From 619b3fc5092acc0e25d1657764d7982815628681 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Wed, 18 Sep 2024 22:04:31 -0700 Subject: [PATCH 01/50] SSE RSUM/RDSUM use correct headers for SSE2, SSSE3 and SSE4 - Replace python floor with // and remove math header PiperOrigin-RevId: 676263295 --- src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-sse41-c16.c | 3 +-- src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-sse41-c32.c | 3 +-- src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-sse41-c64.c | 3 +-- src/qs8-rdsum/sse41.c.in | 6 ++---- src/qs8-rsum/gen/qs8-rsum-ssse3-u16.c | 3 +-- src/qs8-rsum/gen/qs8-rsum-ssse3-u32-acc2.c | 3 +-- src/qs8-rsum/gen/qs8-rsum-ssse3-u32.c | 3 +-- src/qs8-rsum/gen/qs8-rsum-ssse3-u64-acc2.c | 3 +-- src/qs8-rsum/gen/qs8-rsum-ssse3-u64-acc4.c | 3 +-- src/qs8-rsum/gen/qs8-rsum-ssse3-u64.c | 3 +-- src/qs8-rsum/ssse3.c.in | 3 +-- src/qu8-rdsum/gen/qu8-rdsum-7p7x-ssse3-c16.c | 2 +- src/qu8-rdsum/gen/qu8-rdsum-7p7x-ssse3-c32.c | 2 +- src/qu8-rdsum/gen/qu8-rdsum-7p7x-ssse3-c64.c | 2 +- src/qu8-rdsum/ssse3.c.in | 2 +- src/qu8-rsum/gen/qu8-rsum-sse2-u16.c | 2 +- src/qu8-rsum/gen/qu8-rsum-sse2-u32-acc2.c | 2 +- src/qu8-rsum/gen/qu8-rsum-sse2-u32.c | 2 +- src/qu8-rsum/gen/qu8-rsum-sse2-u64-acc2.c | 2 +- src/qu8-rsum/gen/qu8-rsum-sse2-u64-acc4.c | 2 +- src/qu8-rsum/gen/qu8-rsum-sse2-u64.c | 2 +- src/qu8-rsum/sse2.c.in | 2 +- 22 files changed, 23 insertions(+), 35 deletions(-) diff --git a/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-sse41-c16.c b/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-sse41-c16.c index 079536ee247..7fc4fafc8ad 100644 --- a/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-sse41-c16.c +++ b/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-sse41-c16.c @@ -6,11 +6,10 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. - #include #include -#include +#include #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" diff --git a/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-sse41-c32.c b/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-sse41-c32.c index e786c06d2f0..b5d18511559 100644 --- a/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-sse41-c32.c +++ b/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-sse41-c32.c @@ -6,11 +6,10 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. - #include #include -#include +#include #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" diff --git a/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-sse41-c64.c b/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-sse41-c64.c index 91ee454fd2c..9f0f95d396a 100644 --- a/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-sse41-c64.c +++ b/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-sse41-c64.c @@ -6,11 +6,10 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. - #include #include -#include +#include #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" diff --git a/src/qs8-rdsum/sse41.c.in b/src/qs8-rdsum/sse41.c.in index 8890db790a0..0ed6b959224 100644 --- a/src/qs8-rdsum/sse41.c.in +++ b/src/qs8-rdsum/sse41.c.in @@ -2,12 +2,10 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. - -$import math #include #include -#include +#include #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" @@ -47,7 +45,7 @@ void xnn_qs8_rdsum_ukernel_${ACCUMULATORS}p${ACCUMULATORS}x__sse41_c${CHANNELS}( // 256 int8s may be summed into an int16 before overflowing // To prevent handling the tails of the inner 256 loop, we round 256 down to // the nearest integer multiple of ACCUMULATORS. - $OVERFLOW = math.floor(256 / ACCUMULATORS) * ACCUMULATORS + $OVERFLOW = (256 // ACCUMULATORS) * ACCUMULATORS int r = rows; while (r > 0) { $for C in range(0, CHANNELS, 8): diff --git a/src/qs8-rsum/gen/qs8-rsum-ssse3-u16.c b/src/qs8-rsum/gen/qs8-rsum-ssse3-u16.c index a7bb97b17bd..519550404b5 100644 --- a/src/qs8-rsum/gen/qs8-rsum-ssse3-u16.c +++ b/src/qs8-rsum/gen/qs8-rsum-ssse3-u16.c @@ -6,10 +6,9 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. - #include -#include +#include #include "xnnpack/common.h" #include "xnnpack/reduce.h" diff --git a/src/qs8-rsum/gen/qs8-rsum-ssse3-u32-acc2.c b/src/qs8-rsum/gen/qs8-rsum-ssse3-u32-acc2.c index 402d9c20d5a..2e681aee28e 100644 --- a/src/qs8-rsum/gen/qs8-rsum-ssse3-u32-acc2.c +++ b/src/qs8-rsum/gen/qs8-rsum-ssse3-u32-acc2.c @@ -6,10 +6,9 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. - #include -#include +#include #include "xnnpack/common.h" #include "xnnpack/reduce.h" diff --git a/src/qs8-rsum/gen/qs8-rsum-ssse3-u32.c b/src/qs8-rsum/gen/qs8-rsum-ssse3-u32.c index d8427483911..219c7a069e5 100644 --- a/src/qs8-rsum/gen/qs8-rsum-ssse3-u32.c +++ b/src/qs8-rsum/gen/qs8-rsum-ssse3-u32.c @@ -6,10 +6,9 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. - #include -#include +#include #include "xnnpack/common.h" #include "xnnpack/reduce.h" diff --git a/src/qs8-rsum/gen/qs8-rsum-ssse3-u64-acc2.c b/src/qs8-rsum/gen/qs8-rsum-ssse3-u64-acc2.c index f7f2e318557..8fe5f2400a7 100644 --- a/src/qs8-rsum/gen/qs8-rsum-ssse3-u64-acc2.c +++ b/src/qs8-rsum/gen/qs8-rsum-ssse3-u64-acc2.c @@ -6,10 +6,9 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. - #include -#include +#include #include "xnnpack/common.h" #include "xnnpack/reduce.h" diff --git a/src/qs8-rsum/gen/qs8-rsum-ssse3-u64-acc4.c b/src/qs8-rsum/gen/qs8-rsum-ssse3-u64-acc4.c index fb5474ed18d..8b453ebb71e 100644 --- a/src/qs8-rsum/gen/qs8-rsum-ssse3-u64-acc4.c +++ b/src/qs8-rsum/gen/qs8-rsum-ssse3-u64-acc4.c @@ -6,10 +6,9 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. - #include -#include +#include #include "xnnpack/common.h" #include "xnnpack/reduce.h" diff --git a/src/qs8-rsum/gen/qs8-rsum-ssse3-u64.c b/src/qs8-rsum/gen/qs8-rsum-ssse3-u64.c index 9515b7e9def..f9d7e791fdb 100644 --- a/src/qs8-rsum/gen/qs8-rsum-ssse3-u64.c +++ b/src/qs8-rsum/gen/qs8-rsum-ssse3-u64.c @@ -6,10 +6,9 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. - #include -#include +#include #include "xnnpack/common.h" #include "xnnpack/reduce.h" diff --git a/src/qs8-rsum/ssse3.c.in b/src/qs8-rsum/ssse3.c.in index a66086a2b6d..dc3de62ea1a 100644 --- a/src/qs8-rsum/ssse3.c.in +++ b/src/qs8-rsum/ssse3.c.in @@ -6,10 +6,9 @@ $assert CHANNEL_TILE % 16 == 0 $assert CHANNEL_TILE >= 16 $SIMD_TILE = CHANNEL_TILE // 16 $assert ACCUMULATORS <= SIMD_TILE - #include -#include +#include #include "xnnpack/common.h" #include "xnnpack/reduce.h" diff --git a/src/qu8-rdsum/gen/qu8-rdsum-7p7x-ssse3-c16.c b/src/qu8-rdsum/gen/qu8-rdsum-7p7x-ssse3-c16.c index 4cf7b3762d1..fcfac6d64a2 100644 --- a/src/qu8-rdsum/gen/qu8-rdsum-7p7x-ssse3-c16.c +++ b/src/qu8-rdsum/gen/qu8-rdsum-7p7x-ssse3-c16.c @@ -10,7 +10,7 @@ #include #include -#include +#include #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" diff --git a/src/qu8-rdsum/gen/qu8-rdsum-7p7x-ssse3-c32.c b/src/qu8-rdsum/gen/qu8-rdsum-7p7x-ssse3-c32.c index 7796d48801a..ebf24777ec0 100644 --- a/src/qu8-rdsum/gen/qu8-rdsum-7p7x-ssse3-c32.c +++ b/src/qu8-rdsum/gen/qu8-rdsum-7p7x-ssse3-c32.c @@ -10,7 +10,7 @@ #include #include -#include +#include #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" diff --git a/src/qu8-rdsum/gen/qu8-rdsum-7p7x-ssse3-c64.c b/src/qu8-rdsum/gen/qu8-rdsum-7p7x-ssse3-c64.c index 08ccf3eb01f..1485e82f3b4 100644 --- a/src/qu8-rdsum/gen/qu8-rdsum-7p7x-ssse3-c64.c +++ b/src/qu8-rdsum/gen/qu8-rdsum-7p7x-ssse3-c64.c @@ -10,7 +10,7 @@ #include #include -#include +#include #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" diff --git a/src/qu8-rdsum/ssse3.c.in b/src/qu8-rdsum/ssse3.c.in index eb3f3f0eef4..700b1a249be 100644 --- a/src/qu8-rdsum/ssse3.c.in +++ b/src/qu8-rdsum/ssse3.c.in @@ -7,7 +7,7 @@ $import math #include #include -#include +#include #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" diff --git a/src/qu8-rsum/gen/qu8-rsum-sse2-u16.c b/src/qu8-rsum/gen/qu8-rsum-sse2-u16.c index 9676bbf780d..4090ac79816 100644 --- a/src/qu8-rsum/gen/qu8-rsum-sse2-u16.c +++ b/src/qu8-rsum/gen/qu8-rsum-sse2-u16.c @@ -8,7 +8,7 @@ // LICENSE file in the root directory of this source tree. #include -#include +#include #include "xnnpack/common.h" #include "xnnpack/reduce.h" diff --git a/src/qu8-rsum/gen/qu8-rsum-sse2-u32-acc2.c b/src/qu8-rsum/gen/qu8-rsum-sse2-u32-acc2.c index 04b8ddefe36..49dfe57b80e 100644 --- a/src/qu8-rsum/gen/qu8-rsum-sse2-u32-acc2.c +++ b/src/qu8-rsum/gen/qu8-rsum-sse2-u32-acc2.c @@ -8,7 +8,7 @@ // LICENSE file in the root directory of this source tree. #include -#include +#include #include "xnnpack/common.h" #include "xnnpack/reduce.h" diff --git a/src/qu8-rsum/gen/qu8-rsum-sse2-u32.c b/src/qu8-rsum/gen/qu8-rsum-sse2-u32.c index 486481f86e7..e0019731a1b 100644 --- a/src/qu8-rsum/gen/qu8-rsum-sse2-u32.c +++ b/src/qu8-rsum/gen/qu8-rsum-sse2-u32.c @@ -8,7 +8,7 @@ // LICENSE file in the root directory of this source tree. #include -#include +#include #include "xnnpack/common.h" #include "xnnpack/reduce.h" diff --git a/src/qu8-rsum/gen/qu8-rsum-sse2-u64-acc2.c b/src/qu8-rsum/gen/qu8-rsum-sse2-u64-acc2.c index 44f6f2e3dc0..4e427d2627b 100644 --- a/src/qu8-rsum/gen/qu8-rsum-sse2-u64-acc2.c +++ b/src/qu8-rsum/gen/qu8-rsum-sse2-u64-acc2.c @@ -8,7 +8,7 @@ // LICENSE file in the root directory of this source tree. #include -#include +#include #include "xnnpack/common.h" #include "xnnpack/reduce.h" diff --git a/src/qu8-rsum/gen/qu8-rsum-sse2-u64-acc4.c b/src/qu8-rsum/gen/qu8-rsum-sse2-u64-acc4.c index 9f2a72e86c3..299943514d2 100644 --- a/src/qu8-rsum/gen/qu8-rsum-sse2-u64-acc4.c +++ b/src/qu8-rsum/gen/qu8-rsum-sse2-u64-acc4.c @@ -8,7 +8,7 @@ // LICENSE file in the root directory of this source tree. #include -#include +#include #include "xnnpack/common.h" #include "xnnpack/reduce.h" diff --git a/src/qu8-rsum/gen/qu8-rsum-sse2-u64.c b/src/qu8-rsum/gen/qu8-rsum-sse2-u64.c index 8affb450f0b..1f36be691d2 100644 --- a/src/qu8-rsum/gen/qu8-rsum-sse2-u64.c +++ b/src/qu8-rsum/gen/qu8-rsum-sse2-u64.c @@ -8,7 +8,7 @@ // LICENSE file in the root directory of this source tree. #include -#include +#include #include "xnnpack/common.h" #include "xnnpack/reduce.h" diff --git a/src/qu8-rsum/sse2.c.in b/src/qu8-rsum/sse2.c.in index f1fb76152dd..3ddcd7421b5 100644 --- a/src/qu8-rsum/sse2.c.in +++ b/src/qu8-rsum/sse2.c.in @@ -8,7 +8,7 @@ $SIMD_TILE = CHANNEL_TILE // 16 $assert ACCUMULATORS <= SIMD_TILE #include -#include +#include #include "xnnpack/common.h" #include "xnnpack/reduce.h" From 42eca640dff21522e5aebc92833c98ac96625f0b Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Thu, 19 Sep 2024 02:13:20 -0700 Subject: [PATCH 02/50] RDSUM - Replace math.floor with // PiperOrigin-RevId: 676326045 --- src/qs8-rdsum/avx2.c.in | 4 +--- src/qs8-rdsum/avx512skx.c.in | 4 +--- src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-avx2-c32.c | 1 - src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-avx2-c64.c | 1 - src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-avx512skx-c128.c | 1 - src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-avx512skx-c64.c | 1 - src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-neon-c16.c | 1 - src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-neon-c32.c | 1 - src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-neon-c64.c | 1 - src/qs8-rdsum/neon.c.in | 4 +--- src/qu8-rdsum/gen/qu8-rdsum-7p7x-neon-u16.c | 1 - src/qu8-rdsum/gen/qu8-rdsum-7p7x-neon-u32.c | 1 - src/qu8-rdsum/gen/qu8-rdsum-7p7x-neon-u64.c | 1 - src/qu8-rdsum/gen/qu8-rdsum-7p7x-ssse3-c16.c | 1 - src/qu8-rdsum/gen/qu8-rdsum-7p7x-ssse3-c32.c | 1 - src/qu8-rdsum/gen/qu8-rdsum-7p7x-ssse3-c64.c | 1 - src/qu8-rdsum/neon.c.in | 4 +--- src/qu8-rdsum/ssse3.c.in | 4 +--- 18 files changed, 5 insertions(+), 28 deletions(-) diff --git a/src/qs8-rdsum/avx2.c.in b/src/qs8-rdsum/avx2.c.in index cd6553a2d2c..e93d4d15e18 100644 --- a/src/qs8-rdsum/avx2.c.in +++ b/src/qs8-rdsum/avx2.c.in @@ -2,8 +2,6 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. - -$import math #include #include @@ -42,7 +40,7 @@ void xnn_qs8_rdsum_ukernel_${ACCUMULATORS}p${ACCUMULATORS}x__avx2_c${CHANNELS}( // 256 int8s may be summed into an int16 before overflowing // To prevent handling the tails of the inner 256 loop, we round 256 down to // the nearest integer multiple of ACCUMULATORS. - $OVERFLOW = math.floor(256 / ACCUMULATORS) * ACCUMULATORS + $OVERFLOW = (256 // ACCUMULATORS) * ACCUMULATORS int r = rows; while (r > 0) { $for C in range(0, CHANNELS, 16): diff --git a/src/qs8-rdsum/avx512skx.c.in b/src/qs8-rdsum/avx512skx.c.in index e8b66178fc5..e8fa1dfe06e 100644 --- a/src/qs8-rdsum/avx512skx.c.in +++ b/src/qs8-rdsum/avx512skx.c.in @@ -2,8 +2,6 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. - -$import math #include #include @@ -42,7 +40,7 @@ void xnn_qs8_rdsum_ukernel_${ACCUMULATORS}p${ACCUMULATORS}x__avx512skx_c${CHANNE // 256 int8s may be summed into an int16 before overflowing // To prevent handling the tails of the inner 256 loop, we round 256 down to // the nearest integer multiple of ACCUMULATORS. - $OVERFLOW = math.floor(256 / ACCUMULATORS) * ACCUMULATORS + $OVERFLOW = (256 // ACCUMULATORS) * ACCUMULATORS int num_batches = floor((rows + ${OVERFLOW - 1}) / ${OVERFLOW}); int r = rows; for (; num_batches > 0; --num_batches) { diff --git a/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-avx2-c32.c b/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-avx2-c32.c index d53e99e2390..b2000f765ee 100644 --- a/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-avx2-c32.c +++ b/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-avx2-c32.c @@ -6,7 +6,6 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. - #include #include diff --git a/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-avx2-c64.c b/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-avx2-c64.c index 54ba2f50545..4b7de2fbb58 100644 --- a/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-avx2-c64.c +++ b/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-avx2-c64.c @@ -6,7 +6,6 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. - #include #include diff --git a/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-avx512skx-c128.c b/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-avx512skx-c128.c index 102ed303d63..2952e13c530 100644 --- a/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-avx512skx-c128.c +++ b/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-avx512skx-c128.c @@ -6,7 +6,6 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. - #include #include diff --git a/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-avx512skx-c64.c b/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-avx512skx-c64.c index 7c74082b011..b48b84cffe4 100644 --- a/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-avx512skx-c64.c +++ b/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-avx512skx-c64.c @@ -6,7 +6,6 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. - #include #include diff --git a/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-neon-c16.c b/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-neon-c16.c index 79b48bbb626..0f70c9d83f8 100644 --- a/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-neon-c16.c +++ b/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-neon-c16.c @@ -6,7 +6,6 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. - #include #include diff --git a/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-neon-c32.c b/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-neon-c32.c index 20a2d2a02c0..485824a93db 100644 --- a/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-neon-c32.c +++ b/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-neon-c32.c @@ -6,7 +6,6 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. - #include #include diff --git a/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-neon-c64.c b/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-neon-c64.c index b1616a20e30..eaa86d8b4da 100644 --- a/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-neon-c64.c +++ b/src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-neon-c64.c @@ -6,7 +6,6 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. - #include #include diff --git a/src/qs8-rdsum/neon.c.in b/src/qs8-rdsum/neon.c.in index c76ef0c5a80..3e03eae7287 100644 --- a/src/qs8-rdsum/neon.c.in +++ b/src/qs8-rdsum/neon.c.in @@ -2,8 +2,6 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. - -$import math #include #include @@ -42,7 +40,7 @@ void xnn_qs8_rdsum_ukernel_${ACCUMULATORS}p${ACCUMULATORS}x__neon_c${CHANNELS}( // 256 int8s may be summed into an int16 before overflowing // To prevent handling the tails of the inner 256 loop, we round 256 down to // the nearest integer multiple of ACCUMULATORS. - $OVERFLOW = math.floor(256 / ACCUMULATORS) * ACCUMULATORS + $OVERFLOW = (256 // ACCUMULATORS) * ACCUMULATORS int r = rows; while (r > 0) { $for C in range(0, CHANNELS, 8): diff --git a/src/qu8-rdsum/gen/qu8-rdsum-7p7x-neon-u16.c b/src/qu8-rdsum/gen/qu8-rdsum-7p7x-neon-u16.c index db328ded504..3276cf882b3 100644 --- a/src/qu8-rdsum/gen/qu8-rdsum-7p7x-neon-u16.c +++ b/src/qu8-rdsum/gen/qu8-rdsum-7p7x-neon-u16.c @@ -6,7 +6,6 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. - #include #include diff --git a/src/qu8-rdsum/gen/qu8-rdsum-7p7x-neon-u32.c b/src/qu8-rdsum/gen/qu8-rdsum-7p7x-neon-u32.c index 35cb0e3f8e6..42000ca6fa2 100644 --- a/src/qu8-rdsum/gen/qu8-rdsum-7p7x-neon-u32.c +++ b/src/qu8-rdsum/gen/qu8-rdsum-7p7x-neon-u32.c @@ -6,7 +6,6 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. - #include #include diff --git a/src/qu8-rdsum/gen/qu8-rdsum-7p7x-neon-u64.c b/src/qu8-rdsum/gen/qu8-rdsum-7p7x-neon-u64.c index 5328b34bed8..f3ac9d7ff80 100644 --- a/src/qu8-rdsum/gen/qu8-rdsum-7p7x-neon-u64.c +++ b/src/qu8-rdsum/gen/qu8-rdsum-7p7x-neon-u64.c @@ -6,7 +6,6 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. - #include #include diff --git a/src/qu8-rdsum/gen/qu8-rdsum-7p7x-ssse3-c16.c b/src/qu8-rdsum/gen/qu8-rdsum-7p7x-ssse3-c16.c index fcfac6d64a2..6e2dad59f06 100644 --- a/src/qu8-rdsum/gen/qu8-rdsum-7p7x-ssse3-c16.c +++ b/src/qu8-rdsum/gen/qu8-rdsum-7p7x-ssse3-c16.c @@ -6,7 +6,6 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. - #include #include diff --git a/src/qu8-rdsum/gen/qu8-rdsum-7p7x-ssse3-c32.c b/src/qu8-rdsum/gen/qu8-rdsum-7p7x-ssse3-c32.c index ebf24777ec0..954f27494b8 100644 --- a/src/qu8-rdsum/gen/qu8-rdsum-7p7x-ssse3-c32.c +++ b/src/qu8-rdsum/gen/qu8-rdsum-7p7x-ssse3-c32.c @@ -6,7 +6,6 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. - #include #include diff --git a/src/qu8-rdsum/gen/qu8-rdsum-7p7x-ssse3-c64.c b/src/qu8-rdsum/gen/qu8-rdsum-7p7x-ssse3-c64.c index 1485e82f3b4..269af7347a6 100644 --- a/src/qu8-rdsum/gen/qu8-rdsum-7p7x-ssse3-c64.c +++ b/src/qu8-rdsum/gen/qu8-rdsum-7p7x-ssse3-c64.c @@ -6,7 +6,6 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. - #include #include diff --git a/src/qu8-rdsum/neon.c.in b/src/qu8-rdsum/neon.c.in index 388877ca740..5de319f1297 100644 --- a/src/qu8-rdsum/neon.c.in +++ b/src/qu8-rdsum/neon.c.in @@ -2,8 +2,6 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. - -$import math #include #include @@ -40,7 +38,7 @@ void xnn_qu8_rdsum_ukernel_${ACCUMULATORS}p${ACCUMULATORS}x__neon_u${CHANNELS}( // 256 uint8s may be summed into an uint16 before overflowing // To prevent handling the tails of the inner 256 loop, we round 256 down to // the nearest integer multiple of ACCUMULATORS. - $OVERFLOW = math.floor(256 / ACCUMULATORS) * ACCUMULATORS + $OVERFLOW = (256 // ACCUMULATORS) * ACCUMULATORS int r = rows; while (r > 0) { $for C in range(0, CHANNELS, 8): diff --git a/src/qu8-rdsum/ssse3.c.in b/src/qu8-rdsum/ssse3.c.in index 700b1a249be..ec984023889 100644 --- a/src/qu8-rdsum/ssse3.c.in +++ b/src/qu8-rdsum/ssse3.c.in @@ -2,8 +2,6 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. - -$import math #include #include @@ -42,7 +40,7 @@ void xnn_qu8_rdsum_ukernel_${ACCUMULATORS}p${ACCUMULATORS}x__ssse3_c${CHANNELS}( // 256 uint8s may be summed into an uint16 before overflowing // To prevent handling the tails of the inner 256 loop, we round 256 down to // the nearest integer multiple of ACCUMULATORS. - $OVERFLOW = math.floor(256 / ACCUMULATORS) * ACCUMULATORS + $OVERFLOW = (256 // ACCUMULATORS) * ACCUMULATORS int r = rows; __m128i vone = _mm_set1_epi8(1); From 3014fb625c73f3b1ce1f6d3e45f1e216f9cb7105 Mon Sep 17 00:00:00 2001 From: Alan Kelly Date: Thu, 19 Sep 2024 05:08:42 -0700 Subject: [PATCH 03/50] Do not enable avxvnniint8 by default. Only the very latest compilers support this. PiperOrigin-RevId: 676369522 --- BUILD.bazel | 1 + build_config/BUILD.bazel | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/BUILD.bazel b/BUILD.bazel index 0b1b97368a8..04ccf87588c 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -1797,6 +1797,7 @@ alias( ":xnn_enable_avxvnniint8_explicit_true": ":xnn_enable_avxvnniint8_explicit_true", ":xnn_enable_avxvnniint8_explicit_false": ":xnn_enable_avxvnniint8_explicit_true", "//build_config:ios_x86_64": ":xnn_enable_avxvnniint8_explicit_true", + "//build_config:ios": ":xnn_enable_avxvnniint8_explicit_true", "//conditions:default": ":avxvnniint8_enabled_by_default", }), ) diff --git a/build_config/BUILD.bazel b/build_config/BUILD.bazel index 22490743554..6352a43acdd 100644 --- a/build_config/BUILD.bazel +++ b/build_config/BUILD.bazel @@ -209,6 +209,13 @@ config_setting( }, ) +config_setting( + name = "ios", + values = { + "apple_platform_type": "ios", + }, +) + config_setting( name = "watchos_arm64_32", values = { From 2bb88b9e451895373afbe71f64289785cfb8c3d6 Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Thu, 19 Sep 2024 10:36:28 -0700 Subject: [PATCH 04/50] Remove unary benchmark generators These should have been removed in a previous change that replaced these benchmarks. PiperOrigin-RevId: 676472974 --- scripts/generate-benchmarks.sh | 40 ------ tools/generate-vunary-benchmark.py | 197 ----------------------------- 2 files changed, 237 deletions(-) delete mode 100755 tools/generate-vunary-benchmark.py diff --git a/scripts/generate-benchmarks.sh b/scripts/generate-benchmarks.sh index ccf5d61e578..f5cb9eaa85c 100755 --- a/scripts/generate-benchmarks.sh +++ b/scripts/generate-benchmarks.sh @@ -4,46 +4,6 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -### Tests for VUnary micro-kernels -tools/generate-vunary-benchmark.py --ukernel f16-vabs --output bench/f16-vabs.cc & -tools/generate-vunary-benchmark.py --ukernel f16-vclamp --output bench/f16-vclamp.cc & -tools/generate-vunary-benchmark.py --ukernel f16-velu --output bench/f16-velu.cc & -tools/generate-vunary-benchmark.py --ukernel f16-vneg --output bench/f16-vneg.cc & -tools/generate-vunary-benchmark.py --ukernel f16-vrndd --output bench/f16-vrndd.cc & -tools/generate-vunary-benchmark.py --ukernel f16-vrndne --output bench/f16-vrndne.cc & -tools/generate-vunary-benchmark.py --ukernel f16-vrndu --output bench/f16-vrndu.cc & -tools/generate-vunary-benchmark.py --ukernel f16-vrndz --output bench/f16-vrndz.cc & -tools/generate-vunary-benchmark.py --ukernel f16-vrsqrt --output bench/f16-vrsqrt.cc & -tools/generate-vunary-benchmark.py --ukernel f16-vsigmoid --output bench/f16-vsigmoid.cc & -tools/generate-vunary-benchmark.py --ukernel f16-vsqr --output bench/f16-vsqr.cc & -tools/generate-vunary-benchmark.py --ukernel f16-vsqrt --output bench/f16-vsqrt.cc & -tools/generate-vunary-benchmark.py --ukernel f16-vtanh --output bench/f16-vtanh.cc & - -tools/generate-vunary-benchmark.py --ukernel f32-vabs --output bench/f32-vabs.cc & -tools/generate-vunary-benchmark.py --ukernel f32-vclamp --output bench/f32-vclamp.cc & -tools/generate-vunary-benchmark.py --ukernel f32-velu --output bench/f32-velu.cc & -tools/generate-vunary-benchmark.py --ukernel f32-vgelu --output bench/f32-vgelu.cc & -tools/generate-vunary-benchmark.py --ukernel f32-vneg --output bench/f32-vneg.cc & -tools/generate-vunary-benchmark.py --ukernel f32-vrelu --output bench/f32-vrelu.cc & -tools/generate-vunary-benchmark.py --ukernel f32-vrndd --output bench/f32-vrndd.cc & -tools/generate-vunary-benchmark.py --ukernel f32-vrndne --output bench/f32-vrndne.cc & -tools/generate-vunary-benchmark.py --ukernel f32-vrndu --output bench/f32-vrndu.cc & -tools/generate-vunary-benchmark.py --ukernel f32-vrndz --output bench/f32-vrndz.cc & -tools/generate-vunary-benchmark.py --ukernel f32-vrsqrt --output bench/f32-vrsqrt.cc & -tools/generate-vunary-benchmark.py --ukernel f32-vsigmoid --output bench/f32-vsigmoid.cc & -tools/generate-vunary-benchmark.py --ukernel f32-vsqr --output bench/f32-vsqr.cc & -tools/generate-vunary-benchmark.py --ukernel f32-vsqrt --output bench/f32-vsqrt.cc & -tools/generate-vunary-benchmark.py --ukernel f32-vtanh --output bench/f32-vtanh.cc & -tools/generate-vunary-benchmark.py --ukernel f32-vlog --output bench/f32-vlog.cc & - -### Tests for VLRelu micro-kernels -tools/generate-vunary-benchmark.py --ukernel f16-vlrelu --output bench/f16-vlrelu.cc & -tools/generate-vunary-benchmark.py --ukernel f32-vlrelu --output bench/f32-vlrelu.cc & - -### Tests for VHSwish micro-kernels -tools/generate-vunary-benchmark.py --ukernel f16-vhswish --output bench/f16-vhswish.cc & -tools/generate-vunary-benchmark.py --ukernel f32-vhswish --output bench/f32-vhswish.cc & - ### Tests for Rsum micro-kernels tools/generate-rdsum-benchmark.py --spec test/f32-rdsum.yaml --output bench/f32-rdsum.cc & tools/generate-rdsum-benchmark.py --spec test/f16-f32acc-rdsum.yaml --output bench/f16-f32acc-rdsum.cc & diff --git a/tools/generate-vunary-benchmark.py b/tools/generate-vunary-benchmark.py deleted file mode 100755 index aabea34b5b3..00000000000 --- a/tools/generate-vunary-benchmark.py +++ /dev/null @@ -1,197 +0,0 @@ -#!/usr/bin/env python -# Copyright 2024 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -import argparse -import codecs -import os -import re -import sys -import yaml - -sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) -import xngen -import xnncommon - - -parser = argparse.ArgumentParser( - description="Vector unary operation microkernel benchmark generator" -) -parser.add_argument( - "-k", - "--ukernel", - required=True, - help="Microkernel", -) -parser.add_argument( - "-o", - "--output", - metavar="FILE", - required=True, - help="Output (C++ source) file", -) -parser.set_defaults(defines=list()) - -BENCHMARK_TEMPLATE = """\ -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, - datatype, params_type, init_params) -BENCHMARK_CAPTURE(${DATATYPE}_v${OP_NAME}, ukernel, arch_flags, ukernel, init_params) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime();""" - -BENCHMARK_FUNCTION_TEMPLATE = """\ -$if OP_NAME.startswith("rnd"): - void ${DATATYPE}_v${OP_NAME}(benchmark::State& state, uint64_t arch_flags, xnn_${DATATYPE}_vround_ukernel_fn ukernel, - xnn_init_${DATATYPE}_rnd_params_fn init_params = nullptr) { - ${DATATYPE}_vunary_benchmark( - state, ukernel, - init_params, - arch_flags, - /*range_min=*/${RANGE_MIN}, - /*range_max=*/${RANGE_MAX}); - } -$elif OP_NAME == "clamp": - void ${DATATYPE}_v${OP_NAME}(benchmark::State& state, uint64_t arch_flags, xnn_${DATATYPE}_v${OP_NAME}_ukernel_fn ukernel, - xnn_init_${DATATYPE}_minmax_params_fn init_params = nullptr) { - ${DATATYPE}_vunary_benchmark( - state, ukernel, - [init_params](xnn_${DATATYPE}_minmax_params* params) -> size_t { - $if DATATYPE == "f16": - init_params(params, -1.0f, 1.0f); - $else: - init_params(params, -INFINITY, INFINITY); - return sizeof(*params); - }, - arch_flags, - /*range_min=*/${RANGE_MIN}, - /*range_max=*/${RANGE_MAX}); - } -$elif OP_NAME in ("abs", "gelu", "log", "neg", "sqr"): - void ${DATATYPE}_v${OP_NAME}(benchmark::State& state, uint64_t arch_flags, xnn_${DATATYPE}_v${OP_NAME}_ukernel_fn ukernel, - xnn_init_${DATATYPE}_default_params_fn init_params = nullptr) { - ${DATATYPE}_vunary_benchmark( - state, ukernel, - init_params, - arch_flags, - /*range_min=*/${RANGE_MIN}, - /*range_max=*/${RANGE_MAX}); - } -$else: - void ${DATATYPE}_v${OP_NAME}(benchmark::State& state, uint64_t arch_flags, xnn_${DATATYPE}_v${OP_NAME}_ukernel_fn ukernel, - xnn_init_${DATATYPE}_${OP_NAME}_params_fn init_params = nullptr) { - ${DATATYPE}_vunary_benchmark( - state, ukernel, - $if OP_NAME == "lrelu": - [init_params](xnn_${DATATYPE}_${OP_NAME}_params* params) -> size_t { - init_params(params, 0.01f); - return sizeof(*params); - }, - $elif OP_NAME == "elu": - $if DATATYPE == "f16": - [init_params](xnn_${DATATYPE}_${OP_NAME}_params* params) -> size_t { - init_params(params, - /*prescale=*/1.0f, - /*alpha=*/1.0f, - /*beta=*/1.0f); - return sizeof(*params); - }, - $else: - [init_params](xnn_${DATATYPE}_${OP_NAME}_params* params) -> size_t { - init_params(params, /*prescale=*/1.0f, /*alpha=*/1.0f, /*beta=*/1.0f); - return sizeof(*params); - }, - $else: - init_params, - arch_flags, - /*range_min=*/${RANGE_MIN}, - /*range_max=*/${RANGE_MAX}); - } - -""" - -RANGE_FOR_OP_NAME = { - "f16_tanh": (-5.0, 5.0), - "f32_clamp": (0.0, 10.0), - "f32_elu": (-20.0, 10.0), - "f16_elu": (-9.0, 9.0), - "f32_log": (0.0, 10.0), - "f32_lrelu": (-5.0, 5.0), - "f16_lrelu": (-5.0, 5.0), - "f32_rsqrt": (1e-5, 10.0), - "f16_rsqrt": (1e-5, 10.0), - "f32_sqrt": (0.0, 10.0), - "f16_sqrt": (0.0, 1.0), -} - -def main(args): - options = parser.parse_args(args) - - # Extract the datatype and op from the file name. - datatype, op_name = options.ukernel.split('-') - - benchmarks = """\ -// Copyright 2024 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Microkernel: {microkernel} -// Generator: {generator} - -#include -#include - -#include -#include "bench/{datatype}-vunary-benchmark.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams.h" - -""".format(microkernel=options.ukernel, generator=sys.argv[0], datatype=datatype) - - op_name = op_name[1:] - - # Create the benchmark wrapper function. - range_min, range_max = RANGE_FOR_OP_NAME.get( - f"{datatype}_{op_name}", (-10.0, 10.0) - ) - benchmarks += xngen.preprocess( - BENCHMARK_FUNCTION_TEMPLATE, - { - "DATATYPE": datatype, - "OP_NAME": op_name, - "RANGE_MIN": range_min, - "RANGE_MAX": range_max, - }, - ) - - benchmarks += xnncommon.make_multiline_macro(xngen.preprocess( - BENCHMARK_TEMPLATE, - { - "OP_NAME": op_name, - "DATATYPE": datatype, - }, - )) - - folder = options.ukernel - if "rnd" in folder: - folder = folder[0:8] - - benchmarks += f'#include "{xnncommon.xnnpack_src()}/{folder}/{options.ukernel}.h"\n' - benchmarks += "#undef XNN_UKERNEL_WITH_PARAMS\n" - - # Footer with `main` function. - benchmarks += "\n\n" + """\ -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif -""" - - # Finally, write the file to disk. - xnncommon.overwrite_if_changed(options.output, benchmarks) - - -if __name__ == "__main__": - main(sys.argv[1:]) From c8015fe38750b382b29d7c669450d4bb64019ceb Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Thu, 19 Sep 2024 10:56:27 -0700 Subject: [PATCH 05/50] Remove audio DSP related microkernels PiperOrigin-RevId: 676480843 --- BUILD.bazel | 7 - CMakeLists.txt | 19 - bench/BUILD.bazel | 10 - bench/cs16-bfly4.cc | 116 -- bench/cs16-fftr.cc | 73 -- bench/cs16-vsquareabs.cc | 127 -- bench/i16-vlshift.cc | 94 -- bench/s16-rmaxabs.cc | 101 -- bench/s16-window.cc | 144 -- bench/u32-filterbank-accumulate.cc | 86 -- bench/u32-filterbank-subtract.cc | 65 - bench/u32-vlog.cc | 75 -- bench/u64-u32-vsqrtshift.cc | 67 - cmake/gen/aarch32_microkernels.cmake | 10 +- cmake/gen/hexagon_microkernels.cmake | 8 +- cmake/gen/neon_microkernels.cmake | 31 - cmake/gen/scalar_microkernels.cmake | 31 - gen/aarch32_microkernels.bzl | 8 - gen/hexagon_microkernels.bzl | 6 - gen/neon_microkernels.bzl | 31 - gen/scalar_microkernels.bzl | 31 - scripts/generate-cs16-bfly4.sh | 12 - scripts/generate-cs16-fftr.sh | 12 - scripts/generate-cs16-vsquareabs.sh | 27 - scripts/generate-i16-vlshift.sh | 19 - scripts/generate-s16-rmaxabs.sh | 19 - scripts/generate-s16-window.sh | 29 - scripts/generate-tests.sh | 27 - scripts/generate-u32-filterbank-accumulate.sh | 14 - scripts/generate-u32-vlog.sh | 13 - src/cs16-bfly4/cs16-bfly4-neon-x1.c | 138 -- src/cs16-bfly4/cs16-bfly4-neon-x4.c | 129 -- .../cs16-bfly4-samples1-asm-aarch32-neon-x1.S | 74 -- .../cs16-bfly4-samples1-asm-aarch32-neon-x2.S | 102 -- .../cs16-bfly4-samples1-asm-aarch32-neon-x4.S | 110 -- src/cs16-bfly4/cs16-bfly4-samples1-neon.c | 57 - src/cs16-bfly4/cs16-bfly4-samples1-scalar.c | 75 -- src/cs16-bfly4/cs16-bfly4-samples4-neon.c | 106 -- src/cs16-bfly4/cs16-bfly4-samples4-scalar.c | 269 ---- src/cs16-bfly4/gen/cs16-bfly4-scalar-x1.c | 178 --- src/cs16-bfly4/gen/cs16-bfly4-scalar-x2.c | 300 ----- src/cs16-bfly4/gen/cs16-bfly4-scalar-x4.c | 410 ------ src/cs16-bfly4/scalar.c.in | 269 ---- src/cs16-fftr/cs16-fftr-asm-aarch32-neon-x1.S | 93 -- src/cs16-fftr/cs16-fftr-asm-aarch32-neon-x4.S | 101 -- src/cs16-fftr/cs16-fftr-neon-x4.c | 85 -- src/cs16-fftr/gen/cs16-fftr-scalar-x1.c | 75 -- src/cs16-fftr/gen/cs16-fftr-scalar-x2.c | 123 -- src/cs16-fftr/gen/cs16-fftr-scalar-x4.c | 163 --- src/cs16-fftr/scalar.c.in | 109 -- .../gen/cs16-vsquareabs-hexagon-x10.c | 64 - .../gen/cs16-vsquareabs-hexagon-x12.c | 67 - .../gen/cs16-vsquareabs-hexagon-x2.c | 45 - .../gen/cs16-vsquareabs-hexagon-x4.c | 55 - .../gen/cs16-vsquareabs-hexagon-x6.c | 58 - .../gen/cs16-vsquareabs-hexagon-x8.c | 61 - .../gen/cs16-vsquareabs-neon-mlal-ld128-x12.c | 65 - .../gen/cs16-vsquareabs-neon-mlal-ld128-x16.c | 69 - .../gen/cs16-vsquareabs-neon-mlal-ld128-x4.c | 48 - .../gen/cs16-vsquareabs-neon-mlal-ld128-x8.c | 61 - .../gen/cs16-vsquareabs-scalar-x1.c | 39 - .../gen/cs16-vsquareabs-scalar-x2.c | 56 - .../gen/cs16-vsquareabs-scalar-x3.c | 63 - .../gen/cs16-vsquareabs-scalar-x4.c | 68 - src/cs16-vsquareabs/hexagon.c.in | 55 - src/cs16-vsquareabs/neon.c.in | 61 - src/cs16-vsquareabs/scalar.c.in | 80 -- src/i16-vlshift/gen/i16-vlshift-neon-u16.c | 69 - src/i16-vlshift/gen/i16-vlshift-neon-u24.c | 72 - src/i16-vlshift/gen/i16-vlshift-neon-u32.c | 75 -- src/i16-vlshift/gen/i16-vlshift-neon-u8.c | 59 - src/i16-vlshift/gen/i16-vlshift-scalar-u1.c | 36 - src/i16-vlshift/gen/i16-vlshift-scalar-u2.c | 48 - src/i16-vlshift/gen/i16-vlshift-scalar-u3.c | 51 - src/i16-vlshift/gen/i16-vlshift-scalar-u4.c | 54 - src/i16-vlshift/neon.c.in | 69 - src/i16-vlshift/scalar.c.in | 46 - src/s16-rmaxabs/gen/s16-rmaxabs-neon-x16.c | 66 - src/s16-rmaxabs/gen/s16-rmaxabs-neon-x24.c | 71 - src/s16-rmaxabs/gen/s16-rmaxabs-neon-x32.c | 76 -- src/s16-rmaxabs/gen/s16-rmaxabs-neon-x8.c | 53 - src/s16-rmaxabs/gen/s16-rmaxabs-scalar-x1.c | 37 - src/s16-rmaxabs/gen/s16-rmaxabs-scalar-x2.c | 52 - src/s16-rmaxabs/gen/s16-rmaxabs-scalar-x3.c | 59 - src/s16-rmaxabs/gen/s16-rmaxabs-scalar-x4.c | 64 - src/s16-rmaxabs/neon.c.in | 71 - src/s16-rmaxabs/scalar.c.in | 72 - src/s16-window/gen/s16-window-neon-u16.c | 100 -- src/s16-window/gen/s16-window-neon-u24.c | 108 -- src/s16-window/gen/s16-window-neon-u32.c | 116 -- src/s16-window/gen/s16-window-neon-u8.c | 77 -- src/s16-window/gen/s16-window-scalar-u1.c | 47 - src/s16-window/gen/s16-window-scalar-u2.c | 73 -- src/s16-window/gen/s16-window-scalar-u3.c | 82 -- src/s16-window/gen/s16-window-scalar-u4.c | 89 -- .../gen/s16-window-shift12-neon-u16.c | 97 -- .../gen/s16-window-shift12-neon-u24.c | 105 -- .../gen/s16-window-shift12-neon-u32.c | 113 -- .../gen/s16-window-shift12-neon-u8.c | 74 -- .../gen/s16-window-shift15-neon-u16.c | 81 -- .../gen/s16-window-shift15-neon-u24.c | 85 -- .../gen/s16-window-shift15-neon-u32.c | 89 -- .../gen/s16-window-shift15-neon-u8.c | 68 - src/s16-window/neon.c.in | 135 -- src/s16-window/scalar.c.in | 96 -- .../gen/u32-filterbank-accumulate-neon-x1.c | 62 - .../gen/u32-filterbank-accumulate-neon-x2.c | 71 - .../gen/u32-filterbank-accumulate-scalar-x1.c | 70 - src/u32-filterbank-accumulate/neon.c.in | 77 -- src/u32-filterbank-accumulate/scalar.c.in | 67 - ...filterbank-accumulate-asm-aarch32-arm-x1.S | 92 -- ...ilterbank-accumulate-asm-aarch32-neon-x1.S | 85 -- ...ilterbank-accumulate-asm-aarch32-neon-x2.S | 80 -- .../u32-filterbank-subtract-scalar-x2.c | 68 - src/u32-vlog/gen/u32-vlog-scalar-x1.c | 42 - src/u32-vlog/gen/u32-vlog-scalar-x2.c | 61 - src/u32-vlog/gen/u32-vlog-scalar-x3.c | 67 - src/u32-vlog/gen/u32-vlog-scalar-x4.c | 73 -- src/u32-vlog/scalar.c.in | 56 - ...qrtshift-scalar-cvtu32-sqrt-cvtu32f64-u1.c | 65 - src/u64-u32-vsqrtshift/u64-u32-vsqrtshift.h | 28 - src/xnnpack/filterbank.h | 53 - src/xnnpack/rmaxabs.h | 37 - src/xnnpack/vlshift.h | 38 - src/xnnpack/vsquareabs.h | 44 - src/xnnpack/window.h | 50 - test/BUILD.bazel | 82 -- test/bfly4-microkernel-tester.h | 251 ---- test/cs16-bfly4.cc | 394 ------ test/cs16-bfly4.yaml | 22 - test/cs16-fftr.cc | 66 - test/cs16-fftr.yaml | 15 - test/cs16-vsquareabs.cc | 469 ------- test/cs16-vsquareabs.yaml | 24 - test/fftr-microkernel-tester.h | 170 --- ...filterbank-accumulate-microkernel-tester.h | 93 -- test/filterbank-subtract-microkernel-tester.h | 134 -- test/i16-vlshift.cc | 431 ------ test/i16-vlshift.yaml | 17 - test/rmaxabs-microkernel-tester.h | 75 -- test/s16-rmaxabs.cc | 271 ---- test/s16-rmaxabs.yaml | 17 - test/s16-window.cc | 1166 ----------------- test/s16-window.yaml | 25 - test/u32-filterbank-accumulate.cc | 123 -- test/u32-filterbank-accumulate.yaml | 17 - test/u32-filterbank-subtract.cc | 55 - test/u32-filterbank-subtract.yaml | 8 - test/u32-vlog.cc | 231 ---- test/u32-vlog.yaml | 11 - test/u64-u32-vsqrtshift.cc | 49 - test/vlshift-microkernel-tester.h | 99 -- test/vsquareabs-microkernel-tester.h | 76 -- test/window-microkernel-tester.h | 121 -- tools/generate-bfly4-test.py | 188 --- tools/generate-fftr-test.py | 112 -- tools/generate-filterbank-accumulate-test.py | 127 -- tools/generate-filterbank-subtract-test.py | 153 --- tools/generate-rmaxabs-test.py | 143 -- tools/generate-vlshift-test.py | 166 --- tools/generate-vsquareabs-test.py | 143 -- tools/generate-window-test.py | 228 ---- 162 files changed, 2 insertions(+), 15184 deletions(-) delete mode 100644 bench/cs16-bfly4.cc delete mode 100644 bench/cs16-fftr.cc delete mode 100644 bench/cs16-vsquareabs.cc delete mode 100644 bench/i16-vlshift.cc delete mode 100644 bench/s16-rmaxabs.cc delete mode 100644 bench/s16-window.cc delete mode 100644 bench/u32-filterbank-accumulate.cc delete mode 100644 bench/u32-filterbank-subtract.cc delete mode 100644 bench/u32-vlog.cc delete mode 100644 bench/u64-u32-vsqrtshift.cc delete mode 100755 scripts/generate-cs16-bfly4.sh delete mode 100755 scripts/generate-cs16-fftr.sh delete mode 100755 scripts/generate-cs16-vsquareabs.sh delete mode 100755 scripts/generate-i16-vlshift.sh delete mode 100755 scripts/generate-s16-rmaxabs.sh delete mode 100755 scripts/generate-s16-window.sh delete mode 100755 scripts/generate-u32-filterbank-accumulate.sh delete mode 100755 scripts/generate-u32-vlog.sh delete mode 100644 src/cs16-bfly4/cs16-bfly4-neon-x1.c delete mode 100644 src/cs16-bfly4/cs16-bfly4-neon-x4.c delete mode 100644 src/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x1.S delete mode 100644 src/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x2.S delete mode 100644 src/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x4.S delete mode 100644 src/cs16-bfly4/cs16-bfly4-samples1-neon.c delete mode 100644 src/cs16-bfly4/cs16-bfly4-samples1-scalar.c delete mode 100644 src/cs16-bfly4/cs16-bfly4-samples4-neon.c delete mode 100644 src/cs16-bfly4/cs16-bfly4-samples4-scalar.c delete mode 100644 src/cs16-bfly4/gen/cs16-bfly4-scalar-x1.c delete mode 100644 src/cs16-bfly4/gen/cs16-bfly4-scalar-x2.c delete mode 100644 src/cs16-bfly4/gen/cs16-bfly4-scalar-x4.c delete mode 100644 src/cs16-bfly4/scalar.c.in delete mode 100644 src/cs16-fftr/cs16-fftr-asm-aarch32-neon-x1.S delete mode 100644 src/cs16-fftr/cs16-fftr-asm-aarch32-neon-x4.S delete mode 100644 src/cs16-fftr/cs16-fftr-neon-x4.c delete mode 100644 src/cs16-fftr/gen/cs16-fftr-scalar-x1.c delete mode 100644 src/cs16-fftr/gen/cs16-fftr-scalar-x2.c delete mode 100644 src/cs16-fftr/gen/cs16-fftr-scalar-x4.c delete mode 100644 src/cs16-fftr/scalar.c.in delete mode 100644 src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x10.c delete mode 100644 src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x12.c delete mode 100644 src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x2.c delete mode 100644 src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x4.c delete mode 100644 src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x6.c delete mode 100644 src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x8.c delete mode 100644 src/cs16-vsquareabs/gen/cs16-vsquareabs-neon-mlal-ld128-x12.c delete mode 100644 src/cs16-vsquareabs/gen/cs16-vsquareabs-neon-mlal-ld128-x16.c delete mode 100644 src/cs16-vsquareabs/gen/cs16-vsquareabs-neon-mlal-ld128-x4.c delete mode 100644 src/cs16-vsquareabs/gen/cs16-vsquareabs-neon-mlal-ld128-x8.c delete mode 100644 src/cs16-vsquareabs/gen/cs16-vsquareabs-scalar-x1.c delete mode 100644 src/cs16-vsquareabs/gen/cs16-vsquareabs-scalar-x2.c delete mode 100644 src/cs16-vsquareabs/gen/cs16-vsquareabs-scalar-x3.c delete mode 100644 src/cs16-vsquareabs/gen/cs16-vsquareabs-scalar-x4.c delete mode 100644 src/cs16-vsquareabs/hexagon.c.in delete mode 100644 src/cs16-vsquareabs/neon.c.in delete mode 100644 src/cs16-vsquareabs/scalar.c.in delete mode 100644 src/i16-vlshift/gen/i16-vlshift-neon-u16.c delete mode 100644 src/i16-vlshift/gen/i16-vlshift-neon-u24.c delete mode 100644 src/i16-vlshift/gen/i16-vlshift-neon-u32.c delete mode 100644 src/i16-vlshift/gen/i16-vlshift-neon-u8.c delete mode 100644 src/i16-vlshift/gen/i16-vlshift-scalar-u1.c delete mode 100644 src/i16-vlshift/gen/i16-vlshift-scalar-u2.c delete mode 100644 src/i16-vlshift/gen/i16-vlshift-scalar-u3.c delete mode 100644 src/i16-vlshift/gen/i16-vlshift-scalar-u4.c delete mode 100644 src/i16-vlshift/neon.c.in delete mode 100644 src/i16-vlshift/scalar.c.in delete mode 100644 src/s16-rmaxabs/gen/s16-rmaxabs-neon-x16.c delete mode 100644 src/s16-rmaxabs/gen/s16-rmaxabs-neon-x24.c delete mode 100644 src/s16-rmaxabs/gen/s16-rmaxabs-neon-x32.c delete mode 100644 src/s16-rmaxabs/gen/s16-rmaxabs-neon-x8.c delete mode 100644 src/s16-rmaxabs/gen/s16-rmaxabs-scalar-x1.c delete mode 100644 src/s16-rmaxabs/gen/s16-rmaxabs-scalar-x2.c delete mode 100644 src/s16-rmaxabs/gen/s16-rmaxabs-scalar-x3.c delete mode 100644 src/s16-rmaxabs/gen/s16-rmaxabs-scalar-x4.c delete mode 100644 src/s16-rmaxabs/neon.c.in delete mode 100644 src/s16-rmaxabs/scalar.c.in delete mode 100644 src/s16-window/gen/s16-window-neon-u16.c delete mode 100644 src/s16-window/gen/s16-window-neon-u24.c delete mode 100644 src/s16-window/gen/s16-window-neon-u32.c delete mode 100644 src/s16-window/gen/s16-window-neon-u8.c delete mode 100644 src/s16-window/gen/s16-window-scalar-u1.c delete mode 100644 src/s16-window/gen/s16-window-scalar-u2.c delete mode 100644 src/s16-window/gen/s16-window-scalar-u3.c delete mode 100644 src/s16-window/gen/s16-window-scalar-u4.c delete mode 100644 src/s16-window/gen/s16-window-shift12-neon-u16.c delete mode 100644 src/s16-window/gen/s16-window-shift12-neon-u24.c delete mode 100644 src/s16-window/gen/s16-window-shift12-neon-u32.c delete mode 100644 src/s16-window/gen/s16-window-shift12-neon-u8.c delete mode 100644 src/s16-window/gen/s16-window-shift15-neon-u16.c delete mode 100644 src/s16-window/gen/s16-window-shift15-neon-u24.c delete mode 100644 src/s16-window/gen/s16-window-shift15-neon-u32.c delete mode 100644 src/s16-window/gen/s16-window-shift15-neon-u8.c delete mode 100644 src/s16-window/neon.c.in delete mode 100644 src/s16-window/scalar.c.in delete mode 100644 src/u32-filterbank-accumulate/gen/u32-filterbank-accumulate-neon-x1.c delete mode 100644 src/u32-filterbank-accumulate/gen/u32-filterbank-accumulate-neon-x2.c delete mode 100644 src/u32-filterbank-accumulate/gen/u32-filterbank-accumulate-scalar-x1.c delete mode 100644 src/u32-filterbank-accumulate/neon.c.in delete mode 100644 src/u32-filterbank-accumulate/scalar.c.in delete mode 100644 src/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-arm-x1.S delete mode 100644 src/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-neon-x1.S delete mode 100644 src/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-neon-x2.S delete mode 100644 src/u32-filterbank-subtract/u32-filterbank-subtract-scalar-x2.c delete mode 100644 src/u32-vlog/gen/u32-vlog-scalar-x1.c delete mode 100644 src/u32-vlog/gen/u32-vlog-scalar-x2.c delete mode 100644 src/u32-vlog/gen/u32-vlog-scalar-x3.c delete mode 100644 src/u32-vlog/gen/u32-vlog-scalar-x4.c delete mode 100644 src/u32-vlog/scalar.c.in delete mode 100644 src/u64-u32-vsqrtshift/u64-u32-vsqrtshift-scalar-cvtu32-sqrt-cvtu32f64-u1.c delete mode 100644 src/u64-u32-vsqrtshift/u64-u32-vsqrtshift.h delete mode 100644 src/xnnpack/filterbank.h delete mode 100644 src/xnnpack/rmaxabs.h delete mode 100644 src/xnnpack/vlshift.h delete mode 100644 src/xnnpack/vsquareabs.h delete mode 100644 src/xnnpack/window.h delete mode 100644 test/bfly4-microkernel-tester.h delete mode 100644 test/cs16-bfly4.cc delete mode 100644 test/cs16-bfly4.yaml delete mode 100644 test/cs16-fftr.cc delete mode 100644 test/cs16-fftr.yaml delete mode 100644 test/cs16-vsquareabs.cc delete mode 100644 test/cs16-vsquareabs.yaml delete mode 100644 test/fftr-microkernel-tester.h delete mode 100644 test/filterbank-accumulate-microkernel-tester.h delete mode 100644 test/filterbank-subtract-microkernel-tester.h delete mode 100644 test/i16-vlshift.cc delete mode 100644 test/i16-vlshift.yaml delete mode 100644 test/rmaxabs-microkernel-tester.h delete mode 100644 test/s16-rmaxabs.cc delete mode 100644 test/s16-rmaxabs.yaml delete mode 100644 test/s16-window.cc delete mode 100644 test/s16-window.yaml delete mode 100644 test/u32-filterbank-accumulate.cc delete mode 100644 test/u32-filterbank-accumulate.yaml delete mode 100644 test/u32-filterbank-subtract.cc delete mode 100644 test/u32-filterbank-subtract.yaml delete mode 100644 test/u32-vlog.cc delete mode 100644 test/u32-vlog.yaml delete mode 100644 test/u64-u32-vsqrtshift.cc delete mode 100644 test/vlshift-microkernel-tester.h delete mode 100644 test/vsquareabs-microkernel-tester.h delete mode 100644 test/window-microkernel-tester.h delete mode 100755 tools/generate-bfly4-test.py delete mode 100755 tools/generate-fftr-test.py delete mode 100755 tools/generate-filterbank-accumulate-test.py delete mode 100755 tools/generate-filterbank-subtract-test.py delete mode 100755 tools/generate-rmaxabs-test.py delete mode 100755 tools/generate-vlshift-test.py delete mode 100755 tools/generate-vsquareabs-test.py delete mode 100755 tools/generate-window-test.py diff --git a/BUILD.bazel b/BUILD.bazel index 04ccf87588c..bff2ee97e90 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -274,7 +274,6 @@ MICROKERNEL_DEFS = [ "src/s32-vmul/s32-vmulc.h", "src/u8-maxpool/u8-maxpool-minmax.h", "src/u8-vclamp/u8-vclamp.h", - "src/u64-u32-vsqrtshift/u64-u32-vsqrtshift.h", "src/x8-packq/x8-packq.h", "src/x8-packw/x8-packw.h", "src/x16-packw/x16-packw.h", @@ -310,9 +309,7 @@ MICROKERNEL_HDRS = [ "src/xnnpack/avgpool.h", "src/xnnpack/conv.h", "src/xnnpack/dwconv.h", - "src/xnnpack/fft.h", "src/xnnpack/fill.h", - "src/xnnpack/filterbank.h", "src/xnnpack/gavgpool.h", "src/xnnpack/gemm.h", "src/xnnpack/ibilinear.h", @@ -332,7 +329,6 @@ MICROKERNEL_HDRS = [ "src/xnnpack/raddextexp.h", "src/xnnpack/raddstoreexpminusmax.h", "src/xnnpack/reduce.h", - "src/xnnpack/rmaxabs.h", "src/xnnpack/spmm.h", "src/xnnpack/transpose.h", "src/xnnpack/unpool.h", @@ -341,13 +337,10 @@ MICROKERNEL_HDRS = [ "src/xnnpack/vhswish.h", "src/xnnpack/vlog.h", "src/xnnpack/vlrelu.h", - "src/xnnpack/vlshift.h", "src/xnnpack/vmulcaddc.h", "src/xnnpack/vscaleexpminusmax.h", "src/xnnpack/vscaleextexp.h", - "src/xnnpack/vsquareabs.h", "src/xnnpack/vunary.h", - "src/xnnpack/window.h", "src/xnnpack/zerob.h", "src/xnnpack/zip.h", ] + MICROKERNEL_DEFS diff --git a/CMakeLists.txt b/CMakeLists.txt index b1304e0bc71..2728c26380c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1463,9 +1463,6 @@ IF(XNNPACK_BUILD_TESTS) # ---[ Build microkernel-level unit tests SET(MICROKERNEL_UNIT_TESTS - cs16-bfly4 - cs16-fftr - cs16-vsquareabs f16-conv-hwc2chw f16-f32acc-rdsum f16-f32acc-rsum @@ -1499,7 +1496,6 @@ IF(XNNPACK_BUILD_TESTS) f32-vmulcaddc-minmax f32-vscaleexpminusmax f32-vscaleextexp - i16-vlshift indirection packing qs8-gavgpool-minmax-fp32 @@ -1516,12 +1512,7 @@ IF(XNNPACK_BUILD_TESTS) qu8-requantization qu8-vhswish qu8-vlrelu - s16-rmaxabs - s16-window s8-ibilinear - u32-filterbank-accumulate - u32-filterbank-subtract - u32-vlog u8-ibilinear u8-lut32norm u8-rmax @@ -2027,9 +2018,6 @@ IF(XNNPACK_BUILD_BENCHMARKS) # ---[ Build microkernel-level microbenchmarks SET(MICROKERNEL_BENCHMARKS bf16-gemm - cs16-bfly4 - cs16-fftr - cs16-vsquareabs f16-conv-hwc2chw f16-dwconv f16-dwconv2d-chw @@ -2078,7 +2066,6 @@ IF(XNNPACK_BUILD_BENCHMARKS) f32-vcmul f32-vscaleexpminusmax f32-vscaleextexp - i16-vlshift qd8-f16-qb4w-gemm qd8-f16-qc4w-gemm qd8-f16-qc8w-gemm @@ -2111,12 +2098,6 @@ IF(XNNPACK_BUILD_BENCHMARKS) qu8-vcvt qu8-vmul qu8-vmulc - s16-rmaxabs - s16-window - u32-filterbank-accumulate - u32-filterbank-subtract - u32-vlog - u64-u32-vsqrtshift x16-packw x32-packw x8-lut diff --git a/bench/BUILD.bazel b/bench/BUILD.bazel index f2570c87375..a46f19f2103 100644 --- a/bench/BUILD.bazel +++ b/bench/BUILD.bazel @@ -265,16 +265,6 @@ xnnpack_benchmark( "f32_vscaleextexp", "f16_vcmul", "f32_vcmul", - "s16_rmaxabs", - "s16_window", - "u32_filterbank_accumulate", - "u32_filterbank_subtract", - "u32_vlog", - "u64_u32_vsqrtshift", - "i16_vlshift", - "cs16_vsquareabs", - "cs16_bfly4", - "cs16_fftr", "x8_lut", ]] diff --git a/bench/cs16-bfly4.cc b/bench/cs16-bfly4.cc deleted file mode 100644 index 569ba5b6988..00000000000 --- a/bench/cs16-bfly4.cc +++ /dev/null @@ -1,116 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - - -#include -#include -#include -#include -#include - -#include "bench/utils.h" -#include - -#include "xnnpack.h" -#include "xnnpack/aligned-allocator.h" -#include "xnnpack/common.h" -#include "xnnpack/fft.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams-init.h" - - -void cs16_bfly4( - benchmark::State& state, - xnn_cs16_bfly4_ukernel_fn bfly4, - benchmark::utils::IsaCheckFunction isa_check = nullptr) -{ - if ((isa_check != nullptr) && !isa_check(state)) { - return; - } - const size_t fft_size = state.range(0); - const size_t batch = state.range(1); - const size_t samples = state.range(2); - const size_t stride = state.range(3); - - assert(fft_size == samples * stride * 4); // 4 for bfly4. - - std::vector> output(fft_size * 2); - std::vector> twiddle(fft_size * 3 / 4 * 2); - - std::iota(output.begin(), output.end(), 0); - std::iota(twiddle.begin(), twiddle.end(), 0); - - for (auto _ : state) { - bfly4(batch, samples * sizeof(int16_t) * 2, output.data(), twiddle.data(), stride * sizeof(int16_t) * 2); - } - - const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); - if (cpu_frequency != 0) { - state.counters["cpufreq"] = cpu_frequency; - } -} - -static void BenchmarkKernelSize(benchmark::internal::Benchmark* b) -{ - b->ArgNames({"fft_size", "batch", "samples", "stride"}); - b->Args({256, 1, 1, 64}); - b->Args({256, 4, 1, 64}); - b->Args({256, 1, 4, 16}); - b->Args({256, 4, 4, 16}); - b->Args({256, 1, 16, 4}); - b->Args({256, 4, 16, 4}); - b->Args({256, 1, 64, 1}); -} - -static void BenchmarkSamples1KernelSize(benchmark::internal::Benchmark* b) -{ - b->ArgNames({"fft_size", "batch", "samples", "stride"}); - b->Args({256, 1, 1, 64}); - b->Args({256, 4, 1, 64}); - b->Args({256, 16, 1, 64}); - b->Args({256, 64, 1, 64}); -} -static void BenchmarkSamples4KernelSize(benchmark::internal::Benchmark* b) -{ - b->ArgNames({"fft_size", "batch", "samples", "stride"}); - b->Args({256, 1, 4, 16}); - b->Args({256, 4, 4, 16}); - b->Args({256, 16, 4, 16}); -} - -#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY -BENCHMARK_CAPTURE(cs16_bfly4, samples1__asm_aarch32_neon_x1, xnn_cs16_bfly4_samples1_ukernel__asm_aarch32_neon_x1) - ->Apply(BenchmarkSamples1KernelSize)->UseRealTime(); -BENCHMARK_CAPTURE(cs16_bfly4, samples1__asm_aarch32_neon_x2, xnn_cs16_bfly4_samples1_ukernel__asm_aarch32_neon_x2) - ->Apply(BenchmarkSamples1KernelSize)->UseRealTime(); -BENCHMARK_CAPTURE(cs16_bfly4, samples1__asm_aarch32_neon_x4, xnn_cs16_bfly4_samples1_ukernel__asm_aarch32_neon_x4) - ->Apply(BenchmarkSamples1KernelSize)->UseRealTime(); -#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 -BENCHMARK_CAPTURE(cs16_bfly4, samples1__neon, xnn_cs16_bfly4_samples1_ukernel__neon) - ->Apply(BenchmarkSamples1KernelSize)->UseRealTime(); -BENCHMARK_CAPTURE(cs16_bfly4, samples4__neon, xnn_cs16_bfly4_samples4_ukernel__neon) - ->Apply(BenchmarkSamples4KernelSize)->UseRealTime(); -BENCHMARK_CAPTURE(cs16_bfly4, neon_x1, xnn_cs16_bfly4_ukernel__neon_x1) - ->Apply(BenchmarkKernelSize)->UseRealTime(); -BENCHMARK_CAPTURE(cs16_bfly4, neon_x4, xnn_cs16_bfly4_ukernel__neon_x4) - ->Apply(BenchmarkKernelSize)->UseRealTime(); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - -BENCHMARK_CAPTURE(cs16_bfly4, samples1__scalar, xnn_cs16_bfly4_samples1_ukernel__scalar) - ->Apply(BenchmarkSamples1KernelSize)->UseRealTime(); -BENCHMARK_CAPTURE(cs16_bfly4, samples4__scalar, xnn_cs16_bfly4_samples4_ukernel__scalar) - ->Apply(BenchmarkSamples4KernelSize)->UseRealTime(); -BENCHMARK_CAPTURE(cs16_bfly4, scalar_x1, xnn_cs16_bfly4_ukernel__scalar_x1) - ->Apply(BenchmarkKernelSize)->UseRealTime(); -BENCHMARK_CAPTURE(cs16_bfly4, scalar_x2, xnn_cs16_bfly4_ukernel__scalar_x2) - ->Apply(BenchmarkKernelSize)->UseRealTime(); -BENCHMARK_CAPTURE(cs16_bfly4, scalar_x4, xnn_cs16_bfly4_ukernel__scalar_x4) - ->Apply(BenchmarkKernelSize)->UseRealTime(); - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/cs16-fftr.cc b/bench/cs16-fftr.cc deleted file mode 100644 index 8eabc029f13..00000000000 --- a/bench/cs16-fftr.cc +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include - -#include "bench/utils.h" -#include - -#include "xnnpack.h" -#include "xnnpack/aligned-allocator.h" -#include "xnnpack/common.h" -#include "xnnpack/fft.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams-init.h" - - -void cs16_fftr( - benchmark::State& state, - xnn_cs16_fftr_ukernel_fn fftr, - benchmark::utils::IsaCheckFunction isa_check = nullptr) -{ - if ((isa_check != nullptr) && !isa_check(state)) { - return; - } - const size_t samples = state.range(0); - - assert(samples % 2 == 0); - const size_t sample_size = samples * 2 + 2; - - std::vector> data(sample_size + XNN_EXTRA_BYTES / sizeof(int16_t)); - std::vector> twiddle(samples); - - std::iota(data.begin(), data.end(), 0); - std::iota(twiddle.begin(), twiddle.end(), 2); - - for (auto _ : state) { - fftr(samples, data.data(), twiddle.data()); - } - - const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); - if (cpu_frequency != 0) { - state.counters["cpufreq"] = cpu_frequency; - } -} - -static void BenchmarkKernelSize(benchmark::internal::Benchmark* b) -{ - b->ArgNames({"samples"}); - b->Args({256}); - b->Args({1024}); -} -#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY -BENCHMARK_CAPTURE(cs16_fftr, cs16_aarch32_neon_x1, xnn_cs16_fftr_ukernel__asm_aarch32_neon_x1)->Apply(BenchmarkKernelSize)->UseRealTime(); -BENCHMARK_CAPTURE(cs16_fftr, cs16_aarch32_neon_x4, xnn_cs16_fftr_ukernel__asm_aarch32_neon_x4)->Apply(BenchmarkKernelSize)->UseRealTime(); -#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 -BENCHMARK_CAPTURE(cs16_fftr, cs16_neon_x4, xnn_cs16_fftr_ukernel__neon_x4)->Apply(BenchmarkKernelSize)->UseRealTime(); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - -BENCHMARK_CAPTURE(cs16_fftr, cs16_scalar_x1, xnn_cs16_fftr_ukernel__scalar_x1)->Apply(BenchmarkKernelSize)->UseRealTime(); -BENCHMARK_CAPTURE(cs16_fftr, cs16_scalar_x2, xnn_cs16_fftr_ukernel__scalar_x2)->Apply(BenchmarkKernelSize)->UseRealTime(); -BENCHMARK_CAPTURE(cs16_fftr, cs16_scalar_x4, xnn_cs16_fftr_ukernel__scalar_x4)->Apply(BenchmarkKernelSize)->UseRealTime(); - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/cs16-vsquareabs.cc b/bench/cs16-vsquareabs.cc deleted file mode 100644 index 92bd05e0f35..00000000000 --- a/bench/cs16-vsquareabs.cc +++ /dev/null @@ -1,127 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include - -#include "bench/utils.h" -#include - -#include "xnnpack.h" -#include "xnnpack/aligned-allocator.h" -#include "xnnpack/common.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams-init.h" -#include "xnnpack/vsquareabs.h" - - -void cs16_vsquareabs( - benchmark::State& state, - xnn_cs16_vsquareabs_ukernel_fn vsquareabs, - benchmark::utils::IsaCheckFunction isa_check = nullptr) -{ - if ((isa_check != nullptr) && !isa_check(state)) { - return; - } - const size_t num_elements = state.range(0); - - std::vector> input( - num_elements * 2 + XNN_EXTRA_BYTES / sizeof(int16_t)); - std::vector> output(num_elements); - std::iota(input.begin(), input.end(), 0); - std::iota(output.begin(), output.end(), 0); - - for (auto _ : state) { - vsquareabs(num_elements * sizeof(int16_t) * 2, input.data(), output.data()); - } - - const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); - if (cpu_frequency != 0) { - state.counters["cpufreq"] = cpu_frequency; - } - - const size_t elements_per_iteration = num_elements; - state.counters["elements"] = - benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate); - - const size_t bytes_per_iteration = num_elements * (sizeof(std::complex) + sizeof(uint32_t)); - state.counters["bytes"] = - benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate); -} - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_neon_x4, - xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x4, - benchmark::utils::CheckNEON) - ->Apply(benchmark::utils::UnaryElementwiseParameters, uint32_t>) - ->UseRealTime(); - BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_neon_x8, - xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x8, - benchmark::utils::CheckNEON) - ->Apply(benchmark::utils::UnaryElementwiseParameters, uint32_t>) - ->UseRealTime(); - BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_neon_x12, - xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x12, - benchmark::utils::CheckNEON) - ->Apply(benchmark::utils::UnaryElementwiseParameters, uint32_t>) - ->UseRealTime(); - BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_neon_x16, - xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x16, - benchmark::utils::CheckNEON) - ->Apply(benchmark::utils::UnaryElementwiseParameters, uint32_t>) - ->UseRealTime(); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - -#if XNN_ARCH_HEXAGON - BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_hexagon_x2, - xnn_cs16_vsquareabs_ukernel__hexagon_x2) - ->Apply(benchmark::utils::UnaryElementwiseParameters, uint32_t>) - ->UseRealTime(); - BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_hexagon_x4, - xnn_cs16_vsquareabs_ukernel__hexagon_x4) - ->Apply(benchmark::utils::UnaryElementwiseParameters, uint32_t>) - ->UseRealTime(); - BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_hexagon_x6, - xnn_cs16_vsquareabs_ukernel__hexagon_x6) - ->Apply(benchmark::utils::UnaryElementwiseParameters, uint32_t>) - ->UseRealTime(); - BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_hexagon_x8, - xnn_cs16_vsquareabs_ukernel__hexagon_x8) - ->Apply(benchmark::utils::UnaryElementwiseParameters, uint32_t>) - ->UseRealTime(); - BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_hexagon_x10, - xnn_cs16_vsquareabs_ukernel__hexagon_x10) - ->Apply(benchmark::utils::UnaryElementwiseParameters, uint32_t>) - ->UseRealTime(); - BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_hexagon_x12, - xnn_cs16_vsquareabs_ukernel__hexagon_x12) - ->Apply(benchmark::utils::UnaryElementwiseParameters, uint32_t>) - ->UseRealTime(); -#endif // XNN_ARCH_HEXAGON - -BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_scalar_x1, - xnn_cs16_vsquareabs_ukernel__scalar_x1) - ->Apply(benchmark::utils::UnaryElementwiseParameters, uint32_t>) - ->UseRealTime(); -BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_scalar_x2, - xnn_cs16_vsquareabs_ukernel__scalar_x2) - ->Apply(benchmark::utils::UnaryElementwiseParameters, uint32_t>) - ->UseRealTime(); -BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_scalar_x3, - xnn_cs16_vsquareabs_ukernel__scalar_x3) - ->Apply(benchmark::utils::UnaryElementwiseParameters, uint32_t>) - ->UseRealTime(); -BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_scalar_x4, - xnn_cs16_vsquareabs_ukernel__scalar_x4) - ->Apply(benchmark::utils::UnaryElementwiseParameters, uint32_t>) - ->UseRealTime(); - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/i16-vlshift.cc b/bench/i16-vlshift.cc deleted file mode 100644 index 1e63f548c8a..00000000000 --- a/bench/i16-vlshift.cc +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include - -#include "bench/utils.h" -#include - -#include "xnnpack.h" -#include "xnnpack/aligned-allocator.h" -#include "xnnpack/common.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/vlshift.h" - - -void i16_vlshift( - benchmark::State& state, - xnn_i16_vlshift_ukernel_fn vlshift, - benchmark::utils::IsaCheckFunction isa_check = nullptr) -{ - if (isa_check != nullptr && !isa_check(state)) { - return; - } - const size_t batch = state.range(0); - - std::random_device random_device; - auto rng = std::mt19937(random_device()); - auto u16rng = std::bind(std::uniform_int_distribution(), std::ref(rng)); - - std::vector> input(batch + XNN_EXTRA_BYTES / sizeof(uint16_t)); - std::vector> output(batch); - - std::generate(input.begin(), input.end(), std::ref(u16rng)); - std::fill(output.begin(), output.end(), UINT16_C(0xDEAD)); - - for (auto _ : state) { - vlshift(batch, input.data(), output.data(), 4 /* shift */); - } - - const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); - if (cpu_frequency != 0) { - state.counters["cpufreq"] = cpu_frequency; - } -} - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - BENCHMARK_CAPTURE(i16_vlshift, i16_neon_u8, - xnn_i16_vlshift_ukernel__neon_u8, - benchmark::utils::CheckNEON) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(i16_vlshift, i16_neon_u16, - xnn_i16_vlshift_ukernel__neon_u16, - benchmark::utils::CheckNEON) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(i16_vlshift, i16_neon_u24, - xnn_i16_vlshift_ukernel__neon_u24, - benchmark::utils::CheckNEON) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(i16_vlshift, i16_neon_u32, - xnn_i16_vlshift_ukernel__neon_u32, - benchmark::utils::CheckNEON) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - -BENCHMARK_CAPTURE(i16_vlshift, i16_scalar_u1, - xnn_i16_vlshift_ukernel__scalar_u1) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); -BENCHMARK_CAPTURE(i16_vlshift, i16_scalar_u2, - xnn_i16_vlshift_ukernel__scalar_u2) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); -BENCHMARK_CAPTURE(i16_vlshift, i16_scalar_u3, - xnn_i16_vlshift_ukernel__scalar_u3) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); -BENCHMARK_CAPTURE(i16_vlshift, i16_scalar_u4, - xnn_i16_vlshift_ukernel__scalar_u4) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/s16-rmaxabs.cc b/bench/s16-rmaxabs.cc deleted file mode 100644 index 9a83c6b2bb9..00000000000 --- a/bench/s16-rmaxabs.cc +++ /dev/null @@ -1,101 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include - -#include "bench/utils.h" -#include - -#include "xnnpack.h" -#include "xnnpack/aligned-allocator.h" -#include "xnnpack/common.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/rmaxabs.h" - - -void s16_rmaxabs( - benchmark::State& state, - xnn_s16_rmaxabs_ukernel_fn rmaxabs, - benchmark::utils::IsaCheckFunction isa_check = nullptr) -{ - if (isa_check != nullptr && !isa_check(state)) { - return; - } - const size_t channels = state.range(0); - - std::vector> input( - (channels) + XNN_EXTRA_BYTES / sizeof(int16_t)); - std::iota(input.begin(), input.end(), 0); - - uint16_t output = UINT16_C(0); - for (auto _ : state) { - rmaxabs(channels, input.data(), &output); - } - - const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); - if (cpu_frequency != 0) { - state.counters["cpufreq"] = cpu_frequency; - } -} - -static void BenchmarkBatch(benchmark::internal::Benchmark* b) -{ - b->ArgNames({"batch"}); - b->Args({32}); - b->Args({64}); - b->Args({216}); - b->Args({400}); - b->Args({1000}); - b->Args({10000}); - b->Args({100000}); -} - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - BENCHMARK_CAPTURE(s16_rmaxabs, s16_neon_x8, - xnn_s16_rmaxabs_ukernel__neon_x8, - benchmark::utils::CheckNEON) - ->Apply(BenchmarkBatch) - ->UseRealTime(); - BENCHMARK_CAPTURE(s16_rmaxabs, s16_neon_x16, - xnn_s16_rmaxabs_ukernel__neon_x16, - benchmark::utils::CheckNEON) - ->Apply(BenchmarkBatch) - ->UseRealTime(); - BENCHMARK_CAPTURE(s16_rmaxabs, s16_neon_x24, - xnn_s16_rmaxabs_ukernel__neon_x24, - benchmark::utils::CheckNEON) - ->Apply(BenchmarkBatch) - ->UseRealTime(); - BENCHMARK_CAPTURE(s16_rmaxabs, s16_neon_x32, - xnn_s16_rmaxabs_ukernel__neon_x32, - benchmark::utils::CheckNEON) - ->Apply(BenchmarkBatch) - ->UseRealTime(); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - -BENCHMARK_CAPTURE(s16_rmaxabs, s16_scalar_x1, - xnn_s16_rmaxabs_ukernel__scalar_x1) - ->Apply(BenchmarkBatch) - ->UseRealTime(); -BENCHMARK_CAPTURE(s16_rmaxabs, s16_scalar_x2, - xnn_s16_rmaxabs_ukernel__scalar_x2) - ->Apply(BenchmarkBatch) - ->UseRealTime(); -BENCHMARK_CAPTURE(s16_rmaxabs, s16_scalar_x3, - xnn_s16_rmaxabs_ukernel__scalar_x3) - ->Apply(BenchmarkBatch) - ->UseRealTime(); -BENCHMARK_CAPTURE(s16_rmaxabs, s16_scalar_x4, - xnn_s16_rmaxabs_ukernel__scalar_x4) - ->Apply(BenchmarkBatch) - ->UseRealTime(); - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/s16-window.cc b/bench/s16-window.cc deleted file mode 100644 index fc826b88c3a..00000000000 --- a/bench/s16-window.cc +++ /dev/null @@ -1,144 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include - -#include "bench/utils.h" -#include - -#include "xnnpack.h" -#include "xnnpack/aligned-allocator.h" -#include "xnnpack/common.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/window.h" - - -void s16_window( - benchmark::State& state, - xnn_s16_window_ukernel_fn window, - benchmark::utils::IsaCheckFunction isa_check = nullptr) -{ - if (isa_check != nullptr && !isa_check(state)) { - return; - } - const size_t rows = state.range(0); - const size_t channels = state.range(1); - - std::vector> input( - (rows * channels) + XNN_EXTRA_BYTES / sizeof(int16_t)); - std::vector> weights( - channels + XNN_EXTRA_BYTES / sizeof(int16_t)); - std::vector> output( - (rows * channels) + XNN_EXTRA_BYTES / sizeof(int16_t)); - std::iota(input.begin(), input.end(), 0); - std::fill(weights.begin(), weights.end(), 0); - std::iota(output.begin(), output.end(), 0); - - for (auto _ : state) { - window(rows, channels * sizeof(int16_t), input.data(), weights.data(), output.data(), 12 /* shift */); - } - - const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); - if (cpu_frequency != 0) { - state.counters["cpufreq"] = cpu_frequency; - } -} - -static void BenchmarkKernelSize(benchmark::internal::Benchmark* b) -{ - b->ArgNames({"rows", "channels"}); - b->Args({1, 400}); - b->Args({10, 400}); -} - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - BENCHMARK_CAPTURE(s16_window, s16_neon_u8, - xnn_s16_window_ukernel__neon_u8, - benchmark::utils::CheckNEON) - ->Apply(BenchmarkKernelSize) - ->UseRealTime(); - BENCHMARK_CAPTURE(s16_window, s16_neon_u16, - xnn_s16_window_ukernel__neon_u16, - benchmark::utils::CheckNEON) - ->Apply(BenchmarkKernelSize) - ->UseRealTime(); - BENCHMARK_CAPTURE(s16_window, s16_neon_u24, - xnn_s16_window_ukernel__neon_u24, - benchmark::utils::CheckNEON) - ->Apply(BenchmarkKernelSize) - ->UseRealTime(); - BENCHMARK_CAPTURE(s16_window, s16_neon_u32, - xnn_s16_window_ukernel__neon_u32, - benchmark::utils::CheckNEON) - ->Apply(BenchmarkKernelSize) - ->UseRealTime(); - - BENCHMARK_CAPTURE(s16_window, s16_shift12_neon_u8, - xnn_s16_window_shift12_ukernel__neon_u8, - benchmark::utils::CheckNEON) - ->Apply(BenchmarkKernelSize) - ->UseRealTime(); - BENCHMARK_CAPTURE(s16_window, s16_shift12_neon_u16, - xnn_s16_window_shift12_ukernel__neon_u16, - benchmark::utils::CheckNEON) - ->Apply(BenchmarkKernelSize) - ->UseRealTime(); - BENCHMARK_CAPTURE(s16_window, s16_shift12_neon_u24, - xnn_s16_window_shift12_ukernel__neon_u24, - benchmark::utils::CheckNEON) - ->Apply(BenchmarkKernelSize) - ->UseRealTime(); - BENCHMARK_CAPTURE(s16_window, s16_shift12_neon_u32, - xnn_s16_window_shift12_ukernel__neon_u32, - benchmark::utils::CheckNEON) - ->Apply(BenchmarkKernelSize) - ->UseRealTime(); - - BENCHMARK_CAPTURE(s16_window, s16_shift15_neon_u8, - xnn_s16_window_shift15_ukernel__neon_u8, - benchmark::utils::CheckNEON) - ->Apply(BenchmarkKernelSize) - ->UseRealTime(); - BENCHMARK_CAPTURE(s16_window, s16_shift15_neon_u16, - xnn_s16_window_shift15_ukernel__neon_u16, - benchmark::utils::CheckNEON) - ->Apply(BenchmarkKernelSize) - ->UseRealTime(); - BENCHMARK_CAPTURE(s16_window, s16_shift15_neon_u24, - xnn_s16_window_shift15_ukernel__neon_u24, - benchmark::utils::CheckNEON) - ->Apply(BenchmarkKernelSize) - ->UseRealTime(); - BENCHMARK_CAPTURE(s16_window, s16_shift15_neon_u32, - xnn_s16_window_shift15_ukernel__neon_u32, - benchmark::utils::CheckNEON) - ->Apply(BenchmarkKernelSize) - ->UseRealTime(); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - -BENCHMARK_CAPTURE(s16_window, s16_scalar_u1, - xnn_s16_window_ukernel__scalar_u1) - ->Apply(BenchmarkKernelSize) - ->UseRealTime(); -BENCHMARK_CAPTURE(s16_window, s16_scalar_u2, - xnn_s16_window_ukernel__scalar_u2) - ->Apply(BenchmarkKernelSize) - ->UseRealTime(); -BENCHMARK_CAPTURE(s16_window, s16_scalar_u3, - xnn_s16_window_ukernel__scalar_u3) - ->Apply(BenchmarkKernelSize) - ->UseRealTime(); -BENCHMARK_CAPTURE(s16_window, s16_scalar_u4, - xnn_s16_window_ukernel__scalar_u4) - ->Apply(BenchmarkKernelSize) - ->UseRealTime(); - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/u32-filterbank-accumulate.cc b/bench/u32-filterbank-accumulate.cc deleted file mode 100644 index fea7bde166a..00000000000 --- a/bench/u32-filterbank-accumulate.cc +++ /dev/null @@ -1,86 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include - -#include "bench/utils.h" -#include - -#include "xnnpack.h" -#include "xnnpack/aligned-allocator.h" -#include "xnnpack/common.h" -#include "xnnpack/filterbank.h" -#include "xnnpack/microfnptr.h" - - -void filterbank_accumulate( - benchmark::State& state, - xnn_u32_filterbank_accumulate_ukernel_fn filterbank_accumulate, - benchmark::utils::IsaCheckFunction isa_check = nullptr) -{ - if (isa_check != nullptr && !isa_check(state)) { - return; - } - const size_t rows = state.range(0); - const size_t batch = state.range(1); - const size_t input_size = (rows + 1) * batch; - - std::vector> input(input_size); - std::vector> weight_widths(rows + 1); - std::vector> weights(input_size * 2); - std::vector> output(rows); - std::iota(input.begin(), input.end(), 0); - std::fill(weight_widths.begin(), weight_widths.end(), static_cast(batch)); - std::iota(weights.begin(), weights.end(), 0); - std::iota(output.begin(), output.end(), 0); - - for (auto _ : state) { - filterbank_accumulate(rows, input.data(), weight_widths.data(), weights.data(), output.data()); - } - - const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); - if (cpu_frequency != 0) { - state.counters["cpufreq"] = cpu_frequency; - } -} - -static void BenchmarkKernelSize(benchmark::internal::Benchmark* b) -{ - b->ArgNames({"rows", "batch"}); - b->Args({1, 237}); - b->Args({5, 1}); - b->Args({10, 2}); - b->Args({7, 3}); - b->Args({5, 4}); - b->Args({5, 5}); - b->Args({3, 6}); - b->Args({4, 7}); - b->Args({2, 8}); - b->Args({2, 9}); - b->Args({2, 10}); - b->Args({3, 11}); - b->Args({1, 13}); -} - -#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY -BENCHMARK_CAPTURE(filterbank_accumulate, u32_aarch32_arm_x1, xnn_u32_filterbank_accumulate_ukernel__asm_aarch32_arm_x1)->Apply(BenchmarkKernelSize)->UseRealTime(); -BENCHMARK_CAPTURE(filterbank_accumulate, u32_aarch32_neon_x1, xnn_u32_filterbank_accumulate_ukernel__asm_aarch32_neon_x1, benchmark::utils::CheckNEON)->Apply(BenchmarkKernelSize)->UseRealTime(); -BENCHMARK_CAPTURE(filterbank_accumulate, u32_aarch32_neon_x2, xnn_u32_filterbank_accumulate_ukernel__asm_aarch32_neon_x2, benchmark::utils::CheckNEON)->Apply(BenchmarkKernelSize)->UseRealTime(); -#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 -BENCHMARK_CAPTURE(filterbank_accumulate, u32_neon_x1, xnn_u32_filterbank_accumulate_ukernel__neon_x1, benchmark::utils::CheckNEON)->Apply(BenchmarkKernelSize)->UseRealTime(); -BENCHMARK_CAPTURE(filterbank_accumulate, u32_neon_x2, xnn_u32_filterbank_accumulate_ukernel__neon_x2, benchmark::utils::CheckNEON)->Apply(BenchmarkKernelSize)->UseRealTime(); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - -BENCHMARK_CAPTURE(filterbank_accumulate, u32_scalar_x1, xnn_u32_filterbank_accumulate_ukernel__scalar_x1)->Apply(BenchmarkKernelSize)->UseRealTime(); - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/u32-filterbank-subtract.cc b/bench/u32-filterbank-subtract.cc deleted file mode 100644 index aeb3b22adb6..00000000000 --- a/bench/u32-filterbank-subtract.cc +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include - -#include "bench/utils.h" -#include - -#include "xnnpack.h" -#include "xnnpack/aligned-allocator.h" -#include "xnnpack/common.h" -#include "xnnpack/filterbank.h" -#include "xnnpack/microfnptr.h" - - -void filterbank_subtract( - benchmark::State& state, - xnn_u32_filterbank_subtract_ukernel_fn filterbank_subtract, - benchmark::utils::IsaCheckFunction isa_check = nullptr) -{ - if (isa_check != nullptr && !isa_check(state)) { - return; - } - const size_t batch = state.range(0); - - std::vector> input(batch + XNN_EXTRA_BYTES / sizeof(uint32_t)); - std::vector> noise_estimate(batch + XNN_EXTRA_BYTES / sizeof(uint32_t)); - std::vector> output(batch); - std::iota(input.begin(), input.end(), 0); - std::iota(noise_estimate.begin(), noise_estimate.end(), 1); - std::iota(output.begin(), output.end(), 0); - - for (auto _ : state) { - filterbank_subtract(batch, input.data(), - 655, 655, 15729, 15729, 819, 0, 14, - noise_estimate.data(), output.data()); - } - - const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); - if (cpu_frequency != 0) { - state.counters["cpufreq"] = cpu_frequency; - } -} - -static void BenchmarkKernelSize(benchmark::internal::Benchmark* b) -{ - b->ArgNames({"batch"}); - b->Args({48}); - b->Args({480}); - b->Args({1000}); - b->Args({10000}); - b->Args({48000}); -} - -BENCHMARK_CAPTURE(filterbank_subtract, u32_scalar_x1, xnn_u32_filterbank_subtract_ukernel__scalar_x2)->Apply(BenchmarkKernelSize)->UseRealTime(); - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/u32-vlog.cc b/bench/u32-vlog.cc deleted file mode 100644 index 2b9cab61377..00000000000 --- a/bench/u32-vlog.cc +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include - -#include "bench/utils.h" -#include - -#include "xnnpack.h" -#include "xnnpack/aligned-allocator.h" -#include "xnnpack/common.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/vlog.h" - - -void u32_vlog( - benchmark::State& state, - xnn_u32_vlog_ukernel_fn vlog, - benchmark::utils::IsaCheckFunction isa_check = nullptr) -{ - if (isa_check != nullptr && !isa_check(state)) { - return; - } - const size_t num_elements = state.range(0); - - std::vector> input( - num_elements + XNN_EXTRA_BYTES / sizeof(uint32_t)); - std::vector> output(num_elements); - std::iota(input.begin(), input.end(), 0); - std::iota(output.begin(), output.end(), 0); - - for (auto _ : state) { - vlog(num_elements, input.data(), 4, 16, output.data()); - } - - const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); - if (cpu_frequency != 0) { - state.counters["cpufreq"] = cpu_frequency; - } - - const size_t elements_per_iteration = num_elements; - state.counters["elements"] = - benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate); - - const size_t bytes_per_iteration = num_elements * (sizeof(uint32_t) + sizeof(uint16_t)); - state.counters["bytes"] = - benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate); -} - -BENCHMARK_CAPTURE(u32_vlog, scalar_x1, - xnn_u32_vlog_ukernel__scalar_x1) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); -BENCHMARK_CAPTURE(u32_vlog, scalar_x2, - xnn_u32_vlog_ukernel__scalar_x2) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); -BENCHMARK_CAPTURE(u32_vlog, scalar_x3, - xnn_u32_vlog_ukernel__scalar_x3) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); -BENCHMARK_CAPTURE(u32_vlog, scalar_x4, - xnn_u32_vlog_ukernel__scalar_x4) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/u64-u32-vsqrtshift.cc b/bench/u64-u32-vsqrtshift.cc deleted file mode 100644 index 5c253a0f401..00000000000 --- a/bench/u64-u32-vsqrtshift.cc +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include - -#include -#include "bench/utils.h" - -#include "xnnpack.h" -#include "xnnpack/aligned-allocator.h" -#include "xnnpack/common.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams-init.h" -#include "xnnpack/vunary.h" - - -static void u64_u32_vsqrtshift( - benchmark::State& state, - xnn_u64_u32_vsqrtshift_ukernel_fn vsqrtshift, - benchmark::utils::IsaCheckFunction isa_check = nullptr) -{ - if (isa_check != nullptr && !isa_check(state)) { - return; - } - const size_t num_elements = state.range(0); - - std::random_device random_device; - auto rng = std::mt19937(random_device()); - auto u64rng = std::bind(std::uniform_int_distribution(), std::ref(rng)); - - std::vector> x(num_elements + XNN_EXTRA_BYTES / sizeof(uint64_t)); - std::vector> y(num_elements); - std::generate(x.begin(), x.end(), std::ref(u64rng)); - std::fill(y.begin(), y.end(), UINT32_C(0xDEADBEEF)); - - for (auto _ : state) { - vsqrtshift(num_elements * sizeof(uint64_t), x.data(), y.data(), 1 /* shift */); - } - - const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); - if (cpu_frequency != 0) { - state.counters["cpufreq"] = cpu_frequency; - } - - const size_t elements_per_iteration = num_elements; - state.counters["elements"] = - benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate); - - const size_t bytes_per_iteration = num_elements * (sizeof(uint64_t) + sizeof(uint32_t)); - state.counters["bytes"] = - benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate); -} - -BENCHMARK_CAPTURE(u64_u32_vsqrtshift, scalar_cvtu32_sqrt_cvtu32f64_u1, - xnn_u64_u32_vsqrtshift_ukernel__scalar_cvtu32_sqrt_cvtu32f64_u1) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/cmake/gen/aarch32_microkernels.cmake b/cmake/gen/aarch32_microkernels.cmake index 3e5db830b14..5a286d57f23 100644 --- a/cmake/gen/aarch32_microkernels.cmake +++ b/cmake/gen/aarch32_microkernels.cmake @@ -75,11 +75,6 @@ SET(PROD_AARCH32_ASM_MICROKERNEL_SRCS src/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S) SET(NON_PROD_AARCH32_ASM_MICROKERNEL_SRCS - src/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x1.S - src/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x2.S - src/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x4.S - src/cs16-fftr/cs16-fftr-asm-aarch32-neon-x1.S - src/cs16-fftr/cs16-fftr-asm-aarch32-neon-x4.S src/f32-gemm/gen/f32-gemm-4x4-asm-aarch32-vfp-ld64.S src/f32-gemm/gen/f32-gemm-4x4-minmax-asm-aarch32-vfp-ld64.S src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-ld64.S @@ -95,9 +90,6 @@ SET(NON_PROD_AARCH32_ASM_MICROKERNEL_SRCS src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a53.S src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-ld64-prfm.S - src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35-prfm.S - src/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-arm-x1.S - src/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-neon-x1.S - src/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-neon-x2.S) + src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35-prfm.S) SET(ALL_AARCH32_ASM_MICROKERNEL_SRCS ${PROD_AARCH32_ASM_MICROKERNEL_SRCS} + ${NON_PROD_AARCH32_ASM_MICROKERNEL_SRCS}) diff --git a/cmake/gen/hexagon_microkernels.cmake b/cmake/gen/hexagon_microkernels.cmake index 7b8be2a9ada..b4d9ba1959c 100644 --- a/cmake/gen/hexagon_microkernels.cmake +++ b/cmake/gen/hexagon_microkernels.cmake @@ -11,12 +11,6 @@ SET(PROD_HEXAGON_MICROKERNEL_SRCS) -SET(NON_PROD_HEXAGON_MICROKERNEL_SRCS - src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x2.c - src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x4.c - src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x6.c - src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x8.c - src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x10.c - src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x12.c) +SET(NON_PROD_HEXAGON_MICROKERNEL_SRCS) SET(ALL_HEXAGON_MICROKERNEL_SRCS ${PROD_HEXAGON_MICROKERNEL_SRCS} + ${NON_PROD_HEXAGON_MICROKERNEL_SRCS}) diff --git a/cmake/gen/neon_microkernels.cmake b/cmake/gen/neon_microkernels.cmake index 2cb16c475a8..44e2884daa3 100644 --- a/cmake/gen/neon_microkernels.cmake +++ b/cmake/gen/neon_microkernels.cmake @@ -183,15 +183,6 @@ SET(PROD_NEON_MICROKERNEL_SRCS src/xx-pad/xx-pad-p16-neon-u16.c) SET(NON_PROD_NEON_MICROKERNEL_SRCS - src/cs16-bfly4/cs16-bfly4-neon-x1.c - src/cs16-bfly4/cs16-bfly4-neon-x4.c - src/cs16-bfly4/cs16-bfly4-samples1-neon.c - src/cs16-bfly4/cs16-bfly4-samples4-neon.c - src/cs16-fftr/cs16-fftr-neon-x4.c - src/cs16-vsquareabs/gen/cs16-vsquareabs-neon-mlal-ld128-x4.c - src/cs16-vsquareabs/gen/cs16-vsquareabs-neon-mlal-ld128-x8.c - src/cs16-vsquareabs/gen/cs16-vsquareabs-neon-mlal-ld128-x12.c - src/cs16-vsquareabs/gen/cs16-vsquareabs-neon-mlal-ld128-x16.c src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u8.c src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u24.c src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u32.c @@ -490,10 +481,6 @@ SET(NON_PROD_NEON_MICROKERNEL_SRCS src/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-p5-nr2recps-u20.c src/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-p5-nr2recps-u24.c src/f32-vtanh/gen/f32-vtanh-neon-rational-9-8-nr.c - src/i16-vlshift/gen/i16-vlshift-neon-u8.c - src/i16-vlshift/gen/i16-vlshift-neon-u16.c - src/i16-vlshift/gen/i16-vlshift-neon-u24.c - src/i16-vlshift/gen/i16-vlshift-neon-u32.c src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-1x16-minmax-neon-mlal-lane-prfm.c src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-2x16-minmax-neon-mlal-lane-prfm.c src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-2x16-minmax-neon-mlal-lane.c @@ -897,24 +884,6 @@ SET(NON_PROD_NEON_MICROKERNEL_SRCS src/qu8-vmulc/gen/qu8-vmulc-minmax-rndnu-neon-ld128-u16.c src/s8-maxpool/s8-maxpool-2p2x-minmax-neon-c16.c src/s8-maxpool/s8-maxpool-4p3x-minmax-neon-c16.c - src/s16-rmaxabs/gen/s16-rmaxabs-neon-x8.c - src/s16-rmaxabs/gen/s16-rmaxabs-neon-x16.c - src/s16-rmaxabs/gen/s16-rmaxabs-neon-x24.c - src/s16-rmaxabs/gen/s16-rmaxabs-neon-x32.c - src/s16-window/gen/s16-window-neon-u8.c - src/s16-window/gen/s16-window-neon-u16.c - src/s16-window/gen/s16-window-neon-u24.c - src/s16-window/gen/s16-window-neon-u32.c - src/s16-window/gen/s16-window-shift12-neon-u8.c - src/s16-window/gen/s16-window-shift12-neon-u16.c - src/s16-window/gen/s16-window-shift12-neon-u24.c - src/s16-window/gen/s16-window-shift12-neon-u32.c - src/s16-window/gen/s16-window-shift15-neon-u8.c - src/s16-window/gen/s16-window-shift15-neon-u16.c - src/s16-window/gen/s16-window-shift15-neon-u24.c - src/s16-window/gen/s16-window-shift15-neon-u32.c - src/u32-filterbank-accumulate/gen/u32-filterbank-accumulate-neon-x1.c - src/u32-filterbank-accumulate/gen/u32-filterbank-accumulate-neon-x2.c src/x8-transposec/gen/x8-transposec-8x8-multi-dec-zip-neon.c src/x8-transposec/gen/x8-transposec-8x8-multi-mov-zip-neon.c src/x8-transposec/gen/x8-transposec-8x8-multi-switch-zip-neon.c diff --git a/cmake/gen/scalar_microkernels.cmake b/cmake/gen/scalar_microkernels.cmake index 4d4de2a1c48..3c3fe6f9765 100644 --- a/cmake/gen/scalar_microkernels.cmake +++ b/cmake/gen/scalar_microkernels.cmake @@ -265,18 +265,6 @@ SET(PROD_SCALAR_MICROKERNEL_SRCS src/xx-transposev/xx-transposev-1x1-scalar-memcpy.c) SET(NON_PROD_SCALAR_MICROKERNEL_SRCS - src/cs16-bfly4/cs16-bfly4-samples1-scalar.c - src/cs16-bfly4/cs16-bfly4-samples4-scalar.c - src/cs16-bfly4/gen/cs16-bfly4-scalar-x1.c - src/cs16-bfly4/gen/cs16-bfly4-scalar-x2.c - src/cs16-bfly4/gen/cs16-bfly4-scalar-x4.c - src/cs16-fftr/gen/cs16-fftr-scalar-x1.c - src/cs16-fftr/gen/cs16-fftr-scalar-x2.c - src/cs16-fftr/gen/cs16-fftr-scalar-x4.c - src/cs16-vsquareabs/gen/cs16-vsquareabs-scalar-x1.c - src/cs16-vsquareabs/gen/cs16-vsquareabs-scalar-x2.c - src/cs16-vsquareabs/gen/cs16-vsquareabs-scalar-x3.c - src/cs16-vsquareabs/gen/cs16-vsquareabs-scalar-x4.c src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u2.c src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u3.c src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-fmagic-u1.c @@ -608,10 +596,6 @@ SET(NON_PROD_SCALAR_MICROKERNEL_SRCS src/f32-vsigmoid/gen/f32-vsigmoid-scalar-rr2-p5-div-u4.c src/f32-vsqrt/gen/f32-vsqrt-scalar-sqrt-u2.c src/f32-vsqrt/gen/f32-vsqrt-scalar-sqrt-u4.c - src/i16-vlshift/gen/i16-vlshift-scalar-u1.c - src/i16-vlshift/gen/i16-vlshift-scalar-u2.c - src/i16-vlshift/gen/i16-vlshift-scalar-u3.c - src/i16-vlshift/gen/i16-vlshift-scalar-u4.c src/qd8-f16-qb4w-gemm/gen/qd8-f16-qb4w-gemm-1x2-minmax-scalar.c src/qd8-f16-qb4w-gemm/gen/qd8-f16-qb4w-gemm-1x4-minmax-scalar.c src/qd8-f16-qb4w-gemm/gen/qd8-f16-qb4w-gemm-1x8-minmax-scalar.c @@ -940,23 +924,8 @@ SET(NON_PROD_SCALAR_MICROKERNEL_SRCS src/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-scalar-u2.c src/s8-ibilinear/gen/s8-ibilinear-scalar-c2.c src/s8-ibilinear/gen/s8-ibilinear-scalar-c4.c - src/s16-rmaxabs/gen/s16-rmaxabs-scalar-x1.c - src/s16-rmaxabs/gen/s16-rmaxabs-scalar-x2.c - src/s16-rmaxabs/gen/s16-rmaxabs-scalar-x3.c - src/s16-rmaxabs/gen/s16-rmaxabs-scalar-x4.c - src/s16-window/gen/s16-window-scalar-u1.c - src/s16-window/gen/s16-window-scalar-u2.c - src/s16-window/gen/s16-window-scalar-u3.c - src/s16-window/gen/s16-window-scalar-u4.c src/u8-ibilinear/gen/u8-ibilinear-scalar-c2.c src/u8-ibilinear/gen/u8-ibilinear-scalar-c4.c - src/u32-filterbank-accumulate/gen/u32-filterbank-accumulate-scalar-x1.c - src/u32-filterbank-subtract/u32-filterbank-subtract-scalar-x2.c - src/u32-vlog/gen/u32-vlog-scalar-x1.c - src/u32-vlog/gen/u32-vlog-scalar-x2.c - src/u32-vlog/gen/u32-vlog-scalar-x3.c - src/u32-vlog/gen/u32-vlog-scalar-x4.c - src/u64-u32-vsqrtshift/u64-u32-vsqrtshift-scalar-cvtu32-sqrt-cvtu32f64-u1.c src/x8-lut/gen/x8-lut-scalar-u1.c src/x8-lut/gen/x8-lut-scalar-u2.c src/x8-lut/gen/x8-lut-scalar-u8.c diff --git a/gen/aarch32_microkernels.bzl b/gen/aarch32_microkernels.bzl index f9efb43e405..6ce699962c1 100644 --- a/gen/aarch32_microkernels.bzl +++ b/gen/aarch32_microkernels.bzl @@ -72,11 +72,6 @@ PROD_AARCH32_ASM_MICROKERNEL_SRCS = [ ] NON_PROD_AARCH32_ASM_MICROKERNEL_SRCS = [ - "src/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x1.S", - "src/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x2.S", - "src/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x4.S", - "src/cs16-fftr/cs16-fftr-asm-aarch32-neon-x1.S", - "src/cs16-fftr/cs16-fftr-asm-aarch32-neon-x4.S", "src/f32-gemm/gen/f32-gemm-4x4-asm-aarch32-vfp-ld64.S", "src/f32-gemm/gen/f32-gemm-4x4-minmax-asm-aarch32-vfp-ld64.S", "src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-ld64.S", @@ -93,9 +88,6 @@ NON_PROD_AARCH32_ASM_MICROKERNEL_SRCS = [ "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a53.S", "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-ld64-prfm.S", "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35-prfm.S", - "src/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-arm-x1.S", - "src/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-neon-x1.S", - "src/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-neon-x2.S", ] AARCH32_ASM_MICROKERNEL_SRCS = PROD_AARCH32_ASM_MICROKERNEL_SRCS + NON_PROD_AARCH32_ASM_MICROKERNEL_SRCS diff --git a/gen/hexagon_microkernels.bzl b/gen/hexagon_microkernels.bzl index f72d3b7c683..0d392adc162 100644 --- a/gen/hexagon_microkernels.bzl +++ b/gen/hexagon_microkernels.bzl @@ -9,12 +9,6 @@ PROD_HEXAGON_MICROKERNEL_SRCS = [ ] NON_PROD_HEXAGON_MICROKERNEL_SRCS = [ - "src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x2.c", - "src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x4.c", - "src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x6.c", - "src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x8.c", - "src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x10.c", - "src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x12.c", ] ALL_HEXAGON_MICROKERNEL_SRCS = PROD_HEXAGON_MICROKERNEL_SRCS + NON_PROD_HEXAGON_MICROKERNEL_SRCS diff --git a/gen/neon_microkernels.bzl b/gen/neon_microkernels.bzl index bdb30f7efe8..a7d12c81704 100644 --- a/gen/neon_microkernels.bzl +++ b/gen/neon_microkernels.bzl @@ -180,15 +180,6 @@ PROD_NEON_MICROKERNEL_SRCS = [ ] NON_PROD_NEON_MICROKERNEL_SRCS = [ - "src/cs16-bfly4/cs16-bfly4-neon-x1.c", - "src/cs16-bfly4/cs16-bfly4-neon-x4.c", - "src/cs16-bfly4/cs16-bfly4-samples1-neon.c", - "src/cs16-bfly4/cs16-bfly4-samples4-neon.c", - "src/cs16-fftr/cs16-fftr-neon-x4.c", - "src/cs16-vsquareabs/gen/cs16-vsquareabs-neon-mlal-ld128-x4.c", - "src/cs16-vsquareabs/gen/cs16-vsquareabs-neon-mlal-ld128-x8.c", - "src/cs16-vsquareabs/gen/cs16-vsquareabs-neon-mlal-ld128-x12.c", - "src/cs16-vsquareabs/gen/cs16-vsquareabs-neon-mlal-ld128-x16.c", "src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u8.c", "src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u24.c", "src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u32.c", @@ -487,10 +478,6 @@ NON_PROD_NEON_MICROKERNEL_SRCS = [ "src/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-p5-nr2recps-u20.c", "src/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-p5-nr2recps-u24.c", "src/f32-vtanh/gen/f32-vtanh-neon-rational-9-8-nr.c", - "src/i16-vlshift/gen/i16-vlshift-neon-u8.c", - "src/i16-vlshift/gen/i16-vlshift-neon-u16.c", - "src/i16-vlshift/gen/i16-vlshift-neon-u24.c", - "src/i16-vlshift/gen/i16-vlshift-neon-u32.c", "src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-1x16-minmax-neon-mlal-lane-prfm.c", "src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-2x16-minmax-neon-mlal-lane-prfm.c", "src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-2x16-minmax-neon-mlal-lane.c", @@ -894,24 +881,6 @@ NON_PROD_NEON_MICROKERNEL_SRCS = [ "src/qu8-vmulc/gen/qu8-vmulc-minmax-rndnu-neon-ld128-u16.c", "src/s8-maxpool/s8-maxpool-2p2x-minmax-neon-c16.c", "src/s8-maxpool/s8-maxpool-4p3x-minmax-neon-c16.c", - "src/s16-rmaxabs/gen/s16-rmaxabs-neon-x8.c", - "src/s16-rmaxabs/gen/s16-rmaxabs-neon-x16.c", - "src/s16-rmaxabs/gen/s16-rmaxabs-neon-x24.c", - "src/s16-rmaxabs/gen/s16-rmaxabs-neon-x32.c", - "src/s16-window/gen/s16-window-neon-u8.c", - "src/s16-window/gen/s16-window-neon-u16.c", - "src/s16-window/gen/s16-window-neon-u24.c", - "src/s16-window/gen/s16-window-neon-u32.c", - "src/s16-window/gen/s16-window-shift12-neon-u8.c", - "src/s16-window/gen/s16-window-shift12-neon-u16.c", - "src/s16-window/gen/s16-window-shift12-neon-u24.c", - "src/s16-window/gen/s16-window-shift12-neon-u32.c", - "src/s16-window/gen/s16-window-shift15-neon-u8.c", - "src/s16-window/gen/s16-window-shift15-neon-u16.c", - "src/s16-window/gen/s16-window-shift15-neon-u24.c", - "src/s16-window/gen/s16-window-shift15-neon-u32.c", - "src/u32-filterbank-accumulate/gen/u32-filterbank-accumulate-neon-x1.c", - "src/u32-filterbank-accumulate/gen/u32-filterbank-accumulate-neon-x2.c", "src/x8-transposec/gen/x8-transposec-8x8-multi-dec-zip-neon.c", "src/x8-transposec/gen/x8-transposec-8x8-multi-mov-zip-neon.c", "src/x8-transposec/gen/x8-transposec-8x8-multi-switch-zip-neon.c", diff --git a/gen/scalar_microkernels.bzl b/gen/scalar_microkernels.bzl index 323f8931138..ed08da2bc7a 100644 --- a/gen/scalar_microkernels.bzl +++ b/gen/scalar_microkernels.bzl @@ -262,18 +262,6 @@ PROD_SCALAR_MICROKERNEL_SRCS = [ ] NON_PROD_SCALAR_MICROKERNEL_SRCS = [ - "src/cs16-bfly4/cs16-bfly4-samples1-scalar.c", - "src/cs16-bfly4/cs16-bfly4-samples4-scalar.c", - "src/cs16-bfly4/gen/cs16-bfly4-scalar-x1.c", - "src/cs16-bfly4/gen/cs16-bfly4-scalar-x2.c", - "src/cs16-bfly4/gen/cs16-bfly4-scalar-x4.c", - "src/cs16-fftr/gen/cs16-fftr-scalar-x1.c", - "src/cs16-fftr/gen/cs16-fftr-scalar-x2.c", - "src/cs16-fftr/gen/cs16-fftr-scalar-x4.c", - "src/cs16-vsquareabs/gen/cs16-vsquareabs-scalar-x1.c", - "src/cs16-vsquareabs/gen/cs16-vsquareabs-scalar-x2.c", - "src/cs16-vsquareabs/gen/cs16-vsquareabs-scalar-x3.c", - "src/cs16-vsquareabs/gen/cs16-vsquareabs-scalar-x4.c", "src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u2.c", "src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u3.c", "src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-fmagic-u1.c", @@ -605,10 +593,6 @@ NON_PROD_SCALAR_MICROKERNEL_SRCS = [ "src/f32-vsigmoid/gen/f32-vsigmoid-scalar-rr2-p5-div-u4.c", "src/f32-vsqrt/gen/f32-vsqrt-scalar-sqrt-u2.c", "src/f32-vsqrt/gen/f32-vsqrt-scalar-sqrt-u4.c", - "src/i16-vlshift/gen/i16-vlshift-scalar-u1.c", - "src/i16-vlshift/gen/i16-vlshift-scalar-u2.c", - "src/i16-vlshift/gen/i16-vlshift-scalar-u3.c", - "src/i16-vlshift/gen/i16-vlshift-scalar-u4.c", "src/qd8-f16-qb4w-gemm/gen/qd8-f16-qb4w-gemm-1x2-minmax-scalar.c", "src/qd8-f16-qb4w-gemm/gen/qd8-f16-qb4w-gemm-1x4-minmax-scalar.c", "src/qd8-f16-qb4w-gemm/gen/qd8-f16-qb4w-gemm-1x8-minmax-scalar.c", @@ -937,23 +921,8 @@ NON_PROD_SCALAR_MICROKERNEL_SRCS = [ "src/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-scalar-u2.c", "src/s8-ibilinear/gen/s8-ibilinear-scalar-c2.c", "src/s8-ibilinear/gen/s8-ibilinear-scalar-c4.c", - "src/s16-rmaxabs/gen/s16-rmaxabs-scalar-x1.c", - "src/s16-rmaxabs/gen/s16-rmaxabs-scalar-x2.c", - "src/s16-rmaxabs/gen/s16-rmaxabs-scalar-x3.c", - "src/s16-rmaxabs/gen/s16-rmaxabs-scalar-x4.c", - "src/s16-window/gen/s16-window-scalar-u1.c", - "src/s16-window/gen/s16-window-scalar-u2.c", - "src/s16-window/gen/s16-window-scalar-u3.c", - "src/s16-window/gen/s16-window-scalar-u4.c", "src/u8-ibilinear/gen/u8-ibilinear-scalar-c2.c", "src/u8-ibilinear/gen/u8-ibilinear-scalar-c4.c", - "src/u32-filterbank-accumulate/gen/u32-filterbank-accumulate-scalar-x1.c", - "src/u32-filterbank-subtract/u32-filterbank-subtract-scalar-x2.c", - "src/u32-vlog/gen/u32-vlog-scalar-x1.c", - "src/u32-vlog/gen/u32-vlog-scalar-x2.c", - "src/u32-vlog/gen/u32-vlog-scalar-x3.c", - "src/u32-vlog/gen/u32-vlog-scalar-x4.c", - "src/u64-u32-vsqrtshift/u64-u32-vsqrtshift-scalar-cvtu32-sqrt-cvtu32f64-u1.c", "src/x8-lut/gen/x8-lut-scalar-u1.c", "src/x8-lut/gen/x8-lut-scalar-u2.c", "src/x8-lut/gen/x8-lut-scalar-u8.c", diff --git a/scripts/generate-cs16-bfly4.sh b/scripts/generate-cs16-bfly4.sh deleted file mode 100755 index 0594754b31b..00000000000 --- a/scripts/generate-cs16-bfly4.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/sh -# Copyright 2022 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -################################### SCALAR ################################### -tools/xngen src/cs16-bfly4/scalar.c.in -D SAMPLE_TILE=1 -o src/cs16-bfly4/gen/cs16-bfly4-scalar-x1.c & -tools/xngen src/cs16-bfly4/scalar.c.in -D SAMPLE_TILE=2 -o src/cs16-bfly4/gen/cs16-bfly4-scalar-x2.c & -tools/xngen src/cs16-bfly4/scalar.c.in -D SAMPLE_TILE=4 -o src/cs16-bfly4/gen/cs16-bfly4-scalar-x4.c & - -wait diff --git a/scripts/generate-cs16-fftr.sh b/scripts/generate-cs16-fftr.sh deleted file mode 100755 index 48dfedd3688..00000000000 --- a/scripts/generate-cs16-fftr.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/sh -# Copyright 2022 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -################################### SCALAR ################################### -tools/xngen src/cs16-fftr/scalar.c.in -D SAMPLE_TILE=1 -o src/cs16-fftr/gen/cs16-fftr-scalar-x1.c & -tools/xngen src/cs16-fftr/scalar.c.in -D SAMPLE_TILE=2 -o src/cs16-fftr/gen/cs16-fftr-scalar-x2.c & -tools/xngen src/cs16-fftr/scalar.c.in -D SAMPLE_TILE=4 -o src/cs16-fftr/gen/cs16-fftr-scalar-x4.c & - -wait diff --git a/scripts/generate-cs16-vsquareabs.sh b/scripts/generate-cs16-vsquareabs.sh deleted file mode 100755 index 1af4d9e8451..00000000000 --- a/scripts/generate-cs16-vsquareabs.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/sh -# Copyright 2022 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -################################### Scalar ################################### -tools/xngen src/cs16-vsquareabs/scalar.c.in -D BATCH_TILE=1 -o src/cs16-vsquareabs/gen/cs16-vsquareabs-scalar-x1.c & -tools/xngen src/cs16-vsquareabs/scalar.c.in -D BATCH_TILE=2 -o src/cs16-vsquareabs/gen/cs16-vsquareabs-scalar-x2.c & -tools/xngen src/cs16-vsquareabs/scalar.c.in -D BATCH_TILE=3 -o src/cs16-vsquareabs/gen/cs16-vsquareabs-scalar-x3.c & -tools/xngen src/cs16-vsquareabs/scalar.c.in -D BATCH_TILE=4 -o src/cs16-vsquareabs/gen/cs16-vsquareabs-scalar-x4.c & - -################################### NEON ################################### -tools/xngen src/cs16-vsquareabs/neon.c.in -D BATCH_TILE=4 -o src/cs16-vsquareabs/gen/cs16-vsquareabs-neon-mlal-ld128-x4.c & -tools/xngen src/cs16-vsquareabs/neon.c.in -D BATCH_TILE=8 -o src/cs16-vsquareabs/gen/cs16-vsquareabs-neon-mlal-ld128-x8.c & -tools/xngen src/cs16-vsquareabs/neon.c.in -D BATCH_TILE=12 -o src/cs16-vsquareabs/gen/cs16-vsquareabs-neon-mlal-ld128-x12.c & -tools/xngen src/cs16-vsquareabs/neon.c.in -D BATCH_TILE=16 -o src/cs16-vsquareabs/gen/cs16-vsquareabs-neon-mlal-ld128-x16.c & - -################################### Hexagon ################################### -tools/xngen src/cs16-vsquareabs/hexagon.c.in -D BATCH_TILE=2 -o src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x2.c & -tools/xngen src/cs16-vsquareabs/hexagon.c.in -D BATCH_TILE=4 -o src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x4.c & -tools/xngen src/cs16-vsquareabs/hexagon.c.in -D BATCH_TILE=6 -o src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x6.c & -tools/xngen src/cs16-vsquareabs/hexagon.c.in -D BATCH_TILE=8 -o src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x8.c & -tools/xngen src/cs16-vsquareabs/hexagon.c.in -D BATCH_TILE=10 -o src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x10.c & -tools/xngen src/cs16-vsquareabs/hexagon.c.in -D BATCH_TILE=12 -o src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x12.c & - -wait diff --git a/scripts/generate-i16-vlshift.sh b/scripts/generate-i16-vlshift.sh deleted file mode 100755 index 3b0ac51cfa9..00000000000 --- a/scripts/generate-i16-vlshift.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/sh -# Copyright 2022 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -################################### SCALAR ################################### -tools/xngen src/i16-vlshift/scalar.c.in -D BATCH_TILE=1 -o src/i16-vlshift/gen/i16-vlshift-scalar-u1.c & -tools/xngen src/i16-vlshift/scalar.c.in -D BATCH_TILE=2 -o src/i16-vlshift/gen/i16-vlshift-scalar-u2.c & -tools/xngen src/i16-vlshift/scalar.c.in -D BATCH_TILE=3 -o src/i16-vlshift/gen/i16-vlshift-scalar-u3.c & -tools/xngen src/i16-vlshift/scalar.c.in -D BATCH_TILE=4 -o src/i16-vlshift/gen/i16-vlshift-scalar-u4.c & - -################################### NEON ################################### -tools/xngen src/i16-vlshift/neon.c.in -D BATCH_TILE=8 -o src/i16-vlshift/gen/i16-vlshift-neon-u8.c & -tools/xngen src/i16-vlshift/neon.c.in -D BATCH_TILE=16 -o src/i16-vlshift/gen/i16-vlshift-neon-u16.c & -tools/xngen src/i16-vlshift/neon.c.in -D BATCH_TILE=24 -o src/i16-vlshift/gen/i16-vlshift-neon-u24.c & -tools/xngen src/i16-vlshift/neon.c.in -D BATCH_TILE=32 -o src/i16-vlshift/gen/i16-vlshift-neon-u32.c & - -wait diff --git a/scripts/generate-s16-rmaxabs.sh b/scripts/generate-s16-rmaxabs.sh deleted file mode 100755 index 31ec7ed1482..00000000000 --- a/scripts/generate-s16-rmaxabs.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/sh -# Copyright 2022 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -################################### SCALAR ################################### -tools/xngen src/s16-rmaxabs/scalar.c.in -D BATCH_TILE=1 -o src/s16-rmaxabs/gen/s16-rmaxabs-scalar-x1.c & -tools/xngen src/s16-rmaxabs/scalar.c.in -D BATCH_TILE=2 -o src/s16-rmaxabs/gen/s16-rmaxabs-scalar-x2.c & -tools/xngen src/s16-rmaxabs/scalar.c.in -D BATCH_TILE=3 -o src/s16-rmaxabs/gen/s16-rmaxabs-scalar-x3.c & -tools/xngen src/s16-rmaxabs/scalar.c.in -D BATCH_TILE=4 -o src/s16-rmaxabs/gen/s16-rmaxabs-scalar-x4.c & - -################################### NEON ################################### -tools/xngen src/s16-rmaxabs/neon.c.in -D BATCH_TILE=8 -o src/s16-rmaxabs/gen/s16-rmaxabs-neon-x8.c & -tools/xngen src/s16-rmaxabs/neon.c.in -D BATCH_TILE=16 -o src/s16-rmaxabs/gen/s16-rmaxabs-neon-x16.c & -tools/xngen src/s16-rmaxabs/neon.c.in -D BATCH_TILE=24 -o src/s16-rmaxabs/gen/s16-rmaxabs-neon-x24.c & -tools/xngen src/s16-rmaxabs/neon.c.in -D BATCH_TILE=32 -o src/s16-rmaxabs/gen/s16-rmaxabs-neon-x32.c & - -wait diff --git a/scripts/generate-s16-window.sh b/scripts/generate-s16-window.sh deleted file mode 100755 index fc12bbaf1a3..00000000000 --- a/scripts/generate-s16-window.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/sh -# Copyright 2022 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -################################### SCALAR ################################### -tools/xngen src/s16-window/scalar.c.in -D CHANNEL_TILE=1 -o src/s16-window/gen/s16-window-scalar-u1.c & -tools/xngen src/s16-window/scalar.c.in -D CHANNEL_TILE=2 -o src/s16-window/gen/s16-window-scalar-u2.c & -tools/xngen src/s16-window/scalar.c.in -D CHANNEL_TILE=3 -o src/s16-window/gen/s16-window-scalar-u3.c & -tools/xngen src/s16-window/scalar.c.in -D CHANNEL_TILE=4 -o src/s16-window/gen/s16-window-scalar-u4.c & - -################################### NEON ################################### -tools/xngen src/s16-window/neon.c.in -D CHANNEL_TILE=8 -D SHIFT=0 -o src/s16-window/gen/s16-window-neon-u8.c & -tools/xngen src/s16-window/neon.c.in -D CHANNEL_TILE=16 -D SHIFT=0 -o src/s16-window/gen/s16-window-neon-u16.c & -tools/xngen src/s16-window/neon.c.in -D CHANNEL_TILE=24 -D SHIFT=0 -o src/s16-window/gen/s16-window-neon-u24.c & -tools/xngen src/s16-window/neon.c.in -D CHANNEL_TILE=32 -D SHIFT=0 -o src/s16-window/gen/s16-window-neon-u32.c & - -tools/xngen src/s16-window/neon.c.in -D CHANNEL_TILE=8 -D SHIFT=12 -o src/s16-window/gen/s16-window-shift12-neon-u8.c & -tools/xngen src/s16-window/neon.c.in -D CHANNEL_TILE=16 -D SHIFT=12 -o src/s16-window/gen/s16-window-shift12-neon-u16.c & -tools/xngen src/s16-window/neon.c.in -D CHANNEL_TILE=24 -D SHIFT=12 -o src/s16-window/gen/s16-window-shift12-neon-u24.c & -tools/xngen src/s16-window/neon.c.in -D CHANNEL_TILE=32 -D SHIFT=12 -o src/s16-window/gen/s16-window-shift12-neon-u32.c & - -tools/xngen src/s16-window/neon.c.in -D CHANNEL_TILE=8 -D SHIFT=15 -o src/s16-window/gen/s16-window-shift15-neon-u8.c & -tools/xngen src/s16-window/neon.c.in -D CHANNEL_TILE=16 -D SHIFT=15 -o src/s16-window/gen/s16-window-shift15-neon-u16.c & -tools/xngen src/s16-window/neon.c.in -D CHANNEL_TILE=24 -D SHIFT=15 -o src/s16-window/gen/s16-window-shift15-neon-u24.c & -tools/xngen src/s16-window/neon.c.in -D CHANNEL_TILE=32 -D SHIFT=15 -o src/s16-window/gen/s16-window-shift15-neon-u32.c & - -wait diff --git a/scripts/generate-tests.sh b/scripts/generate-tests.sh index 53c91f46a63..8e44081cac3 100755 --- a/scripts/generate-tests.sh +++ b/scripts/generate-tests.sh @@ -257,21 +257,12 @@ tools/generate-dwconv-multipass-test.py --spec test/qu8-dwconv-minmax-multipass- tools/generate-dwconv2d-chw-test.py --spec test/f16-dwconv2d-chw.yaml --output test/f16-dwconv2d-chw.cc & tools/generate-dwconv2d-chw-test.py --spec test/f32-dwconv2d-chw.yaml --output test/f32-dwconv2d-chw.cc & -### Tests for VLShift micro-kernels -tools/generate-vlshift-test.py --spec test/i16-vlshift.yaml --output test/i16-vlshift.cc & - -### Tests for VLog micro-kernels -tools/generate-vlog-test.py --spec test/u32-vlog.yaml --output test/u32-vlog.cc & - ### Tests for VHSwish micro-kernels tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f16-vhswish --output test/f16-vhswish.cc & tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f32-vhswish --output test/f32-vhswish.cc & tools/generate-vhswish-test.py --spec test/qs8-vhswish.yaml --output test/qs8-vhswish.cc & tools/generate-vhswish-test.py --spec test/qu8-vhswish.yaml --output test/qu8-vhswish.cc & -### Tests for Window micro-kernels -tools/generate-window-test.py --spec test/s16-window.yaml --output test/s16-window.cc & - ### Tests for IBilinear micro-kernels tools/generate-ibilinear-test.py --spec test/f16-ibilinear.yaml --output test/f16-ibilinear.cc & tools/generate-ibilinear-test.py --spec test/f32-ibilinear.yaml --output test/f32-ibilinear.cc & @@ -286,12 +277,6 @@ tools/generate-ibilinear-chw-test.py --spec test/f32-ibilinear-chw.yaml --output tools/generate-prelu-test.py --spec test/f16-prelu.yaml --output test/f16-prelu.cc & tools/generate-prelu-test.py --spec test/f32-prelu.yaml --output test/f32-prelu.cc & -### Tests for FFTR micro-kernels -tools/generate-fftr-test.py --spec test/cs16-fftr.yaml --output test/cs16-fftr.cc & - -### Tests for BFly4 micro-kernels -tools/generate-bfly4-test.py --spec test/cs16-bfly4.yaml --output test/cs16-bfly4.cc & - ### Tests for RAddExpMinusMax micro-kernels tools/generate-raddexpminusmax-test.py --spec test/f32-raddexpminusmax.yaml --output test/f32-raddexpminusmax.cc & @@ -308,22 +293,10 @@ tools/generate-vscaleextexp-test.py --spec test/f32-vscaleextexp.yaml --output t ### Tests for VScaleExpMinusMax micro-kernels tools/generate-vscaleexpminusmax-test.py --spec test/f32-vscaleexpminusmax.yaml --output test/f32-vscaleexpminusmax.cc & -### Tests for RMaxAbs micro-kernels -tools/generate-rmaxabs-test.py --spec test/s16-rmaxabs.yaml --output test/s16-rmaxabs.cc & - ### Tests for VMulCAddC micro-kernels tools/generate-vmulcaddc-test.py --spec test/f16-vmulcaddc-minmax.yaml --output test/f16-vmulcaddc-minmax.cc & tools/generate-vmulcaddc-test.py --spec test/f32-vmulcaddc-minmax.yaml --output test/f32-vmulcaddc-minmax.cc & -### Tests for VSquareAbs micro-kernels -tools/generate-vsquareabs-test.py --spec test/cs16-vsquareabs.yaml --output test/cs16-vsquareabs.cc & - -### Tests for FilterBank accumulate micro-kernels -tools/generate-filterbank-accumulate-test.py --spec test/u32-filterbank-accumulate.yaml --output test/u32-filterbank-accumulate.cc & - -### Tests for FilterBank subtract micro-kernels -tools/generate-filterbank-subtract-test.py --spec test/u32-filterbank-subtract.yaml --output test/u32-filterbank-subtract.cc & - ### Tests for the portable SIMD wrappers. tools/xngen test/f32-simd.cc.in -D ARCH=scalar -D ARCH_MACRO="" -D TEST_REQUIRES="" -o test/f32-simd-scalar.cc & tools/xngen test/f32-simd.cc.in -D ARCH=sse2 -D ARCH_MACRO="XNN_ARCH_X86 || XNN_ARCH_X86_64" -D TEST_REQUIRES=TEST_REQUIRES_X86_SSE2 -o test/f32-simd-sse2.cc & diff --git a/scripts/generate-u32-filterbank-accumulate.sh b/scripts/generate-u32-filterbank-accumulate.sh deleted file mode 100755 index 81dd3afbc88..00000000000 --- a/scripts/generate-u32-filterbank-accumulate.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/sh -# Copyright 2022 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -################################### SCALAR ################################### -tools/xngen src/u32-filterbank-accumulate/scalar.c.in -D BATCH_TILE=1 -o src/u32-filterbank-accumulate/gen/u32-filterbank-accumulate-scalar-x1.c & - -################################### NEON ################################### -tools/xngen src/u32-filterbank-accumulate/neon.c.in -D BATCH_TILE=1 -o src/u32-filterbank-accumulate/gen/u32-filterbank-accumulate-neon-x1.c & -tools/xngen src/u32-filterbank-accumulate/neon.c.in -D BATCH_TILE=2 -o src/u32-filterbank-accumulate/gen/u32-filterbank-accumulate-neon-x2.c & - -wait diff --git a/scripts/generate-u32-vlog.sh b/scripts/generate-u32-vlog.sh deleted file mode 100755 index f7afcb534f2..00000000000 --- a/scripts/generate-u32-vlog.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/sh -# Copyright 2022 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -################################### SCALAR ################################### -tools/xngen src/u32-vlog/scalar.c.in -D BATCH_TILE=1 -o src/u32-vlog/gen/u32-vlog-scalar-x1.c & -tools/xngen src/u32-vlog/scalar.c.in -D BATCH_TILE=2 -o src/u32-vlog/gen/u32-vlog-scalar-x2.c & -tools/xngen src/u32-vlog/scalar.c.in -D BATCH_TILE=3 -o src/u32-vlog/gen/u32-vlog-scalar-x3.c & -tools/xngen src/u32-vlog/scalar.c.in -D BATCH_TILE=4 -o src/u32-vlog/gen/u32-vlog-scalar-x4.c & - -wait diff --git a/src/cs16-bfly4/cs16-bfly4-neon-x1.c b/src/cs16-bfly4/cs16-bfly4-neon-x1.c deleted file mode 100644 index ea29eab1aca..00000000000 --- a/src/cs16-bfly4/cs16-bfly4-neon-x1.c +++ /dev/null @@ -1,138 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/fft.h" - -#include - - -void xnn_cs16_bfly4_ukernel__neon_x1( - size_t batch, - size_t samples, - int16_t* data, - const int16_t* twiddle, - size_t stride) -{ - assert(batch != 0); - assert(samples != 0); - assert(samples % (sizeof(int16_t) * 2) == 0); - assert(data != NULL); - assert(stride != 0); - assert(twiddle != NULL); - - const int16x4_t vdiv4 = vdup_n_s16(8191); - const int16x4_t vnegr = vreinterpret_s16_u32(vdup_n_u32(0x0001ffff)); - - int16_t* data3 = data; - - do { - int16_t* data0 = data3; - int16_t* data1 = (int16_t*) ((uintptr_t) data0 + samples); - int16_t* data2 = (int16_t*) ((uintptr_t) data1 + samples); - data3 = (int16_t*) ((uintptr_t) data2 + samples); - - // First sample skips twiddle. - { - int16x4_t vout0 = vreinterpret_s16_u32(vld1_dup_u32((void*) data0)); - int16x4_t vout1 = vreinterpret_s16_u32(vld1_dup_u32((void*) data1)); - int16x4_t vout2 = vreinterpret_s16_u32(vld1_dup_u32((void*) data2)); - int16x4_t vout3 = vreinterpret_s16_u32(vld1_dup_u32((void*) data3)); - - vout1 = vqrdmulh_s16(vout1, vdiv4); - vout3 = vqrdmulh_s16(vout3, vdiv4); - vout0 = vqrdmulh_s16(vout0, vdiv4); - vout2 = vqrdmulh_s16(vout2, vdiv4); - - const int16x4_t vtmp4 = vsub_s16(vout1, vout3); - const int16x4_t vtmp3 = vadd_s16(vout1, vout3); - - int16x4_t vrev4 = vmul_s16(vtmp4, vnegr); // vrev4 = vtmp4 -r, i - const int16x4_t vtmp5 = vsub_s16(vout0, vout2); - vout0 = vadd_s16(vout0, vout2); - vrev4 = vrev32_s16(vrev4); // vrev4 = vtmp4 i, -r - - vout2 = vsub_s16(vout0, vtmp3); - vout0 = vadd_s16(vout0, vtmp3); - vout1 = vadd_s16(vtmp5, vrev4); - vout3 = vsub_s16(vtmp5, vrev4); - - vst1_lane_u32((void*) data0, vreinterpret_u32_s16(vout0), 0); data0 += 2; - vst1_lane_u32((void*) data1, vreinterpret_u32_s16(vout1), 0); data1 += 2; - vst1_lane_u32((void*) data2, vreinterpret_u32_s16(vout2), 0); data2 += 2; - vst1_lane_u32((void*) data3, vreinterpret_u32_s16(vout3), 0); data3 += 2; - } - - size_t s = samples - sizeof(int16_t) * 2; - - if XNN_LIKELY(s != 0) { - - const int16_t* tw1 = (const int16_t*) ((uintptr_t) twiddle + stride); - const int16_t* tw2 = (const int16_t*) ((uintptr_t) twiddle + stride * 2); - const int16_t* tw3 = (const int16_t*) ((uintptr_t) twiddle + stride * 3); - - do { - int16x4_t vout0 = vreinterpret_s16_u32(vld1_dup_u32((void*) data0)); - int16x4_t vout1 = vreinterpret_s16_u32(vld1_dup_u32((void*) data1)); - int16x4_t vout2 = vreinterpret_s16_u32(vld1_dup_u32((void*) data2)); - int16x4_t vout3 = vreinterpret_s16_u32(vld1_dup_u32((void*) data3)); - - const int16x4_t vtw1 = vreinterpret_s16_u32(vld1_dup_u32((const void*) tw1)); - const int16x4_t vtw2 = vreinterpret_s16_u32(vld1_dup_u32((const void*) tw2)); - const int16x4_t vtw3 = vreinterpret_s16_u32(vld1_dup_u32((const void*) tw3)); - tw1 = (const int16_t*) ((uintptr_t) tw1 + stride); - tw2 = (const int16_t*) ((uintptr_t) tw2 + stride * 2); - tw3 = (const int16_t*) ((uintptr_t) tw3 + stride * 3); - - // Note 32767 / 4 = 8191. Should be 8192. - vout0 = vqrdmulh_s16(vout0, vdiv4); - vout1 = vqrdmulh_s16(vout1, vdiv4); - vout2 = vqrdmulh_s16(vout2, vdiv4); - vout3 = vqrdmulh_s16(vout3, vdiv4); - - int16x4_t vnegtw1 = vmul_s16(vtw1, vnegr); // vrevtw1 = vtw1 -r, i - int16x4_t vnegtw2 = vmul_s16(vtw2, vnegr); // vrevtw2 = vtw2 -r, i - int16x4_t vnegtw3 = vmul_s16(vtw3, vnegr); // vrevtw3 = vtw3 -r, i - int32x4_t vaccr1 = vmull_lane_s16(vtw1, vout1, 0); - int32x4_t vaccr2 = vmull_lane_s16(vtw2, vout2, 0); - int32x4_t vaccr3 = vmull_lane_s16(vtw3, vout3, 0); - int16x4_t vrevtw1 = vrev32_s16(vnegtw1); // vrevtw1 = vtw1 i, -r - int16x4_t vrevtw2 = vrev32_s16(vnegtw2); // vrevtw2 = vtw2 i, -r - int16x4_t vrevtw3 = vrev32_s16(vnegtw3); // vrevtw3 = vtw3 i, -r - vaccr1 = vmlsl_lane_s16(vaccr1, vrevtw1, vout1, 1); - vaccr2 = vmlsl_lane_s16(vaccr2, vrevtw2, vout2, 1); - vaccr3 = vmlsl_lane_s16(vaccr3, vrevtw3, vout3, 1); - const int16x4_t vtmp0 = vrshrn_n_s32(vaccr1, 15); - const int16x4_t vtmp1 = vrshrn_n_s32(vaccr2, 15); - const int16x4_t vtmp2 = vrshrn_n_s32(vaccr3, 15); - - const int16x4_t vtmp4 = vsub_s16(vtmp0, vtmp2); - const int16x4_t vtmp3 = vadd_s16(vtmp0, vtmp2); - - int16x4_t vrev4 = vmul_s16(vtmp4, vnegr); // vrev4 = vtmp4 -r, i - const int16x4_t vtmp5 = vsub_s16(vout0, vtmp1); - vout0 = vadd_s16(vout0, vtmp1); - vrev4 = vrev32_s16(vrev4); // vrev4 = vtmp4 i, -r - - vout2 = vsub_s16(vout0, vtmp3); - vout0 = vadd_s16(vout0, vtmp3); - vout1 = vadd_s16(vtmp5, vrev4); - vout3 = vsub_s16(vtmp5, vrev4); - - vst1_lane_u32((void*) data0, vreinterpret_u32_s16(vout0), 0); data0 += 2; - vst1_lane_u32((void*) data1, vreinterpret_u32_s16(vout1), 0); data1 += 2; - vst1_lane_u32((void*) data2, vreinterpret_u32_s16(vout2), 0); data2 += 2; - vst1_lane_u32((void*) data3, vreinterpret_u32_s16(vout3), 0); data3 += 2; - - s -= sizeof(int16_t) * 2; - } while (s != 0); - } - } while (--batch != 0); -} - diff --git a/src/cs16-bfly4/cs16-bfly4-neon-x4.c b/src/cs16-bfly4/cs16-bfly4-neon-x4.c deleted file mode 100644 index 7cfa64e11ad..00000000000 --- a/src/cs16-bfly4/cs16-bfly4-neon-x4.c +++ /dev/null @@ -1,129 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/fft.h" - -#include - -void xnn_cs16_bfly4_ukernel__neon_x4( - size_t batch, - size_t samples, - int16_t* data, - const int16_t* twiddle, - size_t stride) -{ - assert(batch != 0); - assert(samples != 0); - assert(samples % (sizeof(int16_t) * 8) == 0); - assert(data != NULL); - assert(stride != 0); - assert(twiddle != NULL); - - const int16x4_t vdiv4 = vdup_n_s16(8191); - - int16_t* data3 = data; - do { - int16_t* data0 = data3; - int16_t* data1 = (int16_t*) ((uintptr_t) data0 + samples); - int16_t* data2 = (int16_t*) ((uintptr_t) data1 + samples); - data3 = (int16_t*) ((uintptr_t) data2 + samples); - - const int16_t* tw1 = twiddle; - const int16_t* tw2 = twiddle; - const int16_t* tw3 = twiddle; - - size_t s = samples; - for (; s >= sizeof(int16_t) * 8; s -= sizeof(int16_t) * 8) { - int16x4x2_t vout0 = vld2_s16(data0); - int16x4x2_t vout1 = vld2_s16(data1); - int16x4x2_t vout2 = vld2_s16(data2); - int16x4x2_t vout3 = vld2_s16(data3); - - int16x4x2_t vtw1 = vld2_dup_s16(tw1); - int16x4x2_t vtw2 = vld2_dup_s16(tw2); - int16x4x2_t vtw3 = vld2_dup_s16(tw3); - tw1 = (const int16_t*) ((uintptr_t) tw1 + stride); - tw2 = (const int16_t*) ((uintptr_t) tw2 + stride * 2); - tw3 = (const int16_t*) ((uintptr_t) tw3 + stride * 3); - vtw1 = vld2_lane_s16(tw1, vtw1, 1); - vtw2 = vld2_lane_s16(tw2, vtw2, 1); - vtw3 = vld2_lane_s16(tw3, vtw3, 1); - tw1 = (const int16_t*) ((uintptr_t) tw1 + stride); - tw2 = (const int16_t*) ((uintptr_t) tw2 + stride * 2); - tw3 = (const int16_t*) ((uintptr_t) tw3 + stride * 3); - vtw1 = vld2_lane_s16(tw1, vtw1, 2); - vtw2 = vld2_lane_s16(tw2, vtw2, 2); - vtw3 = vld2_lane_s16(tw3, vtw3, 2); - tw1 = (const int16_t*) ((uintptr_t) tw1 + stride); - tw2 = (const int16_t*) ((uintptr_t) tw2 + stride * 2); - tw3 = (const int16_t*) ((uintptr_t) tw3 + stride * 3); - vtw1 = vld2_lane_s16(tw1, vtw1, 3); - vtw2 = vld2_lane_s16(tw2, vtw2, 3); - vtw3 = vld2_lane_s16(tw3, vtw3, 3); - tw1 = (const int16_t*) ((uintptr_t) tw1 + stride); - tw2 = (const int16_t*) ((uintptr_t) tw2 + stride * 2); - tw3 = (const int16_t*) ((uintptr_t) tw3 + stride * 3); - - // Note 32767 / 4 = 8191. Should be 8192. - vout1.val[0] = vqrdmulh_s16(vout1.val[0], vdiv4); - vout1.val[1] = vqrdmulh_s16(vout1.val[1], vdiv4); - vout2.val[0] = vqrdmulh_s16(vout2.val[0], vdiv4); - vout2.val[1] = vqrdmulh_s16(vout2.val[1], vdiv4); - vout3.val[0] = vqrdmulh_s16(vout3.val[0], vdiv4); - vout3.val[1] = vqrdmulh_s16(vout3.val[1], vdiv4); - vout0.val[0] = vqrdmulh_s16(vout0.val[0], vdiv4); - vout0.val[1] = vqrdmulh_s16(vout0.val[1], vdiv4); - - int32x4_t vacc0r = vmull_s16(vout1.val[0], vtw1.val[0]); - int32x4_t vacc1r = vmull_s16(vout2.val[0], vtw2.val[0]); - int32x4_t vacc2r = vmull_s16(vout3.val[0], vtw3.val[0]); - int32x4_t vacc0i = vmull_s16(vout1.val[0], vtw1.val[1]); - int32x4_t vacc1i = vmull_s16(vout2.val[0], vtw2.val[1]); - int32x4_t vacc2i = vmull_s16(vout3.val[0], vtw3.val[1]); - vacc0r = vmlsl_s16(vacc0r, vout1.val[1], vtw1.val[1]); - vacc1r = vmlsl_s16(vacc1r, vout2.val[1], vtw2.val[1]); - vacc2r = vmlsl_s16(vacc2r, vout3.val[1], vtw3.val[1]); - vacc0i = vmlal_s16(vacc0i, vout1.val[1], vtw1.val[0]); - vacc1i = vmlal_s16(vacc1i, vout2.val[1], vtw2.val[0]); - vacc2i = vmlal_s16(vacc2i, vout3.val[1], vtw3.val[0]); - int16x4_t vtmp0r = vrshrn_n_s32(vacc0r, 15); - int16x4_t vtmp1r = vrshrn_n_s32(vacc1r, 15); - int16x4_t vtmp2r = vrshrn_n_s32(vacc2r, 15); - int16x4_t vtmp0i = vrshrn_n_s32(vacc0i, 15); - int16x4_t vtmp1i = vrshrn_n_s32(vacc1i, 15); - int16x4_t vtmp2i = vrshrn_n_s32(vacc2i, 15); - - const int16x4_t vtmp4r = vsub_s16(vtmp0r, vtmp2r); - const int16x4_t vtmp4i = vsub_s16(vtmp0i, vtmp2i); - const int16x4_t vtmp3r = vadd_s16(vtmp0r, vtmp2r); - const int16x4_t vtmp3i = vadd_s16(vtmp0i, vtmp2i); - - const int16x4_t vtmp5r = vsub_s16(vout0.val[0], vtmp1r); - const int16x4_t vtmp5i = vsub_s16(vout0.val[1], vtmp1i); - vout0.val[0] = vadd_s16(vout0.val[0], vtmp1r); - vout0.val[1] = vadd_s16(vout0.val[1], vtmp1i); - - vout2.val[0] = vsub_s16(vout0.val[0], vtmp3r); - vout2.val[1] = vsub_s16(vout0.val[1], vtmp3i); - vout0.val[0] = vadd_s16(vout0.val[0], vtmp3r); - vout0.val[1] = vadd_s16(vout0.val[1], vtmp3i); - - vout1.val[0] = vadd_s16(vtmp5r, vtmp4i); - vout1.val[1] = vsub_s16(vtmp5i, vtmp4r); - vout3.val[0] = vsub_s16(vtmp5r, vtmp4i); - vout3.val[1] = vadd_s16(vtmp5i, vtmp4r); - - vst2_s16(data0, vout0); data0 += 8; - vst2_s16(data1, vout1); data1 += 8; - vst2_s16(data2, vout2); data2 += 8; - vst2_s16(data3, vout3); data3 += 8; - } - } while (--batch != 0); -} diff --git a/src/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x1.S b/src/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x1.S deleted file mode 100644 index af22d6e97fb..00000000000 --- a/src/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x1.S +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include "xnnpack/assembly.h" - -.syntax unified - -// void xnn_cs16_bfly4_samples1_ukernel__asm_aarch32_neon_x1( -// size_t batch, r0 -// size_t samples, (unused) -// int16_t* data, r2 -// const int16_t* twiddle, (unused) -// size_t stride) (unused) - -// d8-d15, r12-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved. - -// Register usage -// vout0 r2 d0 -// vout1 d1 -// vout2 d2 -// vout3 d3 -// vtmp3 d4 -// vtmp4 d5 -// vtmp5 d6 -// vtmp0 d7 - -// vdiv4 d16 -// vnegr d17 - -BEGIN_FUNCTION xnn_cs16_bfly4_samples1_ukernel__asm_aarch32_neon_x1 - .arm -#ifndef __APPLE__ - .arch armv7-a - .fpu neon -#endif - VMVN.U16 d16, 57344 // 8191 - VMOV.I32 d17, 0x0001ffff // vnegr - - // Remainder batch of 1 -0: - VLD4.32 {d0[0],d1[0],d2[0],d3[0]}, [r2] // input 1 batch - SUBS r0, r0, 1 // batch - - VQRDMULH.S16 d1, d1, d16 // vout1 /= 4 - VQRDMULH.S16 d3, d3, d16 // vout3 /= 4 - VQRDMULH.S16 d0, d0, d16 // vout0 /= 4 - VQRDMULH.S16 d2, d2, d16 // vout2 /= 4 - - VSUB.I16 d5, d1, d3 // vtmp4 = vout1 - vout3 - VADD.I16 d4, d1, d3 // vtmp3 = vout1 + vout3 - - VMUL.S16 d5, d5, d17 // vrev4 = vtmp4 -r, i - VADD.I16 d7, d0, d2 // vtmp0 = vout0 + vout2 - VSUB.I16 d6, d0, d2 // vtmp5 = vout0 - vout2 - - VADD.I16 d0, d7, d4 // vout0 = vtmp0 + vtmp3 - VSUB.I16 d2, d7, d4 // vout2 = vtmp0 - vtmp3 - - VREV32.16 d5, d5 // vrev4 = vtmp4 i, -r - VADD.I16 d1, d6, d5 // vout1 = vtmp5 + vrev4 - VSUB.I16 d3, d6, d5 // vout3 = vtmp5 - vrev4 - - VST4.32 {d0[0],d1[0],d2[0],d3[0]}, [r2]! // output 1 batch - BHI 0b - - BX lr - -END_FUNCTION xnn_cs16_bfly4_samples1_ukernel__asm_aarch32_neon_x1 - -#ifdef __ELF__ -.section ".note.GNU-stack","",%progbits -#endif diff --git a/src/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x2.S b/src/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x2.S deleted file mode 100644 index bbc86b85ca0..00000000000 --- a/src/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x2.S +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include "xnnpack/assembly.h" - -.syntax unified - -// void xnn_cs16_bfly4_samples1_ukernel__asm_aarch32_neon_x2( -// size_t batch, r0 -// size_t samples, (unused) -// int16_t* data, r2 -// const int16_t* twiddle, (unused) -// size_t stride) (unused) - -// d8-d15, r12-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved. - -// Register usage -// vout0 r2 d0 -// vout1 d1 -// vout2 d2 -// vout3 d3 -// vtmp3 d4 -// vtmp4 d5 -// vtmp5 d6 -// vtmp0 d7 - -// vdiv4 d16 -// vnegr d17 - -BEGIN_FUNCTION xnn_cs16_bfly4_samples1_ukernel__asm_aarch32_neon_x2 - .arm -#ifndef __APPLE__ - .arch armv7-a - .fpu neon -#endif - SUBS r0, r0, 1 // batch - 1 - VMVN.U16 d16, 57344 // 8191 - VMOV.I32 d17, 0x0001ffff // vnegr - BLS 1f - - // Batch of 2 main loop -0: - VLD4.32 {d0,d1,d2,d3}, [r2] // input 2 batches - SUBS r0, r0, 2 // batch - - VQRDMULH.S16 d1, d1, d16 // vout1 /= 4 - VQRDMULH.S16 d3, d3, d16 // vout3 /= 4 - VQRDMULH.S16 d0, d0, d16 // vout0 /= 4 - VQRDMULH.S16 d2, d2, d16 // vout2 /= 4 - - VSUB.I16 d5, d1, d3 // vtmp4 = vout1 - vout3 - VADD.I16 d4, d1, d3 // vtmp3 = vout1 + vout3 - - VMUL.S16 d5, d5, d17 // vrev4 = vtmp4 -r, i - VADD.I16 d7, d0, d2 // vtmp0 = vout0 + vout2 - VSUB.I16 d6, d0, d2 // vtmp5 = vout0 - vout2 - - VADD.I16 d0, d7, d4 // vout0 = vtmp0 + vtmp3 - VSUB.I16 d2, d7, d4 // vout2 = vtmp0 - vtmp3 - - VREV32.16 d5, d5 // vrev4 = vtmp4 i, -r - VADD.I16 d1, d6, d5 // vout1 = vtmp5 + vrev4 - VSUB.I16 d3, d6, d5 // vout3 = vtmp5 - vrev4 - - VST4.32 {d0,d1,d2,d3}, [r2]! // output 2 batches - BHI 0b - - BXLO lr // no remainder? early return - - // Remainder batch of 1 -1: - VLD4.32 {d0[0],d1[0],d2[0],d3[0]}, [r2] // input 1 batch - - VQRDMULH.S16 d1, d1, d16 // vout1 /= 4 - VQRDMULH.S16 d3, d3, d16 // vout3 /= 4 - VQRDMULH.S16 d0, d0, d16 // vout0 /= 4 - VQRDMULH.S16 d2, d2, d16 // vout2 /= 4 - - VSUB.I16 d5, d1, d3 // vtmp4 = vout1 - vout3 - VADD.I16 d4, d1, d3 // vtmp3 = vout1 + vout3 - - VMUL.S16 d5, d5, d17 // vrev4 = vtmp4 -r, i - VADD.I16 d7, d0, d2 // vtmp0 = vout0 + vout2 - VSUB.I16 d6, d0, d2 // vtmp5 = vout0 - vout2 - - VADD.I16 d0, d7, d4 // vout0 = vtmp0 + vtmp3 - VSUB.I16 d2, d7, d4 // vout2 = vtmp0 - vtmp3 - - VREV32.16 d5, d5 // vrev4 = vtmp4 i, -r - VADD.I16 d1, d6, d5 // vout1 = vtmp5 + vrev4 - VSUB.I16 d3, d6, d5 // vout3 = vtmp5 - vrev4 - - VST4.32 {d0[0],d1[0],d2[0],d3[0]}, [r2] // output 1 batch - BX lr - -END_FUNCTION xnn_cs16_bfly4_samples1_ukernel__asm_aarch32_neon_x2 - -#ifdef __ELF__ -.section ".note.GNU-stack","",%progbits -#endif diff --git a/src/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x4.S b/src/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x4.S deleted file mode 100644 index 9f4efa7af58..00000000000 --- a/src/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x4.S +++ /dev/null @@ -1,110 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include "xnnpack/assembly.h" - -.syntax unified - -// void xnn_cs16_bfly4_samples1_ukernel__asm_aarch32_neon_x4( -// size_t batch, r0 -// size_t samples, (unused) -// int16_t* data, r2 -// const int16_t* twiddle, (unused) -// size_t stride) (unused) - -// d8-d15, r12-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved. - -// Register usage -// vout0 r2 q0 -// vout1 q1 -// vout2 q2 -// vout3 q3 -// vtmp3 q8 -// vtmp4 q9 -// vtmp5 q10 -// vtmp0 q11 - -// vdiv4 q12 -// vnegr q13 - -BEGIN_FUNCTION xnn_cs16_bfly4_samples1_ukernel__asm_aarch32_neon_x4 - .arm -#ifndef __APPLE__ - .arch armv7-a - .fpu neon -#endif - SUBS r0, r0, 4 // batch - VMVN.U16 q12, 57344 // 8191 - VMOV.I32 q13, 0x0001ffff // vnegr - BLO 1f - - MOV r3, r2 // output = input for post inc - - // batch of 4 main loop -0: - VLD4.32 {d0,d2,d4,d6}, [r2]! // input first 2 batch - VLD4.32 {d1,d3,d5,d7}, [r2]! // input second 2 batch - SUBS r0, r0, 4 // batch - VQRDMULH.S16 q1, q1, q12 // vout1 /= 4 - VQRDMULH.S16 q3, q3, q12 // vout3 /= 4 - VQRDMULH.S16 q0, q0, q12 // vout0 /= 4 - VQRDMULH.S16 q2, q2, q12 // vout2 /= 4 - - VSUB.I16 q9, q1, q3 // vtmp4 = vout1 - vout3 - VADD.I16 q8, q1, q3 // vtmp3 = vout1 + vout3 - - VMUL.S16 q9, q9, q13 // vrev4 = vtmp4 -r, i - VADD.I16 q11, q0, q2 // vtmp0 = vout0 + vout2 - VSUB.I16 q10, q0, q2 // vtmp5 = vout0 - vout2 - - VADD.I16 q0, q11, q8 // vout0 = vtmp0 + vtmp3 - VSUB.I16 q2, q11, q8 // vout2 = vtmp0 - vtmp3 - - VREV32.16 q9, q9 // vrev4 = vtmp4 i, -r - VADD.I16 q1, q10, q9 // vout1 = vtmp5 + vrev4 - VSUB.I16 q3, q10, q9 // vout3 = vtmp5 - vrev4 - - VST4.32 {d0,d2,d4,d6}, [r3]! // output first 2 batch - VST4.32 {d1,d3,d5,d7}, [r3]! // output second 2 batch - - BHS 0b - -1: - ANDS r0, r0, 3 // batch remainder? - BXEQ lr - - // Remainder batch of 1 to 3 -2: - VLD4.32 {d0[0],d1[0],d2[0],d3[0]}, [r2] // input 1 batch - SUBS r0, r0, 1 // batch - VQRDMULH.S16 d1, d1, d24 // vout1 /= 4 - VQRDMULH.S16 d3, d3, d24 // vout3 /= 4 - VQRDMULH.S16 d0, d0, d24 // vout0 /= 4 - VQRDMULH.S16 d2, d2, d24 // vout2 /= 4 - - VSUB.I16 d5, d1, d3 // vtmp4 = vout1 - vout3 - VADD.I16 d4, d1, d3 // vtmp3 = vout1 + vout3 - VMUL.S16 d5, d5, d26 // vrev4 = vtmp4 -r, i - - VADD.I16 d7, d0, d2 // vtmp0 = vout0 + vout2 - VSUB.I16 d6, d0, d2 // vtmp5 = vout0 - vout2 - - VADD.I16 d0, d7, d4 // vout0 = vtmp0 + vtmp3 - VSUB.I16 d2, d7, d4 // vout2 = vtmp0 - vtmp3 - - VREV32.16 d5, d5 // vrev4 = vtmp4 i, -r - VADD.I16 d1, d6, d5 // vout1 = vtmp5 + vrev4 - VSUB.I16 d3, d6, d5 // vout3 = vtmp5 - vrev4 - - VST4.32 {d0[0],d1[0],d2[0],d3[0]}, [r2]! // output 1 batch - BHI 2b - - BX lr - -END_FUNCTION xnn_cs16_bfly4_samples1_ukernel__asm_aarch32_neon_x4 - -#ifdef __ELF__ -.section ".note.GNU-stack","",%progbits -#endif diff --git a/src/cs16-bfly4/cs16-bfly4-samples1-neon.c b/src/cs16-bfly4/cs16-bfly4-samples1-neon.c deleted file mode 100644 index 622482f72d0..00000000000 --- a/src/cs16-bfly4/cs16-bfly4-samples1-neon.c +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/fft.h" - -#include - - -void xnn_cs16_bfly4_samples1_ukernel__neon( - size_t batch, - size_t samples, - int16_t* data, - const int16_t* twiddle, - size_t stride) -{ - assert(batch != 0); - assert(samples == sizeof(int16_t) * 2); - assert(data != NULL); - assert(stride != 0); - assert(twiddle != NULL); - - const int16x4_t vdiv4 = vdup_n_s16(8191); - const int16x4_t vnegr = vreinterpret_s16_u32(vdup_n_u32(0x0001ffff)); - uint32x2x4_t vout; - - do { - const uint32x2x4_t vi = (vld4_dup_u32((void*)data)); - - int16x4_t vout1 = vqrdmulh_s16(vreinterpret_s16_u32(vi.val[1]), vdiv4); - int16x4_t vout3 = vqrdmulh_s16(vreinterpret_s16_u32(vi.val[3]), vdiv4); - int16x4_t vout0 = vqrdmulh_s16(vreinterpret_s16_u32(vi.val[0]), vdiv4); - int16x4_t vout2 = vqrdmulh_s16(vreinterpret_s16_u32(vi.val[2]), vdiv4); - - const int16x4_t vtmp4 = vsub_s16(vout1, vout3); - const int16x4_t vtmp3 = vadd_s16(vout1, vout3); - - int16x4_t vrev4 = vmul_s16(vtmp4, vnegr); // vrev4 = vtmp4 -r, i - const int16x4_t vtmp5 = vsub_s16(vout0, vout2); - vout0 = vadd_s16(vout0, vout2); - vrev4 = vrev32_s16(vrev4); // vrev4 = vtmp4 i, -r - - vout.val[2] = vreinterpret_u32_s16(vsub_s16(vout0, vtmp3)); - vout.val[0] = vreinterpret_u32_s16(vadd_s16(vout0, vtmp3)); - vout.val[1] = vreinterpret_u32_s16(vadd_s16(vtmp5, vrev4)); - vout.val[3] = vreinterpret_u32_s16(vsub_s16(vtmp5, vrev4)); - - vst4_lane_u32((void*)data, vout, 0); - data += 8; - } while(--batch != 0); -} diff --git a/src/cs16-bfly4/cs16-bfly4-samples1-scalar.c b/src/cs16-bfly4/cs16-bfly4-samples1-scalar.c deleted file mode 100644 index c8321ae0d80..00000000000 --- a/src/cs16-bfly4/cs16-bfly4-samples1-scalar.c +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/fft.h" - - -void xnn_cs16_bfly4_samples1_ukernel__scalar( - size_t batch, - size_t samples, - int16_t* data, - const int16_t* twiddle, - size_t stride) -{ - assert(samples == sizeof(int16_t) * 2); - assert(data != NULL); - assert(stride != 0); - assert(twiddle != NULL); - - do { - int32_t vout0r = (int32_t) data[0]; - int32_t vout0i = (int32_t) data[1]; - int32_t vout1r = (int32_t) data[2]; - int32_t vout1i = (int32_t) data[3]; - int32_t vout2r = (int32_t) data[4]; - int32_t vout2i = (int32_t) data[5]; - int32_t vout3r = (int32_t) data[6]; - int32_t vout3i = (int32_t) data[7]; - - // Note 32767 / 4 = 8191. Should be 8192. - vout0r = math_asr_s32(vout0r * 8191 + 16384, 15); - vout0i = math_asr_s32(vout0i * 8191 + 16384, 15); - vout1r = math_asr_s32(vout1r * 8191 + 16384, 15); - vout1i = math_asr_s32(vout1i * 8191 + 16384, 15); - vout2r = math_asr_s32(vout2r * 8191 + 16384, 15); - vout2i = math_asr_s32(vout2i * 8191 + 16384, 15); - vout3r = math_asr_s32(vout3r * 8191 + 16384, 15); - vout3i = math_asr_s32(vout3i * 8191 + 16384, 15); - - const int32_t vtmp5r = vout0r - vout2r; - const int32_t vtmp5i = vout0i - vout2i; - vout0r += vout2r; - vout0i += vout2i; - const int32_t vtmp3r = vout1r + vout3r; - const int32_t vtmp3i = vout1i + vout3i; - const int32_t vtmp4r = vout1i - vout3i; - const int32_t vtmp4i = -(vout1r - vout3r); // swap r,i and neg i - vout2r = vout0r - vtmp3r; - vout2i = vout0i - vtmp3i; - - vout0r += vtmp3r; - vout0i += vtmp3i; - - vout1r = vtmp5r + vtmp4r; - vout1i = vtmp5i + vtmp4i; - vout3r = vtmp5r - vtmp4r; - vout3i = vtmp5i - vtmp4i; - - data[0] = (int16_t) vout0r; - data[1] = (int16_t) vout0i; - data[2] = (int16_t) vout1r; - data[3] = (int16_t) vout1i; - data[4] = (int16_t) vout2r; - data[5] = (int16_t) vout2i; - data[6] = (int16_t) vout3r; - data[7] = (int16_t) vout3i; - data += 8; - } while(--batch != 0); -} diff --git a/src/cs16-bfly4/cs16-bfly4-samples4-neon.c b/src/cs16-bfly4/cs16-bfly4-samples4-neon.c deleted file mode 100644 index fc205553d69..00000000000 --- a/src/cs16-bfly4/cs16-bfly4-samples4-neon.c +++ /dev/null @@ -1,106 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/fft.h" - -#include - -static const int16_t xnn_table_fft256_samples4_twiddle[24] = { - 32767,0, 30273,-12539, 23170,-23170, 12539,-30273, - 32767,0, 23170,-23170, 0,-32767, -23170,-23170, - 32767,0, 12539,-30273, -23170,-23170, -30273, 12539, -}; - -void xnn_cs16_bfly4_samples4_ukernel__neon( - size_t batch, - size_t samples, - int16_t* data, - const int16_t* twiddle, - size_t stride) -{ - assert(batch != 0); - assert(samples == sizeof(int16_t) * 8); - assert(data != NULL); - assert(stride == sizeof(int16_t) * 2 * 16); - assert(twiddle != NULL); - - const int16x4_t vdiv4 = vdup_n_s16(8191); - int16x4x2_t vtw1 = vld2_s16(xnn_table_fft256_samples4_twiddle); - int16x4x2_t vtw2 = vld2_s16(xnn_table_fft256_samples4_twiddle + 8); - int16x4x2_t vtw3 = vld2_s16(xnn_table_fft256_samples4_twiddle + 16); - - int16_t* data3 = data; - do { - int16_t* data0 = data3; - int16_t* data1 = (int16_t*) ((uintptr_t) data0 + samples); - int16_t* data2 = (int16_t*) ((uintptr_t) data1 + samples); - data3 = (int16_t*) ((uintptr_t) data2 + samples); - - int16x4x2_t vout0 = vld2_s16(data0); - int16x4x2_t vout1 = vld2_s16(data1); - int16x4x2_t vout2 = vld2_s16(data2); - int16x4x2_t vout3 = vld2_s16(data3); - - // Note 32767 / 4 = 8191. Should be 8192. - vout0.val[0] = vqrdmulh_s16(vout0.val[0], vdiv4); - vout0.val[1] = vqrdmulh_s16(vout0.val[1], vdiv4); - vout1.val[0] = vqrdmulh_s16(vout1.val[0], vdiv4); - vout1.val[1] = vqrdmulh_s16(vout1.val[1], vdiv4); - vout2.val[0] = vqrdmulh_s16(vout2.val[0], vdiv4); - vout2.val[1] = vqrdmulh_s16(vout2.val[1], vdiv4); - vout3.val[0] = vqrdmulh_s16(vout3.val[0], vdiv4); - vout3.val[1] = vqrdmulh_s16(vout3.val[1], vdiv4); - - int32x4_t vacc0r = vmull_s16(vout1.val[0], vtw1.val[0]); - int32x4_t vacc1r = vmull_s16(vout2.val[0], vtw2.val[0]); - int32x4_t vacc2r = vmull_s16(vout3.val[0], vtw3.val[0]); - int32x4_t vacc0i = vmull_s16(vout1.val[0], vtw1.val[1]); - int32x4_t vacc1i = vmull_s16(vout2.val[0], vtw2.val[1]); - int32x4_t vacc2i = vmull_s16(vout3.val[0], vtw3.val[1]); - vacc0r = vmlsl_s16(vacc0r, vout1.val[1], vtw1.val[1]); - vacc1r = vmlsl_s16(vacc1r, vout2.val[1], vtw2.val[1]); - vacc2r = vmlsl_s16(vacc2r, vout3.val[1], vtw3.val[1]); - vacc0i = vmlal_s16(vacc0i, vout1.val[1], vtw1.val[0]); - vacc1i = vmlal_s16(vacc1i, vout2.val[1], vtw2.val[0]); - vacc2i = vmlal_s16(vacc2i, vout3.val[1], vtw3.val[0]); - int16x4_t vtmp0r = vrshrn_n_s32(vacc0r, 15); - int16x4_t vtmp1r = vrshrn_n_s32(vacc1r, 15); - int16x4_t vtmp2r = vrshrn_n_s32(vacc2r, 15); - int16x4_t vtmp0i = vrshrn_n_s32(vacc0i, 15); - int16x4_t vtmp1i = vrshrn_n_s32(vacc1i, 15); - int16x4_t vtmp2i = vrshrn_n_s32(vacc2i, 15); - - const int16x4_t vtmp4r = vsub_s16(vtmp0r, vtmp2r); - const int16x4_t vtmp4i = vsub_s16(vtmp0i, vtmp2i); - const int16x4_t vtmp3r = vadd_s16(vtmp0r, vtmp2r); - const int16x4_t vtmp3i = vadd_s16(vtmp0i, vtmp2i); - - const int16x4_t vtmp5r = vsub_s16(vout0.val[0], vtmp1r); - const int16x4_t vtmp5i = vsub_s16(vout0.val[1], vtmp1i); - vout0.val[0] = vadd_s16(vout0.val[0], vtmp1r); - vout0.val[1] = vadd_s16(vout0.val[1], vtmp1i); - - vout2.val[0] = vsub_s16(vout0.val[0], vtmp3r); - vout2.val[1] = vsub_s16(vout0.val[1], vtmp3i); - vout0.val[0] = vadd_s16(vout0.val[0], vtmp3r); - vout0.val[1] = vadd_s16(vout0.val[1], vtmp3i); - - vout1.val[0] = vadd_s16(vtmp5r, vtmp4i); - vout1.val[1] = vsub_s16(vtmp5i, vtmp4r); - vout3.val[0] = vsub_s16(vtmp5r, vtmp4i); - vout3.val[1] = vadd_s16(vtmp5i, vtmp4r); - - vst2_s16(data0, vout0); - vst2_s16(data1, vout1); - vst2_s16(data2, vout2); - vst2_s16(data3, vout3); data3 += 8; - - } while (--batch != 0); -} diff --git a/src/cs16-bfly4/cs16-bfly4-samples4-scalar.c b/src/cs16-bfly4/cs16-bfly4-samples4-scalar.c deleted file mode 100644 index c667aa06a68..00000000000 --- a/src/cs16-bfly4/cs16-bfly4-samples4-scalar.c +++ /dev/null @@ -1,269 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/fft.h" - - -void xnn_cs16_bfly4_samples4_ukernel__scalar( - size_t batch, - size_t samples, - int16_t* data, - const int16_t* twiddle, - size_t stride) -{ - assert(batch != 0); - assert(samples != 0); - assert(samples % (sizeof(int16_t) * 2) == 0); - assert(data != NULL); - assert(stride != 0); - assert(twiddle != NULL); - - const int16_t* tw1 = twiddle; - const int16_t* tw2 = twiddle; - const int16_t* tw3 = twiddle; - tw1 = (const int16_t*) ((uintptr_t) tw1 + stride); - tw2 = (const int16_t*) ((uintptr_t) tw2 + stride * 2); - tw3 = (const int16_t*) ((uintptr_t) tw3 + stride * 3); - - const int32_t vtw1r0 = (const int32_t) tw1[0]; - const int32_t vtw1i0 = (const int32_t) tw1[1]; - tw1 = (const int16_t*) ((uintptr_t) tw1 + stride); - const int32_t vtw1r1 = (const int32_t) tw1[0]; - const int32_t vtw1i1 = (const int32_t) tw1[1]; - tw1 = (const int16_t*) ((uintptr_t) tw1 + stride); - const int32_t vtw1r2 = (const int32_t) tw1[0]; - const int32_t vtw1i2 = (const int32_t) tw1[1]; - const int32_t vtw2r0 = (const int32_t) tw2[0]; - const int32_t vtw2i0 = (const int32_t) tw2[1]; - tw2 = (const int16_t*) ((uintptr_t) tw2 + stride * 2); - const int32_t vtw2r1 = (const int32_t) tw2[0]; - const int32_t vtw2i1 = (const int32_t) tw2[1]; - tw2 = (const int16_t*) ((uintptr_t) tw2 + stride * 2); - const int32_t vtw2r2 = (const int32_t) tw2[0]; - const int32_t vtw2i2 = (const int32_t) tw2[1]; - const int32_t vtw3r0 = (const int32_t) tw3[0]; - const int32_t vtw3i0 = (const int32_t) tw3[1]; - tw3 = (const int16_t*) ((uintptr_t) tw3 + stride * 3); - const int32_t vtw3r1 = (const int32_t) tw3[0]; - const int32_t vtw3i1 = (const int32_t) tw3[1]; - tw3 = (const int16_t*) ((uintptr_t) tw3 + stride * 3); - const int32_t vtw3r2 = (const int32_t) tw3[0]; - const int32_t vtw3i2 = (const int32_t) tw3[1]; - - int16_t* data3 = data; - - do { - int16_t* data0 = data3; - int16_t* data1 = (int16_t*) ((uintptr_t) data0 + samples); - int16_t* data2 = (int16_t*) ((uintptr_t) data1 + samples); - data3 = (int16_t*) ((uintptr_t) data2 + samples); - - // First sample skips twiddle. - // Same code as samples=1 but supports stride - { - int32_t vout0r = (int32_t) data0[0]; - int32_t vout0i = (int32_t) data0[1]; - int32_t vout1r = (int32_t) data1[0]; - int32_t vout1i = (int32_t) data1[1]; - int32_t vout2r = (int32_t) data2[0]; - int32_t vout2i = (int32_t) data2[1]; - int32_t vout3r = (int32_t) data3[0]; - int32_t vout3i = (int32_t) data3[1]; - - // Note 32767 / 4 = 8191. Should be 8192. - vout0r = math_asr_s32(vout0r * 8191 + 16384, 15); - vout0i = math_asr_s32(vout0i * 8191 + 16384, 15); - vout1r = math_asr_s32(vout1r * 8191 + 16384, 15); - vout1i = math_asr_s32(vout1i * 8191 + 16384, 15); - vout2r = math_asr_s32(vout2r * 8191 + 16384, 15); - vout2i = math_asr_s32(vout2i * 8191 + 16384, 15); - vout3r = math_asr_s32(vout3r * 8191 + 16384, 15); - vout3i = math_asr_s32(vout3i * 8191 + 16384, 15); - - const int32_t vtmp5r = vout0r - vout2r; - const int32_t vtmp5i = vout0i - vout2i; - vout0r += vout2r; - vout0i += vout2i; - const int32_t vtmp3r = vout1r + vout3r; - const int32_t vtmp3i = vout1i + vout3i; - const int32_t vtmp4r = vout1i - vout3i; - const int32_t vtmp4i = -(vout1r - vout3r); // swap r,i and neg i - vout2r = vout0r - vtmp3r; - vout2i = vout0i - vtmp3i; - - vout0r += vtmp3r; - vout0i += vtmp3i; - - vout1r = vtmp5r + vtmp4r; - vout1i = vtmp5i + vtmp4i; - vout3r = vtmp5r - vtmp4r; - vout3i = vtmp5i - vtmp4i; - - data0[0] = (int16_t) vout0r; - data0[1] = (int16_t) vout0i; - data1[0] = (int16_t) vout1r; - data1[1] = (int16_t) vout1i; - data2[0] = (int16_t) vout2r; - data2[1] = (int16_t) vout2i; - data3[0] = (int16_t) vout3r; - data3[1] = (int16_t) vout3i; - data0 += 2; - data1 += 2; - data2 += 2; - data3 += 2; - } - - int32_t vout0r0 = (int32_t) data0[0]; - int32_t vout0i0 = (int32_t) data0[1]; - int32_t vout0r1 = (int32_t) data0[2]; - int32_t vout0i1 = (int32_t) data0[3]; - int32_t vout0r2 = (int32_t) data0[4]; - int32_t vout0i2 = (int32_t) data0[5]; - int32_t vout1r0 = (int32_t) data1[0]; - int32_t vout1i0 = (int32_t) data1[1]; - int32_t vout1r1 = (int32_t) data1[2]; - int32_t vout1i1 = (int32_t) data1[3]; - int32_t vout1r2 = (int32_t) data1[4]; - int32_t vout1i2 = (int32_t) data1[5]; - int32_t vout2r0 = (int32_t) data2[0]; - int32_t vout2i0 = (int32_t) data2[1]; - int32_t vout2r1 = (int32_t) data2[2]; - int32_t vout2i1 = (int32_t) data2[3]; - int32_t vout2r2 = (int32_t) data2[4]; - int32_t vout2i2 = (int32_t) data2[5]; - int32_t vout3r0 = (int32_t) data3[0]; - int32_t vout3i0 = (int32_t) data3[1]; - int32_t vout3r1 = (int32_t) data3[2]; - int32_t vout3i1 = (int32_t) data3[3]; - int32_t vout3r2 = (int32_t) data3[4]; - int32_t vout3i2 = (int32_t) data3[5]; - - // Note 32767 / 4 = 8191. Should be 8192. - vout0r0 = math_asr_s32(vout0r0 * 8191 + 16384, 15); - vout0i0 = math_asr_s32(vout0i0 * 8191 + 16384, 15); - vout0r1 = math_asr_s32(vout0r1 * 8191 + 16384, 15); - vout0i1 = math_asr_s32(vout0i1 * 8191 + 16384, 15); - vout0r2 = math_asr_s32(vout0r2 * 8191 + 16384, 15); - vout0i2 = math_asr_s32(vout0i2 * 8191 + 16384, 15); - vout1r0 = math_asr_s32(vout1r0 * 8191 + 16384, 15); - vout1i0 = math_asr_s32(vout1i0 * 8191 + 16384, 15); - vout1r1 = math_asr_s32(vout1r1 * 8191 + 16384, 15); - vout1i1 = math_asr_s32(vout1i1 * 8191 + 16384, 15); - vout1r2 = math_asr_s32(vout1r2 * 8191 + 16384, 15); - vout1i2 = math_asr_s32(vout1i2 * 8191 + 16384, 15); - vout2r0 = math_asr_s32(vout2r0 * 8191 + 16384, 15); - vout2i0 = math_asr_s32(vout2i0 * 8191 + 16384, 15); - vout2r1 = math_asr_s32(vout2r1 * 8191 + 16384, 15); - vout2i1 = math_asr_s32(vout2i1 * 8191 + 16384, 15); - vout2r2 = math_asr_s32(vout2r2 * 8191 + 16384, 15); - vout2i2 = math_asr_s32(vout2i2 * 8191 + 16384, 15); - vout3r0 = math_asr_s32(vout3r0 * 8191 + 16384, 15); - vout3i0 = math_asr_s32(vout3i0 * 8191 + 16384, 15); - vout3r1 = math_asr_s32(vout3r1 * 8191 + 16384, 15); - vout3i1 = math_asr_s32(vout3i1 * 8191 + 16384, 15); - vout3r2 = math_asr_s32(vout3r2 * 8191 + 16384, 15); - vout3i2 = math_asr_s32(vout3i2 * 8191 + 16384, 15); - - const int32_t vtmp0r0 = math_asr_s32(vout1r0 * vtw1r0 - vout1i0 * vtw1i0 + 16384, 15); - const int32_t vtmp0i0 = math_asr_s32(vout1r0 * vtw1i0 + vout1i0 * vtw1r0 + 16384, 15); - const int32_t vtmp0r1 = math_asr_s32(vout1r1 * vtw1r1 - vout1i1 * vtw1i1 + 16384, 15); - const int32_t vtmp0i1 = math_asr_s32(vout1r1 * vtw1i1 + vout1i1 * vtw1r1 + 16384, 15); - const int32_t vtmp0r2 = math_asr_s32(vout1r2 * vtw1r2 - vout1i2 * vtw1i2 + 16384, 15); - const int32_t vtmp0i2 = math_asr_s32(vout1r2 * vtw1i2 + vout1i2 * vtw1r2 + 16384, 15); - const int32_t vtmp1r0 = math_asr_s32(vout2r0 * vtw2r0 - vout2i0 * vtw2i0 + 16384, 15); - const int32_t vtmp1i0 = math_asr_s32(vout2r0 * vtw2i0 + vout2i0 * vtw2r0 + 16384, 15); - const int32_t vtmp1r1 = math_asr_s32(vout2r1 * vtw2r1 - vout2i1 * vtw2i1 + 16384, 15); - const int32_t vtmp1i1 = math_asr_s32(vout2r1 * vtw2i1 + vout2i1 * vtw2r1 + 16384, 15); - const int32_t vtmp1r2 = math_asr_s32(vout2r2 * vtw2r2 - vout2i2 * vtw2i2 + 16384, 15); - const int32_t vtmp1i2 = math_asr_s32(vout2r2 * vtw2i2 + vout2i2 * vtw2r2 + 16384, 15); - const int32_t vtmp2r0 = math_asr_s32(vout3r0 * vtw3r0 - vout3i0 * vtw3i0 + 16384, 15); - const int32_t vtmp2i0 = math_asr_s32(vout3r0 * vtw3i0 + vout3i0 * vtw3r0 + 16384, 15); - const int32_t vtmp2r1 = math_asr_s32(vout3r1 * vtw3r1 - vout3i1 * vtw3i1 + 16384, 15); - const int32_t vtmp2i1 = math_asr_s32(vout3r1 * vtw3i1 + vout3i1 * vtw3r1 + 16384, 15); - const int32_t vtmp2r2 = math_asr_s32(vout3r2 * vtw3r2 - vout3i2 * vtw3i2 + 16384, 15); - const int32_t vtmp2i2 = math_asr_s32(vout3r2 * vtw3i2 + vout3i2 * vtw3r2 + 16384, 15); - - const int32_t vtmp5r0 = vout0r0 - vtmp1r0; - const int32_t vtmp5i0 = vout0i0 - vtmp1i0; - const int32_t vtmp5r1 = vout0r1 - vtmp1r1; - const int32_t vtmp5i1 = vout0i1 - vtmp1i1; - const int32_t vtmp5r2 = vout0r2 - vtmp1r2; - const int32_t vtmp5i2 = vout0i2 - vtmp1i2; - vout0r0 += vtmp1r0; - vout0i0 += vtmp1i0; - vout0r1 += vtmp1r1; - vout0i1 += vtmp1i1; - vout0r2 += vtmp1r2; - vout0i2 += vtmp1i2; - const int32_t vtmp3r0 = vtmp0r0 + vtmp2r0; - const int32_t vtmp3i0 = vtmp0i0 + vtmp2i0; - const int32_t vtmp3r1 = vtmp0r1 + vtmp2r1; - const int32_t vtmp3i1 = vtmp0i1 + vtmp2i1; - const int32_t vtmp3r2 = vtmp0r2 + vtmp2r2; - const int32_t vtmp3i2 = vtmp0i2 + vtmp2i2; - const int32_t vtmp4r0 = vtmp0i0 - vtmp2i0; - const int32_t vtmp4i0 = -(vtmp0r0 - vtmp2r0); // swap r,i and neg i - const int32_t vtmp4r1 = vtmp0i1 - vtmp2i1; - const int32_t vtmp4i1 = -(vtmp0r1 - vtmp2r1); // swap r,i and neg i - const int32_t vtmp4r2 = vtmp0i2 - vtmp2i2; - const int32_t vtmp4i2 = -(vtmp0r2 - vtmp2r2); // swap r,i and neg i - vout2r0 = vout0r0 - vtmp3r0; - vout2i0 = vout0i0 - vtmp3i0; - vout2r1 = vout0r1 - vtmp3r1; - vout2i1 = vout0i1 - vtmp3i1; - vout2r2 = vout0r2 - vtmp3r2; - vout2i2 = vout0i2 - vtmp3i2; - vout0r0 += vtmp3r0; - vout0i0 += vtmp3i0; - vout0r1 += vtmp3r1; - vout0i1 += vtmp3i1; - vout0r2 += vtmp3r2; - vout0i2 += vtmp3i2; - vout1r0 = vtmp5r0 + vtmp4r0; - vout1i0 = vtmp5i0 + vtmp4i0; - vout1r1 = vtmp5r1 + vtmp4r1; - vout1i1 = vtmp5i1 + vtmp4i1; - vout1r2 = vtmp5r2 + vtmp4r2; - vout1i2 = vtmp5i2 + vtmp4i2; - vout3r0 = vtmp5r0 - vtmp4r0; - vout3i0 = vtmp5i0 - vtmp4i0; - vout3r1 = vtmp5r1 - vtmp4r1; - vout3i1 = vtmp5i1 - vtmp4i1; - vout3r2 = vtmp5r2 - vtmp4r2; - vout3i2 = vtmp5i2 - vtmp4i2; - - data0[0] = (int16_t) vout0r0; - data0[1] = (int16_t) vout0i0; - data0[2] = (int16_t) vout0r1; - data0[3] = (int16_t) vout0i1; - data0[4] = (int16_t) vout0r2; - data0[5] = (int16_t) vout0i2; - data1[0] = (int16_t) vout1r0; - data1[1] = (int16_t) vout1i0; - data1[2] = (int16_t) vout1r1; - data1[3] = (int16_t) vout1i1; - data1[4] = (int16_t) vout1r2; - data1[5] = (int16_t) vout1i2; - data2[0] = (int16_t) vout2r0; - data2[1] = (int16_t) vout2i0; - data2[2] = (int16_t) vout2r1; - data2[3] = (int16_t) vout2i1; - data2[4] = (int16_t) vout2r2; - data2[5] = (int16_t) vout2i2; - data3[0] = (int16_t) vout3r0; - data3[1] = (int16_t) vout3i0; - data3[2] = (int16_t) vout3r1; - data3[3] = (int16_t) vout3i1; - data3[4] = (int16_t) vout3r2; - data3[5] = (int16_t) vout3i2; - data3 += 3 * 2; - - } while (--batch != 0); -} diff --git a/src/cs16-bfly4/gen/cs16-bfly4-scalar-x1.c b/src/cs16-bfly4/gen/cs16-bfly4-scalar-x1.c deleted file mode 100644 index 03b63af6ba1..00000000000 --- a/src/cs16-bfly4/gen/cs16-bfly4-scalar-x1.c +++ /dev/null @@ -1,178 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/cs16-bfly4/scalar.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/fft.h" - - -void xnn_cs16_bfly4_ukernel__scalar_x1( - size_t batch, - size_t samples, - int16_t* data, - const int16_t* twiddle, - size_t stride) -{ - assert(batch != 0); - assert(samples != 0); - assert(samples % (sizeof(int16_t) * 2) == 0); - assert(data != NULL); - assert(stride != 0); - assert(twiddle != NULL); - - int16_t* data3 = data; - - do { - int16_t* data0 = data3; - int16_t* data1 = (int16_t*) ((uintptr_t) data0 + samples); - int16_t* data2 = (int16_t*) ((uintptr_t) data1 + samples); - data3 = (int16_t*) ((uintptr_t) data2 + samples); - - const int16_t* tw1 = twiddle; - const int16_t* tw2 = twiddle; - const int16_t* tw3 = twiddle; - tw1 = (const int16_t*) ((uintptr_t) tw1 + stride); - tw2 = (const int16_t*) ((uintptr_t) tw2 + stride * 2); - tw3 = (const int16_t*) ((uintptr_t) tw3 + stride * 3); - - size_t s = samples - sizeof(int16_t) * 2; - - // First sample skips twiddle. - // Same code as samples=1 but supports stride - { - int32_t vout0r = (int32_t) data0[0]; - int32_t vout0i = (int32_t) data0[1]; - int32_t vout1r = (int32_t) data1[0]; - int32_t vout1i = (int32_t) data1[1]; - int32_t vout2r = (int32_t) data2[0]; - int32_t vout2i = (int32_t) data2[1]; - int32_t vout3r = (int32_t) data3[0]; - int32_t vout3i = (int32_t) data3[1]; - - // Note 32767 / 4 = 8191. Should be 8192. - vout0r = math_asr_s32(vout0r * 8191 + 16384, 15); - vout0i = math_asr_s32(vout0i * 8191 + 16384, 15); - vout1r = math_asr_s32(vout1r * 8191 + 16384, 15); - vout1i = math_asr_s32(vout1i * 8191 + 16384, 15); - vout2r = math_asr_s32(vout2r * 8191 + 16384, 15); - vout2i = math_asr_s32(vout2i * 8191 + 16384, 15); - vout3r = math_asr_s32(vout3r * 8191 + 16384, 15); - vout3i = math_asr_s32(vout3i * 8191 + 16384, 15); - - const int32_t vtmp5r = vout0r - vout2r; - const int32_t vtmp5i = vout0i - vout2i; - vout0r += vout2r; - vout0i += vout2i; - const int32_t vtmp3r = vout1r + vout3r; - const int32_t vtmp3i = vout1i + vout3i; - const int32_t vtmp4r = vout1i - vout3i; - const int32_t vtmp4i = -(vout1r - vout3r); // swap r,i and neg i - vout2r = vout0r - vtmp3r; - vout2i = vout0i - vtmp3i; - - vout0r += vtmp3r; - vout0i += vtmp3i; - - vout1r = vtmp5r + vtmp4r; - vout1i = vtmp5i + vtmp4i; - vout3r = vtmp5r - vtmp4r; - vout3i = vtmp5i - vtmp4i; - - data0[0] = (int16_t) vout0r; - data0[1] = (int16_t) vout0i; - data1[0] = (int16_t) vout1r; - data1[1] = (int16_t) vout1i; - data2[0] = (int16_t) vout2r; - data2[1] = (int16_t) vout2i; - data3[0] = (int16_t) vout3r; - data3[1] = (int16_t) vout3i; - data0 += 2; - data1 += 2; - data2 += 2; - data3 += 2; - } - - if XNN_UNLIKELY(s != 0) { - do { - int32_t vout0r = (int32_t) data0[0]; - int32_t vout0i = (int32_t) data0[1]; - int32_t vout1r = (int32_t) data1[0]; - int32_t vout1i = (int32_t) data1[1]; - int32_t vout2r = (int32_t) data2[0]; - int32_t vout2i = (int32_t) data2[1]; - int32_t vout3r = (int32_t) data3[0]; - int32_t vout3i = (int32_t) data3[1]; - - const int32_t vtw1r = (const int32_t) tw1[0]; - const int32_t vtw1i = (const int32_t) tw1[1]; - const int32_t vtw2r = (const int32_t) tw2[0]; - const int32_t vtw2i = (const int32_t) tw2[1]; - const int32_t vtw3r = (const int32_t) tw3[0]; - const int32_t vtw3i = (const int32_t) tw3[1]; - tw1 = (const int16_t*) ((uintptr_t) tw1 + stride); - tw2 = (const int16_t*) ((uintptr_t) tw2 + stride * 2); - tw3 = (const int16_t*) ((uintptr_t) tw3 + stride * 3); - - // Note 32767 / 4 = 8191. Should be 8192. - vout0r = math_asr_s32(vout0r * 8191 + 16384, 15); - vout0i = math_asr_s32(vout0i * 8191 + 16384, 15); - vout1r = math_asr_s32(vout1r * 8191 + 16384, 15); - vout1i = math_asr_s32(vout1i * 8191 + 16384, 15); - vout2r = math_asr_s32(vout2r * 8191 + 16384, 15); - vout2i = math_asr_s32(vout2i * 8191 + 16384, 15); - vout3r = math_asr_s32(vout3r * 8191 + 16384, 15); - vout3i = math_asr_s32(vout3i * 8191 + 16384, 15); - - const int32_t vtmp0r = math_asr_s32(vout1r * vtw1r - vout1i * vtw1i + 16384, 15); - const int32_t vtmp0i = math_asr_s32(vout1r * vtw1i + vout1i * vtw1r + 16384, 15); - const int32_t vtmp1r = math_asr_s32(vout2r * vtw2r - vout2i * vtw2i + 16384, 15); - const int32_t vtmp1i = math_asr_s32(vout2r * vtw2i + vout2i * vtw2r + 16384, 15); - const int32_t vtmp2r = math_asr_s32(vout3r * vtw3r - vout3i * vtw3i + 16384, 15); - const int32_t vtmp2i = math_asr_s32(vout3r * vtw3i + vout3i * vtw3r + 16384, 15); - - const int32_t vtmp5r = vout0r - vtmp1r; - const int32_t vtmp5i = vout0i - vtmp1i; - vout0r += vtmp1r; - vout0i += vtmp1i; - const int32_t vtmp3r = vtmp0r + vtmp2r; - const int32_t vtmp3i = vtmp0i + vtmp2i; - const int32_t vtmp4r = vtmp0i - vtmp2i; - const int32_t vtmp4i = -(vtmp0r - vtmp2r); // swap r,i and neg i - vout2r = vout0r - vtmp3r; - vout2i = vout0i - vtmp3i; - - vout0r += vtmp3r; - vout0i += vtmp3i; - - vout1r = vtmp5r + vtmp4r; - vout1i = vtmp5i + vtmp4i; - vout3r = vtmp5r - vtmp4r; - vout3i = vtmp5i - vtmp4i; - - data0[0] = (int16_t) vout0r; - data0[1] = (int16_t) vout0i; - data1[0] = (int16_t) vout1r; - data1[1] = (int16_t) vout1i; - data2[0] = (int16_t) vout2r; - data2[1] = (int16_t) vout2i; - data3[0] = (int16_t) vout3r; - data3[1] = (int16_t) vout3i; - data0 += 2; - data1 += 2; - data2 += 2; - data3 += 2; - - s -= sizeof(int16_t) * 2; - } while (s != 0); - } - } while (--batch != 0); -} diff --git a/src/cs16-bfly4/gen/cs16-bfly4-scalar-x2.c b/src/cs16-bfly4/gen/cs16-bfly4-scalar-x2.c deleted file mode 100644 index 0d09baf0f1d..00000000000 --- a/src/cs16-bfly4/gen/cs16-bfly4-scalar-x2.c +++ /dev/null @@ -1,300 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/cs16-bfly4/scalar.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/fft.h" - - -void xnn_cs16_bfly4_ukernel__scalar_x2( - size_t batch, - size_t samples, - int16_t* data, - const int16_t* twiddle, - size_t stride) -{ - assert(batch != 0); - assert(samples != 0); - assert(samples % (sizeof(int16_t) * 2) == 0); - assert(data != NULL); - assert(stride != 0); - assert(twiddle != NULL); - - int16_t* data3 = data; - - do { - int16_t* data0 = data3; - int16_t* data1 = (int16_t*) ((uintptr_t) data0 + samples); - int16_t* data2 = (int16_t*) ((uintptr_t) data1 + samples); - data3 = (int16_t*) ((uintptr_t) data2 + samples); - - const int16_t* tw1 = twiddle; - const int16_t* tw2 = twiddle; - const int16_t* tw3 = twiddle; - tw1 = (const int16_t*) ((uintptr_t) tw1 + stride); - tw2 = (const int16_t*) ((uintptr_t) tw2 + stride * 2); - tw3 = (const int16_t*) ((uintptr_t) tw3 + stride * 3); - - size_t s = samples - sizeof(int16_t) * 2; - - // First sample skips twiddle. - // Same code as samples=1 but supports stride - { - int32_t vout0r = (int32_t) data0[0]; - int32_t vout0i = (int32_t) data0[1]; - int32_t vout1r = (int32_t) data1[0]; - int32_t vout1i = (int32_t) data1[1]; - int32_t vout2r = (int32_t) data2[0]; - int32_t vout2i = (int32_t) data2[1]; - int32_t vout3r = (int32_t) data3[0]; - int32_t vout3i = (int32_t) data3[1]; - - // Note 32767 / 4 = 8191. Should be 8192. - vout0r = math_asr_s32(vout0r * 8191 + 16384, 15); - vout0i = math_asr_s32(vout0i * 8191 + 16384, 15); - vout1r = math_asr_s32(vout1r * 8191 + 16384, 15); - vout1i = math_asr_s32(vout1i * 8191 + 16384, 15); - vout2r = math_asr_s32(vout2r * 8191 + 16384, 15); - vout2i = math_asr_s32(vout2i * 8191 + 16384, 15); - vout3r = math_asr_s32(vout3r * 8191 + 16384, 15); - vout3i = math_asr_s32(vout3i * 8191 + 16384, 15); - - const int32_t vtmp5r = vout0r - vout2r; - const int32_t vtmp5i = vout0i - vout2i; - vout0r += vout2r; - vout0i += vout2i; - const int32_t vtmp3r = vout1r + vout3r; - const int32_t vtmp3i = vout1i + vout3i; - const int32_t vtmp4r = vout1i - vout3i; - const int32_t vtmp4i = -(vout1r - vout3r); // swap r,i and neg i - vout2r = vout0r - vtmp3r; - vout2i = vout0i - vtmp3i; - - vout0r += vtmp3r; - vout0i += vtmp3i; - - vout1r = vtmp5r + vtmp4r; - vout1i = vtmp5i + vtmp4i; - vout3r = vtmp5r - vtmp4r; - vout3i = vtmp5i - vtmp4i; - - data0[0] = (int16_t) vout0r; - data0[1] = (int16_t) vout0i; - data1[0] = (int16_t) vout1r; - data1[1] = (int16_t) vout1i; - data2[0] = (int16_t) vout2r; - data2[1] = (int16_t) vout2i; - data3[0] = (int16_t) vout3r; - data3[1] = (int16_t) vout3i; - data0 += 2; - data1 += 2; - data2 += 2; - data3 += 2; - } - - for (; s >= 2 * sizeof(int16_t) * 2; s -= 2 * sizeof(int16_t) * 2) { - int32_t vout0r0 = (int32_t) data0[0]; - int32_t vout0i0 = (int32_t) data0[1]; - int32_t vout0r1 = (int32_t) data0[2]; - int32_t vout0i1 = (int32_t) data0[3]; - int32_t vout1r0 = (int32_t) data1[0]; - int32_t vout1i0 = (int32_t) data1[1]; - int32_t vout1r1 = (int32_t) data1[2]; - int32_t vout1i1 = (int32_t) data1[3]; - int32_t vout2r0 = (int32_t) data2[0]; - int32_t vout2i0 = (int32_t) data2[1]; - int32_t vout2r1 = (int32_t) data2[2]; - int32_t vout2i1 = (int32_t) data2[3]; - int32_t vout3r0 = (int32_t) data3[0]; - int32_t vout3i0 = (int32_t) data3[1]; - int32_t vout3r1 = (int32_t) data3[2]; - int32_t vout3i1 = (int32_t) data3[3]; - - const int32_t vtw1r0 = (const int32_t) tw1[0]; - const int32_t vtw1i0 = (const int32_t) tw1[1]; - tw1 = (const int16_t*) ((uintptr_t) tw1 + stride); - const int32_t vtw1r1 = (const int32_t) tw1[0]; - const int32_t vtw1i1 = (const int32_t) tw1[1]; - tw1 = (const int16_t*) ((uintptr_t) tw1 + stride); - const int32_t vtw2r0 = (const int32_t) tw2[0]; - const int32_t vtw2i0 = (const int32_t) tw2[1]; - tw2 = (const int16_t*) ((uintptr_t) tw2 + stride * 2); - const int32_t vtw2r1 = (const int32_t) tw2[0]; - const int32_t vtw2i1 = (const int32_t) tw2[1]; - tw2 = (const int16_t*) ((uintptr_t) tw2 + stride * 2); - const int32_t vtw3r0 = (const int32_t) tw3[0]; - const int32_t vtw3i0 = (const int32_t) tw3[1]; - tw3 = (const int16_t*) ((uintptr_t) tw3 + stride * 3); - const int32_t vtw3r1 = (const int32_t) tw3[0]; - const int32_t vtw3i1 = (const int32_t) tw3[1]; - tw3 = (const int16_t*) ((uintptr_t) tw3 + stride * 3); - - // Note 32767 / 4 = 8191. Should be 8192. - vout0r0 = math_asr_s32(vout0r0 * 8191 + 16384, 15); - vout0i0 = math_asr_s32(vout0i0 * 8191 + 16384, 15); - vout0r1 = math_asr_s32(vout0r1 * 8191 + 16384, 15); - vout0i1 = math_asr_s32(vout0i1 * 8191 + 16384, 15); - vout1r0 = math_asr_s32(vout1r0 * 8191 + 16384, 15); - vout1i0 = math_asr_s32(vout1i0 * 8191 + 16384, 15); - vout1r1 = math_asr_s32(vout1r1 * 8191 + 16384, 15); - vout1i1 = math_asr_s32(vout1i1 * 8191 + 16384, 15); - vout2r0 = math_asr_s32(vout2r0 * 8191 + 16384, 15); - vout2i0 = math_asr_s32(vout2i0 * 8191 + 16384, 15); - vout2r1 = math_asr_s32(vout2r1 * 8191 + 16384, 15); - vout2i1 = math_asr_s32(vout2i1 * 8191 + 16384, 15); - vout3r0 = math_asr_s32(vout3r0 * 8191 + 16384, 15); - vout3i0 = math_asr_s32(vout3i0 * 8191 + 16384, 15); - vout3r1 = math_asr_s32(vout3r1 * 8191 + 16384, 15); - vout3i1 = math_asr_s32(vout3i1 * 8191 + 16384, 15); - - const int32_t vtmp0r0 = math_asr_s32(vout1r0 * vtw1r0 - vout1i0 * vtw1i0 + 16384, 15); - const int32_t vtmp0i0 = math_asr_s32(vout1r0 * vtw1i0 + vout1i0 * vtw1r0 + 16384, 15); - const int32_t vtmp0r1 = math_asr_s32(vout1r1 * vtw1r1 - vout1i1 * vtw1i1 + 16384, 15); - const int32_t vtmp0i1 = math_asr_s32(vout1r1 * vtw1i1 + vout1i1 * vtw1r1 + 16384, 15); - const int32_t vtmp1r0 = math_asr_s32(vout2r0 * vtw2r0 - vout2i0 * vtw2i0 + 16384, 15); - const int32_t vtmp1i0 = math_asr_s32(vout2r0 * vtw2i0 + vout2i0 * vtw2r0 + 16384, 15); - const int32_t vtmp1r1 = math_asr_s32(vout2r1 * vtw2r1 - vout2i1 * vtw2i1 + 16384, 15); - const int32_t vtmp1i1 = math_asr_s32(vout2r1 * vtw2i1 + vout2i1 * vtw2r1 + 16384, 15); - const int32_t vtmp2r0 = math_asr_s32(vout3r0 * vtw3r0 - vout3i0 * vtw3i0 + 16384, 15); - const int32_t vtmp2i0 = math_asr_s32(vout3r0 * vtw3i0 + vout3i0 * vtw3r0 + 16384, 15); - const int32_t vtmp2r1 = math_asr_s32(vout3r1 * vtw3r1 - vout3i1 * vtw3i1 + 16384, 15); - const int32_t vtmp2i1 = math_asr_s32(vout3r1 * vtw3i1 + vout3i1 * vtw3r1 + 16384, 15); - - const int32_t vtmp5r0 = vout0r0 - vtmp1r0; - const int32_t vtmp5i0 = vout0i0 - vtmp1i0; - const int32_t vtmp5r1 = vout0r1 - vtmp1r1; - const int32_t vtmp5i1 = vout0i1 - vtmp1i1; - vout0r0 += vtmp1r0; - vout0i0 += vtmp1i0; - vout0r1 += vtmp1r1; - vout0i1 += vtmp1i1; - const int32_t vtmp3r0 = vtmp0r0 + vtmp2r0; - const int32_t vtmp3i0 = vtmp0i0 + vtmp2i0; - const int32_t vtmp3r1 = vtmp0r1 + vtmp2r1; - const int32_t vtmp3i1 = vtmp0i1 + vtmp2i1; - const int32_t vtmp4r0 = vtmp0i0 - vtmp2i0; - const int32_t vtmp4i0 = -(vtmp0r0 - vtmp2r0); // swap r,i and neg i - const int32_t vtmp4r1 = vtmp0i1 - vtmp2i1; - const int32_t vtmp4i1 = -(vtmp0r1 - vtmp2r1); // swap r,i and neg i - vout2r0 = vout0r0 - vtmp3r0; - vout2i0 = vout0i0 - vtmp3i0; - vout2r1 = vout0r1 - vtmp3r1; - vout2i1 = vout0i1 - vtmp3i1; - vout0r0 += vtmp3r0; - vout0i0 += vtmp3i0; - vout0r1 += vtmp3r1; - vout0i1 += vtmp3i1; - vout1r0 = vtmp5r0 + vtmp4r0; - vout1i0 = vtmp5i0 + vtmp4i0; - vout1r1 = vtmp5r1 + vtmp4r1; - vout1i1 = vtmp5i1 + vtmp4i1; - vout3r0 = vtmp5r0 - vtmp4r0; - vout3i0 = vtmp5i0 - vtmp4i0; - vout3r1 = vtmp5r1 - vtmp4r1; - vout3i1 = vtmp5i1 - vtmp4i1; - - data0[0] = (int16_t) vout0r0; - data0[1] = (int16_t) vout0i0; - data0[2] = (int16_t) vout0r1; - data0[3] = (int16_t) vout0i1; - data0 += 2 * 2; - data1[0] = (int16_t) vout1r0; - data1[1] = (int16_t) vout1i0; - data1[2] = (int16_t) vout1r1; - data1[3] = (int16_t) vout1i1; - data1 += 2 * 2; - data2[0] = (int16_t) vout2r0; - data2[1] = (int16_t) vout2i0; - data2[2] = (int16_t) vout2r1; - data2[3] = (int16_t) vout2i1; - data2 += 2 * 2; - data3[0] = (int16_t) vout3r0; - data3[1] = (int16_t) vout3i0; - data3[2] = (int16_t) vout3r1; - data3[3] = (int16_t) vout3i1; - data3 += 2 * 2; - } - if XNN_UNLIKELY(s != 0) { - do { - int32_t vout0r = (int32_t) data0[0]; - int32_t vout0i = (int32_t) data0[1]; - int32_t vout1r = (int32_t) data1[0]; - int32_t vout1i = (int32_t) data1[1]; - int32_t vout2r = (int32_t) data2[0]; - int32_t vout2i = (int32_t) data2[1]; - int32_t vout3r = (int32_t) data3[0]; - int32_t vout3i = (int32_t) data3[1]; - - const int32_t vtw1r = (const int32_t) tw1[0]; - const int32_t vtw1i = (const int32_t) tw1[1]; - const int32_t vtw2r = (const int32_t) tw2[0]; - const int32_t vtw2i = (const int32_t) tw2[1]; - const int32_t vtw3r = (const int32_t) tw3[0]; - const int32_t vtw3i = (const int32_t) tw3[1]; - tw1 = (const int16_t*) ((uintptr_t) tw1 + stride); - tw2 = (const int16_t*) ((uintptr_t) tw2 + stride * 2); - tw3 = (const int16_t*) ((uintptr_t) tw3 + stride * 3); - - // Note 32767 / 4 = 8191. Should be 8192. - vout0r = math_asr_s32(vout0r * 8191 + 16384, 15); - vout0i = math_asr_s32(vout0i * 8191 + 16384, 15); - vout1r = math_asr_s32(vout1r * 8191 + 16384, 15); - vout1i = math_asr_s32(vout1i * 8191 + 16384, 15); - vout2r = math_asr_s32(vout2r * 8191 + 16384, 15); - vout2i = math_asr_s32(vout2i * 8191 + 16384, 15); - vout3r = math_asr_s32(vout3r * 8191 + 16384, 15); - vout3i = math_asr_s32(vout3i * 8191 + 16384, 15); - - const int32_t vtmp0r = math_asr_s32(vout1r * vtw1r - vout1i * vtw1i + 16384, 15); - const int32_t vtmp0i = math_asr_s32(vout1r * vtw1i + vout1i * vtw1r + 16384, 15); - const int32_t vtmp1r = math_asr_s32(vout2r * vtw2r - vout2i * vtw2i + 16384, 15); - const int32_t vtmp1i = math_asr_s32(vout2r * vtw2i + vout2i * vtw2r + 16384, 15); - const int32_t vtmp2r = math_asr_s32(vout3r * vtw3r - vout3i * vtw3i + 16384, 15); - const int32_t vtmp2i = math_asr_s32(vout3r * vtw3i + vout3i * vtw3r + 16384, 15); - - const int32_t vtmp5r = vout0r - vtmp1r; - const int32_t vtmp5i = vout0i - vtmp1i; - vout0r += vtmp1r; - vout0i += vtmp1i; - const int32_t vtmp3r = vtmp0r + vtmp2r; - const int32_t vtmp3i = vtmp0i + vtmp2i; - const int32_t vtmp4r = vtmp0i - vtmp2i; - const int32_t vtmp4i = -(vtmp0r - vtmp2r); // swap r,i and neg i - vout2r = vout0r - vtmp3r; - vout2i = vout0i - vtmp3i; - - vout0r += vtmp3r; - vout0i += vtmp3i; - - vout1r = vtmp5r + vtmp4r; - vout1i = vtmp5i + vtmp4i; - vout3r = vtmp5r - vtmp4r; - vout3i = vtmp5i - vtmp4i; - - data0[0] = (int16_t) vout0r; - data0[1] = (int16_t) vout0i; - data1[0] = (int16_t) vout1r; - data1[1] = (int16_t) vout1i; - data2[0] = (int16_t) vout2r; - data2[1] = (int16_t) vout2i; - data3[0] = (int16_t) vout3r; - data3[1] = (int16_t) vout3i; - data0 += 2; - data1 += 2; - data2 += 2; - data3 += 2; - - s -= sizeof(int16_t) * 2; - } while (s != 0); - } - } while (--batch != 0); -} diff --git a/src/cs16-bfly4/gen/cs16-bfly4-scalar-x4.c b/src/cs16-bfly4/gen/cs16-bfly4-scalar-x4.c deleted file mode 100644 index 5ce6eea520c..00000000000 --- a/src/cs16-bfly4/gen/cs16-bfly4-scalar-x4.c +++ /dev/null @@ -1,410 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/cs16-bfly4/scalar.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/fft.h" - - -void xnn_cs16_bfly4_ukernel__scalar_x4( - size_t batch, - size_t samples, - int16_t* data, - const int16_t* twiddle, - size_t stride) -{ - assert(batch != 0); - assert(samples != 0); - assert(samples % (sizeof(int16_t) * 2) == 0); - assert(data != NULL); - assert(stride != 0); - assert(twiddle != NULL); - - int16_t* data3 = data; - - do { - int16_t* data0 = data3; - int16_t* data1 = (int16_t*) ((uintptr_t) data0 + samples); - int16_t* data2 = (int16_t*) ((uintptr_t) data1 + samples); - data3 = (int16_t*) ((uintptr_t) data2 + samples); - - const int16_t* tw1 = twiddle; - const int16_t* tw2 = twiddle; - const int16_t* tw3 = twiddle; - tw1 = (const int16_t*) ((uintptr_t) tw1 + stride); - tw2 = (const int16_t*) ((uintptr_t) tw2 + stride * 2); - tw3 = (const int16_t*) ((uintptr_t) tw3 + stride * 3); - - size_t s = samples - sizeof(int16_t) * 2; - - // First sample skips twiddle. - // Same code as samples=1 but supports stride - { - int32_t vout0r = (int32_t) data0[0]; - int32_t vout0i = (int32_t) data0[1]; - int32_t vout1r = (int32_t) data1[0]; - int32_t vout1i = (int32_t) data1[1]; - int32_t vout2r = (int32_t) data2[0]; - int32_t vout2i = (int32_t) data2[1]; - int32_t vout3r = (int32_t) data3[0]; - int32_t vout3i = (int32_t) data3[1]; - - // Note 32767 / 4 = 8191. Should be 8192. - vout0r = math_asr_s32(vout0r * 8191 + 16384, 15); - vout0i = math_asr_s32(vout0i * 8191 + 16384, 15); - vout1r = math_asr_s32(vout1r * 8191 + 16384, 15); - vout1i = math_asr_s32(vout1i * 8191 + 16384, 15); - vout2r = math_asr_s32(vout2r * 8191 + 16384, 15); - vout2i = math_asr_s32(vout2i * 8191 + 16384, 15); - vout3r = math_asr_s32(vout3r * 8191 + 16384, 15); - vout3i = math_asr_s32(vout3i * 8191 + 16384, 15); - - const int32_t vtmp5r = vout0r - vout2r; - const int32_t vtmp5i = vout0i - vout2i; - vout0r += vout2r; - vout0i += vout2i; - const int32_t vtmp3r = vout1r + vout3r; - const int32_t vtmp3i = vout1i + vout3i; - const int32_t vtmp4r = vout1i - vout3i; - const int32_t vtmp4i = -(vout1r - vout3r); // swap r,i and neg i - vout2r = vout0r - vtmp3r; - vout2i = vout0i - vtmp3i; - - vout0r += vtmp3r; - vout0i += vtmp3i; - - vout1r = vtmp5r + vtmp4r; - vout1i = vtmp5i + vtmp4i; - vout3r = vtmp5r - vtmp4r; - vout3i = vtmp5i - vtmp4i; - - data0[0] = (int16_t) vout0r; - data0[1] = (int16_t) vout0i; - data1[0] = (int16_t) vout1r; - data1[1] = (int16_t) vout1i; - data2[0] = (int16_t) vout2r; - data2[1] = (int16_t) vout2i; - data3[0] = (int16_t) vout3r; - data3[1] = (int16_t) vout3i; - data0 += 2; - data1 += 2; - data2 += 2; - data3 += 2; - } - - for (; s >= 4 * sizeof(int16_t) * 2; s -= 4 * sizeof(int16_t) * 2) { - int32_t vout0r0 = (int32_t) data0[0]; - int32_t vout0i0 = (int32_t) data0[1]; - int32_t vout0r1 = (int32_t) data0[2]; - int32_t vout0i1 = (int32_t) data0[3]; - int32_t vout0r2 = (int32_t) data0[4]; - int32_t vout0i2 = (int32_t) data0[5]; - int32_t vout0r3 = (int32_t) data0[6]; - int32_t vout0i3 = (int32_t) data0[7]; - int32_t vout1r0 = (int32_t) data1[0]; - int32_t vout1i0 = (int32_t) data1[1]; - int32_t vout1r1 = (int32_t) data1[2]; - int32_t vout1i1 = (int32_t) data1[3]; - int32_t vout1r2 = (int32_t) data1[4]; - int32_t vout1i2 = (int32_t) data1[5]; - int32_t vout1r3 = (int32_t) data1[6]; - int32_t vout1i3 = (int32_t) data1[7]; - int32_t vout2r0 = (int32_t) data2[0]; - int32_t vout2i0 = (int32_t) data2[1]; - int32_t vout2r1 = (int32_t) data2[2]; - int32_t vout2i1 = (int32_t) data2[3]; - int32_t vout2r2 = (int32_t) data2[4]; - int32_t vout2i2 = (int32_t) data2[5]; - int32_t vout2r3 = (int32_t) data2[6]; - int32_t vout2i3 = (int32_t) data2[7]; - int32_t vout3r0 = (int32_t) data3[0]; - int32_t vout3i0 = (int32_t) data3[1]; - int32_t vout3r1 = (int32_t) data3[2]; - int32_t vout3i1 = (int32_t) data3[3]; - int32_t vout3r2 = (int32_t) data3[4]; - int32_t vout3i2 = (int32_t) data3[5]; - int32_t vout3r3 = (int32_t) data3[6]; - int32_t vout3i3 = (int32_t) data3[7]; - - const int32_t vtw1r0 = (const int32_t) tw1[0]; - const int32_t vtw1i0 = (const int32_t) tw1[1]; - tw1 = (const int16_t*) ((uintptr_t) tw1 + stride); - const int32_t vtw1r1 = (const int32_t) tw1[0]; - const int32_t vtw1i1 = (const int32_t) tw1[1]; - tw1 = (const int16_t*) ((uintptr_t) tw1 + stride); - const int32_t vtw1r2 = (const int32_t) tw1[0]; - const int32_t vtw1i2 = (const int32_t) tw1[1]; - tw1 = (const int16_t*) ((uintptr_t) tw1 + stride); - const int32_t vtw1r3 = (const int32_t) tw1[0]; - const int32_t vtw1i3 = (const int32_t) tw1[1]; - tw1 = (const int16_t*) ((uintptr_t) tw1 + stride); - const int32_t vtw2r0 = (const int32_t) tw2[0]; - const int32_t vtw2i0 = (const int32_t) tw2[1]; - tw2 = (const int16_t*) ((uintptr_t) tw2 + stride * 2); - const int32_t vtw2r1 = (const int32_t) tw2[0]; - const int32_t vtw2i1 = (const int32_t) tw2[1]; - tw2 = (const int16_t*) ((uintptr_t) tw2 + stride * 2); - const int32_t vtw2r2 = (const int32_t) tw2[0]; - const int32_t vtw2i2 = (const int32_t) tw2[1]; - tw2 = (const int16_t*) ((uintptr_t) tw2 + stride * 2); - const int32_t vtw2r3 = (const int32_t) tw2[0]; - const int32_t vtw2i3 = (const int32_t) tw2[1]; - tw2 = (const int16_t*) ((uintptr_t) tw2 + stride * 2); - const int32_t vtw3r0 = (const int32_t) tw3[0]; - const int32_t vtw3i0 = (const int32_t) tw3[1]; - tw3 = (const int16_t*) ((uintptr_t) tw3 + stride * 3); - const int32_t vtw3r1 = (const int32_t) tw3[0]; - const int32_t vtw3i1 = (const int32_t) tw3[1]; - tw3 = (const int16_t*) ((uintptr_t) tw3 + stride * 3); - const int32_t vtw3r2 = (const int32_t) tw3[0]; - const int32_t vtw3i2 = (const int32_t) tw3[1]; - tw3 = (const int16_t*) ((uintptr_t) tw3 + stride * 3); - const int32_t vtw3r3 = (const int32_t) tw3[0]; - const int32_t vtw3i3 = (const int32_t) tw3[1]; - tw3 = (const int16_t*) ((uintptr_t) tw3 + stride * 3); - - // Note 32767 / 4 = 8191. Should be 8192. - vout0r0 = math_asr_s32(vout0r0 * 8191 + 16384, 15); - vout0i0 = math_asr_s32(vout0i0 * 8191 + 16384, 15); - vout0r1 = math_asr_s32(vout0r1 * 8191 + 16384, 15); - vout0i1 = math_asr_s32(vout0i1 * 8191 + 16384, 15); - vout0r2 = math_asr_s32(vout0r2 * 8191 + 16384, 15); - vout0i2 = math_asr_s32(vout0i2 * 8191 + 16384, 15); - vout0r3 = math_asr_s32(vout0r3 * 8191 + 16384, 15); - vout0i3 = math_asr_s32(vout0i3 * 8191 + 16384, 15); - vout1r0 = math_asr_s32(vout1r0 * 8191 + 16384, 15); - vout1i0 = math_asr_s32(vout1i0 * 8191 + 16384, 15); - vout1r1 = math_asr_s32(vout1r1 * 8191 + 16384, 15); - vout1i1 = math_asr_s32(vout1i1 * 8191 + 16384, 15); - vout1r2 = math_asr_s32(vout1r2 * 8191 + 16384, 15); - vout1i2 = math_asr_s32(vout1i2 * 8191 + 16384, 15); - vout1r3 = math_asr_s32(vout1r3 * 8191 + 16384, 15); - vout1i3 = math_asr_s32(vout1i3 * 8191 + 16384, 15); - vout2r0 = math_asr_s32(vout2r0 * 8191 + 16384, 15); - vout2i0 = math_asr_s32(vout2i0 * 8191 + 16384, 15); - vout2r1 = math_asr_s32(vout2r1 * 8191 + 16384, 15); - vout2i1 = math_asr_s32(vout2i1 * 8191 + 16384, 15); - vout2r2 = math_asr_s32(vout2r2 * 8191 + 16384, 15); - vout2i2 = math_asr_s32(vout2i2 * 8191 + 16384, 15); - vout2r3 = math_asr_s32(vout2r3 * 8191 + 16384, 15); - vout2i3 = math_asr_s32(vout2i3 * 8191 + 16384, 15); - vout3r0 = math_asr_s32(vout3r0 * 8191 + 16384, 15); - vout3i0 = math_asr_s32(vout3i0 * 8191 + 16384, 15); - vout3r1 = math_asr_s32(vout3r1 * 8191 + 16384, 15); - vout3i1 = math_asr_s32(vout3i1 * 8191 + 16384, 15); - vout3r2 = math_asr_s32(vout3r2 * 8191 + 16384, 15); - vout3i2 = math_asr_s32(vout3i2 * 8191 + 16384, 15); - vout3r3 = math_asr_s32(vout3r3 * 8191 + 16384, 15); - vout3i3 = math_asr_s32(vout3i3 * 8191 + 16384, 15); - - const int32_t vtmp0r0 = math_asr_s32(vout1r0 * vtw1r0 - vout1i0 * vtw1i0 + 16384, 15); - const int32_t vtmp0i0 = math_asr_s32(vout1r0 * vtw1i0 + vout1i0 * vtw1r0 + 16384, 15); - const int32_t vtmp0r1 = math_asr_s32(vout1r1 * vtw1r1 - vout1i1 * vtw1i1 + 16384, 15); - const int32_t vtmp0i1 = math_asr_s32(vout1r1 * vtw1i1 + vout1i1 * vtw1r1 + 16384, 15); - const int32_t vtmp0r2 = math_asr_s32(vout1r2 * vtw1r2 - vout1i2 * vtw1i2 + 16384, 15); - const int32_t vtmp0i2 = math_asr_s32(vout1r2 * vtw1i2 + vout1i2 * vtw1r2 + 16384, 15); - const int32_t vtmp0r3 = math_asr_s32(vout1r3 * vtw1r3 - vout1i3 * vtw1i3 + 16384, 15); - const int32_t vtmp0i3 = math_asr_s32(vout1r3 * vtw1i3 + vout1i3 * vtw1r3 + 16384, 15); - const int32_t vtmp1r0 = math_asr_s32(vout2r0 * vtw2r0 - vout2i0 * vtw2i0 + 16384, 15); - const int32_t vtmp1i0 = math_asr_s32(vout2r0 * vtw2i0 + vout2i0 * vtw2r0 + 16384, 15); - const int32_t vtmp1r1 = math_asr_s32(vout2r1 * vtw2r1 - vout2i1 * vtw2i1 + 16384, 15); - const int32_t vtmp1i1 = math_asr_s32(vout2r1 * vtw2i1 + vout2i1 * vtw2r1 + 16384, 15); - const int32_t vtmp1r2 = math_asr_s32(vout2r2 * vtw2r2 - vout2i2 * vtw2i2 + 16384, 15); - const int32_t vtmp1i2 = math_asr_s32(vout2r2 * vtw2i2 + vout2i2 * vtw2r2 + 16384, 15); - const int32_t vtmp1r3 = math_asr_s32(vout2r3 * vtw2r3 - vout2i3 * vtw2i3 + 16384, 15); - const int32_t vtmp1i3 = math_asr_s32(vout2r3 * vtw2i3 + vout2i3 * vtw2r3 + 16384, 15); - const int32_t vtmp2r0 = math_asr_s32(vout3r0 * vtw3r0 - vout3i0 * vtw3i0 + 16384, 15); - const int32_t vtmp2i0 = math_asr_s32(vout3r0 * vtw3i0 + vout3i0 * vtw3r0 + 16384, 15); - const int32_t vtmp2r1 = math_asr_s32(vout3r1 * vtw3r1 - vout3i1 * vtw3i1 + 16384, 15); - const int32_t vtmp2i1 = math_asr_s32(vout3r1 * vtw3i1 + vout3i1 * vtw3r1 + 16384, 15); - const int32_t vtmp2r2 = math_asr_s32(vout3r2 * vtw3r2 - vout3i2 * vtw3i2 + 16384, 15); - const int32_t vtmp2i2 = math_asr_s32(vout3r2 * vtw3i2 + vout3i2 * vtw3r2 + 16384, 15); - const int32_t vtmp2r3 = math_asr_s32(vout3r3 * vtw3r3 - vout3i3 * vtw3i3 + 16384, 15); - const int32_t vtmp2i3 = math_asr_s32(vout3r3 * vtw3i3 + vout3i3 * vtw3r3 + 16384, 15); - - const int32_t vtmp5r0 = vout0r0 - vtmp1r0; - const int32_t vtmp5i0 = vout0i0 - vtmp1i0; - const int32_t vtmp5r1 = vout0r1 - vtmp1r1; - const int32_t vtmp5i1 = vout0i1 - vtmp1i1; - const int32_t vtmp5r2 = vout0r2 - vtmp1r2; - const int32_t vtmp5i2 = vout0i2 - vtmp1i2; - const int32_t vtmp5r3 = vout0r3 - vtmp1r3; - const int32_t vtmp5i3 = vout0i3 - vtmp1i3; - vout0r0 += vtmp1r0; - vout0i0 += vtmp1i0; - vout0r1 += vtmp1r1; - vout0i1 += vtmp1i1; - vout0r2 += vtmp1r2; - vout0i2 += vtmp1i2; - vout0r3 += vtmp1r3; - vout0i3 += vtmp1i3; - const int32_t vtmp3r0 = vtmp0r0 + vtmp2r0; - const int32_t vtmp3i0 = vtmp0i0 + vtmp2i0; - const int32_t vtmp3r1 = vtmp0r1 + vtmp2r1; - const int32_t vtmp3i1 = vtmp0i1 + vtmp2i1; - const int32_t vtmp3r2 = vtmp0r2 + vtmp2r2; - const int32_t vtmp3i2 = vtmp0i2 + vtmp2i2; - const int32_t vtmp3r3 = vtmp0r3 + vtmp2r3; - const int32_t vtmp3i3 = vtmp0i3 + vtmp2i3; - const int32_t vtmp4r0 = vtmp0i0 - vtmp2i0; - const int32_t vtmp4i0 = -(vtmp0r0 - vtmp2r0); // swap r,i and neg i - const int32_t vtmp4r1 = vtmp0i1 - vtmp2i1; - const int32_t vtmp4i1 = -(vtmp0r1 - vtmp2r1); // swap r,i and neg i - const int32_t vtmp4r2 = vtmp0i2 - vtmp2i2; - const int32_t vtmp4i2 = -(vtmp0r2 - vtmp2r2); // swap r,i and neg i - const int32_t vtmp4r3 = vtmp0i3 - vtmp2i3; - const int32_t vtmp4i3 = -(vtmp0r3 - vtmp2r3); // swap r,i and neg i - vout2r0 = vout0r0 - vtmp3r0; - vout2i0 = vout0i0 - vtmp3i0; - vout2r1 = vout0r1 - vtmp3r1; - vout2i1 = vout0i1 - vtmp3i1; - vout2r2 = vout0r2 - vtmp3r2; - vout2i2 = vout0i2 - vtmp3i2; - vout2r3 = vout0r3 - vtmp3r3; - vout2i3 = vout0i3 - vtmp3i3; - vout0r0 += vtmp3r0; - vout0i0 += vtmp3i0; - vout0r1 += vtmp3r1; - vout0i1 += vtmp3i1; - vout0r2 += vtmp3r2; - vout0i2 += vtmp3i2; - vout0r3 += vtmp3r3; - vout0i3 += vtmp3i3; - vout1r0 = vtmp5r0 + vtmp4r0; - vout1i0 = vtmp5i0 + vtmp4i0; - vout1r1 = vtmp5r1 + vtmp4r1; - vout1i1 = vtmp5i1 + vtmp4i1; - vout1r2 = vtmp5r2 + vtmp4r2; - vout1i2 = vtmp5i2 + vtmp4i2; - vout1r3 = vtmp5r3 + vtmp4r3; - vout1i3 = vtmp5i3 + vtmp4i3; - vout3r0 = vtmp5r0 - vtmp4r0; - vout3i0 = vtmp5i0 - vtmp4i0; - vout3r1 = vtmp5r1 - vtmp4r1; - vout3i1 = vtmp5i1 - vtmp4i1; - vout3r2 = vtmp5r2 - vtmp4r2; - vout3i2 = vtmp5i2 - vtmp4i2; - vout3r3 = vtmp5r3 - vtmp4r3; - vout3i3 = vtmp5i3 - vtmp4i3; - - data0[0] = (int16_t) vout0r0; - data0[1] = (int16_t) vout0i0; - data0[2] = (int16_t) vout0r1; - data0[3] = (int16_t) vout0i1; - data0[4] = (int16_t) vout0r2; - data0[5] = (int16_t) vout0i2; - data0[6] = (int16_t) vout0r3; - data0[7] = (int16_t) vout0i3; - data0 += 4 * 2; - data1[0] = (int16_t) vout1r0; - data1[1] = (int16_t) vout1i0; - data1[2] = (int16_t) vout1r1; - data1[3] = (int16_t) vout1i1; - data1[4] = (int16_t) vout1r2; - data1[5] = (int16_t) vout1i2; - data1[6] = (int16_t) vout1r3; - data1[7] = (int16_t) vout1i3; - data1 += 4 * 2; - data2[0] = (int16_t) vout2r0; - data2[1] = (int16_t) vout2i0; - data2[2] = (int16_t) vout2r1; - data2[3] = (int16_t) vout2i1; - data2[4] = (int16_t) vout2r2; - data2[5] = (int16_t) vout2i2; - data2[6] = (int16_t) vout2r3; - data2[7] = (int16_t) vout2i3; - data2 += 4 * 2; - data3[0] = (int16_t) vout3r0; - data3[1] = (int16_t) vout3i0; - data3[2] = (int16_t) vout3r1; - data3[3] = (int16_t) vout3i1; - data3[4] = (int16_t) vout3r2; - data3[5] = (int16_t) vout3i2; - data3[6] = (int16_t) vout3r3; - data3[7] = (int16_t) vout3i3; - data3 += 4 * 2; - } - if XNN_UNLIKELY(s != 0) { - do { - int32_t vout0r = (int32_t) data0[0]; - int32_t vout0i = (int32_t) data0[1]; - int32_t vout1r = (int32_t) data1[0]; - int32_t vout1i = (int32_t) data1[1]; - int32_t vout2r = (int32_t) data2[0]; - int32_t vout2i = (int32_t) data2[1]; - int32_t vout3r = (int32_t) data3[0]; - int32_t vout3i = (int32_t) data3[1]; - - const int32_t vtw1r = (const int32_t) tw1[0]; - const int32_t vtw1i = (const int32_t) tw1[1]; - const int32_t vtw2r = (const int32_t) tw2[0]; - const int32_t vtw2i = (const int32_t) tw2[1]; - const int32_t vtw3r = (const int32_t) tw3[0]; - const int32_t vtw3i = (const int32_t) tw3[1]; - tw1 = (const int16_t*) ((uintptr_t) tw1 + stride); - tw2 = (const int16_t*) ((uintptr_t) tw2 + stride * 2); - tw3 = (const int16_t*) ((uintptr_t) tw3 + stride * 3); - - // Note 32767 / 4 = 8191. Should be 8192. - vout0r = math_asr_s32(vout0r * 8191 + 16384, 15); - vout0i = math_asr_s32(vout0i * 8191 + 16384, 15); - vout1r = math_asr_s32(vout1r * 8191 + 16384, 15); - vout1i = math_asr_s32(vout1i * 8191 + 16384, 15); - vout2r = math_asr_s32(vout2r * 8191 + 16384, 15); - vout2i = math_asr_s32(vout2i * 8191 + 16384, 15); - vout3r = math_asr_s32(vout3r * 8191 + 16384, 15); - vout3i = math_asr_s32(vout3i * 8191 + 16384, 15); - - const int32_t vtmp0r = math_asr_s32(vout1r * vtw1r - vout1i * vtw1i + 16384, 15); - const int32_t vtmp0i = math_asr_s32(vout1r * vtw1i + vout1i * vtw1r + 16384, 15); - const int32_t vtmp1r = math_asr_s32(vout2r * vtw2r - vout2i * vtw2i + 16384, 15); - const int32_t vtmp1i = math_asr_s32(vout2r * vtw2i + vout2i * vtw2r + 16384, 15); - const int32_t vtmp2r = math_asr_s32(vout3r * vtw3r - vout3i * vtw3i + 16384, 15); - const int32_t vtmp2i = math_asr_s32(vout3r * vtw3i + vout3i * vtw3r + 16384, 15); - - const int32_t vtmp5r = vout0r - vtmp1r; - const int32_t vtmp5i = vout0i - vtmp1i; - vout0r += vtmp1r; - vout0i += vtmp1i; - const int32_t vtmp3r = vtmp0r + vtmp2r; - const int32_t vtmp3i = vtmp0i + vtmp2i; - const int32_t vtmp4r = vtmp0i - vtmp2i; - const int32_t vtmp4i = -(vtmp0r - vtmp2r); // swap r,i and neg i - vout2r = vout0r - vtmp3r; - vout2i = vout0i - vtmp3i; - - vout0r += vtmp3r; - vout0i += vtmp3i; - - vout1r = vtmp5r + vtmp4r; - vout1i = vtmp5i + vtmp4i; - vout3r = vtmp5r - vtmp4r; - vout3i = vtmp5i - vtmp4i; - - data0[0] = (int16_t) vout0r; - data0[1] = (int16_t) vout0i; - data1[0] = (int16_t) vout1r; - data1[1] = (int16_t) vout1i; - data2[0] = (int16_t) vout2r; - data2[1] = (int16_t) vout2i; - data3[0] = (int16_t) vout3r; - data3[1] = (int16_t) vout3i; - data0 += 2; - data1 += 2; - data2 += 2; - data3 += 2; - - s -= sizeof(int16_t) * 2; - } while (s != 0); - } - } while (--batch != 0); -} diff --git a/src/cs16-bfly4/scalar.c.in b/src/cs16-bfly4/scalar.c.in deleted file mode 100644 index 742b1690bbf..00000000000 --- a/src/cs16-bfly4/scalar.c.in +++ /dev/null @@ -1,269 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert SAMPLE_TILE >= 1 -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/fft.h" - - -void xnn_cs16_bfly4_ukernel__scalar_x${SAMPLE_TILE}( - size_t batch, - size_t samples, - int16_t* data, - const int16_t* twiddle, - size_t stride) -{ - assert(batch != 0); - assert(samples != 0); - assert(samples % (sizeof(int16_t) * 2) == 0); - assert(data != NULL); - assert(stride != 0); - assert(twiddle != NULL); - - int16_t* data3 = data; - - do { - int16_t* data0 = data3; - int16_t* data1 = (int16_t*) ((uintptr_t) data0 + samples); - int16_t* data2 = (int16_t*) ((uintptr_t) data1 + samples); - data3 = (int16_t*) ((uintptr_t) data2 + samples); - - const int16_t* tw1 = twiddle; - const int16_t* tw2 = twiddle; - const int16_t* tw3 = twiddle; - tw1 = (const int16_t*) ((uintptr_t) tw1 + stride); - tw2 = (const int16_t*) ((uintptr_t) tw2 + stride * 2); - tw3 = (const int16_t*) ((uintptr_t) tw3 + stride * 3); - - size_t s = samples - sizeof(int16_t) * 2; - - // First sample skips twiddle. - // Same code as samples=1 but supports stride - { - int32_t vout0r = (int32_t) data0[0]; - int32_t vout0i = (int32_t) data0[1]; - int32_t vout1r = (int32_t) data1[0]; - int32_t vout1i = (int32_t) data1[1]; - int32_t vout2r = (int32_t) data2[0]; - int32_t vout2i = (int32_t) data2[1]; - int32_t vout3r = (int32_t) data3[0]; - int32_t vout3i = (int32_t) data3[1]; - - // Note 32767 / 4 = 8191. Should be 8192. - vout0r = math_asr_s32(vout0r * 8191 + 16384, 15); - vout0i = math_asr_s32(vout0i * 8191 + 16384, 15); - vout1r = math_asr_s32(vout1r * 8191 + 16384, 15); - vout1i = math_asr_s32(vout1i * 8191 + 16384, 15); - vout2r = math_asr_s32(vout2r * 8191 + 16384, 15); - vout2i = math_asr_s32(vout2i * 8191 + 16384, 15); - vout3r = math_asr_s32(vout3r * 8191 + 16384, 15); - vout3i = math_asr_s32(vout3i * 8191 + 16384, 15); - - const int32_t vtmp5r = vout0r - vout2r; - const int32_t vtmp5i = vout0i - vout2i; - vout0r += vout2r; - vout0i += vout2i; - const int32_t vtmp3r = vout1r + vout3r; - const int32_t vtmp3i = vout1i + vout3i; - const int32_t vtmp4r = vout1i - vout3i; - const int32_t vtmp4i = -(vout1r - vout3r); // swap r,i and neg i - vout2r = vout0r - vtmp3r; - vout2i = vout0i - vtmp3i; - - vout0r += vtmp3r; - vout0i += vtmp3i; - - vout1r = vtmp5r + vtmp4r; - vout1i = vtmp5i + vtmp4i; - vout3r = vtmp5r - vtmp4r; - vout3i = vtmp5i - vtmp4i; - - data0[0] = (int16_t) vout0r; - data0[1] = (int16_t) vout0i; - data1[0] = (int16_t) vout1r; - data1[1] = (int16_t) vout1i; - data2[0] = (int16_t) vout2r; - data2[1] = (int16_t) vout2i; - data3[0] = (int16_t) vout3r; - data3[1] = (int16_t) vout3i; - data0 += 2; - data1 += 2; - data2 += 2; - data3 += 2; - } - - $if SAMPLE_TILE > 1: - for (; s >= ${SAMPLE_TILE} * sizeof(int16_t) * 2; s -= ${SAMPLE_TILE} * sizeof(int16_t) * 2) { - $for C in range(SAMPLE_TILE): - int32_t vout0r${C} = (int32_t) data0[${C * 2 + 0}]; - int32_t vout0i${C} = (int32_t) data0[${C * 2 + 1}]; - $for C in range(SAMPLE_TILE): - int32_t vout1r${C} = (int32_t) data1[${C * 2 + 0}]; - int32_t vout1i${C} = (int32_t) data1[${C * 2 + 1}]; - $for C in range(SAMPLE_TILE): - int32_t vout2r${C} = (int32_t) data2[${C * 2 + 0}]; - int32_t vout2i${C} = (int32_t) data2[${C * 2 + 1}]; - $for C in range(SAMPLE_TILE): - int32_t vout3r${C} = (int32_t) data3[${C * 2 + 0}]; - int32_t vout3i${C} = (int32_t) data3[${C * 2 + 1}]; - - $for C in range(SAMPLE_TILE): - const int32_t vtw1r${C} = (const int32_t) tw1[0]; - const int32_t vtw1i${C} = (const int32_t) tw1[1]; - tw1 = (const int16_t*) ((uintptr_t) tw1 + stride); - $for C in range(SAMPLE_TILE): - const int32_t vtw2r${C} = (const int32_t) tw2[0]; - const int32_t vtw2i${C} = (const int32_t) tw2[1]; - tw2 = (const int16_t*) ((uintptr_t) tw2 + stride * 2); - $for C in range(SAMPLE_TILE): - const int32_t vtw3r${C} = (const int32_t) tw3[0]; - const int32_t vtw3i${C} = (const int32_t) tw3[1]; - tw3 = (const int16_t*) ((uintptr_t) tw3 + stride * 3); - - // Note 32767 / 4 = 8191. Should be 8192. - $for C in range(SAMPLE_TILE): - vout0r${C} = math_asr_s32(vout0r${C} * 8191 + 16384, 15); - vout0i${C} = math_asr_s32(vout0i${C} * 8191 + 16384, 15); - $for C in range(SAMPLE_TILE): - vout1r${C} = math_asr_s32(vout1r${C} * 8191 + 16384, 15); - vout1i${C} = math_asr_s32(vout1i${C} * 8191 + 16384, 15); - $for C in range(SAMPLE_TILE): - vout2r${C} = math_asr_s32(vout2r${C} * 8191 + 16384, 15); - vout2i${C} = math_asr_s32(vout2i${C} * 8191 + 16384, 15); - $for C in range(SAMPLE_TILE): - vout3r${C} = math_asr_s32(vout3r${C} * 8191 + 16384, 15); - vout3i${C} = math_asr_s32(vout3i${C} * 8191 + 16384, 15); - - $for C in range(SAMPLE_TILE): - const int32_t vtmp0r${C} = math_asr_s32(vout1r${C} * vtw1r${C} - vout1i${C} * vtw1i${C} + 16384, 15); - const int32_t vtmp0i${C} = math_asr_s32(vout1r${C} * vtw1i${C} + vout1i${C} * vtw1r${C} + 16384, 15); - $for C in range(SAMPLE_TILE): - const int32_t vtmp1r${C} = math_asr_s32(vout2r${C} * vtw2r${C} - vout2i${C} * vtw2i${C} + 16384, 15); - const int32_t vtmp1i${C} = math_asr_s32(vout2r${C} * vtw2i${C} + vout2i${C} * vtw2r${C} + 16384, 15); - $for C in range(SAMPLE_TILE): - const int32_t vtmp2r${C} = math_asr_s32(vout3r${C} * vtw3r${C} - vout3i${C} * vtw3i${C} + 16384, 15); - const int32_t vtmp2i${C} = math_asr_s32(vout3r${C} * vtw3i${C} + vout3i${C} * vtw3r${C} + 16384, 15); - - $for C in range(SAMPLE_TILE): - const int32_t vtmp5r${C} = vout0r${C} - vtmp1r${C}; - const int32_t vtmp5i${C} = vout0i${C} - vtmp1i${C}; - $for C in range(SAMPLE_TILE): - vout0r${C} += vtmp1r${C}; - vout0i${C} += vtmp1i${C}; - $for C in range(SAMPLE_TILE): - const int32_t vtmp3r${C} = vtmp0r${C} + vtmp2r${C}; - const int32_t vtmp3i${C} = vtmp0i${C} + vtmp2i${C}; - $for C in range(SAMPLE_TILE): - const int32_t vtmp4r${C} = vtmp0i${C} - vtmp2i${C}; - const int32_t vtmp4i${C} = -(vtmp0r${C} - vtmp2r${C}); // swap r,i and neg i - $for C in range(SAMPLE_TILE): - vout2r${C} = vout0r${C} - vtmp3r${C}; - vout2i${C} = vout0i${C} - vtmp3i${C}; - $for C in range(SAMPLE_TILE): - vout0r${C} += vtmp3r${C}; - vout0i${C} += vtmp3i${C}; - $for C in range(SAMPLE_TILE): - vout1r${C} = vtmp5r${C} + vtmp4r${C}; - vout1i${C} = vtmp5i${C} + vtmp4i${C}; - $for C in range(SAMPLE_TILE): - vout3r${C} = vtmp5r${C} - vtmp4r${C}; - vout3i${C} = vtmp5i${C} - vtmp4i${C}; - - $for C in range(SAMPLE_TILE): - data0[${C * 2 + 0}] = (int16_t) vout0r${C}; - data0[${C * 2 + 1}] = (int16_t) vout0i${C}; - data0 += ${SAMPLE_TILE} * 2; - $for C in range(SAMPLE_TILE): - data1[${C * 2 + 0}] = (int16_t) vout1r${C}; - data1[${C * 2 + 1}] = (int16_t) vout1i${C}; - data1 += ${SAMPLE_TILE} * 2; - $for C in range(SAMPLE_TILE): - data2[${C * 2 + 0}] = (int16_t) vout2r${C}; - data2[${C * 2 + 1}] = (int16_t) vout2i${C}; - data2 += ${SAMPLE_TILE} * 2; - $for C in range(SAMPLE_TILE): - data3[${C * 2 + 0}] = (int16_t) vout3r${C}; - data3[${C * 2 + 1}] = (int16_t) vout3i${C}; - data3 += ${SAMPLE_TILE} * 2; - } - if XNN_UNLIKELY(s != 0) { - do { - int32_t vout0r = (int32_t) data0[0]; - int32_t vout0i = (int32_t) data0[1]; - int32_t vout1r = (int32_t) data1[0]; - int32_t vout1i = (int32_t) data1[1]; - int32_t vout2r = (int32_t) data2[0]; - int32_t vout2i = (int32_t) data2[1]; - int32_t vout3r = (int32_t) data3[0]; - int32_t vout3i = (int32_t) data3[1]; - - const int32_t vtw1r = (const int32_t) tw1[0]; - const int32_t vtw1i = (const int32_t) tw1[1]; - const int32_t vtw2r = (const int32_t) tw2[0]; - const int32_t vtw2i = (const int32_t) tw2[1]; - const int32_t vtw3r = (const int32_t) tw3[0]; - const int32_t vtw3i = (const int32_t) tw3[1]; - tw1 = (const int16_t*) ((uintptr_t) tw1 + stride); - tw2 = (const int16_t*) ((uintptr_t) tw2 + stride * 2); - tw3 = (const int16_t*) ((uintptr_t) tw3 + stride * 3); - - // Note 32767 / 4 = 8191. Should be 8192. - vout0r = math_asr_s32(vout0r * 8191 + 16384, 15); - vout0i = math_asr_s32(vout0i * 8191 + 16384, 15); - vout1r = math_asr_s32(vout1r * 8191 + 16384, 15); - vout1i = math_asr_s32(vout1i * 8191 + 16384, 15); - vout2r = math_asr_s32(vout2r * 8191 + 16384, 15); - vout2i = math_asr_s32(vout2i * 8191 + 16384, 15); - vout3r = math_asr_s32(vout3r * 8191 + 16384, 15); - vout3i = math_asr_s32(vout3i * 8191 + 16384, 15); - - const int32_t vtmp0r = math_asr_s32(vout1r * vtw1r - vout1i * vtw1i + 16384, 15); - const int32_t vtmp0i = math_asr_s32(vout1r * vtw1i + vout1i * vtw1r + 16384, 15); - const int32_t vtmp1r = math_asr_s32(vout2r * vtw2r - vout2i * vtw2i + 16384, 15); - const int32_t vtmp1i = math_asr_s32(vout2r * vtw2i + vout2i * vtw2r + 16384, 15); - const int32_t vtmp2r = math_asr_s32(vout3r * vtw3r - vout3i * vtw3i + 16384, 15); - const int32_t vtmp2i = math_asr_s32(vout3r * vtw3i + vout3i * vtw3r + 16384, 15); - - const int32_t vtmp5r = vout0r - vtmp1r; - const int32_t vtmp5i = vout0i - vtmp1i; - vout0r += vtmp1r; - vout0i += vtmp1i; - const int32_t vtmp3r = vtmp0r + vtmp2r; - const int32_t vtmp3i = vtmp0i + vtmp2i; - const int32_t vtmp4r = vtmp0i - vtmp2i; - const int32_t vtmp4i = -(vtmp0r - vtmp2r); // swap r,i and neg i - vout2r = vout0r - vtmp3r; - vout2i = vout0i - vtmp3i; - - vout0r += vtmp3r; - vout0i += vtmp3i; - - vout1r = vtmp5r + vtmp4r; - vout1i = vtmp5i + vtmp4i; - vout3r = vtmp5r - vtmp4r; - vout3i = vtmp5i - vtmp4i; - - data0[0] = (int16_t) vout0r; - data0[1] = (int16_t) vout0i; - data1[0] = (int16_t) vout1r; - data1[1] = (int16_t) vout1i; - data2[0] = (int16_t) vout2r; - data2[1] = (int16_t) vout2i; - data3[0] = (int16_t) vout3r; - data3[1] = (int16_t) vout3i; - data0 += 2; - data1 += 2; - data2 += 2; - data3 += 2; - - s -= sizeof(int16_t) * 2; - } while (s != 0); - } - } while (--batch != 0); -} diff --git a/src/cs16-fftr/cs16-fftr-asm-aarch32-neon-x1.S b/src/cs16-fftr/cs16-fftr-asm-aarch32-neon-x1.S deleted file mode 100644 index 6b68d502b01..00000000000 --- a/src/cs16-fftr/cs16-fftr-asm-aarch32-neon-x1.S +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include "xnnpack/assembly.h" - -.syntax unified - -// void xnn_cs16_fftr_ukernel__asm_aarch32_neon_x1( -// size_t samples, r0 (256) -// int16_t* data, r1 -// const int16_t* twiddle) r2 - -// d8-d15, r12-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved. - -// Register usage -// vilr r1 d0 -// vili d1 -// virr r3 d2 -// viri d3 -// vdiv2 d4 -// vtwr r2 d6 -// vtwi d7 - -// vacc1r q8 = vilr + virr; -// vacc1i q9 = vili + viri; -// vacc2r d0 = vilr - virr; -// vacc2i d1 = vili - viri; - -// vaccr q10 (d20/d21) -// vacci q11 (d22/d23) -// voutlr q12 (vaccr + vacc1r) / 2 -// voutli q13 (vacci + vacc1i) / 2 -// voutrr q14 (vacc1r - vaccr) / 2 -// voutri q15 (vacci - vacc1i) / 2 -// unused d5, d8-d15 - -BEGIN_FUNCTION xnn_cs16_fftr_ukernel__asm_aarch32_neon_x1 - .arm -#ifndef __APPLE__ - .arch armv7-a - .fpu neon -#endif - ADD r3, r1, r0, lsl #2 // dr = data + samples * 4 - VMOV.U16 q0, 0 - VMVN.U16 d4, 49152 // 16383 - VLD2.16 {d0[0],d1[0]}, [r1] // first value - VQRDMULH.S16 q0, q0, d4[0] // vilr /= 2 - VADD.I16 d16, d0, d1 // dl[0] = vicr + vici; - VSUB.I16 d18, d0, d1 // dr[0] = vicr - vici; - VST1.32 {d16[0]}, [r1]! - VST1.32 {d18[0]}, [r3] - - // Main loop of 1 cs16 value at a time -0: - SUB r3, r3, 4 // dr -= 4 - VLD2.16 {d0[0],d1[0]}, [r1] // load left r and i - VLD2.16 {d2[0],d3[0]}, [r3] // load right r and i - VLD2.16 {d6[0],d7[0]}, [r2]! // load twiddle values vtwr, vtwi - - VQRDMULH.S16 q0, q0, d4[0] // vilr /= 2 - VQRDMULH.S16 q1, q1, d4[0] // virr /= 2 - VADDL.S16 q8, d0, d2 // vacc1r = vilr + virr; - VSUBL.S16 q9, d1, d3 // vacc1i = vili - viri; - - VSUB.I16 d0, d0, d2 // vacc2r = vilr - virr; - VADD.I16 d1, d1, d3 // vacc2i = vili + viri; - - VMULL.S16 q10, d0, d6 // vaccr = vacc2r * vtwr - VMULL.S16 q11, d0, d7 // vacci = vacc2r * vtwi - VMLSL.S16 q10, d1, d7 // vaccr -= vacc2i * vtwi - VMLAL.S16 q11, d1, d6 // vacci += vacc2i * vtwr - VRSHR.S32 q10, q10, 15 // (vaccr + 16384) >> 15 - VRSHR.S32 q11, q11, 15 // (vacci + 16384) >> 15 - - VHADD.S32 q12, q10, q8 // (vaccr + vacc1r) / 2 - VHADD.S32 q13, q11, q9 // (vacci + vacc1i) / 2 - VHSUB.S32 q14, q8, q10 // (vacc1r - vaccr) / 2 - VHSUB.S32 q15, q11, q9 // (vacci - vacc1i) / 2 - - SUBS r0, r0, 2 // 2 samples (left and right) per loop - VST2.16 {d24[0],d26[0]}, [r1]! // store left r and i - VST2.16 {d28[0],d30[0]}, [r3] // store right r and i - BHI 0b - - BX lr - -END_FUNCTION xnn_cs16_fftr_ukernel__asm_aarch32_neon_x1 - -#ifdef __ELF__ -.section ".note.GNU-stack","",%progbits -#endif diff --git a/src/cs16-fftr/cs16-fftr-asm-aarch32-neon-x4.S b/src/cs16-fftr/cs16-fftr-asm-aarch32-neon-x4.S deleted file mode 100644 index a7afd637ade..00000000000 --- a/src/cs16-fftr/cs16-fftr-asm-aarch32-neon-x4.S +++ /dev/null @@ -1,101 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include "xnnpack/assembly.h" - -.syntax unified - -// void xnn_cs16_fftr_ukernel__asm_aarch32_neon_x4( -// size_t samples, r0 (256) -// int16_t* data, r1 -// const int16_t* twiddle) r2 - -// d8-d15, r12-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved. - -// Register usage -// vilr r1 d0 -// vili d1 -// virr r3 d2 -// viri d3 -// vdiv2 d4 -// vtwr r2 d6 -// vtwi d7 - -// vacc1r q8 = vilr + virr; -// vacc1i q9 = vili + viri; -// vacc2r d0 = vilr - virr; -// vacc2i d1 = vili - viri; - -// vaccr q10 (d20/d21) -// vacci q11 (d22/d23) -// voutlr q12 (vaccr + vacc1r) / 2 -// voutli q13 (vacci + vacc1i) / 2 -// voutrr q14 (vacc1r - vaccr) / 2 -// voutri q15 (vacci - vacc1i) / 2 -// unused d5, d8-d15 - -BEGIN_FUNCTION xnn_cs16_fftr_ukernel__asm_aarch32_neon_x4 - .arm -#ifndef __APPLE__ - .arch armv7-a - .fpu neon -#endif - ADD r3, r1, r0, lsl #2 // dr = data + samples * 4 - VMOV.U16 q0, 0 - VMVN.U16 d4, 49152 // 16383 - VLD2.16 {d0[0],d1[0]}, [r1] // first value - VQRDMULH.S16 q0, q0, d4[0] // vilr /= 2 - VADD.I16 d16, d0, d1 // dl[0] = vicr + vici; - VSUB.I16 d18, d0, d1 // dr[0] = vicr - vici; - VST1.32 {d16[0]}, [r1]! - VST1.32 {d18[0]}, [r3] - - // Main loop of 4 cs16 value at a time -0: - SUB r3, r3, 16 // dr -= 16 - VLD2.16 {d0,d1}, [r1] // load left r and i - VLD2.16 {d2,d3}, [r3] // load right r and i - VLD2.16 {d6,d7}, [r2]! // load twiddle values vtwr, vtwi - VREV64.16 q1, q1 // reverse right side - - VQRDMULH.S16 q0, q0, d4[0] // vilr /= 2 - VQRDMULH.S16 q1, q1, d4[0] // virr /= 2 - VADDL.S16 q8, d0, d2 // vacc1r = vilr + virr; - VSUBL.S16 q9, d1, d3 // vacc1i = vili - viri; - - VSUB.I16 d0, d0, d2 // vacc2r = vilr - virr; - VADD.I16 d1, d1, d3 // vacc2i = vili + viri; - - VMULL.S16 q10, d0, d6 // vaccr = vacc2r * vtwr - VMULL.S16 q11, d0, d7 // vacci = vacc2r * vtwi - VMLSL.S16 q10, d1, d7 // vaccr -= vacc2i * vtwi - VMLAL.S16 q11, d1, d6 // vacci += vacc2i * vtwr - VRSHR.S32 q10, q10, 15 // (vaccr + 16384) >> 15 - VRSHR.S32 q11, q11, 15 // (vacci + 16384) >> 15 - - VHADD.S32 q12, q10, q8 // (vaccr + vacc1r) / 2 - VHADD.S32 q13, q11, q9 // (vacci + vacc1i) / 2 - VHSUB.S32 q14, q8, q10 // (vacc1r - vaccr) / 2 - VHSUB.S32 q15, q11, q9 // (vacci - vacc1i) / 2 - - VMOVN.S32 d0, q12 - VMOVN.S32 d1, q13 - VMOVN.S32 d2, q14 - VMOVN.S32 d3, q15 - - SUBS r0, r0, 8 // 8 samples (left and right) per loop - VREV64.16 q1, q1 // reverse right side - - VST2.16 {d0,d1}, [r1]! // store left r and i - VST2.16 {d2,d3}, [r3] // store right r and i - BHI 0b - - BX lr - -END_FUNCTION xnn_cs16_fftr_ukernel__asm_aarch32_neon_x4 - -#ifdef __ELF__ -.section ".note.GNU-stack","",%progbits -#endif diff --git a/src/cs16-fftr/cs16-fftr-neon-x4.c b/src/cs16-fftr/cs16-fftr-neon-x4.c deleted file mode 100644 index bc10c7e3b60..00000000000 --- a/src/cs16-fftr/cs16-fftr-neon-x4.c +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/fft.h" - -#include - -void xnn_cs16_fftr_ukernel__neon_x4( - size_t samples, - int16_t* data, - const int16_t* twiddle) -{ - assert(samples != 0); - assert(samples % 8 == 0); - assert(data != NULL); - assert(twiddle != NULL); - - int16_t* dl = data; - int16_t* dr = data + samples * 2; - int32_t vdcr = (int32_t) dl[0]; - int32_t vdci = (int32_t) dl[1]; - - vdcr = math_asr_s32(vdcr * 16383 + 16384, 15); - vdci = math_asr_s32(vdci * 16383 + 16384, 15); - - dl[0] = vdcr + vdci; - dl[1] = 0; - dl += 2; - dr[0] = vdcr - vdci; - dr[1] = 0; - - const int16x4_t vdiv2 = vdup_n_s16(16383); - - do { - dr -= 8; - const int16x4x2_t vil = vld2_s16(dl); - const int16x4x2_t vir = vld2_s16(dr); - const int16x4x2_t vtw = vld2_s16(twiddle); twiddle += 8; - - int16x4_t virr = vrev64_s16(vir.val[0]); - int16x4_t viri = vrev64_s16(vir.val[1]); - - const int16x4_t vilr = vqrdmulh_s16(vil.val[0], vdiv2); - const int16x4_t vili = vqrdmulh_s16(vil.val[1], vdiv2); - virr = vqrdmulh_s16(virr, vdiv2); - viri = vqrdmulh_s16(viri, vdiv2); - - const int32x4_t vacc1r = vaddl_s16(vilr, virr); - const int32x4_t vacc1i = vsubl_s16(vili, viri); - const int16x4_t vacc2r = vsub_s16(vilr, virr); - const int16x4_t vacc2i = vadd_s16(vili, viri); - - int32x4_t vaccr = vmull_s16(vacc2r, vtw.val[0]); - int32x4_t vacci = vmull_s16(vacc2r, vtw.val[1]); - vaccr = vmlsl_s16(vaccr, vacc2i, vtw.val[1]); - vacci = vmlal_s16(vacci, vacc2i, vtw.val[0]); - vaccr = vrshrq_n_s32(vaccr, 15); - vacci = vrshrq_n_s32(vacci, 15); - - const int32x4_t vacclr = vhaddq_s32(vacc1r, vaccr); - const int32x4_t vaccli = vhaddq_s32(vacc1i, vacci); - const int32x4_t vaccrr = vhsubq_s32(vacc1r, vaccr); - const int32x4_t vaccri = vhsubq_s32(vacci, vacc1i); - - int16x4x2_t voutl; - int16x4x2_t voutr; - voutl.val[0] = vmovn_s32(vacclr); - voutl.val[1] = vmovn_s32(vaccli); - voutr.val[0] = vrev64_s16(vmovn_s32(vaccrr)); - voutr.val[1] = vrev64_s16(vmovn_s32(vaccri)); - - vst2_s16(dl, voutl); - vst2_s16(dr, voutr); - dl += 8; - - samples -= 8; - } while(samples != 0); -} diff --git a/src/cs16-fftr/gen/cs16-fftr-scalar-x1.c b/src/cs16-fftr/gen/cs16-fftr-scalar-x1.c deleted file mode 100644 index 631fc594054..00000000000 --- a/src/cs16-fftr/gen/cs16-fftr-scalar-x1.c +++ /dev/null @@ -1,75 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/cs16-fftr/scalar.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/fft.h" - - -void xnn_cs16_fftr_ukernel__scalar_x1( - size_t samples, - int16_t* data, - const int16_t* twiddle) -{ - assert(samples != 0); - assert(samples % 2 == 0); - assert(data != NULL); - assert(twiddle != NULL); - - int16_t* dl = data; - int16_t* dr = data + samples * 2; - int32_t vdcr = (int32_t) dl[0]; - int32_t vdci = (int32_t) dl[1]; - - vdcr = math_asr_s32(vdcr * 16383 + 16384, 15); - vdci = math_asr_s32(vdci * 16383 + 16384, 15); - - dl[0] = vdcr + vdci; - dl[1] = 0; - dl += 2; - dr[0] = vdcr - vdci; - dr[1] = 0; - - samples >>= 1; - - - if XNN_UNLIKELY(samples != 0) { - do { - dr -= 2; - int32_t vilr = (int32_t) dl[0]; - int32_t vili = (int32_t) dl[1]; - int32_t virr = (int32_t) dr[0]; - int32_t viri = (int32_t) dr[1]; - const int32_t vtwr = twiddle[0]; - const int32_t vtwi = twiddle[1]; - twiddle += 2; - - vilr = math_asr_s32(vilr * 16383 + 16384, 15); - vili = math_asr_s32(vili * 16383 + 16384, 15); - virr = math_asr_s32(virr * 16383 + 16384, 15); - viri = math_asr_s32(viri * 16383 + 16384, 15); - const int32_t vacc1r = vilr + virr; - const int32_t vacc1i = vili - viri; - const int32_t vacc2r = vilr - virr; - const int32_t vacc2i = vili + viri; - - const int32_t vaccr = math_asr_s32(vacc2r * vtwr - vacc2i * vtwi + 16384, 15); - const int32_t vacci = math_asr_s32(vacc2r * vtwi + vacc2i * vtwr + 16384, 15); - - dl[0] = math_asr_s32(vacc1r + vaccr, 1); - dl[1] = math_asr_s32(vacc1i + vacci, 1); - dr[0] = math_asr_s32(vacc1r - vaccr, 1); - dr[1] = math_asr_s32(vacci - vacc1i, 1); - dl += 2; - } while (--samples != 0); - } -} diff --git a/src/cs16-fftr/gen/cs16-fftr-scalar-x2.c b/src/cs16-fftr/gen/cs16-fftr-scalar-x2.c deleted file mode 100644 index eee4d4069d3..00000000000 --- a/src/cs16-fftr/gen/cs16-fftr-scalar-x2.c +++ /dev/null @@ -1,123 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/cs16-fftr/scalar.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/fft.h" - - -void xnn_cs16_fftr_ukernel__scalar_x2( - size_t samples, - int16_t* data, - const int16_t* twiddle) -{ - assert(samples != 0); - assert(samples % 2 == 0); - assert(data != NULL); - assert(twiddle != NULL); - - int16_t* dl = data; - int16_t* dr = data + samples * 2; - int32_t vdcr = (int32_t) dl[0]; - int32_t vdci = (int32_t) dl[1]; - - vdcr = math_asr_s32(vdcr * 16383 + 16384, 15); - vdci = math_asr_s32(vdci * 16383 + 16384, 15); - - dl[0] = vdcr + vdci; - dl[1] = 0; - dl += 2; - dr[0] = vdcr - vdci; - dr[1] = 0; - - samples >>= 1; - - for (; samples >= 2; samples -= 2) { - dr -= 2 * 2; - int32_t vilr0 = (int32_t) dl[0]; - int32_t vili0 = (int32_t) dl[1]; - int32_t vilr1 = (int32_t) dl[2]; - int32_t vili1 = (int32_t) dl[3]; - int32_t virr0 = (int32_t) dr[2]; - int32_t viri0 = (int32_t) dr[3]; - int32_t virr1 = (int32_t) dr[0]; - int32_t viri1 = (int32_t) dr[1]; - const int32_t vtwr0 = twiddle[0]; - const int32_t vtwi0 = twiddle[1]; - const int32_t vtwr1 = twiddle[2]; - const int32_t vtwi1 = twiddle[3]; - twiddle += 2 * 2; - - vilr0 = math_asr_s32(vilr0 * 16383 + 16384, 15); - vili0 = math_asr_s32(vili0 * 16383 + 16384, 15); - virr0 = math_asr_s32(virr0 * 16383 + 16384, 15); - viri0 = math_asr_s32(viri0 * 16383 + 16384, 15); - vilr1 = math_asr_s32(vilr1 * 16383 + 16384, 15); - vili1 = math_asr_s32(vili1 * 16383 + 16384, 15); - virr1 = math_asr_s32(virr1 * 16383 + 16384, 15); - viri1 = math_asr_s32(viri1 * 16383 + 16384, 15); - const int32_t vacc1r0 = vilr0 + virr0; - const int32_t vacc1i0 = vili0 - viri0; - const int32_t vacc2r0 = vilr0 - virr0; - const int32_t vacc2i0 = vili0 + viri0; - const int32_t vacc1r1 = vilr1 + virr1; - const int32_t vacc1i1 = vili1 - viri1; - const int32_t vacc2r1 = vilr1 - virr1; - const int32_t vacc2i1 = vili1 + viri1; - - const int32_t vaccr0 = math_asr_s32(vacc2r0 * vtwr0 - vacc2i0 * vtwi0 + 16384, 15); - const int32_t vacci0 = math_asr_s32(vacc2r0 * vtwi0 + vacc2i0 * vtwr0 + 16384, 15); - const int32_t vaccr1 = math_asr_s32(vacc2r1 * vtwr1 - vacc2i1 * vtwi1 + 16384, 15); - const int32_t vacci1 = math_asr_s32(vacc2r1 * vtwi1 + vacc2i1 * vtwr1 + 16384, 15); - - dl[0] = math_asr_s32(vacc1r0 + vaccr0, 1); - dl[1] = math_asr_s32(vacc1i0 + vacci0, 1); - dl[2] = math_asr_s32(vacc1r1 + vaccr1, 1); - dl[3] = math_asr_s32(vacc1i1 + vacci1, 1); - dr[2] = math_asr_s32(vacc1r0 - vaccr0, 1); - dr[3] = math_asr_s32(vacci0 - vacc1i0, 1); - dr[0] = math_asr_s32(vacc1r1 - vaccr1, 1); - dr[1] = math_asr_s32(vacci1 - vacc1i1, 1); - dl += 2 * 2; - } - - if XNN_UNLIKELY(samples != 0) { - do { - dr -= 2; - int32_t vilr = (int32_t) dl[0]; - int32_t vili = (int32_t) dl[1]; - int32_t virr = (int32_t) dr[0]; - int32_t viri = (int32_t) dr[1]; - const int32_t vtwr = twiddle[0]; - const int32_t vtwi = twiddle[1]; - twiddle += 2; - - vilr = math_asr_s32(vilr * 16383 + 16384, 15); - vili = math_asr_s32(vili * 16383 + 16384, 15); - virr = math_asr_s32(virr * 16383 + 16384, 15); - viri = math_asr_s32(viri * 16383 + 16384, 15); - const int32_t vacc1r = vilr + virr; - const int32_t vacc1i = vili - viri; - const int32_t vacc2r = vilr - virr; - const int32_t vacc2i = vili + viri; - - const int32_t vaccr = math_asr_s32(vacc2r * vtwr - vacc2i * vtwi + 16384, 15); - const int32_t vacci = math_asr_s32(vacc2r * vtwi + vacc2i * vtwr + 16384, 15); - - dl[0] = math_asr_s32(vacc1r + vaccr, 1); - dl[1] = math_asr_s32(vacc1i + vacci, 1); - dr[0] = math_asr_s32(vacc1r - vaccr, 1); - dr[1] = math_asr_s32(vacci - vacc1i, 1); - dl += 2; - } while (--samples != 0); - } -} diff --git a/src/cs16-fftr/gen/cs16-fftr-scalar-x4.c b/src/cs16-fftr/gen/cs16-fftr-scalar-x4.c deleted file mode 100644 index 5e0b4a8a3a2..00000000000 --- a/src/cs16-fftr/gen/cs16-fftr-scalar-x4.c +++ /dev/null @@ -1,163 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/cs16-fftr/scalar.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/fft.h" - - -void xnn_cs16_fftr_ukernel__scalar_x4( - size_t samples, - int16_t* data, - const int16_t* twiddle) -{ - assert(samples != 0); - assert(samples % 2 == 0); - assert(data != NULL); - assert(twiddle != NULL); - - int16_t* dl = data; - int16_t* dr = data + samples * 2; - int32_t vdcr = (int32_t) dl[0]; - int32_t vdci = (int32_t) dl[1]; - - vdcr = math_asr_s32(vdcr * 16383 + 16384, 15); - vdci = math_asr_s32(vdci * 16383 + 16384, 15); - - dl[0] = vdcr + vdci; - dl[1] = 0; - dl += 2; - dr[0] = vdcr - vdci; - dr[1] = 0; - - samples >>= 1; - - for (; samples >= 4; samples -= 4) { - dr -= 4 * 2; - int32_t vilr0 = (int32_t) dl[0]; - int32_t vili0 = (int32_t) dl[1]; - int32_t vilr1 = (int32_t) dl[2]; - int32_t vili1 = (int32_t) dl[3]; - int32_t vilr2 = (int32_t) dl[4]; - int32_t vili2 = (int32_t) dl[5]; - int32_t vilr3 = (int32_t) dl[6]; - int32_t vili3 = (int32_t) dl[7]; - int32_t virr0 = (int32_t) dr[6]; - int32_t viri0 = (int32_t) dr[7]; - int32_t virr1 = (int32_t) dr[4]; - int32_t viri1 = (int32_t) dr[5]; - int32_t virr2 = (int32_t) dr[2]; - int32_t viri2 = (int32_t) dr[3]; - int32_t virr3 = (int32_t) dr[0]; - int32_t viri3 = (int32_t) dr[1]; - const int32_t vtwr0 = twiddle[0]; - const int32_t vtwi0 = twiddle[1]; - const int32_t vtwr1 = twiddle[2]; - const int32_t vtwi1 = twiddle[3]; - const int32_t vtwr2 = twiddle[4]; - const int32_t vtwi2 = twiddle[5]; - const int32_t vtwr3 = twiddle[6]; - const int32_t vtwi3 = twiddle[7]; - twiddle += 4 * 2; - - vilr0 = math_asr_s32(vilr0 * 16383 + 16384, 15); - vili0 = math_asr_s32(vili0 * 16383 + 16384, 15); - virr0 = math_asr_s32(virr0 * 16383 + 16384, 15); - viri0 = math_asr_s32(viri0 * 16383 + 16384, 15); - vilr1 = math_asr_s32(vilr1 * 16383 + 16384, 15); - vili1 = math_asr_s32(vili1 * 16383 + 16384, 15); - virr1 = math_asr_s32(virr1 * 16383 + 16384, 15); - viri1 = math_asr_s32(viri1 * 16383 + 16384, 15); - vilr2 = math_asr_s32(vilr2 * 16383 + 16384, 15); - vili2 = math_asr_s32(vili2 * 16383 + 16384, 15); - virr2 = math_asr_s32(virr2 * 16383 + 16384, 15); - viri2 = math_asr_s32(viri2 * 16383 + 16384, 15); - vilr3 = math_asr_s32(vilr3 * 16383 + 16384, 15); - vili3 = math_asr_s32(vili3 * 16383 + 16384, 15); - virr3 = math_asr_s32(virr3 * 16383 + 16384, 15); - viri3 = math_asr_s32(viri3 * 16383 + 16384, 15); - const int32_t vacc1r0 = vilr0 + virr0; - const int32_t vacc1i0 = vili0 - viri0; - const int32_t vacc2r0 = vilr0 - virr0; - const int32_t vacc2i0 = vili0 + viri0; - const int32_t vacc1r1 = vilr1 + virr1; - const int32_t vacc1i1 = vili1 - viri1; - const int32_t vacc2r1 = vilr1 - virr1; - const int32_t vacc2i1 = vili1 + viri1; - const int32_t vacc1r2 = vilr2 + virr2; - const int32_t vacc1i2 = vili2 - viri2; - const int32_t vacc2r2 = vilr2 - virr2; - const int32_t vacc2i2 = vili2 + viri2; - const int32_t vacc1r3 = vilr3 + virr3; - const int32_t vacc1i3 = vili3 - viri3; - const int32_t vacc2r3 = vilr3 - virr3; - const int32_t vacc2i3 = vili3 + viri3; - - const int32_t vaccr0 = math_asr_s32(vacc2r0 * vtwr0 - vacc2i0 * vtwi0 + 16384, 15); - const int32_t vacci0 = math_asr_s32(vacc2r0 * vtwi0 + vacc2i0 * vtwr0 + 16384, 15); - const int32_t vaccr1 = math_asr_s32(vacc2r1 * vtwr1 - vacc2i1 * vtwi1 + 16384, 15); - const int32_t vacci1 = math_asr_s32(vacc2r1 * vtwi1 + vacc2i1 * vtwr1 + 16384, 15); - const int32_t vaccr2 = math_asr_s32(vacc2r2 * vtwr2 - vacc2i2 * vtwi2 + 16384, 15); - const int32_t vacci2 = math_asr_s32(vacc2r2 * vtwi2 + vacc2i2 * vtwr2 + 16384, 15); - const int32_t vaccr3 = math_asr_s32(vacc2r3 * vtwr3 - vacc2i3 * vtwi3 + 16384, 15); - const int32_t vacci3 = math_asr_s32(vacc2r3 * vtwi3 + vacc2i3 * vtwr3 + 16384, 15); - - dl[0] = math_asr_s32(vacc1r0 + vaccr0, 1); - dl[1] = math_asr_s32(vacc1i0 + vacci0, 1); - dl[2] = math_asr_s32(vacc1r1 + vaccr1, 1); - dl[3] = math_asr_s32(vacc1i1 + vacci1, 1); - dl[4] = math_asr_s32(vacc1r2 + vaccr2, 1); - dl[5] = math_asr_s32(vacc1i2 + vacci2, 1); - dl[6] = math_asr_s32(vacc1r3 + vaccr3, 1); - dl[7] = math_asr_s32(vacc1i3 + vacci3, 1); - dr[6] = math_asr_s32(vacc1r0 - vaccr0, 1); - dr[7] = math_asr_s32(vacci0 - vacc1i0, 1); - dr[4] = math_asr_s32(vacc1r1 - vaccr1, 1); - dr[5] = math_asr_s32(vacci1 - vacc1i1, 1); - dr[2] = math_asr_s32(vacc1r2 - vaccr2, 1); - dr[3] = math_asr_s32(vacci2 - vacc1i2, 1); - dr[0] = math_asr_s32(vacc1r3 - vaccr3, 1); - dr[1] = math_asr_s32(vacci3 - vacc1i3, 1); - dl += 4 * 2; - } - - if XNN_UNLIKELY(samples != 0) { - do { - dr -= 2; - int32_t vilr = (int32_t) dl[0]; - int32_t vili = (int32_t) dl[1]; - int32_t virr = (int32_t) dr[0]; - int32_t viri = (int32_t) dr[1]; - const int32_t vtwr = twiddle[0]; - const int32_t vtwi = twiddle[1]; - twiddle += 2; - - vilr = math_asr_s32(vilr * 16383 + 16384, 15); - vili = math_asr_s32(vili * 16383 + 16384, 15); - virr = math_asr_s32(virr * 16383 + 16384, 15); - viri = math_asr_s32(viri * 16383 + 16384, 15); - const int32_t vacc1r = vilr + virr; - const int32_t vacc1i = vili - viri; - const int32_t vacc2r = vilr - virr; - const int32_t vacc2i = vili + viri; - - const int32_t vaccr = math_asr_s32(vacc2r * vtwr - vacc2i * vtwi + 16384, 15); - const int32_t vacci = math_asr_s32(vacc2r * vtwi + vacc2i * vtwr + 16384, 15); - - dl[0] = math_asr_s32(vacc1r + vaccr, 1); - dl[1] = math_asr_s32(vacc1i + vacci, 1); - dr[0] = math_asr_s32(vacc1r - vaccr, 1); - dr[1] = math_asr_s32(vacci - vacc1i, 1); - dl += 2; - } while (--samples != 0); - } -} diff --git a/src/cs16-fftr/scalar.c.in b/src/cs16-fftr/scalar.c.in deleted file mode 100644 index 905bfc4e968..00000000000 --- a/src/cs16-fftr/scalar.c.in +++ /dev/null @@ -1,109 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert SAMPLE_TILE >= 1 -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/fft.h" - - -void xnn_cs16_fftr_ukernel__scalar_x${SAMPLE_TILE}( - size_t samples, - int16_t* data, - const int16_t* twiddle) -{ - assert(samples != 0); - assert(samples % 2 == 0); - assert(data != NULL); - assert(twiddle != NULL); - - int16_t* dl = data; - int16_t* dr = data + samples * 2; - int32_t vdcr = (int32_t) dl[0]; - int32_t vdci = (int32_t) dl[1]; - - vdcr = math_asr_s32(vdcr * 16383 + 16384, 15); - vdci = math_asr_s32(vdci * 16383 + 16384, 15); - - dl[0] = vdcr + vdci; - dl[1] = 0; - dl += 2; - dr[0] = vdcr - vdci; - dr[1] = 0; - - samples >>= 1; - - $if SAMPLE_TILE > 1: - for (; samples >= ${SAMPLE_TILE}; samples -= ${SAMPLE_TILE}) { - dr -= ${SAMPLE_TILE} * 2; - $for C in range(SAMPLE_TILE): - int32_t vilr${C} = (int32_t) dl[${C * 2 + 0}]; - int32_t vili${C} = (int32_t) dl[${C * 2 + 1}]; - $for C in range(SAMPLE_TILE): - int32_t virr${C} = (int32_t) dr[${(SAMPLE_TILE - 1 - C) * 2 + 0}]; - int32_t viri${C} = (int32_t) dr[${(SAMPLE_TILE - 1 - C) * 2 + 1}]; - $for C in range(SAMPLE_TILE): - const int32_t vtwr${C} = twiddle[${C * 2 + 0}]; - const int32_t vtwi${C} = twiddle[${C * 2 + 1}]; - twiddle += ${SAMPLE_TILE} * 2; - - $for C in range(SAMPLE_TILE): - vilr${C} = math_asr_s32(vilr${C} * 16383 + 16384, 15); - vili${C} = math_asr_s32(vili${C} * 16383 + 16384, 15); - virr${C} = math_asr_s32(virr${C} * 16383 + 16384, 15); - viri${C} = math_asr_s32(viri${C} * 16383 + 16384, 15); - $for C in range(SAMPLE_TILE): - const int32_t vacc1r${C} = vilr${C} + virr${C}; - const int32_t vacc1i${C} = vili${C} - viri${C}; - const int32_t vacc2r${C} = vilr${C} - virr${C}; - const int32_t vacc2i${C} = vili${C} + viri${C}; - - $for C in range(SAMPLE_TILE): - const int32_t vaccr${C} = math_asr_s32(vacc2r${C} * vtwr${C} - vacc2i${C} * vtwi${C} + 16384, 15); - const int32_t vacci${C} = math_asr_s32(vacc2r${C} * vtwi${C} + vacc2i${C} * vtwr${C} + 16384, 15); - - $for C in range(SAMPLE_TILE): - dl[${C * 2 + 0}] = math_asr_s32(vacc1r${C} + vaccr${C}, 1); - dl[${C * 2 + 1}] = math_asr_s32(vacc1i${C} + vacci${C}, 1); - $for C in range(SAMPLE_TILE): - dr[${(SAMPLE_TILE - 1 - C) * 2 + 0}] = math_asr_s32(vacc1r${C} - vaccr${C}, 1); - dr[${(SAMPLE_TILE - 1 - C) * 2 + 1}] = math_asr_s32(vacci${C} - vacc1i${C}, 1); - dl += ${SAMPLE_TILE} * 2; - } - - if XNN_UNLIKELY(samples != 0) { - do { - dr -= 2; - int32_t vilr = (int32_t) dl[0]; - int32_t vili = (int32_t) dl[1]; - int32_t virr = (int32_t) dr[0]; - int32_t viri = (int32_t) dr[1]; - const int32_t vtwr = twiddle[0]; - const int32_t vtwi = twiddle[1]; - twiddle += 2; - - vilr = math_asr_s32(vilr * 16383 + 16384, 15); - vili = math_asr_s32(vili * 16383 + 16384, 15); - virr = math_asr_s32(virr * 16383 + 16384, 15); - viri = math_asr_s32(viri * 16383 + 16384, 15); - const int32_t vacc1r = vilr + virr; - const int32_t vacc1i = vili - viri; - const int32_t vacc2r = vilr - virr; - const int32_t vacc2i = vili + viri; - - const int32_t vaccr = math_asr_s32(vacc2r * vtwr - vacc2i * vtwi + 16384, 15); - const int32_t vacci = math_asr_s32(vacc2r * vtwi + vacc2i * vtwr + 16384, 15); - - dl[0] = math_asr_s32(vacc1r + vaccr, 1); - dl[1] = math_asr_s32(vacc1i + vacci, 1); - dr[0] = math_asr_s32(vacc1r - vaccr, 1); - dr[1] = math_asr_s32(vacci - vacc1i, 1); - dl += 2; - } while (--samples != 0); - } -} diff --git a/src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x10.c b/src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x10.c deleted file mode 100644 index b96e4820b21..00000000000 --- a/src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x10.c +++ /dev/null @@ -1,64 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/cs16-vsquareabs/hexagon.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include -#include - -#include "xnnpack/vsquareabs.h" - - -void xnn_cs16_vsquareabs_ukernel__hexagon_x10( - size_t batch, - const int16_t* input, - uint32_t* output) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % (sizeof(int16_t) * 2) == 0); - assert(input != NULL); - assert(output != NULL); - - const HEXAGON_Vect64* i = (const HEXAGON_Vect64*) input; - HEXAGON_Vect64* o = (HEXAGON_Vect64*) output; - for (; batch >= 20 * sizeof(int16_t); batch -= 20 * sizeof(int16_t)) { - HEXAGON_Vect64 vacc0 = *i++; - HEXAGON_Vect64 vacc1 = *i++; - HEXAGON_Vect64 vacc2 = *i++; - HEXAGON_Vect64 vacc3 = *i++; - HEXAGON_Vect64 vacc4 = *i++; - - vacc0 = Q6_P_vdmpy_PP_sat(vacc0, vacc0); - vacc1 = Q6_P_vdmpy_PP_sat(vacc1, vacc1); - vacc2 = Q6_P_vdmpy_PP_sat(vacc2, vacc2); - vacc3 = Q6_P_vdmpy_PP_sat(vacc3, vacc3); - vacc4 = Q6_P_vdmpy_PP_sat(vacc4, vacc4); - - *o++ = vacc0; - *o++ = vacc1; - *o++ = vacc2; - *o++ = vacc3; - *o++ = vacc4; - } - for (; batch >= 4 * sizeof(int16_t); batch -= 4 * sizeof(int16_t)) { - HEXAGON_Vect64 vacc = *i++; - vacc = Q6_P_vdmpy_PP_sat(vacc, vacc); - *o++ = vacc; - } - if XNN_LIKELY(batch != 0) { - assert(batch == 2 * sizeof(int16_t)); - - const HEXAGON_Vect32 vi = *((const HEXAGON_Vect32*) i); - HEXAGON_Vect32 vacc = Q6_R_mpy_RlRl(vi, vi); - vacc = Q6_R_mpyacc_RhRh_sat(vacc, vi, vi); - *((HEXAGON_Vect32*) o) = vacc; - } -} diff --git a/src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x12.c b/src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x12.c deleted file mode 100644 index 92d9574e451..00000000000 --- a/src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x12.c +++ /dev/null @@ -1,67 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/cs16-vsquareabs/hexagon.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include -#include - -#include "xnnpack/vsquareabs.h" - - -void xnn_cs16_vsquareabs_ukernel__hexagon_x12( - size_t batch, - const int16_t* input, - uint32_t* output) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % (sizeof(int16_t) * 2) == 0); - assert(input != NULL); - assert(output != NULL); - - const HEXAGON_Vect64* i = (const HEXAGON_Vect64*) input; - HEXAGON_Vect64* o = (HEXAGON_Vect64*) output; - for (; batch >= 24 * sizeof(int16_t); batch -= 24 * sizeof(int16_t)) { - HEXAGON_Vect64 vacc0 = *i++; - HEXAGON_Vect64 vacc1 = *i++; - HEXAGON_Vect64 vacc2 = *i++; - HEXAGON_Vect64 vacc3 = *i++; - HEXAGON_Vect64 vacc4 = *i++; - HEXAGON_Vect64 vacc5 = *i++; - - vacc0 = Q6_P_vdmpy_PP_sat(vacc0, vacc0); - vacc1 = Q6_P_vdmpy_PP_sat(vacc1, vacc1); - vacc2 = Q6_P_vdmpy_PP_sat(vacc2, vacc2); - vacc3 = Q6_P_vdmpy_PP_sat(vacc3, vacc3); - vacc4 = Q6_P_vdmpy_PP_sat(vacc4, vacc4); - vacc5 = Q6_P_vdmpy_PP_sat(vacc5, vacc5); - - *o++ = vacc0; - *o++ = vacc1; - *o++ = vacc2; - *o++ = vacc3; - *o++ = vacc4; - *o++ = vacc5; - } - for (; batch >= 4 * sizeof(int16_t); batch -= 4 * sizeof(int16_t)) { - HEXAGON_Vect64 vacc = *i++; - vacc = Q6_P_vdmpy_PP_sat(vacc, vacc); - *o++ = vacc; - } - if XNN_LIKELY(batch != 0) { - assert(batch == 2 * sizeof(int16_t)); - - const HEXAGON_Vect32 vi = *((const HEXAGON_Vect32*) i); - HEXAGON_Vect32 vacc = Q6_R_mpy_RlRl(vi, vi); - vacc = Q6_R_mpyacc_RhRh_sat(vacc, vi, vi); - *((HEXAGON_Vect32*) o) = vacc; - } -} diff --git a/src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x2.c b/src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x2.c deleted file mode 100644 index 67b79798cc9..00000000000 --- a/src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x2.c +++ /dev/null @@ -1,45 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/cs16-vsquareabs/hexagon.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include -#include - -#include "xnnpack/vsquareabs.h" - - -void xnn_cs16_vsquareabs_ukernel__hexagon_x2( - size_t batch, - const int16_t* input, - uint32_t* output) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % (sizeof(int16_t) * 2) == 0); - assert(input != NULL); - assert(output != NULL); - - const HEXAGON_Vect64* i = (const HEXAGON_Vect64*) input; - HEXAGON_Vect64* o = (HEXAGON_Vect64*) output; - for (; batch >= 4 * sizeof(int16_t); batch -= 4 * sizeof(int16_t)) { - HEXAGON_Vect64 vacc = *i++; - vacc = Q6_P_vdmpy_PP_sat(vacc, vacc); - *o++ = vacc; - } - if XNN_LIKELY(batch != 0) { - assert(batch == 2 * sizeof(int16_t)); - - const HEXAGON_Vect32 vi = *((const HEXAGON_Vect32*) i); - HEXAGON_Vect32 vacc = Q6_R_mpy_RlRl(vi, vi); - vacc = Q6_R_mpyacc_RhRh_sat(vacc, vi, vi); - *((HEXAGON_Vect32*) o) = vacc; - } -} diff --git a/src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x4.c b/src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x4.c deleted file mode 100644 index 007d01efd6e..00000000000 --- a/src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x4.c +++ /dev/null @@ -1,55 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/cs16-vsquareabs/hexagon.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include -#include - -#include "xnnpack/vsquareabs.h" - - -void xnn_cs16_vsquareabs_ukernel__hexagon_x4( - size_t batch, - const int16_t* input, - uint32_t* output) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % (sizeof(int16_t) * 2) == 0); - assert(input != NULL); - assert(output != NULL); - - const HEXAGON_Vect64* i = (const HEXAGON_Vect64*) input; - HEXAGON_Vect64* o = (HEXAGON_Vect64*) output; - for (; batch >= 8 * sizeof(int16_t); batch -= 8 * sizeof(int16_t)) { - HEXAGON_Vect64 vacc0 = *i++; - HEXAGON_Vect64 vacc1 = *i++; - - vacc0 = Q6_P_vdmpy_PP_sat(vacc0, vacc0); - vacc1 = Q6_P_vdmpy_PP_sat(vacc1, vacc1); - - *o++ = vacc0; - *o++ = vacc1; - } - for (; batch >= 4 * sizeof(int16_t); batch -= 4 * sizeof(int16_t)) { - HEXAGON_Vect64 vacc = *i++; - vacc = Q6_P_vdmpy_PP_sat(vacc, vacc); - *o++ = vacc; - } - if XNN_LIKELY(batch != 0) { - assert(batch == 2 * sizeof(int16_t)); - - const HEXAGON_Vect32 vi = *((const HEXAGON_Vect32*) i); - HEXAGON_Vect32 vacc = Q6_R_mpy_RlRl(vi, vi); - vacc = Q6_R_mpyacc_RhRh_sat(vacc, vi, vi); - *((HEXAGON_Vect32*) o) = vacc; - } -} diff --git a/src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x6.c b/src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x6.c deleted file mode 100644 index d9f1617578e..00000000000 --- a/src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x6.c +++ /dev/null @@ -1,58 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/cs16-vsquareabs/hexagon.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include -#include - -#include "xnnpack/vsquareabs.h" - - -void xnn_cs16_vsquareabs_ukernel__hexagon_x6( - size_t batch, - const int16_t* input, - uint32_t* output) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % (sizeof(int16_t) * 2) == 0); - assert(input != NULL); - assert(output != NULL); - - const HEXAGON_Vect64* i = (const HEXAGON_Vect64*) input; - HEXAGON_Vect64* o = (HEXAGON_Vect64*) output; - for (; batch >= 12 * sizeof(int16_t); batch -= 12 * sizeof(int16_t)) { - HEXAGON_Vect64 vacc0 = *i++; - HEXAGON_Vect64 vacc1 = *i++; - HEXAGON_Vect64 vacc2 = *i++; - - vacc0 = Q6_P_vdmpy_PP_sat(vacc0, vacc0); - vacc1 = Q6_P_vdmpy_PP_sat(vacc1, vacc1); - vacc2 = Q6_P_vdmpy_PP_sat(vacc2, vacc2); - - *o++ = vacc0; - *o++ = vacc1; - *o++ = vacc2; - } - for (; batch >= 4 * sizeof(int16_t); batch -= 4 * sizeof(int16_t)) { - HEXAGON_Vect64 vacc = *i++; - vacc = Q6_P_vdmpy_PP_sat(vacc, vacc); - *o++ = vacc; - } - if XNN_LIKELY(batch != 0) { - assert(batch == 2 * sizeof(int16_t)); - - const HEXAGON_Vect32 vi = *((const HEXAGON_Vect32*) i); - HEXAGON_Vect32 vacc = Q6_R_mpy_RlRl(vi, vi); - vacc = Q6_R_mpyacc_RhRh_sat(vacc, vi, vi); - *((HEXAGON_Vect32*) o) = vacc; - } -} diff --git a/src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x8.c b/src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x8.c deleted file mode 100644 index b5d1345db3e..00000000000 --- a/src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x8.c +++ /dev/null @@ -1,61 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/cs16-vsquareabs/hexagon.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include -#include - -#include "xnnpack/vsquareabs.h" - - -void xnn_cs16_vsquareabs_ukernel__hexagon_x8( - size_t batch, - const int16_t* input, - uint32_t* output) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % (sizeof(int16_t) * 2) == 0); - assert(input != NULL); - assert(output != NULL); - - const HEXAGON_Vect64* i = (const HEXAGON_Vect64*) input; - HEXAGON_Vect64* o = (HEXAGON_Vect64*) output; - for (; batch >= 16 * sizeof(int16_t); batch -= 16 * sizeof(int16_t)) { - HEXAGON_Vect64 vacc0 = *i++; - HEXAGON_Vect64 vacc1 = *i++; - HEXAGON_Vect64 vacc2 = *i++; - HEXAGON_Vect64 vacc3 = *i++; - - vacc0 = Q6_P_vdmpy_PP_sat(vacc0, vacc0); - vacc1 = Q6_P_vdmpy_PP_sat(vacc1, vacc1); - vacc2 = Q6_P_vdmpy_PP_sat(vacc2, vacc2); - vacc3 = Q6_P_vdmpy_PP_sat(vacc3, vacc3); - - *o++ = vacc0; - *o++ = vacc1; - *o++ = vacc2; - *o++ = vacc3; - } - for (; batch >= 4 * sizeof(int16_t); batch -= 4 * sizeof(int16_t)) { - HEXAGON_Vect64 vacc = *i++; - vacc = Q6_P_vdmpy_PP_sat(vacc, vacc); - *o++ = vacc; - } - if XNN_LIKELY(batch != 0) { - assert(batch == 2 * sizeof(int16_t)); - - const HEXAGON_Vect32 vi = *((const HEXAGON_Vect32*) i); - HEXAGON_Vect32 vacc = Q6_R_mpy_RlRl(vi, vi); - vacc = Q6_R_mpyacc_RhRh_sat(vacc, vi, vi); - *((HEXAGON_Vect32*) o) = vacc; - } -} diff --git a/src/cs16-vsquareabs/gen/cs16-vsquareabs-neon-mlal-ld128-x12.c b/src/cs16-vsquareabs/gen/cs16-vsquareabs-neon-mlal-ld128-x12.c deleted file mode 100644 index e281c750c9e..00000000000 --- a/src/cs16-vsquareabs/gen/cs16-vsquareabs-neon-mlal-ld128-x12.c +++ /dev/null @@ -1,65 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/cs16-vsquareabs/neon.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include - -#include "xnnpack/vsquareabs.h" - - -void xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x12( - size_t batch, - const int16_t* input, - uint32_t* output) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % (sizeof(int16_t) * 2) == 0); - assert(input != NULL); - assert(output != NULL); - - for (; batch >= 24 * sizeof(int16_t); batch -= 24 * sizeof(int16_t)) { - const int16x4x2_t vi0 = vld2_s16(input); input += 8; - const int16x4x2_t vi1 = vld2_s16(input); input += 8; - const int16x4x2_t vi2 = vld2_s16(input); input += 8; - - int32x4_t vacc0 = vmull_s16(vi0.val[0], vi0.val[0]); - int32x4_t vacc1 = vmull_s16(vi1.val[0], vi1.val[0]); - int32x4_t vacc2 = vmull_s16(vi2.val[0], vi2.val[0]); - - vacc0 = vmlal_s16(vacc0, vi0.val[1], vi0.val[1]); - vacc1 = vmlal_s16(vacc1, vi1.val[1], vi1.val[1]); - vacc2 = vmlal_s16(vacc2, vi2.val[1], vi2.val[1]); - - vst1q_u32(output, vreinterpretq_u32_s32(vacc0)); output += 4; - vst1q_u32(output, vreinterpretq_u32_s32(vacc1)); output += 4; - vst1q_u32(output, vreinterpretq_u32_s32(vacc2)); output += 4; - } - for (; batch >= 8 * sizeof(int16_t); batch -= 8 * sizeof(int16_t)) { - const int16x4x2_t vi = vld2_s16(input); input += 8; - int32x4_t vacc = vmull_s16(vi.val[0], vi.val[0]); - vacc = vmlal_s16(vacc, vi.val[1], vi.val[1]); - vst1q_u32(output, vreinterpretq_u32_s32(vacc)); output += 4; - } - if XNN_LIKELY(batch != 0) { - const int16x4x2_t vi = vld2_s16(input); - int32x4_t vacc = vmull_s16(vi.val[0], vi.val[0]); - vacc = vmlal_s16(vacc, vi.val[1], vi.val[1]); - uint32x2_t vacc_lo = vreinterpret_u32_s32(vget_low_s32(vacc)); - if (batch & (4 * sizeof(int16_t))) { - vst1_u32(output, vacc_lo); output += 2; - vacc_lo = vreinterpret_u32_s32(vget_high_s32(vacc)); - } - if (batch & (2 * sizeof(int16_t))) { - vst1_lane_u32(output, vacc_lo, 0); - } - } -} diff --git a/src/cs16-vsquareabs/gen/cs16-vsquareabs-neon-mlal-ld128-x16.c b/src/cs16-vsquareabs/gen/cs16-vsquareabs-neon-mlal-ld128-x16.c deleted file mode 100644 index 353ce0fb822..00000000000 --- a/src/cs16-vsquareabs/gen/cs16-vsquareabs-neon-mlal-ld128-x16.c +++ /dev/null @@ -1,69 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/cs16-vsquareabs/neon.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include - -#include "xnnpack/vsquareabs.h" - - -void xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x16( - size_t batch, - const int16_t* input, - uint32_t* output) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % (sizeof(int16_t) * 2) == 0); - assert(input != NULL); - assert(output != NULL); - - for (; batch >= 32 * sizeof(int16_t); batch -= 32 * sizeof(int16_t)) { - const int16x4x2_t vi0 = vld2_s16(input); input += 8; - const int16x4x2_t vi1 = vld2_s16(input); input += 8; - const int16x4x2_t vi2 = vld2_s16(input); input += 8; - const int16x4x2_t vi3 = vld2_s16(input); input += 8; - - int32x4_t vacc0 = vmull_s16(vi0.val[0], vi0.val[0]); - int32x4_t vacc1 = vmull_s16(vi1.val[0], vi1.val[0]); - int32x4_t vacc2 = vmull_s16(vi2.val[0], vi2.val[0]); - int32x4_t vacc3 = vmull_s16(vi3.val[0], vi3.val[0]); - - vacc0 = vmlal_s16(vacc0, vi0.val[1], vi0.val[1]); - vacc1 = vmlal_s16(vacc1, vi1.val[1], vi1.val[1]); - vacc2 = vmlal_s16(vacc2, vi2.val[1], vi2.val[1]); - vacc3 = vmlal_s16(vacc3, vi3.val[1], vi3.val[1]); - - vst1q_u32(output, vreinterpretq_u32_s32(vacc0)); output += 4; - vst1q_u32(output, vreinterpretq_u32_s32(vacc1)); output += 4; - vst1q_u32(output, vreinterpretq_u32_s32(vacc2)); output += 4; - vst1q_u32(output, vreinterpretq_u32_s32(vacc3)); output += 4; - } - for (; batch >= 8 * sizeof(int16_t); batch -= 8 * sizeof(int16_t)) { - const int16x4x2_t vi = vld2_s16(input); input += 8; - int32x4_t vacc = vmull_s16(vi.val[0], vi.val[0]); - vacc = vmlal_s16(vacc, vi.val[1], vi.val[1]); - vst1q_u32(output, vreinterpretq_u32_s32(vacc)); output += 4; - } - if XNN_LIKELY(batch != 0) { - const int16x4x2_t vi = vld2_s16(input); - int32x4_t vacc = vmull_s16(vi.val[0], vi.val[0]); - vacc = vmlal_s16(vacc, vi.val[1], vi.val[1]); - uint32x2_t vacc_lo = vreinterpret_u32_s32(vget_low_s32(vacc)); - if (batch & (4 * sizeof(int16_t))) { - vst1_u32(output, vacc_lo); output += 2; - vacc_lo = vreinterpret_u32_s32(vget_high_s32(vacc)); - } - if (batch & (2 * sizeof(int16_t))) { - vst1_lane_u32(output, vacc_lo, 0); - } - } -} diff --git a/src/cs16-vsquareabs/gen/cs16-vsquareabs-neon-mlal-ld128-x4.c b/src/cs16-vsquareabs/gen/cs16-vsquareabs-neon-mlal-ld128-x4.c deleted file mode 100644 index 930cfab7d0b..00000000000 --- a/src/cs16-vsquareabs/gen/cs16-vsquareabs-neon-mlal-ld128-x4.c +++ /dev/null @@ -1,48 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/cs16-vsquareabs/neon.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include - -#include "xnnpack/vsquareabs.h" - - -void xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x4( - size_t batch, - const int16_t* input, - uint32_t* output) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % (sizeof(int16_t) * 2) == 0); - assert(input != NULL); - assert(output != NULL); - - for (; batch >= 8 * sizeof(int16_t); batch -= 8 * sizeof(int16_t)) { - const int16x4x2_t vi = vld2_s16(input); input += 8; - int32x4_t vacc = vmull_s16(vi.val[0], vi.val[0]); - vacc = vmlal_s16(vacc, vi.val[1], vi.val[1]); - vst1q_u32(output, vreinterpretq_u32_s32(vacc)); output += 4; - } - if XNN_LIKELY(batch != 0) { - const int16x4x2_t vi = vld2_s16(input); - int32x4_t vacc = vmull_s16(vi.val[0], vi.val[0]); - vacc = vmlal_s16(vacc, vi.val[1], vi.val[1]); - uint32x2_t vacc_lo = vreinterpret_u32_s32(vget_low_s32(vacc)); - if (batch & (4 * sizeof(int16_t))) { - vst1_u32(output, vacc_lo); output += 2; - vacc_lo = vreinterpret_u32_s32(vget_high_s32(vacc)); - } - if (batch & (2 * sizeof(int16_t))) { - vst1_lane_u32(output, vacc_lo, 0); - } - } -} diff --git a/src/cs16-vsquareabs/gen/cs16-vsquareabs-neon-mlal-ld128-x8.c b/src/cs16-vsquareabs/gen/cs16-vsquareabs-neon-mlal-ld128-x8.c deleted file mode 100644 index 507b6e4dada..00000000000 --- a/src/cs16-vsquareabs/gen/cs16-vsquareabs-neon-mlal-ld128-x8.c +++ /dev/null @@ -1,61 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/cs16-vsquareabs/neon.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include - -#include "xnnpack/vsquareabs.h" - - -void xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x8( - size_t batch, - const int16_t* input, - uint32_t* output) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % (sizeof(int16_t) * 2) == 0); - assert(input != NULL); - assert(output != NULL); - - for (; batch >= 16 * sizeof(int16_t); batch -= 16 * sizeof(int16_t)) { - const int16x4x2_t vi0 = vld2_s16(input); input += 8; - const int16x4x2_t vi1 = vld2_s16(input); input += 8; - - int32x4_t vacc0 = vmull_s16(vi0.val[0], vi0.val[0]); - int32x4_t vacc1 = vmull_s16(vi1.val[0], vi1.val[0]); - - vacc0 = vmlal_s16(vacc0, vi0.val[1], vi0.val[1]); - vacc1 = vmlal_s16(vacc1, vi1.val[1], vi1.val[1]); - - vst1q_u32(output, vreinterpretq_u32_s32(vacc0)); output += 4; - vst1q_u32(output, vreinterpretq_u32_s32(vacc1)); output += 4; - } - for (; batch >= 8 * sizeof(int16_t); batch -= 8 * sizeof(int16_t)) { - const int16x4x2_t vi = vld2_s16(input); input += 8; - int32x4_t vacc = vmull_s16(vi.val[0], vi.val[0]); - vacc = vmlal_s16(vacc, vi.val[1], vi.val[1]); - vst1q_u32(output, vreinterpretq_u32_s32(vacc)); output += 4; - } - if XNN_LIKELY(batch != 0) { - const int16x4x2_t vi = vld2_s16(input); - int32x4_t vacc = vmull_s16(vi.val[0], vi.val[0]); - vacc = vmlal_s16(vacc, vi.val[1], vi.val[1]); - uint32x2_t vacc_lo = vreinterpret_u32_s32(vget_low_s32(vacc)); - if (batch & (4 * sizeof(int16_t))) { - vst1_u32(output, vacc_lo); output += 2; - vacc_lo = vreinterpret_u32_s32(vget_high_s32(vacc)); - } - if (batch & (2 * sizeof(int16_t))) { - vst1_lane_u32(output, vacc_lo, 0); - } - } -} diff --git a/src/cs16-vsquareabs/gen/cs16-vsquareabs-scalar-x1.c b/src/cs16-vsquareabs/gen/cs16-vsquareabs-scalar-x1.c deleted file mode 100644 index 246849d9ba4..00000000000 --- a/src/cs16-vsquareabs/gen/cs16-vsquareabs-scalar-x1.c +++ /dev/null @@ -1,39 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/cs16-vsquareabs/scalar.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/vsquareabs.h" - - -void xnn_cs16_vsquareabs_ukernel__scalar_x1( - size_t batch, - const int16_t* input, - uint32_t* output) -{ - assert(batch != 0); - assert(batch % (sizeof(int16_t) * 2) == 0); - assert(input != NULL); - assert(output != NULL); - - do { - const int32_t vr = (int32_t) input[0]; - const int32_t vi = (int32_t) input[1]; - input += 2; - - uint32_t vacc = (uint32_t) (vr * vr); - vacc += (uint32_t) (vi * vi); - - *output++ = vacc; - batch -= sizeof(int16_t) * 2; - } while (batch != 0); -} diff --git a/src/cs16-vsquareabs/gen/cs16-vsquareabs-scalar-x2.c b/src/cs16-vsquareabs/gen/cs16-vsquareabs-scalar-x2.c deleted file mode 100644 index 66969c8c285..00000000000 --- a/src/cs16-vsquareabs/gen/cs16-vsquareabs-scalar-x2.c +++ /dev/null @@ -1,56 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/cs16-vsquareabs/scalar.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/vsquareabs.h" - - -void xnn_cs16_vsquareabs_ukernel__scalar_x2( - size_t batch, - const int16_t* input, - uint32_t* output) -{ - assert(batch != 0); - assert(batch % (sizeof(int16_t) * 2) == 0); - assert(input != NULL); - assert(output != NULL); - - for (; batch >= 4 * sizeof(int16_t); batch -= 4 * sizeof(int16_t)) { - const int32_t vr0 = (int32_t) input[0]; - const int32_t vi0 = (int32_t) input[1]; - const int32_t vr1 = (int32_t) input[2]; - const int32_t vi1 = (int32_t) input[3]; - input += 2 * 2; - - uint32_t vacc0 = (uint32_t) (vr0 * vr0); - uint32_t vacc1 = (uint32_t) (vr1 * vr1); - - vacc0 += (uint32_t) (vi0 * vi0); - vacc1 += (uint32_t) (vi1 * vi1); - - output[0] = vacc0; - output[1] = vacc1; - output += 2; - } - if XNN_LIKELY(batch != 0) { - assert(batch == 2 * sizeof(int16_t)); - - const int32_t vr = (int32_t) input[0]; - const int32_t vi = (int32_t) input[1]; - - uint32_t vacc = (uint32_t) (vr * vr); - vacc += (uint32_t) (vi * vi); - - *output = vacc; - } -} diff --git a/src/cs16-vsquareabs/gen/cs16-vsquareabs-scalar-x3.c b/src/cs16-vsquareabs/gen/cs16-vsquareabs-scalar-x3.c deleted file mode 100644 index 22535d0faa1..00000000000 --- a/src/cs16-vsquareabs/gen/cs16-vsquareabs-scalar-x3.c +++ /dev/null @@ -1,63 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/cs16-vsquareabs/scalar.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/vsquareabs.h" - - -void xnn_cs16_vsquareabs_ukernel__scalar_x3( - size_t batch, - const int16_t* input, - uint32_t* output) -{ - assert(batch != 0); - assert(batch % (sizeof(int16_t) * 2) == 0); - assert(input != NULL); - assert(output != NULL); - - for (; batch >= 6 * sizeof(int16_t); batch -= 6 * sizeof(int16_t)) { - const int32_t vr0 = (int32_t) input[0]; - const int32_t vi0 = (int32_t) input[1]; - const int32_t vr1 = (int32_t) input[2]; - const int32_t vi1 = (int32_t) input[3]; - const int32_t vr2 = (int32_t) input[4]; - const int32_t vi2 = (int32_t) input[5]; - input += 3 * 2; - - uint32_t vacc0 = (uint32_t) (vr0 * vr0); - uint32_t vacc1 = (uint32_t) (vr1 * vr1); - uint32_t vacc2 = (uint32_t) (vr2 * vr2); - - vacc0 += (uint32_t) (vi0 * vi0); - vacc1 += (uint32_t) (vi1 * vi1); - vacc2 += (uint32_t) (vi2 * vi2); - - output[0] = vacc0; - output[1] = vacc1; - output[2] = vacc2; - output += 3; - } - if XNN_LIKELY(batch != 0) { - do { - const int32_t vr = (int32_t) input[0]; - const int32_t vi = (int32_t) input[1]; - input += 2; - - uint32_t vacc = (uint32_t) (vr * vr); - vacc += (uint32_t) (vi * vi); - - *output++ = vacc; - batch -= sizeof(int16_t) * 2; - } while (batch != 0); - } -} diff --git a/src/cs16-vsquareabs/gen/cs16-vsquareabs-scalar-x4.c b/src/cs16-vsquareabs/gen/cs16-vsquareabs-scalar-x4.c deleted file mode 100644 index 8561b14e2d6..00000000000 --- a/src/cs16-vsquareabs/gen/cs16-vsquareabs-scalar-x4.c +++ /dev/null @@ -1,68 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/cs16-vsquareabs/scalar.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/vsquareabs.h" - - -void xnn_cs16_vsquareabs_ukernel__scalar_x4( - size_t batch, - const int16_t* input, - uint32_t* output) -{ - assert(batch != 0); - assert(batch % (sizeof(int16_t) * 2) == 0); - assert(input != NULL); - assert(output != NULL); - - for (; batch >= 8 * sizeof(int16_t); batch -= 8 * sizeof(int16_t)) { - const int32_t vr0 = (int32_t) input[0]; - const int32_t vi0 = (int32_t) input[1]; - const int32_t vr1 = (int32_t) input[2]; - const int32_t vi1 = (int32_t) input[3]; - const int32_t vr2 = (int32_t) input[4]; - const int32_t vi2 = (int32_t) input[5]; - const int32_t vr3 = (int32_t) input[6]; - const int32_t vi3 = (int32_t) input[7]; - input += 4 * 2; - - uint32_t vacc0 = (uint32_t) (vr0 * vr0); - uint32_t vacc1 = (uint32_t) (vr1 * vr1); - uint32_t vacc2 = (uint32_t) (vr2 * vr2); - uint32_t vacc3 = (uint32_t) (vr3 * vr3); - - vacc0 += (uint32_t) (vi0 * vi0); - vacc1 += (uint32_t) (vi1 * vi1); - vacc2 += (uint32_t) (vi2 * vi2); - vacc3 += (uint32_t) (vi3 * vi3); - - output[0] = vacc0; - output[1] = vacc1; - output[2] = vacc2; - output[3] = vacc3; - output += 4; - } - if XNN_LIKELY(batch != 0) { - do { - const int32_t vr = (int32_t) input[0]; - const int32_t vi = (int32_t) input[1]; - input += 2; - - uint32_t vacc = (uint32_t) (vr * vr); - vacc += (uint32_t) (vi * vi); - - *output++ = vacc; - batch -= sizeof(int16_t) * 2; - } while (batch != 0); - } -} diff --git a/src/cs16-vsquareabs/hexagon.c.in b/src/cs16-vsquareabs/hexagon.c.in deleted file mode 100644 index c921c34671d..00000000000 --- a/src/cs16-vsquareabs/hexagon.c.in +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert BATCH_TILE % 2 == 0 -$assert BATCH_TILE >= 2 -$SIMD_TILE = BATCH_TILE // 2 -#include -#include -#include - -#include -#include - -#include "xnnpack/vsquareabs.h" - - -void xnn_cs16_vsquareabs_ukernel__hexagon_x${BATCH_TILE}( - size_t batch, - const int16_t* input, - uint32_t* output) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % (sizeof(int16_t) * 2) == 0); - assert(input != NULL); - assert(output != NULL); - - const HEXAGON_Vect64* i = (const HEXAGON_Vect64*) input; - HEXAGON_Vect64* o = (HEXAGON_Vect64*) output; - $if BATCH_TILE > 2: - for (; batch >= ${BATCH_TILE * 2} * sizeof(int16_t); batch -= ${BATCH_TILE * 2} * sizeof(int16_t)) { - $for N in range(SIMD_TILE): - HEXAGON_Vect64 vacc${N} = *i++; - - $for N in range(SIMD_TILE): - vacc${N} = Q6_P_vdmpy_PP_sat(vacc${N}, vacc${N}); - - $for N in range(SIMD_TILE): - *o++ = vacc${N}; - } - for (; batch >= 4 * sizeof(int16_t); batch -= 4 * sizeof(int16_t)) { - HEXAGON_Vect64 vacc = *i++; - vacc = Q6_P_vdmpy_PP_sat(vacc, vacc); - *o++ = vacc; - } - if XNN_LIKELY(batch != 0) { - assert(batch == 2 * sizeof(int16_t)); - - const HEXAGON_Vect32 vi = *((const HEXAGON_Vect32*) i); - HEXAGON_Vect32 vacc = Q6_R_mpy_RlRl(vi, vi); - vacc = Q6_R_mpyacc_RhRh_sat(vacc, vi, vi); - *((HEXAGON_Vect32*) o) = vacc; - } -} diff --git a/src/cs16-vsquareabs/neon.c.in b/src/cs16-vsquareabs/neon.c.in deleted file mode 100644 index 6d6f003d3f3..00000000000 --- a/src/cs16-vsquareabs/neon.c.in +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert BATCH_TILE % 4 == 0 -$assert BATCH_TILE >= 4 -$SIMD_TILE = BATCH_TILE // 4 -#include -#include -#include - -#include - -#include "xnnpack/vsquareabs.h" - - -void xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x${BATCH_TILE}( - size_t batch, - const int16_t* input, - uint32_t* output) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % (sizeof(int16_t) * 2) == 0); - assert(input != NULL); - assert(output != NULL); - - $if BATCH_TILE > 4: - for (; batch >= ${BATCH_TILE * 2} * sizeof(int16_t); batch -= ${BATCH_TILE * 2} * sizeof(int16_t)) { - $for N in range(SIMD_TILE): - const int16x4x2_t vi${N} = vld2_s16(input); input += 8; - - $for N in range(SIMD_TILE): - int32x4_t vacc${N} = vmull_s16(vi${N}.val[0], vi${N}.val[0]); - - $for N in range(SIMD_TILE): - vacc${N} = vmlal_s16(vacc${N}, vi${N}.val[1], vi${N}.val[1]); - - $for N in range(SIMD_TILE): - vst1q_u32(output, vreinterpretq_u32_s32(vacc${N})); output += 4; - } - for (; batch >= 8 * sizeof(int16_t); batch -= 8 * sizeof(int16_t)) { - const int16x4x2_t vi = vld2_s16(input); input += 8; - int32x4_t vacc = vmull_s16(vi.val[0], vi.val[0]); - vacc = vmlal_s16(vacc, vi.val[1], vi.val[1]); - vst1q_u32(output, vreinterpretq_u32_s32(vacc)); output += 4; - } - if XNN_LIKELY(batch != 0) { - const int16x4x2_t vi = vld2_s16(input); - int32x4_t vacc = vmull_s16(vi.val[0], vi.val[0]); - vacc = vmlal_s16(vacc, vi.val[1], vi.val[1]); - uint32x2_t vacc_lo = vreinterpret_u32_s32(vget_low_s32(vacc)); - if (batch & (4 * sizeof(int16_t))) { - vst1_u32(output, vacc_lo); output += 2; - vacc_lo = vreinterpret_u32_s32(vget_high_s32(vacc)); - } - if (batch & (2 * sizeof(int16_t))) { - vst1_lane_u32(output, vacc_lo, 0); - } - } -} diff --git a/src/cs16-vsquareabs/scalar.c.in b/src/cs16-vsquareabs/scalar.c.in deleted file mode 100644 index 40734708ce7..00000000000 --- a/src/cs16-vsquareabs/scalar.c.in +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert BATCH_TILE >= 1 -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/vsquareabs.h" - - -void xnn_cs16_vsquareabs_ukernel__scalar_x${BATCH_TILE}( - size_t batch, - const int16_t* input, - uint32_t* output) -{ - assert(batch != 0); - assert(batch % (sizeof(int16_t) * 2) == 0); - assert(input != NULL); - assert(output != NULL); - - $if BATCH_TILE == 1: - do { - const int32_t vr = (int32_t) input[0]; - const int32_t vi = (int32_t) input[1]; - input += 2; - - uint32_t vacc = (uint32_t) (vr * vr); - vacc += (uint32_t) (vi * vi); - - *output++ = vacc; - batch -= sizeof(int16_t) * 2; - } while (batch != 0); - $else: - for (; batch >= ${BATCH_TILE * 2} * sizeof(int16_t); batch -= ${BATCH_TILE * 2} * sizeof(int16_t)) { - $for C in range(BATCH_TILE): - const int32_t vr${C} = (int32_t) input[${C * 2}]; - const int32_t vi${C} = (int32_t) input[${C * 2 + 1}]; - input += ${BATCH_TILE} * 2; - - $for C in range(BATCH_TILE): - uint32_t vacc${C} = (uint32_t) (vr${C} * vr${C}); - - $for C in range(BATCH_TILE): - vacc${C} += (uint32_t) (vi${C} * vi${C}); - - $for C in range(BATCH_TILE): - output[${C}] = vacc${C}; - output += ${BATCH_TILE}; - } - $if BATCH_TILE == 2: - if XNN_LIKELY(batch != 0) { - assert(batch == 2 * sizeof(int16_t)); - - const int32_t vr = (int32_t) input[0]; - const int32_t vi = (int32_t) input[1]; - - uint32_t vacc = (uint32_t) (vr * vr); - vacc += (uint32_t) (vi * vi); - - *output = vacc; - } - $else: - if XNN_LIKELY(batch != 0) { - do { - const int32_t vr = (int32_t) input[0]; - const int32_t vi = (int32_t) input[1]; - input += 2; - - uint32_t vacc = (uint32_t) (vr * vr); - vacc += (uint32_t) (vi * vi); - - *output++ = vacc; - batch -= sizeof(int16_t) * 2; - } while (batch != 0); - } -} diff --git a/src/i16-vlshift/gen/i16-vlshift-neon-u16.c b/src/i16-vlshift/gen/i16-vlshift-neon-u16.c deleted file mode 100644 index 485e00813e6..00000000000 --- a/src/i16-vlshift/gen/i16-vlshift-neon-u16.c +++ /dev/null @@ -1,69 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/i16-vlshift/neon.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/vlshift.h" - - -void xnn_i16_vlshift_ukernel__neon_u16( - size_t batch, - const uint16_t* input, - uint16_t* output, - uint32_t shift) -{ - assert(batch != 0); - assert(input != NULL); - assert(output != NULL); - assert(shift < 16); - - const int16x8_t vshift = vdupq_n_s16((int16_t) shift); - for (; batch >= 16; batch -= 16) { - const uint16x8_t vi0 = vld1q_u16(input); input += 8; - const uint16x8_t vi1 = vld1q_u16(input); input += 8; - - const uint16x8_t vout0 = vshlq_u16(vi0, vshift); - const uint16x8_t vout1 = vshlq_u16(vi1, vshift); - - vst1q_u16(output, vout0); output += 8; - vst1q_u16(output, vout1); output += 8; - } - - // Remainder of full vectors - for (; batch >= 8; batch -= 8) { - const uint16x8_t vi = vld1q_u16(input); input += 8; - const uint16x8_t vout = vshlq_u16(vi, vshift); - vst1q_u16(output, vout); output += 8; - } - - // Remainder of 1 to 7 batch - if XNN_UNLIKELY(batch != 0) { - const uint16x8_t vi = vld1q_u16(input); - - const uint16x8_t vout = vshlq_u16(vi, vshift); - uint16x4_t vout_lo = vget_low_u16(vout); - - if (batch & 4) { - vst1_u16(output, vout_lo); output += 4; - vout_lo = vget_high_u16(vout); - } - if (batch & 2) { - vst1_lane_u32((void*) output, vreinterpret_u32_u16(vout_lo), 0); output += 2; - vout_lo = vext_u16(vout_lo, vout_lo, 2); - } - if (batch & 1){ - vst1_lane_u16(output, vout_lo, 0); - } - } -} diff --git a/src/i16-vlshift/gen/i16-vlshift-neon-u24.c b/src/i16-vlshift/gen/i16-vlshift-neon-u24.c deleted file mode 100644 index e5661cd345d..00000000000 --- a/src/i16-vlshift/gen/i16-vlshift-neon-u24.c +++ /dev/null @@ -1,72 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/i16-vlshift/neon.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/vlshift.h" - - -void xnn_i16_vlshift_ukernel__neon_u24( - size_t batch, - const uint16_t* input, - uint16_t* output, - uint32_t shift) -{ - assert(batch != 0); - assert(input != NULL); - assert(output != NULL); - assert(shift < 16); - - const int16x8_t vshift = vdupq_n_s16((int16_t) shift); - for (; batch >= 24; batch -= 24) { - const uint16x8_t vi0 = vld1q_u16(input); input += 8; - const uint16x8_t vi1 = vld1q_u16(input); input += 8; - const uint16x8_t vi2 = vld1q_u16(input); input += 8; - - const uint16x8_t vout0 = vshlq_u16(vi0, vshift); - const uint16x8_t vout1 = vshlq_u16(vi1, vshift); - const uint16x8_t vout2 = vshlq_u16(vi2, vshift); - - vst1q_u16(output, vout0); output += 8; - vst1q_u16(output, vout1); output += 8; - vst1q_u16(output, vout2); output += 8; - } - - // Remainder of full vectors - for (; batch >= 8; batch -= 8) { - const uint16x8_t vi = vld1q_u16(input); input += 8; - const uint16x8_t vout = vshlq_u16(vi, vshift); - vst1q_u16(output, vout); output += 8; - } - - // Remainder of 1 to 7 batch - if XNN_UNLIKELY(batch != 0) { - const uint16x8_t vi = vld1q_u16(input); - - const uint16x8_t vout = vshlq_u16(vi, vshift); - uint16x4_t vout_lo = vget_low_u16(vout); - - if (batch & 4) { - vst1_u16(output, vout_lo); output += 4; - vout_lo = vget_high_u16(vout); - } - if (batch & 2) { - vst1_lane_u32((void*) output, vreinterpret_u32_u16(vout_lo), 0); output += 2; - vout_lo = vext_u16(vout_lo, vout_lo, 2); - } - if (batch & 1){ - vst1_lane_u16(output, vout_lo, 0); - } - } -} diff --git a/src/i16-vlshift/gen/i16-vlshift-neon-u32.c b/src/i16-vlshift/gen/i16-vlshift-neon-u32.c deleted file mode 100644 index 93602fbb1b9..00000000000 --- a/src/i16-vlshift/gen/i16-vlshift-neon-u32.c +++ /dev/null @@ -1,75 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/i16-vlshift/neon.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/vlshift.h" - - -void xnn_i16_vlshift_ukernel__neon_u32( - size_t batch, - const uint16_t* input, - uint16_t* output, - uint32_t shift) -{ - assert(batch != 0); - assert(input != NULL); - assert(output != NULL); - assert(shift < 16); - - const int16x8_t vshift = vdupq_n_s16((int16_t) shift); - for (; batch >= 32; batch -= 32) { - const uint16x8_t vi0 = vld1q_u16(input); input += 8; - const uint16x8_t vi1 = vld1q_u16(input); input += 8; - const uint16x8_t vi2 = vld1q_u16(input); input += 8; - const uint16x8_t vi3 = vld1q_u16(input); input += 8; - - const uint16x8_t vout0 = vshlq_u16(vi0, vshift); - const uint16x8_t vout1 = vshlq_u16(vi1, vshift); - const uint16x8_t vout2 = vshlq_u16(vi2, vshift); - const uint16x8_t vout3 = vshlq_u16(vi3, vshift); - - vst1q_u16(output, vout0); output += 8; - vst1q_u16(output, vout1); output += 8; - vst1q_u16(output, vout2); output += 8; - vst1q_u16(output, vout3); output += 8; - } - - // Remainder of full vectors - for (; batch >= 8; batch -= 8) { - const uint16x8_t vi = vld1q_u16(input); input += 8; - const uint16x8_t vout = vshlq_u16(vi, vshift); - vst1q_u16(output, vout); output += 8; - } - - // Remainder of 1 to 7 batch - if XNN_UNLIKELY(batch != 0) { - const uint16x8_t vi = vld1q_u16(input); - - const uint16x8_t vout = vshlq_u16(vi, vshift); - uint16x4_t vout_lo = vget_low_u16(vout); - - if (batch & 4) { - vst1_u16(output, vout_lo); output += 4; - vout_lo = vget_high_u16(vout); - } - if (batch & 2) { - vst1_lane_u32((void*) output, vreinterpret_u32_u16(vout_lo), 0); output += 2; - vout_lo = vext_u16(vout_lo, vout_lo, 2); - } - if (batch & 1){ - vst1_lane_u16(output, vout_lo, 0); - } - } -} diff --git a/src/i16-vlshift/gen/i16-vlshift-neon-u8.c b/src/i16-vlshift/gen/i16-vlshift-neon-u8.c deleted file mode 100644 index b7dc097e867..00000000000 --- a/src/i16-vlshift/gen/i16-vlshift-neon-u8.c +++ /dev/null @@ -1,59 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/i16-vlshift/neon.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/vlshift.h" - - -void xnn_i16_vlshift_ukernel__neon_u8( - size_t batch, - const uint16_t* input, - uint16_t* output, - uint32_t shift) -{ - assert(batch != 0); - assert(input != NULL); - assert(output != NULL); - assert(shift < 16); - - const int16x8_t vshift = vdupq_n_s16((int16_t) shift); - - // Remainder of full vectors - for (; batch >= 8; batch -= 8) { - const uint16x8_t vi = vld1q_u16(input); input += 8; - const uint16x8_t vout = vshlq_u16(vi, vshift); - vst1q_u16(output, vout); output += 8; - } - - // Remainder of 1 to 7 batch - if XNN_UNLIKELY(batch != 0) { - const uint16x8_t vi = vld1q_u16(input); - - const uint16x8_t vout = vshlq_u16(vi, vshift); - uint16x4_t vout_lo = vget_low_u16(vout); - - if (batch & 4) { - vst1_u16(output, vout_lo); output += 4; - vout_lo = vget_high_u16(vout); - } - if (batch & 2) { - vst1_lane_u32((void*) output, vreinterpret_u32_u16(vout_lo), 0); output += 2; - vout_lo = vext_u16(vout_lo, vout_lo, 2); - } - if (batch & 1){ - vst1_lane_u16(output, vout_lo, 0); - } - } -} diff --git a/src/i16-vlshift/gen/i16-vlshift-scalar-u1.c b/src/i16-vlshift/gen/i16-vlshift-scalar-u1.c deleted file mode 100644 index 579268e0991..00000000000 --- a/src/i16-vlshift/gen/i16-vlshift-scalar-u1.c +++ /dev/null @@ -1,36 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/i16-vlshift/scalar.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/vlshift.h" - - -void xnn_i16_vlshift_ukernel__scalar_u1( - size_t batch, - const uint16_t* input, - uint16_t* output, - uint32_t shift) -{ - assert(batch != 0); - assert(input != NULL); - assert(output != NULL); - assert(shift < 16); - - if XNN_UNLIKELY(batch != 0) { - do { - const uint16_t vi = *input++; - const uint16_t vout = vi << shift; - *output++ = vout; - } while (--batch != 0); - } -} diff --git a/src/i16-vlshift/gen/i16-vlshift-scalar-u2.c b/src/i16-vlshift/gen/i16-vlshift-scalar-u2.c deleted file mode 100644 index f667ed95cf9..00000000000 --- a/src/i16-vlshift/gen/i16-vlshift-scalar-u2.c +++ /dev/null @@ -1,48 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/i16-vlshift/scalar.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/vlshift.h" - - -void xnn_i16_vlshift_ukernel__scalar_u2( - size_t batch, - const uint16_t* input, - uint16_t* output, - uint32_t shift) -{ - assert(batch != 0); - assert(input != NULL); - assert(output != NULL); - assert(shift < 16); - - for (; batch >= 2; batch -= 2) { - const uint16_t vi0 = input[0]; - const uint16_t vi1 = input[1]; - input += 2; - - const uint16_t vout0 = vi0 << shift; - const uint16_t vout1 = vi1 << shift; - - output[0] = vout0; - output[1] = vout1; - output += 2; - } - if XNN_UNLIKELY(batch != 0) { - do { - const uint16_t vi = *input++; - const uint16_t vout = vi << shift; - *output++ = vout; - } while (--batch != 0); - } -} diff --git a/src/i16-vlshift/gen/i16-vlshift-scalar-u3.c b/src/i16-vlshift/gen/i16-vlshift-scalar-u3.c deleted file mode 100644 index 8830936f3cb..00000000000 --- a/src/i16-vlshift/gen/i16-vlshift-scalar-u3.c +++ /dev/null @@ -1,51 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/i16-vlshift/scalar.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/vlshift.h" - - -void xnn_i16_vlshift_ukernel__scalar_u3( - size_t batch, - const uint16_t* input, - uint16_t* output, - uint32_t shift) -{ - assert(batch != 0); - assert(input != NULL); - assert(output != NULL); - assert(shift < 16); - - for (; batch >= 3; batch -= 3) { - const uint16_t vi0 = input[0]; - const uint16_t vi1 = input[1]; - const uint16_t vi2 = input[2]; - input += 3; - - const uint16_t vout0 = vi0 << shift; - const uint16_t vout1 = vi1 << shift; - const uint16_t vout2 = vi2 << shift; - - output[0] = vout0; - output[1] = vout1; - output[2] = vout2; - output += 3; - } - if XNN_UNLIKELY(batch != 0) { - do { - const uint16_t vi = *input++; - const uint16_t vout = vi << shift; - *output++ = vout; - } while (--batch != 0); - } -} diff --git a/src/i16-vlshift/gen/i16-vlshift-scalar-u4.c b/src/i16-vlshift/gen/i16-vlshift-scalar-u4.c deleted file mode 100644 index d6a7cb4c2c1..00000000000 --- a/src/i16-vlshift/gen/i16-vlshift-scalar-u4.c +++ /dev/null @@ -1,54 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/i16-vlshift/scalar.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/vlshift.h" - - -void xnn_i16_vlshift_ukernel__scalar_u4( - size_t batch, - const uint16_t* input, - uint16_t* output, - uint32_t shift) -{ - assert(batch != 0); - assert(input != NULL); - assert(output != NULL); - assert(shift < 16); - - for (; batch >= 4; batch -= 4) { - const uint16_t vi0 = input[0]; - const uint16_t vi1 = input[1]; - const uint16_t vi2 = input[2]; - const uint16_t vi3 = input[3]; - input += 4; - - const uint16_t vout0 = vi0 << shift; - const uint16_t vout1 = vi1 << shift; - const uint16_t vout2 = vi2 << shift; - const uint16_t vout3 = vi3 << shift; - - output[0] = vout0; - output[1] = vout1; - output[2] = vout2; - output[3] = vout3; - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - do { - const uint16_t vi = *input++; - const uint16_t vout = vi << shift; - *output++ = vout; - } while (--batch != 0); - } -} diff --git a/src/i16-vlshift/neon.c.in b/src/i16-vlshift/neon.c.in deleted file mode 100644 index c370922ebc9..00000000000 --- a/src/i16-vlshift/neon.c.in +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert BATCH_TILE % 8 == 0 -$assert BATCH_TILE >= 8 -$SIMD_TILE = BATCH_TILE // 8 -#include -#include -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/vlshift.h" - - -void xnn_i16_vlshift_ukernel__neon_u${BATCH_TILE}( - size_t batch, - const uint16_t* input, - uint16_t* output, - uint32_t shift) -{ - assert(batch != 0); - assert(input != NULL); - assert(output != NULL); - assert(shift < 16); - - const int16x8_t vshift = vdupq_n_s16((int16_t) shift); - $if BATCH_TILE > 8: - for (; batch >= ${BATCH_TILE}; batch -= ${BATCH_TILE}) { - $for N in range(SIMD_TILE): - const uint16x8_t vi${N} = vld1q_u16(input); input += 8; - - $for N in range(SIMD_TILE): - const uint16x8_t vout${N} = vshlq_u16(vi${N}, vshift); - - $for N in range(SIMD_TILE): - vst1q_u16(output, vout${N}); output += 8; - } - - // Remainder of full vectors - for (; batch >= 8; batch -= 8) { - const uint16x8_t vi = vld1q_u16(input); input += 8; - const uint16x8_t vout = vshlq_u16(vi, vshift); - vst1q_u16(output, vout); output += 8; - } - - // Remainder of 1 to 7 batch - if XNN_UNLIKELY(batch != 0) { - const uint16x8_t vi = vld1q_u16(input); - - const uint16x8_t vout = vshlq_u16(vi, vshift); - uint16x4_t vout_lo = vget_low_u16(vout); - - if (batch & 4) { - vst1_u16(output, vout_lo); output += 4; - vout_lo = vget_high_u16(vout); - } - if (batch & 2) { - vst1_lane_u32((void*) output, vreinterpret_u32_u16(vout_lo), 0); output += 2; - vout_lo = vext_u16(vout_lo, vout_lo, 2); - } - if (batch & 1){ - vst1_lane_u16(output, vout_lo, 0); - } - } -} diff --git a/src/i16-vlshift/scalar.c.in b/src/i16-vlshift/scalar.c.in deleted file mode 100644 index 40e26d2cad4..00000000000 --- a/src/i16-vlshift/scalar.c.in +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert BATCH_TILE >= 1 -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/vlshift.h" - - -void xnn_i16_vlshift_ukernel__scalar_u${BATCH_TILE}( - size_t batch, - const uint16_t* input, - uint16_t* output, - uint32_t shift) -{ - assert(batch != 0); - assert(input != NULL); - assert(output != NULL); - assert(shift < 16); - - $if BATCH_TILE > 1: - for (; batch >= ${BATCH_TILE}; batch -= ${BATCH_TILE}) { - $for C in range(BATCH_TILE): - const uint16_t vi${C} = input[${C}]; - input += ${BATCH_TILE}; - - $for C in range(BATCH_TILE): - const uint16_t vout${C} = vi${C} << shift; - - $for C in range(BATCH_TILE): - output[${C}] = vout${C}; - output += ${BATCH_TILE}; - } - if XNN_UNLIKELY(batch != 0) { - do { - const uint16_t vi = *input++; - const uint16_t vout = vi << shift; - *output++ = vout; - } while (--batch != 0); - } -} diff --git a/src/s16-rmaxabs/gen/s16-rmaxabs-neon-x16.c b/src/s16-rmaxabs/gen/s16-rmaxabs-neon-x16.c deleted file mode 100644 index 9e55f46537b..00000000000 --- a/src/s16-rmaxabs/gen/s16-rmaxabs-neon-x16.c +++ /dev/null @@ -1,66 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/s16-rmaxabs/neon.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/rmaxabs.h" - - -void xnn_s16_rmaxabs_ukernel__neon_x16( - size_t batch, - const int16_t* input, - uint16_t* output) -{ - assert(batch != 0); - assert(batch % sizeof(int16_t) == 0); - assert(input != NULL); - assert(output != NULL); - - uint16x8_t vmax0 = vdupq_n_u16(0); - uint16x8_t vmax1 = vdupq_n_u16(0); - for (; batch >= 16 * sizeof(int16_t); batch -= 16 * sizeof(int16_t)) { - const int16x8_t vi0 = vld1q_s16(input); input += 8; - const int16x8_t vi1 = vld1q_s16(input); input += 8; - - const uint16x8_t vabs0 = vreinterpretq_u16_s16(vabsq_s16(vi0)); - const uint16x8_t vabs1 = vreinterpretq_u16_s16(vabsq_s16(vi1)); - - vmax0 = vmaxq_u16(vmax0, vabs0); - vmax1 = vmaxq_u16(vmax1, vabs1); - } - - vmax0 = vmaxq_u16(vmax0, vmax1); - for (; batch >= 8 * sizeof(int16_t); batch -= 8 * sizeof(int16_t)) { - const int16x8_t vi = vld1q_s16(input); input += 8; - const uint16x8_t vabs = vreinterpretq_u16_s16(vabsq_s16(vi)); - vmax0 = vmaxq_u16(vmax0, vabs); - } - if (batch != 0) { - do { - const int16x8_t vi = vld1q_dup_s16(input); input += 1; - const uint16x8_t vabs = vreinterpretq_u16_s16(vabsq_s16(vi)); - vmax0 = vmaxq_u16(vmax0, vabs); - batch -= sizeof(int16_t); - } while (batch != 0); - } - - #if XNN_ARCH_ARM64 - *output = vmaxvq_u16(vmax0); - #else - uint16x4_t vmax_lo = vmax_u16(vget_low_u16(vmax0), vget_high_u16(vmax0)); - vmax_lo = vpmax_u16(vmax_lo, vmax_lo); - vmax_lo = vpmax_u16(vmax_lo, vmax_lo); - vst1_lane_u16(output, vmax_lo, 0); - #endif -} diff --git a/src/s16-rmaxabs/gen/s16-rmaxabs-neon-x24.c b/src/s16-rmaxabs/gen/s16-rmaxabs-neon-x24.c deleted file mode 100644 index 680709fe4bc..00000000000 --- a/src/s16-rmaxabs/gen/s16-rmaxabs-neon-x24.c +++ /dev/null @@ -1,71 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/s16-rmaxabs/neon.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/rmaxabs.h" - - -void xnn_s16_rmaxabs_ukernel__neon_x24( - size_t batch, - const int16_t* input, - uint16_t* output) -{ - assert(batch != 0); - assert(batch % sizeof(int16_t) == 0); - assert(input != NULL); - assert(output != NULL); - - uint16x8_t vmax0 = vdupq_n_u16(0); - uint16x8_t vmax1 = vdupq_n_u16(0); - uint16x8_t vmax2 = vdupq_n_u16(0); - for (; batch >= 24 * sizeof(int16_t); batch -= 24 * sizeof(int16_t)) { - const int16x8_t vi0 = vld1q_s16(input); input += 8; - const int16x8_t vi1 = vld1q_s16(input); input += 8; - const int16x8_t vi2 = vld1q_s16(input); input += 8; - - const uint16x8_t vabs0 = vreinterpretq_u16_s16(vabsq_s16(vi0)); - const uint16x8_t vabs1 = vreinterpretq_u16_s16(vabsq_s16(vi1)); - const uint16x8_t vabs2 = vreinterpretq_u16_s16(vabsq_s16(vi2)); - - vmax0 = vmaxq_u16(vmax0, vabs0); - vmax1 = vmaxq_u16(vmax1, vabs1); - vmax2 = vmaxq_u16(vmax2, vabs2); - } - - vmax0 = vmaxq_u16(vmax0, vmax1); - vmax0 = vmaxq_u16(vmax0, vmax2); - for (; batch >= 8 * sizeof(int16_t); batch -= 8 * sizeof(int16_t)) { - const int16x8_t vi = vld1q_s16(input); input += 8; - const uint16x8_t vabs = vreinterpretq_u16_s16(vabsq_s16(vi)); - vmax0 = vmaxq_u16(vmax0, vabs); - } - if (batch != 0) { - do { - const int16x8_t vi = vld1q_dup_s16(input); input += 1; - const uint16x8_t vabs = vreinterpretq_u16_s16(vabsq_s16(vi)); - vmax0 = vmaxq_u16(vmax0, vabs); - batch -= sizeof(int16_t); - } while (batch != 0); - } - - #if XNN_ARCH_ARM64 - *output = vmaxvq_u16(vmax0); - #else - uint16x4_t vmax_lo = vmax_u16(vget_low_u16(vmax0), vget_high_u16(vmax0)); - vmax_lo = vpmax_u16(vmax_lo, vmax_lo); - vmax_lo = vpmax_u16(vmax_lo, vmax_lo); - vst1_lane_u16(output, vmax_lo, 0); - #endif -} diff --git a/src/s16-rmaxabs/gen/s16-rmaxabs-neon-x32.c b/src/s16-rmaxabs/gen/s16-rmaxabs-neon-x32.c deleted file mode 100644 index 61b86f760df..00000000000 --- a/src/s16-rmaxabs/gen/s16-rmaxabs-neon-x32.c +++ /dev/null @@ -1,76 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/s16-rmaxabs/neon.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/rmaxabs.h" - - -void xnn_s16_rmaxabs_ukernel__neon_x32( - size_t batch, - const int16_t* input, - uint16_t* output) -{ - assert(batch != 0); - assert(batch % sizeof(int16_t) == 0); - assert(input != NULL); - assert(output != NULL); - - uint16x8_t vmax0 = vdupq_n_u16(0); - uint16x8_t vmax1 = vdupq_n_u16(0); - uint16x8_t vmax2 = vdupq_n_u16(0); - uint16x8_t vmax3 = vdupq_n_u16(0); - for (; batch >= 32 * sizeof(int16_t); batch -= 32 * sizeof(int16_t)) { - const int16x8_t vi0 = vld1q_s16(input); input += 8; - const int16x8_t vi1 = vld1q_s16(input); input += 8; - const int16x8_t vi2 = vld1q_s16(input); input += 8; - const int16x8_t vi3 = vld1q_s16(input); input += 8; - - const uint16x8_t vabs0 = vreinterpretq_u16_s16(vabsq_s16(vi0)); - const uint16x8_t vabs1 = vreinterpretq_u16_s16(vabsq_s16(vi1)); - const uint16x8_t vabs2 = vreinterpretq_u16_s16(vabsq_s16(vi2)); - const uint16x8_t vabs3 = vreinterpretq_u16_s16(vabsq_s16(vi3)); - - vmax0 = vmaxq_u16(vmax0, vabs0); - vmax1 = vmaxq_u16(vmax1, vabs1); - vmax2 = vmaxq_u16(vmax2, vabs2); - vmax3 = vmaxq_u16(vmax3, vabs3); - } - - vmax0 = vmaxq_u16(vmax0, vmax1); - vmax2 = vmaxq_u16(vmax2, vmax3); - vmax0 = vmaxq_u16(vmax0, vmax2); - for (; batch >= 8 * sizeof(int16_t); batch -= 8 * sizeof(int16_t)) { - const int16x8_t vi = vld1q_s16(input); input += 8; - const uint16x8_t vabs = vreinterpretq_u16_s16(vabsq_s16(vi)); - vmax0 = vmaxq_u16(vmax0, vabs); - } - if (batch != 0) { - do { - const int16x8_t vi = vld1q_dup_s16(input); input += 1; - const uint16x8_t vabs = vreinterpretq_u16_s16(vabsq_s16(vi)); - vmax0 = vmaxq_u16(vmax0, vabs); - batch -= sizeof(int16_t); - } while (batch != 0); - } - - #if XNN_ARCH_ARM64 - *output = vmaxvq_u16(vmax0); - #else - uint16x4_t vmax_lo = vmax_u16(vget_low_u16(vmax0), vget_high_u16(vmax0)); - vmax_lo = vpmax_u16(vmax_lo, vmax_lo); - vmax_lo = vpmax_u16(vmax_lo, vmax_lo); - vst1_lane_u16(output, vmax_lo, 0); - #endif -} diff --git a/src/s16-rmaxabs/gen/s16-rmaxabs-neon-x8.c b/src/s16-rmaxabs/gen/s16-rmaxabs-neon-x8.c deleted file mode 100644 index c6d18676496..00000000000 --- a/src/s16-rmaxabs/gen/s16-rmaxabs-neon-x8.c +++ /dev/null @@ -1,53 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/s16-rmaxabs/neon.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/rmaxabs.h" - - -void xnn_s16_rmaxabs_ukernel__neon_x8( - size_t batch, - const int16_t* input, - uint16_t* output) -{ - assert(batch != 0); - assert(batch % sizeof(int16_t) == 0); - assert(input != NULL); - assert(output != NULL); - - uint16x8_t vmax0 = vdupq_n_u16(0); - for (; batch >= 8 * sizeof(int16_t); batch -= 8 * sizeof(int16_t)) { - const int16x8_t vi = vld1q_s16(input); input += 8; - const uint16x8_t vabs = vreinterpretq_u16_s16(vabsq_s16(vi)); - vmax0 = vmaxq_u16(vmax0, vabs); - } - if (batch != 0) { - do { - const int16x8_t vi = vld1q_dup_s16(input); input += 1; - const uint16x8_t vabs = vreinterpretq_u16_s16(vabsq_s16(vi)); - vmax0 = vmaxq_u16(vmax0, vabs); - batch -= sizeof(int16_t); - } while (batch != 0); - } - - #if XNN_ARCH_ARM64 - *output = vmaxvq_u16(vmax0); - #else - uint16x4_t vmax_lo = vmax_u16(vget_low_u16(vmax0), vget_high_u16(vmax0)); - vmax_lo = vpmax_u16(vmax_lo, vmax_lo); - vmax_lo = vpmax_u16(vmax_lo, vmax_lo); - vst1_lane_u16(output, vmax_lo, 0); - #endif -} diff --git a/src/s16-rmaxabs/gen/s16-rmaxabs-scalar-x1.c b/src/s16-rmaxabs/gen/s16-rmaxabs-scalar-x1.c deleted file mode 100644 index 8edb079f0d3..00000000000 --- a/src/s16-rmaxabs/gen/s16-rmaxabs-scalar-x1.c +++ /dev/null @@ -1,37 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/s16-rmaxabs/scalar.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/rmaxabs.h" - - -void xnn_s16_rmaxabs_ukernel__scalar_x1( - size_t batch, - const int16_t* input, - uint16_t* output) -{ - assert(batch != 0); - assert(batch % sizeof(int16_t) == 0); - assert(input != NULL); - assert(output != NULL); - - uint32_t vmax0 = 0; - - do { - const int32_t vi = (int32_t) *input++; - const uint32_t vabs = math_abs_s32(vi); - vmax0 = math_max_u32(vmax0, vabs); - batch -= sizeof(int16_t); - } while (batch != 0); - *output = (uint16_t) vmax0; -} diff --git a/src/s16-rmaxabs/gen/s16-rmaxabs-scalar-x2.c b/src/s16-rmaxabs/gen/s16-rmaxabs-scalar-x2.c deleted file mode 100644 index 280963ec04f..00000000000 --- a/src/s16-rmaxabs/gen/s16-rmaxabs-scalar-x2.c +++ /dev/null @@ -1,52 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/s16-rmaxabs/scalar.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/rmaxabs.h" - - -void xnn_s16_rmaxabs_ukernel__scalar_x2( - size_t batch, - const int16_t* input, - uint16_t* output) -{ - assert(batch != 0); - assert(batch % sizeof(int16_t) == 0); - assert(input != NULL); - assert(output != NULL); - - uint32_t vmax0 = 0; - uint32_t vmax1 = 0; - - for (; batch >= 2 * sizeof(int16_t); batch -= 2 * sizeof(int16_t)) { - const int32_t vi0 = (int32_t) input[0]; - const int32_t vi1 = (int32_t) input[1]; - input += 2; - - const uint32_t vabs0 = math_abs_s32(vi0); - const uint32_t vabs1 = math_abs_s32(vi1); - - vmax0 = math_max_u32(vmax0, vabs0); - vmax1 = math_max_u32(vmax1, vabs1); - } - - vmax0 = math_max_u32(vmax0, vmax1); - - if (batch != 0) { - assert(batch == sizeof(int16_t)); - const int32_t vi = (int32_t) *input; - const uint32_t vabs = math_abs_s32(vi); - vmax0 = math_max_u32(vmax0, vabs); - } - *output = (uint16_t) vmax0; -} diff --git a/src/s16-rmaxabs/gen/s16-rmaxabs-scalar-x3.c b/src/s16-rmaxabs/gen/s16-rmaxabs-scalar-x3.c deleted file mode 100644 index 9578447435b..00000000000 --- a/src/s16-rmaxabs/gen/s16-rmaxabs-scalar-x3.c +++ /dev/null @@ -1,59 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/s16-rmaxabs/scalar.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/rmaxabs.h" - - -void xnn_s16_rmaxabs_ukernel__scalar_x3( - size_t batch, - const int16_t* input, - uint16_t* output) -{ - assert(batch != 0); - assert(batch % sizeof(int16_t) == 0); - assert(input != NULL); - assert(output != NULL); - - uint32_t vmax0 = 0; - uint32_t vmax1 = 0; - uint32_t vmax2 = 0; - - for (; batch >= 3 * sizeof(int16_t); batch -= 3 * sizeof(int16_t)) { - const int32_t vi0 = (int32_t) input[0]; - const int32_t vi1 = (int32_t) input[1]; - const int32_t vi2 = (int32_t) input[2]; - input += 3; - - const uint32_t vabs0 = math_abs_s32(vi0); - const uint32_t vabs1 = math_abs_s32(vi1); - const uint32_t vabs2 = math_abs_s32(vi2); - - vmax0 = math_max_u32(vmax0, vabs0); - vmax1 = math_max_u32(vmax1, vabs1); - vmax2 = math_max_u32(vmax2, vabs2); - } - - vmax0 = math_max_u32(vmax0, vmax1); - vmax0 = math_max_u32(vmax0, vmax2); - - if (batch != 0) { - do { - const int32_t vi = (int32_t) *input++; - const uint32_t vabs = math_abs_s32(vi); - vmax0 = math_max_u32(vmax0, vabs); - batch -= sizeof(int16_t); - } while (batch != 0); - } - *output = (uint16_t) vmax0; -} diff --git a/src/s16-rmaxabs/gen/s16-rmaxabs-scalar-x4.c b/src/s16-rmaxabs/gen/s16-rmaxabs-scalar-x4.c deleted file mode 100644 index 97237f0cf54..00000000000 --- a/src/s16-rmaxabs/gen/s16-rmaxabs-scalar-x4.c +++ /dev/null @@ -1,64 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/s16-rmaxabs/scalar.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/rmaxabs.h" - - -void xnn_s16_rmaxabs_ukernel__scalar_x4( - size_t batch, - const int16_t* input, - uint16_t* output) -{ - assert(batch != 0); - assert(batch % sizeof(int16_t) == 0); - assert(input != NULL); - assert(output != NULL); - - uint32_t vmax0 = 0; - uint32_t vmax1 = 0; - uint32_t vmax2 = 0; - uint32_t vmax3 = 0; - - for (; batch >= 4 * sizeof(int16_t); batch -= 4 * sizeof(int16_t)) { - const int32_t vi0 = (int32_t) input[0]; - const int32_t vi1 = (int32_t) input[1]; - const int32_t vi2 = (int32_t) input[2]; - const int32_t vi3 = (int32_t) input[3]; - input += 4; - - const uint32_t vabs0 = math_abs_s32(vi0); - const uint32_t vabs1 = math_abs_s32(vi1); - const uint32_t vabs2 = math_abs_s32(vi2); - const uint32_t vabs3 = math_abs_s32(vi3); - - vmax0 = math_max_u32(vmax0, vabs0); - vmax1 = math_max_u32(vmax1, vabs1); - vmax2 = math_max_u32(vmax2, vabs2); - vmax3 = math_max_u32(vmax3, vabs3); - } - - vmax0 = math_max_u32(vmax0, vmax1); - vmax2 = math_max_u32(vmax2, vmax3); - vmax0 = math_max_u32(vmax0, vmax2); - - if (batch != 0) { - do { - const int32_t vi = (int32_t) *input++; - const uint32_t vabs = math_abs_s32(vi); - vmax0 = math_max_u32(vmax0, vabs); - batch -= sizeof(int16_t); - } while (batch != 0); - } - *output = (uint16_t) vmax0; -} diff --git a/src/s16-rmaxabs/neon.c.in b/src/s16-rmaxabs/neon.c.in deleted file mode 100644 index 5c4506e7c07..00000000000 --- a/src/s16-rmaxabs/neon.c.in +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert BATCH_TILE % 8 == 0 -$assert BATCH_TILE >= 8 -$SIMD_TILE = BATCH_TILE // 8 -#include -#include -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/rmaxabs.h" - - -void xnn_s16_rmaxabs_ukernel__neon_x${BATCH_TILE}( - size_t batch, - const int16_t* input, - uint16_t* output) -{ - assert(batch != 0); - assert(batch % sizeof(int16_t) == 0); - assert(input != NULL); - assert(output != NULL); - - $for N in range(SIMD_TILE): - uint16x8_t vmax${N} = vdupq_n_u16(0); - $if BATCH_TILE > 8: - for (; batch >= ${BATCH_TILE} * sizeof(int16_t); batch -= ${BATCH_TILE} * sizeof(int16_t)) { - $for N in range(SIMD_TILE): - const int16x8_t vi${N} = vld1q_s16(input); input += 8; - - $for N in range(SIMD_TILE): - const uint16x8_t vabs${N} = vreinterpretq_u16_s16(vabsq_s16(vi${N})); - - $for N in range(SIMD_TILE): - vmax${N} = vmaxq_u16(vmax${N}, vabs${N}); - } - - $SIMD_SLICE = 1 - $while SIMD_SLICE < SIMD_TILE: - $for S in range(0, SIMD_TILE, SIMD_SLICE * 2): - $if S + SIMD_SLICE < SIMD_TILE: - vmax${S} = vmaxq_u16(vmax${S}, vmax${S + SIMD_SLICE}); - $SIMD_SLICE *= 2 - for (; batch >= 8 * sizeof(int16_t); batch -= 8 * sizeof(int16_t)) { - const int16x8_t vi = vld1q_s16(input); input += 8; - const uint16x8_t vabs = vreinterpretq_u16_s16(vabsq_s16(vi)); - vmax0 = vmaxq_u16(vmax0, vabs); - } - if (batch != 0) { - do { - const int16x8_t vi = vld1q_dup_s16(input); input += 1; - const uint16x8_t vabs = vreinterpretq_u16_s16(vabsq_s16(vi)); - vmax0 = vmaxq_u16(vmax0, vabs); - batch -= sizeof(int16_t); - } while (batch != 0); - } - - #if XNN_ARCH_ARM64 - *output = vmaxvq_u16(vmax0); - #else - uint16x4_t vmax_lo = vmax_u16(vget_low_u16(vmax0), vget_high_u16(vmax0)); - vmax_lo = vpmax_u16(vmax_lo, vmax_lo); - vmax_lo = vpmax_u16(vmax_lo, vmax_lo); - vst1_lane_u16(output, vmax_lo, 0); - #endif -} diff --git a/src/s16-rmaxabs/scalar.c.in b/src/s16-rmaxabs/scalar.c.in deleted file mode 100644 index cc7d49da266..00000000000 --- a/src/s16-rmaxabs/scalar.c.in +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert BATCH_TILE >= 1 -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/rmaxabs.h" - - -void xnn_s16_rmaxabs_ukernel__scalar_x${BATCH_TILE}( - size_t batch, - const int16_t* input, - uint16_t* output) -{ - assert(batch != 0); - assert(batch % sizeof(int16_t) == 0); - assert(input != NULL); - assert(output != NULL); - - $for N in range(BATCH_TILE): - uint32_t vmax${N} = 0; - - $if BATCH_TILE == 1: - do { - const int32_t vi = (int32_t) *input++; - const uint32_t vabs = math_abs_s32(vi); - vmax0 = math_max_u32(vmax0, vabs); - batch -= sizeof(int16_t); - } while (batch != 0); - $else: - for (; batch >= ${BATCH_TILE} * sizeof(int16_t); batch -= ${BATCH_TILE} * sizeof(int16_t)) { - $for N in range(BATCH_TILE): - const int32_t vi${N} = (int32_t) input[${N}]; - input += ${BATCH_TILE}; - - $for N in range(BATCH_TILE): - const uint32_t vabs${N} = math_abs_s32(vi${N}); - - $for N in range(BATCH_TILE): - vmax${N} = math_max_u32(vmax${N}, vabs${N}); - } - - $BATCH_SLICE = 1 - $while BATCH_SLICE < BATCH_TILE: - $for S in range(0, BATCH_TILE, BATCH_SLICE * 2): - $if S + BATCH_SLICE < BATCH_TILE: - vmax${S} = math_max_u32(vmax${S}, vmax${S + BATCH_SLICE}); - $BATCH_SLICE *= 2 - - $if BATCH_TILE == 2: - if (batch != 0) { - assert(batch == sizeof(int16_t)); - const int32_t vi = (int32_t) *input; - const uint32_t vabs = math_abs_s32(vi); - vmax0 = math_max_u32(vmax0, vabs); - } - $else: - if (batch != 0) { - do { - const int32_t vi = (int32_t) *input++; - const uint32_t vabs = math_abs_s32(vi); - vmax0 = math_max_u32(vmax0, vabs); - batch -= sizeof(int16_t); - } while (batch != 0); - } - *output = (uint16_t) vmax0; -} diff --git a/src/s16-window/gen/s16-window-neon-u16.c b/src/s16-window/gen/s16-window-neon-u16.c deleted file mode 100644 index 5137eca496a..00000000000 --- a/src/s16-window/gen/s16-window-neon-u16.c +++ /dev/null @@ -1,100 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/s16-window/neon.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/window.h" - - -void xnn_s16_window_ukernel__neon_u16( - size_t rows, - size_t channels, - const int16_t* input, - const int16_t* weights, - int16_t* output, - uint32_t shift) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(input != NULL); - assert(weights != NULL); - assert(output != NULL); - assert(shift < 32); - - const int32x4_t vshift = vdupq_n_s32(-(int32_t)shift); // negative to shift right. - - do { - const int16_t* w = weights; - size_t c = channels; - for (; c >= 16 * sizeof(int16_t); c -= 16 * sizeof(int16_t)) { - const int16x8_t vi0 = vld1q_s16(input); input += 8; - const int16x8_t vi1 = vld1q_s16(input); input += 8; - - const int16x8_t vw0 = vld1q_s16(w); w += 8; - const int16x8_t vw1 = vld1q_s16(w); w += 8; - - int32x4_t vacc0_lo = vmull_s16(vget_low_s16(vi0), vget_low_s16(vw0)); - int32x4_t vacc0_hi = vmull_s16(vget_high_s16(vi0), vget_high_s16(vw0)); - int32x4_t vacc1_lo = vmull_s16(vget_low_s16(vi1), vget_low_s16(vw1)); - int32x4_t vacc1_hi = vmull_s16(vget_high_s16(vi1), vget_high_s16(vw1)); - - vacc0_lo = vshlq_s32(vacc0_lo, vshift); - vacc0_hi = vshlq_s32(vacc0_hi, vshift); - vacc1_lo = vshlq_s32(vacc1_lo, vshift); - vacc1_hi = vshlq_s32(vacc1_hi, vshift); - - const int16x8_t vout0 = vcombine_s16(vqmovn_s32(vacc0_lo), vqmovn_s32(vacc0_hi)); - const int16x8_t vout1 = vcombine_s16(vqmovn_s32(vacc1_lo), vqmovn_s32(vacc1_hi)); - - vst1q_s16(output, vout0); output += 8; - vst1q_s16(output, vout1); output += 8; - } - - // Remainder of full vectors - for (; c >= 8 * sizeof(int16_t); c -= 8 * sizeof(int16_t)) { - const int16x8_t vi = vld1q_s16(input); input += 8; - const int16x8_t vw = vld1q_s16(w); w += 8; - int32x4_t vacc_lo = vmull_s16(vget_low_s16(vi), vget_low_s16(vw)); - int32x4_t vacc_hi = vmull_s16(vget_high_s16(vi), vget_high_s16(vw)); - vacc_lo = vshlq_s32(vacc_lo, vshift); - vacc_hi = vshlq_s32(vacc_hi, vshift); - const int16x8_t vout = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)); - vst1q_s16(output, vout); output += 8; - } - - assert(c % 2 == 0); - // Remainder of 1 to 7 channels - if XNN_UNLIKELY(c != 0) { - const int16x8_t vi = vld1q_s16(input); input = (const int16_t*) ((uintptr_t) input + c); - const int16x8_t vw = vld1q_s16(w); - int32x4_t vacc = vmull_s16(vget_low_s16(vi), vget_low_s16(vw)); - vacc = vshlq_s32(vacc, vshift); - int16x4_t vout = vqmovn_s32(vacc); - if (c & (4 * sizeof(int16_t))) { - vst1_s16(output, vout); output += 4; - vacc = vmull_s16(vget_high_s16(vi), vget_high_s16(vw)); - vacc = vshlq_s32(vacc, vshift); - vout = vqmovn_s32(vacc); - } - if (c & (2 * sizeof(int16_t))) { - vst1_lane_u32((void*) output, vreinterpret_u32_s16(vout), 0); output += 2; - vout = vext_s16(vout, vout, 2); - } - if (c & (1 * sizeof(int16_t))) { - vst1_lane_s16(output, vout, 0); output += 1; - } - } - - } while (--rows != 0); -} diff --git a/src/s16-window/gen/s16-window-neon-u24.c b/src/s16-window/gen/s16-window-neon-u24.c deleted file mode 100644 index 8a8e4e68336..00000000000 --- a/src/s16-window/gen/s16-window-neon-u24.c +++ /dev/null @@ -1,108 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/s16-window/neon.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/window.h" - - -void xnn_s16_window_ukernel__neon_u24( - size_t rows, - size_t channels, - const int16_t* input, - const int16_t* weights, - int16_t* output, - uint32_t shift) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(input != NULL); - assert(weights != NULL); - assert(output != NULL); - assert(shift < 32); - - const int32x4_t vshift = vdupq_n_s32(-(int32_t)shift); // negative to shift right. - - do { - const int16_t* w = weights; - size_t c = channels; - for (; c >= 24 * sizeof(int16_t); c -= 24 * sizeof(int16_t)) { - const int16x8_t vi0 = vld1q_s16(input); input += 8; - const int16x8_t vi1 = vld1q_s16(input); input += 8; - const int16x8_t vi2 = vld1q_s16(input); input += 8; - - const int16x8_t vw0 = vld1q_s16(w); w += 8; - const int16x8_t vw1 = vld1q_s16(w); w += 8; - const int16x8_t vw2 = vld1q_s16(w); w += 8; - - int32x4_t vacc0_lo = vmull_s16(vget_low_s16(vi0), vget_low_s16(vw0)); - int32x4_t vacc0_hi = vmull_s16(vget_high_s16(vi0), vget_high_s16(vw0)); - int32x4_t vacc1_lo = vmull_s16(vget_low_s16(vi1), vget_low_s16(vw1)); - int32x4_t vacc1_hi = vmull_s16(vget_high_s16(vi1), vget_high_s16(vw1)); - int32x4_t vacc2_lo = vmull_s16(vget_low_s16(vi2), vget_low_s16(vw2)); - int32x4_t vacc2_hi = vmull_s16(vget_high_s16(vi2), vget_high_s16(vw2)); - - vacc0_lo = vshlq_s32(vacc0_lo, vshift); - vacc0_hi = vshlq_s32(vacc0_hi, vshift); - vacc1_lo = vshlq_s32(vacc1_lo, vshift); - vacc1_hi = vshlq_s32(vacc1_hi, vshift); - vacc2_lo = vshlq_s32(vacc2_lo, vshift); - vacc2_hi = vshlq_s32(vacc2_hi, vshift); - - const int16x8_t vout0 = vcombine_s16(vqmovn_s32(vacc0_lo), vqmovn_s32(vacc0_hi)); - const int16x8_t vout1 = vcombine_s16(vqmovn_s32(vacc1_lo), vqmovn_s32(vacc1_hi)); - const int16x8_t vout2 = vcombine_s16(vqmovn_s32(vacc2_lo), vqmovn_s32(vacc2_hi)); - - vst1q_s16(output, vout0); output += 8; - vst1q_s16(output, vout1); output += 8; - vst1q_s16(output, vout2); output += 8; - } - - // Remainder of full vectors - for (; c >= 8 * sizeof(int16_t); c -= 8 * sizeof(int16_t)) { - const int16x8_t vi = vld1q_s16(input); input += 8; - const int16x8_t vw = vld1q_s16(w); w += 8; - int32x4_t vacc_lo = vmull_s16(vget_low_s16(vi), vget_low_s16(vw)); - int32x4_t vacc_hi = vmull_s16(vget_high_s16(vi), vget_high_s16(vw)); - vacc_lo = vshlq_s32(vacc_lo, vshift); - vacc_hi = vshlq_s32(vacc_hi, vshift); - const int16x8_t vout = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)); - vst1q_s16(output, vout); output += 8; - } - - assert(c % 2 == 0); - // Remainder of 1 to 7 channels - if XNN_UNLIKELY(c != 0) { - const int16x8_t vi = vld1q_s16(input); input = (const int16_t*) ((uintptr_t) input + c); - const int16x8_t vw = vld1q_s16(w); - int32x4_t vacc = vmull_s16(vget_low_s16(vi), vget_low_s16(vw)); - vacc = vshlq_s32(vacc, vshift); - int16x4_t vout = vqmovn_s32(vacc); - if (c & (4 * sizeof(int16_t))) { - vst1_s16(output, vout); output += 4; - vacc = vmull_s16(vget_high_s16(vi), vget_high_s16(vw)); - vacc = vshlq_s32(vacc, vshift); - vout = vqmovn_s32(vacc); - } - if (c & (2 * sizeof(int16_t))) { - vst1_lane_u32((void*) output, vreinterpret_u32_s16(vout), 0); output += 2; - vout = vext_s16(vout, vout, 2); - } - if (c & (1 * sizeof(int16_t))) { - vst1_lane_s16(output, vout, 0); output += 1; - } - } - - } while (--rows != 0); -} diff --git a/src/s16-window/gen/s16-window-neon-u32.c b/src/s16-window/gen/s16-window-neon-u32.c deleted file mode 100644 index 31d35d1c368..00000000000 --- a/src/s16-window/gen/s16-window-neon-u32.c +++ /dev/null @@ -1,116 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/s16-window/neon.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/window.h" - - -void xnn_s16_window_ukernel__neon_u32( - size_t rows, - size_t channels, - const int16_t* input, - const int16_t* weights, - int16_t* output, - uint32_t shift) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(input != NULL); - assert(weights != NULL); - assert(output != NULL); - assert(shift < 32); - - const int32x4_t vshift = vdupq_n_s32(-(int32_t)shift); // negative to shift right. - - do { - const int16_t* w = weights; - size_t c = channels; - for (; c >= 32 * sizeof(int16_t); c -= 32 * sizeof(int16_t)) { - const int16x8_t vi0 = vld1q_s16(input); input += 8; - const int16x8_t vi1 = vld1q_s16(input); input += 8; - const int16x8_t vi2 = vld1q_s16(input); input += 8; - const int16x8_t vi3 = vld1q_s16(input); input += 8; - - const int16x8_t vw0 = vld1q_s16(w); w += 8; - const int16x8_t vw1 = vld1q_s16(w); w += 8; - const int16x8_t vw2 = vld1q_s16(w); w += 8; - const int16x8_t vw3 = vld1q_s16(w); w += 8; - - int32x4_t vacc0_lo = vmull_s16(vget_low_s16(vi0), vget_low_s16(vw0)); - int32x4_t vacc0_hi = vmull_s16(vget_high_s16(vi0), vget_high_s16(vw0)); - int32x4_t vacc1_lo = vmull_s16(vget_low_s16(vi1), vget_low_s16(vw1)); - int32x4_t vacc1_hi = vmull_s16(vget_high_s16(vi1), vget_high_s16(vw1)); - int32x4_t vacc2_lo = vmull_s16(vget_low_s16(vi2), vget_low_s16(vw2)); - int32x4_t vacc2_hi = vmull_s16(vget_high_s16(vi2), vget_high_s16(vw2)); - int32x4_t vacc3_lo = vmull_s16(vget_low_s16(vi3), vget_low_s16(vw3)); - int32x4_t vacc3_hi = vmull_s16(vget_high_s16(vi3), vget_high_s16(vw3)); - - vacc0_lo = vshlq_s32(vacc0_lo, vshift); - vacc0_hi = vshlq_s32(vacc0_hi, vshift); - vacc1_lo = vshlq_s32(vacc1_lo, vshift); - vacc1_hi = vshlq_s32(vacc1_hi, vshift); - vacc2_lo = vshlq_s32(vacc2_lo, vshift); - vacc2_hi = vshlq_s32(vacc2_hi, vshift); - vacc3_lo = vshlq_s32(vacc3_lo, vshift); - vacc3_hi = vshlq_s32(vacc3_hi, vshift); - - const int16x8_t vout0 = vcombine_s16(vqmovn_s32(vacc0_lo), vqmovn_s32(vacc0_hi)); - const int16x8_t vout1 = vcombine_s16(vqmovn_s32(vacc1_lo), vqmovn_s32(vacc1_hi)); - const int16x8_t vout2 = vcombine_s16(vqmovn_s32(vacc2_lo), vqmovn_s32(vacc2_hi)); - const int16x8_t vout3 = vcombine_s16(vqmovn_s32(vacc3_lo), vqmovn_s32(vacc3_hi)); - - vst1q_s16(output, vout0); output += 8; - vst1q_s16(output, vout1); output += 8; - vst1q_s16(output, vout2); output += 8; - vst1q_s16(output, vout3); output += 8; - } - - // Remainder of full vectors - for (; c >= 8 * sizeof(int16_t); c -= 8 * sizeof(int16_t)) { - const int16x8_t vi = vld1q_s16(input); input += 8; - const int16x8_t vw = vld1q_s16(w); w += 8; - int32x4_t vacc_lo = vmull_s16(vget_low_s16(vi), vget_low_s16(vw)); - int32x4_t vacc_hi = vmull_s16(vget_high_s16(vi), vget_high_s16(vw)); - vacc_lo = vshlq_s32(vacc_lo, vshift); - vacc_hi = vshlq_s32(vacc_hi, vshift); - const int16x8_t vout = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)); - vst1q_s16(output, vout); output += 8; - } - - assert(c % 2 == 0); - // Remainder of 1 to 7 channels - if XNN_UNLIKELY(c != 0) { - const int16x8_t vi = vld1q_s16(input); input = (const int16_t*) ((uintptr_t) input + c); - const int16x8_t vw = vld1q_s16(w); - int32x4_t vacc = vmull_s16(vget_low_s16(vi), vget_low_s16(vw)); - vacc = vshlq_s32(vacc, vshift); - int16x4_t vout = vqmovn_s32(vacc); - if (c & (4 * sizeof(int16_t))) { - vst1_s16(output, vout); output += 4; - vacc = vmull_s16(vget_high_s16(vi), vget_high_s16(vw)); - vacc = vshlq_s32(vacc, vshift); - vout = vqmovn_s32(vacc); - } - if (c & (2 * sizeof(int16_t))) { - vst1_lane_u32((void*) output, vreinterpret_u32_s16(vout), 0); output += 2; - vout = vext_s16(vout, vout, 2); - } - if (c & (1 * sizeof(int16_t))) { - vst1_lane_s16(output, vout, 0); output += 1; - } - } - - } while (--rows != 0); -} diff --git a/src/s16-window/gen/s16-window-neon-u8.c b/src/s16-window/gen/s16-window-neon-u8.c deleted file mode 100644 index f181dc40af6..00000000000 --- a/src/s16-window/gen/s16-window-neon-u8.c +++ /dev/null @@ -1,77 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/s16-window/neon.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/window.h" - - -void xnn_s16_window_ukernel__neon_u8( - size_t rows, - size_t channels, - const int16_t* input, - const int16_t* weights, - int16_t* output, - uint32_t shift) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(input != NULL); - assert(weights != NULL); - assert(output != NULL); - assert(shift < 32); - - const int32x4_t vshift = vdupq_n_s32(-(int32_t)shift); // negative to shift right. - - do { - const int16_t* w = weights; - size_t c = channels; - - // Remainder of full vectors - for (; c >= 8 * sizeof(int16_t); c -= 8 * sizeof(int16_t)) { - const int16x8_t vi = vld1q_s16(input); input += 8; - const int16x8_t vw = vld1q_s16(w); w += 8; - int32x4_t vacc_lo = vmull_s16(vget_low_s16(vi), vget_low_s16(vw)); - int32x4_t vacc_hi = vmull_s16(vget_high_s16(vi), vget_high_s16(vw)); - vacc_lo = vshlq_s32(vacc_lo, vshift); - vacc_hi = vshlq_s32(vacc_hi, vshift); - const int16x8_t vout = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)); - vst1q_s16(output, vout); output += 8; - } - - assert(c % 2 == 0); - // Remainder of 1 to 7 channels - if XNN_UNLIKELY(c != 0) { - const int16x8_t vi = vld1q_s16(input); input = (const int16_t*) ((uintptr_t) input + c); - const int16x8_t vw = vld1q_s16(w); - int32x4_t vacc = vmull_s16(vget_low_s16(vi), vget_low_s16(vw)); - vacc = vshlq_s32(vacc, vshift); - int16x4_t vout = vqmovn_s32(vacc); - if (c & (4 * sizeof(int16_t))) { - vst1_s16(output, vout); output += 4; - vacc = vmull_s16(vget_high_s16(vi), vget_high_s16(vw)); - vacc = vshlq_s32(vacc, vshift); - vout = vqmovn_s32(vacc); - } - if (c & (2 * sizeof(int16_t))) { - vst1_lane_u32((void*) output, vreinterpret_u32_s16(vout), 0); output += 2; - vout = vext_s16(vout, vout, 2); - } - if (c & (1 * sizeof(int16_t))) { - vst1_lane_s16(output, vout, 0); output += 1; - } - } - - } while (--rows != 0); -} diff --git a/src/s16-window/gen/s16-window-scalar-u1.c b/src/s16-window/gen/s16-window-scalar-u1.c deleted file mode 100644 index e2d3bb8293c..00000000000 --- a/src/s16-window/gen/s16-window-scalar-u1.c +++ /dev/null @@ -1,47 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/s16-window/scalar.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/window.h" - - -void xnn_s16_window_ukernel__scalar_u1( - size_t rows, - size_t channels, - const int16_t* input, - const int16_t* weights, - int16_t* output, - uint32_t shift) -{ - assert(rows > 0); - assert(channels != 0); - assert(input != NULL); - assert(weights != NULL); - assert(output != NULL); - assert(shift < 32); - - do { - size_t c = channels; - const int16_t* w = weights; - do { - const int32_t vi = (int32_t) *input++; - const int32_t vw = (int32_t) *w++; - int32_t vout = vi * vw; - vout = math_asr_s32(vout, shift); - vout = math_max_s32(vout, INT16_MIN); - vout = math_min_s32(vout, INT16_MAX); - *output++ = (int16_t) vout; - c -= sizeof(int16_t); - } while (c != 0); - } while (--rows != 0); -} diff --git a/src/s16-window/gen/s16-window-scalar-u2.c b/src/s16-window/gen/s16-window-scalar-u2.c deleted file mode 100644 index 96f81b81ed1..00000000000 --- a/src/s16-window/gen/s16-window-scalar-u2.c +++ /dev/null @@ -1,73 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/s16-window/scalar.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/window.h" - - -void xnn_s16_window_ukernel__scalar_u2( - size_t rows, - size_t channels, - const int16_t* input, - const int16_t* weights, - int16_t* output, - uint32_t shift) -{ - assert(rows > 0); - assert(channels != 0); - assert(input != NULL); - assert(weights != NULL); - assert(output != NULL); - assert(shift < 32); - - do { - size_t c = channels; - const int16_t* w = weights; - for (; c >= 2 * sizeof(int16_t); c -= 2 * sizeof(int16_t)) { - const int16_t vi0 = input[0]; - const int16_t vi1 = input[1]; - input += 2; - - const int16_t w0 = w[0]; - const int16_t w1 = w[1]; - w += 2; - - int32_t vout0 = (int32_t) vi0 * (int32_t) w0; - int32_t vout1 = (int32_t) vi1 * (int32_t) w1; - - vout0 = math_asr_s32(vout0, shift); - vout1 = math_asr_s32(vout1, shift); - - vout0 = math_max_s32(vout0, INT16_MIN); - vout1 = math_max_s32(vout1, INT16_MIN); - - vout0 = math_min_s32(vout0, INT16_MAX); - vout1 = math_min_s32(vout1, INT16_MAX); - - output[0] = (int16_t) vout0; - output[1] = (int16_t) vout1; - - output += 2; - } - if XNN_UNLIKELY(c != 0) { - assert(c == sizeof(int16_t)); - const int32_t vi = (int32_t) *input++; - const int32_t vw = (int32_t) *w; - int32_t vout = vi * vw; - vout = math_asr_s32(vout, shift); - vout = math_max_s32(vout, INT16_MIN); - vout = math_min_s32(vout, INT16_MAX); - *output++ = (int16_t) vout; - } - } while (--rows != 0); -} diff --git a/src/s16-window/gen/s16-window-scalar-u3.c b/src/s16-window/gen/s16-window-scalar-u3.c deleted file mode 100644 index eaa2bd7fcfa..00000000000 --- a/src/s16-window/gen/s16-window-scalar-u3.c +++ /dev/null @@ -1,82 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/s16-window/scalar.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/window.h" - - -void xnn_s16_window_ukernel__scalar_u3( - size_t rows, - size_t channels, - const int16_t* input, - const int16_t* weights, - int16_t* output, - uint32_t shift) -{ - assert(rows > 0); - assert(channels != 0); - assert(input != NULL); - assert(weights != NULL); - assert(output != NULL); - assert(shift < 32); - - do { - size_t c = channels; - const int16_t* w = weights; - for (; c >= 3 * sizeof(int16_t); c -= 3 * sizeof(int16_t)) { - const int16_t vi0 = input[0]; - const int16_t vi1 = input[1]; - const int16_t vi2 = input[2]; - input += 3; - - const int16_t w0 = w[0]; - const int16_t w1 = w[1]; - const int16_t w2 = w[2]; - w += 3; - - int32_t vout0 = (int32_t) vi0 * (int32_t) w0; - int32_t vout1 = (int32_t) vi1 * (int32_t) w1; - int32_t vout2 = (int32_t) vi2 * (int32_t) w2; - - vout0 = math_asr_s32(vout0, shift); - vout1 = math_asr_s32(vout1, shift); - vout2 = math_asr_s32(vout2, shift); - - vout0 = math_max_s32(vout0, INT16_MIN); - vout1 = math_max_s32(vout1, INT16_MIN); - vout2 = math_max_s32(vout2, INT16_MIN); - - vout0 = math_min_s32(vout0, INT16_MAX); - vout1 = math_min_s32(vout1, INT16_MAX); - vout2 = math_min_s32(vout2, INT16_MAX); - - output[0] = (int16_t) vout0; - output[1] = (int16_t) vout1; - output[2] = (int16_t) vout2; - - output += 3; - } - if XNN_UNLIKELY(c != 0) { - do { - const int32_t vi = (int32_t) *input++; - const int32_t vw = (int32_t) *w++; - int32_t vout = vi * vw; - vout = math_asr_s32(vout, shift); - vout = math_max_s32(vout, INT16_MIN); - vout = math_min_s32(vout, INT16_MAX); - *output++ = (int16_t) vout; - c -= sizeof(int16_t); - } while (c != 0); - } - } while (--rows != 0); -} diff --git a/src/s16-window/gen/s16-window-scalar-u4.c b/src/s16-window/gen/s16-window-scalar-u4.c deleted file mode 100644 index 213a22cb980..00000000000 --- a/src/s16-window/gen/s16-window-scalar-u4.c +++ /dev/null @@ -1,89 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/s16-window/scalar.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/window.h" - - -void xnn_s16_window_ukernel__scalar_u4( - size_t rows, - size_t channels, - const int16_t* input, - const int16_t* weights, - int16_t* output, - uint32_t shift) -{ - assert(rows > 0); - assert(channels != 0); - assert(input != NULL); - assert(weights != NULL); - assert(output != NULL); - assert(shift < 32); - - do { - size_t c = channels; - const int16_t* w = weights; - for (; c >= 4 * sizeof(int16_t); c -= 4 * sizeof(int16_t)) { - const int16_t vi0 = input[0]; - const int16_t vi1 = input[1]; - const int16_t vi2 = input[2]; - const int16_t vi3 = input[3]; - input += 4; - - const int16_t w0 = w[0]; - const int16_t w1 = w[1]; - const int16_t w2 = w[2]; - const int16_t w3 = w[3]; - w += 4; - - int32_t vout0 = (int32_t) vi0 * (int32_t) w0; - int32_t vout1 = (int32_t) vi1 * (int32_t) w1; - int32_t vout2 = (int32_t) vi2 * (int32_t) w2; - int32_t vout3 = (int32_t) vi3 * (int32_t) w3; - - vout0 = math_asr_s32(vout0, shift); - vout1 = math_asr_s32(vout1, shift); - vout2 = math_asr_s32(vout2, shift); - vout3 = math_asr_s32(vout3, shift); - - vout0 = math_max_s32(vout0, INT16_MIN); - vout1 = math_max_s32(vout1, INT16_MIN); - vout2 = math_max_s32(vout2, INT16_MIN); - vout3 = math_max_s32(vout3, INT16_MIN); - - vout0 = math_min_s32(vout0, INT16_MAX); - vout1 = math_min_s32(vout1, INT16_MAX); - vout2 = math_min_s32(vout2, INT16_MAX); - vout3 = math_min_s32(vout3, INT16_MAX); - - output[0] = (int16_t) vout0; - output[1] = (int16_t) vout1; - output[2] = (int16_t) vout2; - output[3] = (int16_t) vout3; - - output += 4; - } - if XNN_UNLIKELY(c != 0) { - do { - const int32_t vi = (int32_t) *input++; - const int32_t vw = (int32_t) *w++; - int32_t vout = vi * vw; - vout = math_asr_s32(vout, shift); - vout = math_max_s32(vout, INT16_MIN); - vout = math_min_s32(vout, INT16_MAX); - *output++ = (int16_t) vout; - c -= sizeof(int16_t); - } while (c != 0); - } - } while (--rows != 0); -} diff --git a/src/s16-window/gen/s16-window-shift12-neon-u16.c b/src/s16-window/gen/s16-window-shift12-neon-u16.c deleted file mode 100644 index 8f8fa7b378d..00000000000 --- a/src/s16-window/gen/s16-window-shift12-neon-u16.c +++ /dev/null @@ -1,97 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/s16-window/neon.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/window.h" - - -void xnn_s16_window_shift12_ukernel__neon_u16( - size_t rows, - size_t channels, - const int16_t* input, - const int16_t* weights, - int16_t* output, - uint32_t shift) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(input != NULL); - assert(weights != NULL); - assert(output != NULL); - assert(shift == 12); - - - do { - const int16_t* w = weights; - size_t c = channels; - for (; c >= 16 * sizeof(int16_t); c -= 16 * sizeof(int16_t)) { - const int16x8_t vi0 = vld1q_s16(input); input += 8; - const int16x8_t vi1 = vld1q_s16(input); input += 8; - - const int16x8_t vw0 = vld1q_s16(w); w += 8; - const int16x8_t vw1 = vld1q_s16(w); w += 8; - - int32x4_t vacc0_lo = vmull_s16(vget_low_s16(vi0), vget_low_s16(vw0)); - int32x4_t vacc0_hi = vmull_s16(vget_high_s16(vi0), vget_high_s16(vw0)); - int32x4_t vacc1_lo = vmull_s16(vget_low_s16(vi1), vget_low_s16(vw1)); - int32x4_t vacc1_hi = vmull_s16(vget_high_s16(vi1), vget_high_s16(vw1)); - - const int16x4_t vshift0_lo = vqshrn_n_s32(vacc0_lo, 12); - const int16x4_t vshift0_hi = vqshrn_n_s32(vacc0_hi, 12); - const int16x4_t vshift1_lo = vqshrn_n_s32(vacc1_lo, 12); - const int16x4_t vshift1_hi = vqshrn_n_s32(vacc1_hi, 12); - - const int16x8_t vout0 = vcombine_s16(vshift0_lo, vshift0_hi); - const int16x8_t vout1 = vcombine_s16(vshift1_lo, vshift1_hi); - - vst1q_s16(output, vout0); output += 8; - vst1q_s16(output, vout1); output += 8; - } - - // Remainder of full vectors - for (; c >= 8 * sizeof(int16_t); c -= 8 * sizeof(int16_t)) { - const int16x8_t vi = vld1q_s16(input); input += 8; - const int16x8_t vw = vld1q_s16(w); w += 8; - int32x4_t vacc_lo = vmull_s16(vget_low_s16(vi), vget_low_s16(vw)); - int32x4_t vacc_hi = vmull_s16(vget_high_s16(vi), vget_high_s16(vw)); - const int16x4_t vshift_lo = vqshrn_n_s32(vacc_lo, 12); - const int16x4_t vshift_hi = vqshrn_n_s32(vacc_hi, 12); - const int16x8_t vout = vcombine_s16(vshift_lo, vshift_hi); - vst1q_s16(output, vout); output += 8; - } - - assert(c % 2 == 0); - // Remainder of 1 to 7 channels - if XNN_UNLIKELY(c != 0) { - const int16x8_t vi = vld1q_s16(input); input = (const int16_t*) ((uintptr_t) input + c); - const int16x8_t vw = vld1q_s16(w); - int32x4_t vacc = vmull_s16(vget_low_s16(vi), vget_low_s16(vw)); - int16x4_t vout = vqshrn_n_s32(vacc, 12); - if (c & (4 * sizeof(int16_t))) { - vst1_s16(output, vout); output += 4; - vacc = vmull_s16(vget_high_s16(vi), vget_high_s16(vw)); - vout = vqshrn_n_s32(vacc, 12); - } - if (c & (2 * sizeof(int16_t))) { - vst1_lane_u32((void*) output, vreinterpret_u32_s16(vout), 0); output += 2; - vout = vext_s16(vout, vout, 2); - } - if (c & (1 * sizeof(int16_t))) { - vst1_lane_s16(output, vout, 0); output += 1; - } - } - - } while (--rows != 0); -} diff --git a/src/s16-window/gen/s16-window-shift12-neon-u24.c b/src/s16-window/gen/s16-window-shift12-neon-u24.c deleted file mode 100644 index 5e4fa1fdb97..00000000000 --- a/src/s16-window/gen/s16-window-shift12-neon-u24.c +++ /dev/null @@ -1,105 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/s16-window/neon.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/window.h" - - -void xnn_s16_window_shift12_ukernel__neon_u24( - size_t rows, - size_t channels, - const int16_t* input, - const int16_t* weights, - int16_t* output, - uint32_t shift) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(input != NULL); - assert(weights != NULL); - assert(output != NULL); - assert(shift == 12); - - - do { - const int16_t* w = weights; - size_t c = channels; - for (; c >= 24 * sizeof(int16_t); c -= 24 * sizeof(int16_t)) { - const int16x8_t vi0 = vld1q_s16(input); input += 8; - const int16x8_t vi1 = vld1q_s16(input); input += 8; - const int16x8_t vi2 = vld1q_s16(input); input += 8; - - const int16x8_t vw0 = vld1q_s16(w); w += 8; - const int16x8_t vw1 = vld1q_s16(w); w += 8; - const int16x8_t vw2 = vld1q_s16(w); w += 8; - - int32x4_t vacc0_lo = vmull_s16(vget_low_s16(vi0), vget_low_s16(vw0)); - int32x4_t vacc0_hi = vmull_s16(vget_high_s16(vi0), vget_high_s16(vw0)); - int32x4_t vacc1_lo = vmull_s16(vget_low_s16(vi1), vget_low_s16(vw1)); - int32x4_t vacc1_hi = vmull_s16(vget_high_s16(vi1), vget_high_s16(vw1)); - int32x4_t vacc2_lo = vmull_s16(vget_low_s16(vi2), vget_low_s16(vw2)); - int32x4_t vacc2_hi = vmull_s16(vget_high_s16(vi2), vget_high_s16(vw2)); - - const int16x4_t vshift0_lo = vqshrn_n_s32(vacc0_lo, 12); - const int16x4_t vshift0_hi = vqshrn_n_s32(vacc0_hi, 12); - const int16x4_t vshift1_lo = vqshrn_n_s32(vacc1_lo, 12); - const int16x4_t vshift1_hi = vqshrn_n_s32(vacc1_hi, 12); - const int16x4_t vshift2_lo = vqshrn_n_s32(vacc2_lo, 12); - const int16x4_t vshift2_hi = vqshrn_n_s32(vacc2_hi, 12); - - const int16x8_t vout0 = vcombine_s16(vshift0_lo, vshift0_hi); - const int16x8_t vout1 = vcombine_s16(vshift1_lo, vshift1_hi); - const int16x8_t vout2 = vcombine_s16(vshift2_lo, vshift2_hi); - - vst1q_s16(output, vout0); output += 8; - vst1q_s16(output, vout1); output += 8; - vst1q_s16(output, vout2); output += 8; - } - - // Remainder of full vectors - for (; c >= 8 * sizeof(int16_t); c -= 8 * sizeof(int16_t)) { - const int16x8_t vi = vld1q_s16(input); input += 8; - const int16x8_t vw = vld1q_s16(w); w += 8; - int32x4_t vacc_lo = vmull_s16(vget_low_s16(vi), vget_low_s16(vw)); - int32x4_t vacc_hi = vmull_s16(vget_high_s16(vi), vget_high_s16(vw)); - const int16x4_t vshift_lo = vqshrn_n_s32(vacc_lo, 12); - const int16x4_t vshift_hi = vqshrn_n_s32(vacc_hi, 12); - const int16x8_t vout = vcombine_s16(vshift_lo, vshift_hi); - vst1q_s16(output, vout); output += 8; - } - - assert(c % 2 == 0); - // Remainder of 1 to 7 channels - if XNN_UNLIKELY(c != 0) { - const int16x8_t vi = vld1q_s16(input); input = (const int16_t*) ((uintptr_t) input + c); - const int16x8_t vw = vld1q_s16(w); - int32x4_t vacc = vmull_s16(vget_low_s16(vi), vget_low_s16(vw)); - int16x4_t vout = vqshrn_n_s32(vacc, 12); - if (c & (4 * sizeof(int16_t))) { - vst1_s16(output, vout); output += 4; - vacc = vmull_s16(vget_high_s16(vi), vget_high_s16(vw)); - vout = vqshrn_n_s32(vacc, 12); - } - if (c & (2 * sizeof(int16_t))) { - vst1_lane_u32((void*) output, vreinterpret_u32_s16(vout), 0); output += 2; - vout = vext_s16(vout, vout, 2); - } - if (c & (1 * sizeof(int16_t))) { - vst1_lane_s16(output, vout, 0); output += 1; - } - } - - } while (--rows != 0); -} diff --git a/src/s16-window/gen/s16-window-shift12-neon-u32.c b/src/s16-window/gen/s16-window-shift12-neon-u32.c deleted file mode 100644 index 57d438c137a..00000000000 --- a/src/s16-window/gen/s16-window-shift12-neon-u32.c +++ /dev/null @@ -1,113 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/s16-window/neon.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/window.h" - - -void xnn_s16_window_shift12_ukernel__neon_u32( - size_t rows, - size_t channels, - const int16_t* input, - const int16_t* weights, - int16_t* output, - uint32_t shift) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(input != NULL); - assert(weights != NULL); - assert(output != NULL); - assert(shift == 12); - - - do { - const int16_t* w = weights; - size_t c = channels; - for (; c >= 32 * sizeof(int16_t); c -= 32 * sizeof(int16_t)) { - const int16x8_t vi0 = vld1q_s16(input); input += 8; - const int16x8_t vi1 = vld1q_s16(input); input += 8; - const int16x8_t vi2 = vld1q_s16(input); input += 8; - const int16x8_t vi3 = vld1q_s16(input); input += 8; - - const int16x8_t vw0 = vld1q_s16(w); w += 8; - const int16x8_t vw1 = vld1q_s16(w); w += 8; - const int16x8_t vw2 = vld1q_s16(w); w += 8; - const int16x8_t vw3 = vld1q_s16(w); w += 8; - - int32x4_t vacc0_lo = vmull_s16(vget_low_s16(vi0), vget_low_s16(vw0)); - int32x4_t vacc0_hi = vmull_s16(vget_high_s16(vi0), vget_high_s16(vw0)); - int32x4_t vacc1_lo = vmull_s16(vget_low_s16(vi1), vget_low_s16(vw1)); - int32x4_t vacc1_hi = vmull_s16(vget_high_s16(vi1), vget_high_s16(vw1)); - int32x4_t vacc2_lo = vmull_s16(vget_low_s16(vi2), vget_low_s16(vw2)); - int32x4_t vacc2_hi = vmull_s16(vget_high_s16(vi2), vget_high_s16(vw2)); - int32x4_t vacc3_lo = vmull_s16(vget_low_s16(vi3), vget_low_s16(vw3)); - int32x4_t vacc3_hi = vmull_s16(vget_high_s16(vi3), vget_high_s16(vw3)); - - const int16x4_t vshift0_lo = vqshrn_n_s32(vacc0_lo, 12); - const int16x4_t vshift0_hi = vqshrn_n_s32(vacc0_hi, 12); - const int16x4_t vshift1_lo = vqshrn_n_s32(vacc1_lo, 12); - const int16x4_t vshift1_hi = vqshrn_n_s32(vacc1_hi, 12); - const int16x4_t vshift2_lo = vqshrn_n_s32(vacc2_lo, 12); - const int16x4_t vshift2_hi = vqshrn_n_s32(vacc2_hi, 12); - const int16x4_t vshift3_lo = vqshrn_n_s32(vacc3_lo, 12); - const int16x4_t vshift3_hi = vqshrn_n_s32(vacc3_hi, 12); - - const int16x8_t vout0 = vcombine_s16(vshift0_lo, vshift0_hi); - const int16x8_t vout1 = vcombine_s16(vshift1_lo, vshift1_hi); - const int16x8_t vout2 = vcombine_s16(vshift2_lo, vshift2_hi); - const int16x8_t vout3 = vcombine_s16(vshift3_lo, vshift3_hi); - - vst1q_s16(output, vout0); output += 8; - vst1q_s16(output, vout1); output += 8; - vst1q_s16(output, vout2); output += 8; - vst1q_s16(output, vout3); output += 8; - } - - // Remainder of full vectors - for (; c >= 8 * sizeof(int16_t); c -= 8 * sizeof(int16_t)) { - const int16x8_t vi = vld1q_s16(input); input += 8; - const int16x8_t vw = vld1q_s16(w); w += 8; - int32x4_t vacc_lo = vmull_s16(vget_low_s16(vi), vget_low_s16(vw)); - int32x4_t vacc_hi = vmull_s16(vget_high_s16(vi), vget_high_s16(vw)); - const int16x4_t vshift_lo = vqshrn_n_s32(vacc_lo, 12); - const int16x4_t vshift_hi = vqshrn_n_s32(vacc_hi, 12); - const int16x8_t vout = vcombine_s16(vshift_lo, vshift_hi); - vst1q_s16(output, vout); output += 8; - } - - assert(c % 2 == 0); - // Remainder of 1 to 7 channels - if XNN_UNLIKELY(c != 0) { - const int16x8_t vi = vld1q_s16(input); input = (const int16_t*) ((uintptr_t) input + c); - const int16x8_t vw = vld1q_s16(w); - int32x4_t vacc = vmull_s16(vget_low_s16(vi), vget_low_s16(vw)); - int16x4_t vout = vqshrn_n_s32(vacc, 12); - if (c & (4 * sizeof(int16_t))) { - vst1_s16(output, vout); output += 4; - vacc = vmull_s16(vget_high_s16(vi), vget_high_s16(vw)); - vout = vqshrn_n_s32(vacc, 12); - } - if (c & (2 * sizeof(int16_t))) { - vst1_lane_u32((void*) output, vreinterpret_u32_s16(vout), 0); output += 2; - vout = vext_s16(vout, vout, 2); - } - if (c & (1 * sizeof(int16_t))) { - vst1_lane_s16(output, vout, 0); output += 1; - } - } - - } while (--rows != 0); -} diff --git a/src/s16-window/gen/s16-window-shift12-neon-u8.c b/src/s16-window/gen/s16-window-shift12-neon-u8.c deleted file mode 100644 index ab2f1b8c3a1..00000000000 --- a/src/s16-window/gen/s16-window-shift12-neon-u8.c +++ /dev/null @@ -1,74 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/s16-window/neon.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/window.h" - - -void xnn_s16_window_shift12_ukernel__neon_u8( - size_t rows, - size_t channels, - const int16_t* input, - const int16_t* weights, - int16_t* output, - uint32_t shift) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(input != NULL); - assert(weights != NULL); - assert(output != NULL); - assert(shift == 12); - - - do { - const int16_t* w = weights; - size_t c = channels; - - // Remainder of full vectors - for (; c >= 8 * sizeof(int16_t); c -= 8 * sizeof(int16_t)) { - const int16x8_t vi = vld1q_s16(input); input += 8; - const int16x8_t vw = vld1q_s16(w); w += 8; - int32x4_t vacc_lo = vmull_s16(vget_low_s16(vi), vget_low_s16(vw)); - int32x4_t vacc_hi = vmull_s16(vget_high_s16(vi), vget_high_s16(vw)); - const int16x4_t vshift_lo = vqshrn_n_s32(vacc_lo, 12); - const int16x4_t vshift_hi = vqshrn_n_s32(vacc_hi, 12); - const int16x8_t vout = vcombine_s16(vshift_lo, vshift_hi); - vst1q_s16(output, vout); output += 8; - } - - assert(c % 2 == 0); - // Remainder of 1 to 7 channels - if XNN_UNLIKELY(c != 0) { - const int16x8_t vi = vld1q_s16(input); input = (const int16_t*) ((uintptr_t) input + c); - const int16x8_t vw = vld1q_s16(w); - int32x4_t vacc = vmull_s16(vget_low_s16(vi), vget_low_s16(vw)); - int16x4_t vout = vqshrn_n_s32(vacc, 12); - if (c & (4 * sizeof(int16_t))) { - vst1_s16(output, vout); output += 4; - vacc = vmull_s16(vget_high_s16(vi), vget_high_s16(vw)); - vout = vqshrn_n_s32(vacc, 12); - } - if (c & (2 * sizeof(int16_t))) { - vst1_lane_u32((void*) output, vreinterpret_u32_s16(vout), 0); output += 2; - vout = vext_s16(vout, vout, 2); - } - if (c & (1 * sizeof(int16_t))) { - vst1_lane_s16(output, vout, 0); output += 1; - } - } - - } while (--rows != 0); -} diff --git a/src/s16-window/gen/s16-window-shift15-neon-u16.c b/src/s16-window/gen/s16-window-shift15-neon-u16.c deleted file mode 100644 index 928e2eaf1ec..00000000000 --- a/src/s16-window/gen/s16-window-shift15-neon-u16.c +++ /dev/null @@ -1,81 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/s16-window/neon.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/window.h" - - -void xnn_s16_window_shift15_ukernel__neon_u16( - size_t rows, - size_t channels, - const int16_t* input, - const int16_t* weights, - int16_t* output, - uint32_t shift) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(input != NULL); - assert(weights != NULL); - assert(output != NULL); - assert(shift == 15); - - - do { - const int16_t* w = weights; - size_t c = channels; - for (; c >= 16 * sizeof(int16_t); c -= 16 * sizeof(int16_t)) { - const int16x8_t vi0 = vld1q_s16(input); input += 8; - const int16x8_t vi1 = vld1q_s16(input); input += 8; - - const int16x8_t vw0 = vld1q_s16(w); w += 8; - const int16x8_t vw1 = vld1q_s16(w); w += 8; - - const int16x8_t vout0 = vqdmulhq_s16(vi0, vw0); - const int16x8_t vout1 = vqdmulhq_s16(vi1, vw1); - - vst1q_s16(output, vout0); output += 8; - vst1q_s16(output, vout1); output += 8; - } - - // Remainder of full vectors - for (; c >= 8 * sizeof(int16_t); c -= 8 * sizeof(int16_t)) { - const int16x8_t vi = vld1q_s16(input); input += 8; - const int16x8_t vw = vld1q_s16(w); w += 8; - const int16x8_t vout = vqdmulhq_s16(vi, vw); - vst1q_s16(output, vout); output += 8; - } - - assert(c % 2 == 0); - // Remainder of 1 to 7 channels - if XNN_UNLIKELY(c != 0) { - const int16x8_t vi = vld1q_s16(input); input = (const int16_t*) ((uintptr_t) input + c); - const int16x8_t vw = vld1q_s16(w); - int16x4_t vout = vqdmulh_s16(vget_low_s16(vi), vget_low_s16(vw)); - if (c & (4 * sizeof(int16_t))) { - vst1_s16(output, vout); output += 4; - vout = vqdmulh_s16(vget_high_s16(vi), vget_high_s16(vw)); - } - if (c & (2 * sizeof(int16_t))) { - vst1_lane_u32((void*) output, vreinterpret_u32_s16(vout), 0); output += 2; - vout = vext_s16(vout, vout, 2); - } - if (c & (1 * sizeof(int16_t))) { - vst1_lane_s16(output, vout, 0); output += 1; - } - } - - } while (--rows != 0); -} diff --git a/src/s16-window/gen/s16-window-shift15-neon-u24.c b/src/s16-window/gen/s16-window-shift15-neon-u24.c deleted file mode 100644 index 3c1968fb7ae..00000000000 --- a/src/s16-window/gen/s16-window-shift15-neon-u24.c +++ /dev/null @@ -1,85 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/s16-window/neon.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/window.h" - - -void xnn_s16_window_shift15_ukernel__neon_u24( - size_t rows, - size_t channels, - const int16_t* input, - const int16_t* weights, - int16_t* output, - uint32_t shift) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(input != NULL); - assert(weights != NULL); - assert(output != NULL); - assert(shift == 15); - - - do { - const int16_t* w = weights; - size_t c = channels; - for (; c >= 24 * sizeof(int16_t); c -= 24 * sizeof(int16_t)) { - const int16x8_t vi0 = vld1q_s16(input); input += 8; - const int16x8_t vi1 = vld1q_s16(input); input += 8; - const int16x8_t vi2 = vld1q_s16(input); input += 8; - - const int16x8_t vw0 = vld1q_s16(w); w += 8; - const int16x8_t vw1 = vld1q_s16(w); w += 8; - const int16x8_t vw2 = vld1q_s16(w); w += 8; - - const int16x8_t vout0 = vqdmulhq_s16(vi0, vw0); - const int16x8_t vout1 = vqdmulhq_s16(vi1, vw1); - const int16x8_t vout2 = vqdmulhq_s16(vi2, vw2); - - vst1q_s16(output, vout0); output += 8; - vst1q_s16(output, vout1); output += 8; - vst1q_s16(output, vout2); output += 8; - } - - // Remainder of full vectors - for (; c >= 8 * sizeof(int16_t); c -= 8 * sizeof(int16_t)) { - const int16x8_t vi = vld1q_s16(input); input += 8; - const int16x8_t vw = vld1q_s16(w); w += 8; - const int16x8_t vout = vqdmulhq_s16(vi, vw); - vst1q_s16(output, vout); output += 8; - } - - assert(c % 2 == 0); - // Remainder of 1 to 7 channels - if XNN_UNLIKELY(c != 0) { - const int16x8_t vi = vld1q_s16(input); input = (const int16_t*) ((uintptr_t) input + c); - const int16x8_t vw = vld1q_s16(w); - int16x4_t vout = vqdmulh_s16(vget_low_s16(vi), vget_low_s16(vw)); - if (c & (4 * sizeof(int16_t))) { - vst1_s16(output, vout); output += 4; - vout = vqdmulh_s16(vget_high_s16(vi), vget_high_s16(vw)); - } - if (c & (2 * sizeof(int16_t))) { - vst1_lane_u32((void*) output, vreinterpret_u32_s16(vout), 0); output += 2; - vout = vext_s16(vout, vout, 2); - } - if (c & (1 * sizeof(int16_t))) { - vst1_lane_s16(output, vout, 0); output += 1; - } - } - - } while (--rows != 0); -} diff --git a/src/s16-window/gen/s16-window-shift15-neon-u32.c b/src/s16-window/gen/s16-window-shift15-neon-u32.c deleted file mode 100644 index 9d205a1d11a..00000000000 --- a/src/s16-window/gen/s16-window-shift15-neon-u32.c +++ /dev/null @@ -1,89 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/s16-window/neon.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/window.h" - - -void xnn_s16_window_shift15_ukernel__neon_u32( - size_t rows, - size_t channels, - const int16_t* input, - const int16_t* weights, - int16_t* output, - uint32_t shift) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(input != NULL); - assert(weights != NULL); - assert(output != NULL); - assert(shift == 15); - - - do { - const int16_t* w = weights; - size_t c = channels; - for (; c >= 32 * sizeof(int16_t); c -= 32 * sizeof(int16_t)) { - const int16x8_t vi0 = vld1q_s16(input); input += 8; - const int16x8_t vi1 = vld1q_s16(input); input += 8; - const int16x8_t vi2 = vld1q_s16(input); input += 8; - const int16x8_t vi3 = vld1q_s16(input); input += 8; - - const int16x8_t vw0 = vld1q_s16(w); w += 8; - const int16x8_t vw1 = vld1q_s16(w); w += 8; - const int16x8_t vw2 = vld1q_s16(w); w += 8; - const int16x8_t vw3 = vld1q_s16(w); w += 8; - - const int16x8_t vout0 = vqdmulhq_s16(vi0, vw0); - const int16x8_t vout1 = vqdmulhq_s16(vi1, vw1); - const int16x8_t vout2 = vqdmulhq_s16(vi2, vw2); - const int16x8_t vout3 = vqdmulhq_s16(vi3, vw3); - - vst1q_s16(output, vout0); output += 8; - vst1q_s16(output, vout1); output += 8; - vst1q_s16(output, vout2); output += 8; - vst1q_s16(output, vout3); output += 8; - } - - // Remainder of full vectors - for (; c >= 8 * sizeof(int16_t); c -= 8 * sizeof(int16_t)) { - const int16x8_t vi = vld1q_s16(input); input += 8; - const int16x8_t vw = vld1q_s16(w); w += 8; - const int16x8_t vout = vqdmulhq_s16(vi, vw); - vst1q_s16(output, vout); output += 8; - } - - assert(c % 2 == 0); - // Remainder of 1 to 7 channels - if XNN_UNLIKELY(c != 0) { - const int16x8_t vi = vld1q_s16(input); input = (const int16_t*) ((uintptr_t) input + c); - const int16x8_t vw = vld1q_s16(w); - int16x4_t vout = vqdmulh_s16(vget_low_s16(vi), vget_low_s16(vw)); - if (c & (4 * sizeof(int16_t))) { - vst1_s16(output, vout); output += 4; - vout = vqdmulh_s16(vget_high_s16(vi), vget_high_s16(vw)); - } - if (c & (2 * sizeof(int16_t))) { - vst1_lane_u32((void*) output, vreinterpret_u32_s16(vout), 0); output += 2; - vout = vext_s16(vout, vout, 2); - } - if (c & (1 * sizeof(int16_t))) { - vst1_lane_s16(output, vout, 0); output += 1; - } - } - - } while (--rows != 0); -} diff --git a/src/s16-window/gen/s16-window-shift15-neon-u8.c b/src/s16-window/gen/s16-window-shift15-neon-u8.c deleted file mode 100644 index 09e296de3f3..00000000000 --- a/src/s16-window/gen/s16-window-shift15-neon-u8.c +++ /dev/null @@ -1,68 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/s16-window/neon.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/window.h" - - -void xnn_s16_window_shift15_ukernel__neon_u8( - size_t rows, - size_t channels, - const int16_t* input, - const int16_t* weights, - int16_t* output, - uint32_t shift) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(input != NULL); - assert(weights != NULL); - assert(output != NULL); - assert(shift == 15); - - - do { - const int16_t* w = weights; - size_t c = channels; - - // Remainder of full vectors - for (; c >= 8 * sizeof(int16_t); c -= 8 * sizeof(int16_t)) { - const int16x8_t vi = vld1q_s16(input); input += 8; - const int16x8_t vw = vld1q_s16(w); w += 8; - const int16x8_t vout = vqdmulhq_s16(vi, vw); - vst1q_s16(output, vout); output += 8; - } - - assert(c % 2 == 0); - // Remainder of 1 to 7 channels - if XNN_UNLIKELY(c != 0) { - const int16x8_t vi = vld1q_s16(input); input = (const int16_t*) ((uintptr_t) input + c); - const int16x8_t vw = vld1q_s16(w); - int16x4_t vout = vqdmulh_s16(vget_low_s16(vi), vget_low_s16(vw)); - if (c & (4 * sizeof(int16_t))) { - vst1_s16(output, vout); output += 4; - vout = vqdmulh_s16(vget_high_s16(vi), vget_high_s16(vw)); - } - if (c & (2 * sizeof(int16_t))) { - vst1_lane_u32((void*) output, vreinterpret_u32_s16(vout), 0); output += 2; - vout = vext_s16(vout, vout, 2); - } - if (c & (1 * sizeof(int16_t))) { - vst1_lane_s16(output, vout, 0); output += 1; - } - } - - } while (--rows != 0); -} diff --git a/src/s16-window/neon.c.in b/src/s16-window/neon.c.in deleted file mode 100644 index 68463df2a99..00000000000 --- a/src/s16-window/neon.c.in +++ /dev/null @@ -1,135 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert CHANNEL_TILE % 8 == 0 -$assert CHANNEL_TILE >= 8 -$SIMD_TILE = CHANNEL_TILE // 8 -#include -#include -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/window.h" - -$SHIFT_VARIANT = "_shift%s" % SHIFT if SHIFT else "" - -void xnn_s16_window${SHIFT_VARIANT}_ukernel__neon_u${CHANNEL_TILE}( - size_t rows, - size_t channels, - const int16_t* input, - const int16_t* weights, - int16_t* output, - uint32_t shift) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(input != NULL); - assert(weights != NULL); - assert(output != NULL); - $if SHIFT != 0: - assert(shift == ${SHIFT}); - $else: - assert(shift < 32); - - $if SHIFT == 0: - const int32x4_t vshift = vdupq_n_s32(-(int32_t)shift); // negative to shift right. - - do { - const int16_t* w = weights; - size_t c = channels; - $if CHANNEL_TILE > 8: - for (; c >= ${CHANNEL_TILE} * sizeof(int16_t); c -= ${CHANNEL_TILE} * sizeof(int16_t)) { - $for N in range(SIMD_TILE): - const int16x8_t vi${N} = vld1q_s16(input); input += 8; - - $for N in range(SIMD_TILE): - const int16x8_t vw${N} = vld1q_s16(w); w += 8; - - $if SHIFT == 15: - $for N in range(SIMD_TILE): - const int16x8_t vout${N} = vqdmulhq_s16(vi${N}, vw${N}); - $else: - $for N in range(SIMD_TILE): - int32x4_t vacc${N}_lo = vmull_s16(vget_low_s16(vi${N}), vget_low_s16(vw${N})); - int32x4_t vacc${N}_hi = vmull_s16(vget_high_s16(vi${N}), vget_high_s16(vw${N})); - - $if SHIFT != 0: - $for N in range(SIMD_TILE): - const int16x4_t vshift${N}_lo = vqshrn_n_s32(vacc${N}_lo, ${SHIFT}); - const int16x4_t vshift${N}_hi = vqshrn_n_s32(vacc${N}_hi, ${SHIFT}); - - $for N in range(SIMD_TILE): - const int16x8_t vout${N} = vcombine_s16(vshift${N}_lo, vshift${N}_hi); - $else: - $for N in range(SIMD_TILE): - vacc${N}_lo = vshlq_s32(vacc${N}_lo, vshift); - vacc${N}_hi = vshlq_s32(vacc${N}_hi, vshift); - - $for N in range(SIMD_TILE): - const int16x8_t vout${N} = vcombine_s16(vqmovn_s32(vacc${N}_lo), vqmovn_s32(vacc${N}_hi)); - - $for N in range(SIMD_TILE): - vst1q_s16(output, vout${N}); output += 8; - } - - // Remainder of full vectors - for (; c >= 8 * sizeof(int16_t); c -= 8 * sizeof(int16_t)) { - const int16x8_t vi = vld1q_s16(input); input += 8; - const int16x8_t vw = vld1q_s16(w); w += 8; - $if SHIFT == 15: - const int16x8_t vout = vqdmulhq_s16(vi, vw); - $else: - int32x4_t vacc_lo = vmull_s16(vget_low_s16(vi), vget_low_s16(vw)); - int32x4_t vacc_hi = vmull_s16(vget_high_s16(vi), vget_high_s16(vw)); - $if SHIFT != 0: - const int16x4_t vshift_lo = vqshrn_n_s32(vacc_lo, ${SHIFT}); - const int16x4_t vshift_hi = vqshrn_n_s32(vacc_hi, ${SHIFT}); - const int16x8_t vout = vcombine_s16(vshift_lo, vshift_hi); - $else: - vacc_lo = vshlq_s32(vacc_lo, vshift); - vacc_hi = vshlq_s32(vacc_hi, vshift); - const int16x8_t vout = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)); - vst1q_s16(output, vout); output += 8; - } - - assert(c % 2 == 0); - // Remainder of 1 to 7 channels - if XNN_UNLIKELY(c != 0) { - const int16x8_t vi = vld1q_s16(input); input = (const int16_t*) ((uintptr_t) input + c); - const int16x8_t vw = vld1q_s16(w); - $if SHIFT == 15: - int16x4_t vout = vqdmulh_s16(vget_low_s16(vi), vget_low_s16(vw)); - $else: - int32x4_t vacc = vmull_s16(vget_low_s16(vi), vget_low_s16(vw)); - $if SHIFT != 0: - int16x4_t vout = vqshrn_n_s32(vacc, ${SHIFT}); - $else: - vacc = vshlq_s32(vacc, vshift); - int16x4_t vout = vqmovn_s32(vacc); - if (c & (4 * sizeof(int16_t))) { - vst1_s16(output, vout); output += 4; - $if SHIFT == 15: - vout = vqdmulh_s16(vget_high_s16(vi), vget_high_s16(vw)); - $else: - vacc = vmull_s16(vget_high_s16(vi), vget_high_s16(vw)); - $if SHIFT != 0: - vout = vqshrn_n_s32(vacc, ${SHIFT}); - $else: - vacc = vshlq_s32(vacc, vshift); - vout = vqmovn_s32(vacc); - } - if (c & (2 * sizeof(int16_t))) { - vst1_lane_u32((void*) output, vreinterpret_u32_s16(vout), 0); output += 2; - vout = vext_s16(vout, vout, 2); - } - if (c & (1 * sizeof(int16_t))) { - vst1_lane_s16(output, vout, 0); output += 1; - } - } - - } while (--rows != 0); -} diff --git a/src/s16-window/scalar.c.in b/src/s16-window/scalar.c.in deleted file mode 100644 index 8115c3a54ff..00000000000 --- a/src/s16-window/scalar.c.in +++ /dev/null @@ -1,96 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert CHANNEL_TILE >= 1 -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/window.h" - - -void xnn_s16_window_ukernel__scalar_u${CHANNEL_TILE}( - size_t rows, - size_t channels, - const int16_t* input, - const int16_t* weights, - int16_t* output, - uint32_t shift) -{ - assert(rows > 0); - assert(channels != 0); - assert(input != NULL); - assert(weights != NULL); - assert(output != NULL); - assert(shift < 32); - - do { - size_t c = channels; - const int16_t* w = weights; - $if CHANNEL_TILE == 1: - do { - const int32_t vi = (int32_t) *input++; - const int32_t vw = (int32_t) *w++; - int32_t vout = vi * vw; - vout = math_asr_s32(vout, shift); - vout = math_max_s32(vout, INT16_MIN); - vout = math_min_s32(vout, INT16_MAX); - *output++ = (int16_t) vout; - c -= sizeof(int16_t); - } while (c != 0); - $else: - for (; c >= ${CHANNEL_TILE} * sizeof(int16_t); c -= ${CHANNEL_TILE} * sizeof(int16_t)) { - $for C in range(CHANNEL_TILE): - const int16_t vi${C} = input[${C}]; - input += ${CHANNEL_TILE}; - - $for C in range(CHANNEL_TILE): - const int16_t w${C} = w[${C}]; - w += ${CHANNEL_TILE}; - - $for C in range(CHANNEL_TILE): - int32_t vout${C} = (int32_t) vi${C} * (int32_t) w${C}; - - $for C in range(CHANNEL_TILE): - vout${C} = math_asr_s32(vout${C}, shift); - - $for C in range(CHANNEL_TILE): - vout${C} = math_max_s32(vout${C}, INT16_MIN); - - $for C in range(CHANNEL_TILE): - vout${C} = math_min_s32(vout${C}, INT16_MAX); - - $for C in range(CHANNEL_TILE): - output[${C}] = (int16_t) vout${C}; - - output += ${CHANNEL_TILE}; - } - $if CHANNEL_TILE == 2: - if XNN_UNLIKELY(c != 0) { - assert(c == sizeof(int16_t)); - const int32_t vi = (int32_t) *input++; - const int32_t vw = (int32_t) *w; - int32_t vout = vi * vw; - vout = math_asr_s32(vout, shift); - vout = math_max_s32(vout, INT16_MIN); - vout = math_min_s32(vout, INT16_MAX); - *output++ = (int16_t) vout; - } - $else: - if XNN_UNLIKELY(c != 0) { - do { - const int32_t vi = (int32_t) *input++; - const int32_t vw = (int32_t) *w++; - int32_t vout = vi * vw; - vout = math_asr_s32(vout, shift); - vout = math_max_s32(vout, INT16_MIN); - vout = math_min_s32(vout, INT16_MAX); - *output++ = (int16_t) vout; - c -= sizeof(int16_t); - } while (c != 0); - } - } while (--rows != 0); -} diff --git a/src/u32-filterbank-accumulate/gen/u32-filterbank-accumulate-neon-x1.c b/src/u32-filterbank-accumulate/gen/u32-filterbank-accumulate-neon-x1.c deleted file mode 100644 index b499bda145e..00000000000 --- a/src/u32-filterbank-accumulate/gen/u32-filterbank-accumulate-neon-x1.c +++ /dev/null @@ -1,62 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/u32-filterbank-accumulate/neon.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/filterbank.h" - - -void xnn_u32_filterbank_accumulate_ukernel__neon_x1( - size_t rows, - const uint32_t* input, - const uint8_t* weight_widths, - const uint16_t* weights, - uint64_t* output) { - - assert(rows != 0); - assert(input != NULL); - assert(weight_widths != NULL); - assert(weights != NULL); - assert(output != NULL); - - // Compute unweight as initial weight - size_t n = (size_t) *weight_widths++; - assert(n != 0); - uint64x2_t weight_accumulator = vdupq_n_u64(0); - - do { - const uint32x2_t vi = vld1_dup_u32(input); input += 1; - const uint16x4_t vw = vreinterpret_u16_u32(vld1_dup_u32((const void*) weights)); weights += 2; - const uint32x2_t vw32 = vget_low_u32(vmovl_u16(vw)); - - weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi); - } while (--n != 0); - - do { - size_t n = (size_t) *weight_widths++; - assert(n != 0); - weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0)); - - do { - const uint32x2_t vi = vld1_dup_u32(input); input += 1; - const uint16x4_t vw = vreinterpret_u16_u32(vld1_dup_u32((const void*) weights)); weights += 2; - const uint32x2_t vw32 = vget_low_u32(vmovl_u16(vw)); - - weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi); - } while (--n != 0); - - vst1_u64(output, vget_low_u64(weight_accumulator)); output += 1; - - } while (--rows != 0); -} diff --git a/src/u32-filterbank-accumulate/gen/u32-filterbank-accumulate-neon-x2.c b/src/u32-filterbank-accumulate/gen/u32-filterbank-accumulate-neon-x2.c deleted file mode 100644 index 444a4d5fd9c..00000000000 --- a/src/u32-filterbank-accumulate/gen/u32-filterbank-accumulate-neon-x2.c +++ /dev/null @@ -1,71 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/u32-filterbank-accumulate/neon.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/filterbank.h" - - -void xnn_u32_filterbank_accumulate_ukernel__neon_x2( - size_t rows, - const uint32_t* input, - const uint8_t* weight_widths, - const uint16_t* weights, - uint64_t* output) { - - assert(rows != 0); - assert(input != NULL); - assert(weight_widths != NULL); - assert(weights != NULL); - assert(output != NULL); - - // Compute unweight as initial weight - size_t n = (size_t) *weight_widths++; - assert(n != 0); - uint64x2_t weight_accumulator = vdupq_n_u64(0); - - do { - const uint32x2_t vi = vld1_dup_u32(input); input += 1; - const uint16x4_t vw = vreinterpret_u16_u32(vld1_dup_u32((const void*) weights)); weights += 2; - const uint32x2_t vw32 = vget_low_u32(vmovl_u16(vw)); - - weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi); - } while (--n != 0); - - do { - size_t n = (size_t) *weight_widths++; - assert(n != 0); - weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0)); - - for (; n >= 2; n -= 2) { - const uint32x2_t vi = vld1_u32(input); input += 2; - const uint16x4_t vw = vld1_u16(weights); weights += 4; - const uint32x4_t vw32 = vmovl_u16(vw); - - weight_accumulator = vmlal_lane_u32(weight_accumulator, vget_low_u32(vw32), vi, 0); - weight_accumulator = vmlal_lane_u32(weight_accumulator, vget_high_u32(vw32), vi, 1); - } - - if XNN_UNPREDICTABLE(n != 0) { - const uint32x2_t vi = vld1_dup_u32(input); input += 1; - const uint16x4_t vw = vreinterpret_u16_u32(vld1_dup_u32((const void*) weights)); weights += 2; - const uint32x2_t vw32 = vget_low_u32(vmovl_u16(vw)); - - weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi); - } - - vst1_u64(output, vget_low_u64(weight_accumulator)); output += 1; - - } while (--rows != 0); -} diff --git a/src/u32-filterbank-accumulate/gen/u32-filterbank-accumulate-scalar-x1.c b/src/u32-filterbank-accumulate/gen/u32-filterbank-accumulate-scalar-x1.c deleted file mode 100644 index 83d4b280019..00000000000 --- a/src/u32-filterbank-accumulate/gen/u32-filterbank-accumulate-scalar-x1.c +++ /dev/null @@ -1,70 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/u32-filterbank-accumulate/scalar.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/filterbank.h" -#include "xnnpack/math.h" - - -void xnn_u32_filterbank_accumulate_ukernel__scalar_x1( - size_t rows, - const uint32_t* input, - const uint8_t* weight_widths, - const uint16_t* weights, - uint64_t* output) { - - assert(rows != 0); - assert(input != NULL); - assert(weight_widths != NULL); - assert(weights != NULL); - assert(output != NULL); - - uint64_t weight_accumulator = 0; - uint64_t unweight_accumulator = 0; - - // compute unweight as initial weight - size_t n = (size_t) *weight_widths++; - assert(n != 0); - do { - const uint32_t vi = *input++; - const uint32_t vu = (uint32_t) weights[1]; // unweight - weights += 2; - - const uint64_t vuacc = math_mulext_u32(vi, vu); - - weight_accumulator += vuacc; - - } while (--n != 0); - - do { - size_t n = (size_t) *weight_widths++; - assert(n != 0); - do { - const uint32_t vi = *input++; - const uint32_t vw = (uint32_t) weights[0]; // weight - const uint32_t vu = (uint32_t) weights[1]; // unweight - weights += 2; - - const uint64_t vwacc = math_mulext_u32(vi, vw); - const uint64_t vuacc = math_mulext_u32(vi, vu); - - weight_accumulator += vwacc; - unweight_accumulator += vuacc; - - } while (--n != 0); - - *output++ = weight_accumulator; - weight_accumulator = unweight_accumulator; - unweight_accumulator = 0; - - } while (--rows != 0); -} diff --git a/src/u32-filterbank-accumulate/neon.c.in b/src/u32-filterbank-accumulate/neon.c.in deleted file mode 100644 index 7d16becb1a4..00000000000 --- a/src/u32-filterbank-accumulate/neon.c.in +++ /dev/null @@ -1,77 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert BATCH_TILE in [1, 2] -#include -#include -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/filterbank.h" - - -void xnn_u32_filterbank_accumulate_ukernel__neon_x${BATCH_TILE}( - size_t rows, - const uint32_t* input, - const uint8_t* weight_widths, - const uint16_t* weights, - uint64_t* output) { - - assert(rows != 0); - assert(input != NULL); - assert(weight_widths != NULL); - assert(weights != NULL); - assert(output != NULL); - - // Compute unweight as initial weight - size_t n = (size_t) *weight_widths++; - assert(n != 0); - uint64x2_t weight_accumulator = vdupq_n_u64(0); - - do { - const uint32x2_t vi = vld1_dup_u32(input); input += 1; - const uint16x4_t vw = vreinterpret_u16_u32(vld1_dup_u32((const void*) weights)); weights += 2; - const uint32x2_t vw32 = vget_low_u32(vmovl_u16(vw)); - - weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi); - } while (--n != 0); - - do { - size_t n = (size_t) *weight_widths++; - assert(n != 0); - weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0)); - - $if BATCH_TILE == 2: - for (; n >= 2; n -= 2) { - const uint32x2_t vi = vld1_u32(input); input += 2; - const uint16x4_t vw = vld1_u16(weights); weights += 4; - const uint32x4_t vw32 = vmovl_u16(vw); - - weight_accumulator = vmlal_lane_u32(weight_accumulator, vget_low_u32(vw32), vi, 0); - weight_accumulator = vmlal_lane_u32(weight_accumulator, vget_high_u32(vw32), vi, 1); - } - - if XNN_UNPREDICTABLE(n != 0) { - const uint32x2_t vi = vld1_dup_u32(input); input += 1; - const uint16x4_t vw = vreinterpret_u16_u32(vld1_dup_u32((const void*) weights)); weights += 2; - const uint32x2_t vw32 = vget_low_u32(vmovl_u16(vw)); - - weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi); - } - $elif BATCH_TILE == 1: - do { - const uint32x2_t vi = vld1_dup_u32(input); input += 1; - const uint16x4_t vw = vreinterpret_u16_u32(vld1_dup_u32((const void*) weights)); weights += 2; - const uint32x2_t vw32 = vget_low_u32(vmovl_u16(vw)); - - weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi); - } while (--n != 0); - - vst1_u64(output, vget_low_u64(weight_accumulator)); output += 1; - - } while (--rows != 0); -} diff --git a/src/u32-filterbank-accumulate/scalar.c.in b/src/u32-filterbank-accumulate/scalar.c.in deleted file mode 100644 index aa6f4629717..00000000000 --- a/src/u32-filterbank-accumulate/scalar.c.in +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert BATCH_TILE == 1 -#include -#include -#include - -#include "xnnpack/filterbank.h" -#include "xnnpack/math.h" - - -void xnn_u32_filterbank_accumulate_ukernel__scalar_x${BATCH_TILE}( - size_t rows, - const uint32_t* input, - const uint8_t* weight_widths, - const uint16_t* weights, - uint64_t* output) { - - assert(rows != 0); - assert(input != NULL); - assert(weight_widths != NULL); - assert(weights != NULL); - assert(output != NULL); - - uint64_t weight_accumulator = 0; - uint64_t unweight_accumulator = 0; - - // compute unweight as initial weight - size_t n = (size_t) *weight_widths++; - assert(n != 0); - do { - const uint32_t vi = *input++; - const uint32_t vu = (uint32_t) weights[1]; // unweight - weights += 2; - - const uint64_t vuacc = math_mulext_u32(vi, vu); - - weight_accumulator += vuacc; - - } while (--n != 0); - - do { - size_t n = (size_t) *weight_widths++; - assert(n != 0); - do { - const uint32_t vi = *input++; - const uint32_t vw = (uint32_t) weights[0]; // weight - const uint32_t vu = (uint32_t) weights[1]; // unweight - weights += 2; - - const uint64_t vwacc = math_mulext_u32(vi, vw); - const uint64_t vuacc = math_mulext_u32(vi, vu); - - weight_accumulator += vwacc; - unweight_accumulator += vuacc; - - } while (--n != 0); - - *output++ = weight_accumulator; - weight_accumulator = unweight_accumulator; - unweight_accumulator = 0; - - } while (--rows != 0); -} diff --git a/src/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-arm-x1.S b/src/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-arm-x1.S deleted file mode 100644 index f67f045a0cf..00000000000 --- a/src/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-arm-x1.S +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include "xnnpack/assembly.h" - -.syntax unified - -// void xnn_u32_filterbank_accumulate_ukernel__asm_aarch32_arm_x1( -// size_t rows, r0 -// const uint32_t* input, r1 -// const uint8_t* weight_widths, r2 -// const uint16_t* weights, r3 -// uint64_t* output) sp -> r12 - -// d8-d15, r12-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved. - -// Register usage -// input r1 r6 -// weights r3 r5 r7 -// weight_accumulator r12 r8 r9 -// unweight_accumulator r10 r11 -// weight_widths r2 r4 - -BEGIN_FUNCTION xnn_u32_filterbank_accumulate_ukernel__asm_aarch32_arm_x1 - .arm -#ifndef __APPLE__ - .arch armv7-a - .fpu neon -#endif - LDR r12, [sp] // output - ADD r3, r3, 2 // advance weights pointer to unweight - PUSH {r4,r5,r6,r7,r8,r9,r10,r11} // push 32 bytes - MOV r8, 0 // weight_accumulator - MOV r9, 0 - - // Compute unweight as initial weight - LDRB r4, [r2], 1 // weight_widths -0: - LDRH r5, [r3], 4 // unweight - LDR r6, [r1], 4 // input - SUBS r4, r4, 1 - UMLAL r8, r9, r6, r5 // initial weight_accumulator - BHI 0b - - SUBS r0, r0, 1 - SUB r3, r3, 2 // rewind weights pointer to weight - - BLS 3f - -1: - LDRB r4, [r2], 1 // weight_widths - MOV r10, 0 // unweight_accumulator - MOV r11, 0 - -2: - LDR r5, [r3], 4 // weight+unweight - LDR r6, [r1], 4 // input - SUBS r4, r4, 1 - UXTH r7, r5 // weight - UXTH r5, r5, ror #16 // unweight - UMLAL r8, r9, r6, r7 // weight_accumulator - UMLAL r10, r11, r6, r5 // unweight_accumulator - BHI 2b - - STMIA r12!, {r8, r9} - SUBS r0, r0, 1 - MOV r8, r10 // weight_accumulator = unweight_accumulator - MOV r9, r11 - BHI 1b - -3: - // Final row only compute weight - LDRB r4, [r2] // last weight_widths -4: - LDRH r5, [r3], 4 // weight - LDR r6, [r1], 4 // input - SUBS r4, r4, 1 - UMLAL r8, r9, r6, r5 // weight_accumulator - BHI 4b - - STMIA r12!, {r8, r9} - - POP {r4,r5,r6,r7,r8,r9,r10,r11} - BX lr - -END_FUNCTION xnn_u32_filterbank_accumulate_ukernel__asm_aarch32_arm_x1 - -#ifdef __ELF__ -.section ".note.GNU-stack","",%progbits -#endif diff --git a/src/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-neon-x1.S b/src/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-neon-x1.S deleted file mode 100644 index 86415da478d..00000000000 --- a/src/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-neon-x1.S +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include "xnnpack/assembly.h" - -.syntax unified - -// void xnn_u32_filterbank_accumulate_ukernel__asm_aarch32_neon_x1( -// size_t rows, r0 -// const uint32_t* input, r1 -// const uint8_t* weight_widths, r2 -// const uint16_t* weights, r3 -// uint64_t* output) sp -> r12 - -// d8-d15, r12-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved. - -// Register usage -// input r1 d2 -// weights r3 d3 d4 d5 -// output r12 d0 d1 -// weight_widths r2 r4 - -BEGIN_FUNCTION xnn_u32_filterbank_accumulate_ukernel__asm_aarch32_neon_x1 - .arm -#ifndef __APPLE__ - .arch armv7-a - .fpu neon -#endif - LDR r12, [sp] // output - PUSH {r4,lr} // push 8 bytes - - VMOV.U8 d0, 0 // weight_accumulator - - // Compute unweight as initial weight - LDRB r4, [r2], 1 // weight_widths - VMOV.U8 d1, 0 // unweight_accumulator -0: - VLD1.32 {d3[]}, [r3]! // weight+unweight - VLD1.32 {d2[]}, [r1]! // input - SUBS r4, r4, 1 - VMOVL.U16 q2, d3 - VMLAL.U32 q0, d2, d4[1] // unweight - BHI 0b - - SUBS r0, r0, 1 - BLS 3f - -1: - LDRB r4, [r2], 1 // weight_widths - VMOV.U8 d1, 0 // unweight_accumulator -2: - VLD1.32 {d3[]}, [r3]! // weight+unweight - VLD1.32 {d2[]}, [r1]! // input - SUBS r4, r4, 1 - VMOVL.U16 q2, d3 - VMLAL.U32 q0, d4, d2 - BHI 2b - - VST1.64 {d0}, [r12]! - SUBS r0, r0, 1 - VMOV d0, d1 - BNE 1b - -3: - // Final row only compute weight - LDRB r4, [r2], 1 // weight_widths -4: - VLD1.32 {d3[]}, [r3]! // weight+unweight - VLD1.32 {d2[]}, [r1]! // input - SUBS r4, r4, 1 - VMOVL.U16 q2, d3 - VMLAL.U32 q0, d2, d4[0] // weight - BHI 4b - - VST1.64 {d0}, [r12]! - - POP {r4,pc} - -END_FUNCTION xnn_u32_filterbank_accumulate_ukernel__asm_aarch32_neon_x1 - -#ifdef __ELF__ -.section ".note.GNU-stack","",%progbits -#endif diff --git a/src/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-neon-x2.S b/src/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-neon-x2.S deleted file mode 100644 index ed6fdb180d6..00000000000 --- a/src/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-neon-x2.S +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include "xnnpack/assembly.h" - -.syntax unified - -// void xnn_u32_filterbank_accumulate_ukernel__asm_aarch32_neon_x2( -// size_t rows, r0 -// const uint32_t* input, r1 -// const uint8_t* weight_widths, r2 -// const uint16_t* weights, r3 -// uint64_t* output) sp -> r12 - -// d8-d15, r12-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved. - -// Register usage -// input r1 d2 -// weights r3 d3 d4 d5 -// output r12 d0 d1 -// weight_widths r2 r4 - -BEGIN_FUNCTION xnn_u32_filterbank_accumulate_ukernel__asm_aarch32_neon_x2 - .arm -#ifndef __APPLE__ - .arch armv7-a - .fpu neon -#endif - LDR r12, [sp] // output - VMOV.U8 d0, 0 // weight_accumulator - PUSH {r4,lr} // push 8 bytes - - // Compute unweight as initial weight - LDRB r4, [r2], 1 // weight_widths - VMOV.U8 d1, 0 // unweight_accumulator -0: - VLD1.32 {d3[]}, [r3]! // weight+unweight - VLD1.32 {d2[]}, [r1]! // input - SUBS r4, r4, 1 - VMOVL.U16 q2, d3 - VMLAL.U32 q0, d2, d4[1] // unweight - BHI 0b - -1: - LDRB r4, [r2], 1 // weight_widths - SUBS r4, r4, 1 - VMOV.U8 d1, 0 // unweight_accumulator - BLS 3f // less than 2 weights? - -2: - VLD1.16 {d3}, [r3]! // weights - VLD1.32 {d2}, [r1]! // input - SUBS r4, r4, 2 - VMOVL.U16 q2, d3 - VMLAL.U32 q0, d4, d2[0] - VMLAL.U32 q0, d5, d2[1] - BHI 2b - - BLO 4f // is there a remainder? -3: - VLD1.32 {d3[]}, [r3]! // weights - VLD1.32 {d2[]}, [r1]! // input - VMOVL.U16 q2, d3 - VMLAL.U32 q0, d4, d2 - -4: - VST1.64 {d0}, [r12]! - SUBS r0, r0, 1 - VMOV d0, d1 - BNE 1b - - POP {r4,pc} - -END_FUNCTION xnn_u32_filterbank_accumulate_ukernel__asm_aarch32_neon_x2 - -#ifdef __ELF__ -.section ".note.GNU-stack","",%progbits -#endif diff --git a/src/u32-filterbank-subtract/u32-filterbank-subtract-scalar-x2.c b/src/u32-filterbank-subtract/u32-filterbank-subtract-scalar-x2.c deleted file mode 100644 index d51240f9074..00000000000 --- a/src/u32-filterbank-subtract/u32-filterbank-subtract-scalar-x2.c +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/filterbank.h" - - -void xnn_u32_filterbank_subtract_ukernel__scalar_x2( - size_t batch_size, - const uint32_t* input, - uint32_t smoothing, - uint32_t alternate_smoothing, - uint32_t one_minus_smoothing, - uint32_t alternate_one_minus_smoothing, - uint32_t min_signal_remaining, - uint32_t smoothing_bits, /* 0 in FE */ - uint32_t spectral_subtraction_bits, /* 14 in FE */ - uint32_t* noise_estimate, - uint32_t* output) { - - assert(batch_size != 0); - assert(batch_size % 2 == 0); - assert(input != NULL); - assert(output != NULL); - assert(noise_estimate != NULL); - - batch_size >>= 1; /* 48 in FE */ - - do { - const uint32_t vinput0 = input[0]; - const uint32_t vinput1 = input[1]; - input += 2; - - uint32_t vnoise_estimate0 = noise_estimate[0]; - uint32_t vnoise_estimate1 = noise_estimate[1]; - - // Scale up signal for smoothing filter computation. - const uint32_t vsignal_scaled_up0 = vinput0 << smoothing_bits; - const uint32_t vsignal_scaled_up1 = vinput1 << smoothing_bits; - - vnoise_estimate0 = (uint32_t) ((math_mulext_u32(vsignal_scaled_up0, smoothing) + - math_mulext_u32(vnoise_estimate0, one_minus_smoothing)) >> spectral_subtraction_bits); - vnoise_estimate1 = (uint32_t) ((math_mulext_u32(vsignal_scaled_up1, alternate_smoothing) + - math_mulext_u32(vnoise_estimate1, alternate_one_minus_smoothing)) >> spectral_subtraction_bits); - - noise_estimate[0] = vnoise_estimate0; - noise_estimate[1] = vnoise_estimate1; - noise_estimate += 2; - - const uint32_t vfloor0 = (uint32_t) (math_mulext_u32(vinput0, min_signal_remaining) >> spectral_subtraction_bits); - const uint32_t vfloor1 = (uint32_t) (math_mulext_u32(vinput1, min_signal_remaining) >> spectral_subtraction_bits); - const uint32_t vsubtracted0 = math_doz_u32(vsignal_scaled_up0, vnoise_estimate0) >> smoothing_bits; - const uint32_t vsubtracted1 = math_doz_u32(vsignal_scaled_up1, vnoise_estimate1) >> smoothing_bits; - const uint32_t vout0 = math_max_u32(vsubtracted0, vfloor0); - const uint32_t vout1 = math_max_u32(vsubtracted1, vfloor1); - - output[0] = vout0; - output[1] = vout1; - output += 2; - - } while (--batch_size != 0); -} diff --git a/src/u32-vlog/gen/u32-vlog-scalar-x1.c b/src/u32-vlog/gen/u32-vlog-scalar-x1.c deleted file mode 100644 index 5e161628699..00000000000 --- a/src/u32-vlog/gen/u32-vlog-scalar-x1.c +++ /dev/null @@ -1,42 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/u32-vlog/scalar.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/vlog.h" - - -void xnn_u32_vlog_ukernel__scalar_x1( - size_t batch, - const uint32_t* input, - uint32_t input_lshift, - uint32_t output_scale, - uint16_t* output) { - - assert(batch != 0); - assert(input != NULL); - assert(input_lshift < 32); - assert(output != NULL); - - - if XNN_UNLIKELY(batch != 0) { - do { - const uint32_t vi = *input++; - const uint32_t scaled = vi << input_lshift; - - const uint32_t log_value = XNN_LIKELY(scaled != 0) ? math_u32_log32(scaled, output_scale) : 0; - - const uint32_t vout = math_min_u32(log_value, (uint32_t) INT16_MAX); - *output++ = (uint16_t) vout; - } while (--batch != 0); - } -} diff --git a/src/u32-vlog/gen/u32-vlog-scalar-x2.c b/src/u32-vlog/gen/u32-vlog-scalar-x2.c deleted file mode 100644 index 177639c805b..00000000000 --- a/src/u32-vlog/gen/u32-vlog-scalar-x2.c +++ /dev/null @@ -1,61 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/u32-vlog/scalar.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/vlog.h" - - -void xnn_u32_vlog_ukernel__scalar_x2( - size_t batch, - const uint32_t* input, - uint32_t input_lshift, - uint32_t output_scale, - uint16_t* output) { - - assert(batch != 0); - assert(input != NULL); - assert(input_lshift < 32); - assert(output != NULL); - - for (; batch >= 2; batch -= 2) { - const uint32_t vi0 = input[0]; - const uint32_t vi1 = input[1]; - input += 2; - - const uint32_t scaled0 = vi0 << input_lshift; - const uint32_t scaled1 = vi1 << input_lshift; - - const uint32_t log_value0 = XNN_LIKELY(scaled0 != 0) ? math_u32_log32(scaled0, output_scale) : 0; - - const uint32_t vout0 = math_min_u32(log_value0, (uint32_t) INT16_MAX); // signed max value - output[0] = (uint16_t) vout0; - const uint32_t log_value1 = XNN_LIKELY(scaled1 != 0) ? math_u32_log32(scaled1, output_scale) : 0; - - const uint32_t vout1 = math_min_u32(log_value1, (uint32_t) INT16_MAX); // signed max value - output[1] = (uint16_t) vout1; - - output += 2; - } - - if XNN_UNLIKELY(batch != 0) { - do { - const uint32_t vi = *input++; - const uint32_t scaled = vi << input_lshift; - - const uint32_t log_value = XNN_LIKELY(scaled != 0) ? math_u32_log32(scaled, output_scale) : 0; - - const uint32_t vout = math_min_u32(log_value, (uint32_t) INT16_MAX); - *output++ = (uint16_t) vout; - } while (--batch != 0); - } -} diff --git a/src/u32-vlog/gen/u32-vlog-scalar-x3.c b/src/u32-vlog/gen/u32-vlog-scalar-x3.c deleted file mode 100644 index 301caf7e74a..00000000000 --- a/src/u32-vlog/gen/u32-vlog-scalar-x3.c +++ /dev/null @@ -1,67 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/u32-vlog/scalar.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/vlog.h" - - -void xnn_u32_vlog_ukernel__scalar_x3( - size_t batch, - const uint32_t* input, - uint32_t input_lshift, - uint32_t output_scale, - uint16_t* output) { - - assert(batch != 0); - assert(input != NULL); - assert(input_lshift < 32); - assert(output != NULL); - - for (; batch >= 3; batch -= 3) { - const uint32_t vi0 = input[0]; - const uint32_t vi1 = input[1]; - const uint32_t vi2 = input[2]; - input += 3; - - const uint32_t scaled0 = vi0 << input_lshift; - const uint32_t scaled1 = vi1 << input_lshift; - const uint32_t scaled2 = vi2 << input_lshift; - - const uint32_t log_value0 = XNN_LIKELY(scaled0 != 0) ? math_u32_log32(scaled0, output_scale) : 0; - - const uint32_t vout0 = math_min_u32(log_value0, (uint32_t) INT16_MAX); // signed max value - output[0] = (uint16_t) vout0; - const uint32_t log_value1 = XNN_LIKELY(scaled1 != 0) ? math_u32_log32(scaled1, output_scale) : 0; - - const uint32_t vout1 = math_min_u32(log_value1, (uint32_t) INT16_MAX); // signed max value - output[1] = (uint16_t) vout1; - const uint32_t log_value2 = XNN_LIKELY(scaled2 != 0) ? math_u32_log32(scaled2, output_scale) : 0; - - const uint32_t vout2 = math_min_u32(log_value2, (uint32_t) INT16_MAX); // signed max value - output[2] = (uint16_t) vout2; - - output += 3; - } - - if XNN_UNLIKELY(batch != 0) { - do { - const uint32_t vi = *input++; - const uint32_t scaled = vi << input_lshift; - - const uint32_t log_value = XNN_LIKELY(scaled != 0) ? math_u32_log32(scaled, output_scale) : 0; - - const uint32_t vout = math_min_u32(log_value, (uint32_t) INT16_MAX); - *output++ = (uint16_t) vout; - } while (--batch != 0); - } -} diff --git a/src/u32-vlog/gen/u32-vlog-scalar-x4.c b/src/u32-vlog/gen/u32-vlog-scalar-x4.c deleted file mode 100644 index 5f1cd14b43d..00000000000 --- a/src/u32-vlog/gen/u32-vlog-scalar-x4.c +++ /dev/null @@ -1,73 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/u32-vlog/scalar.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/vlog.h" - - -void xnn_u32_vlog_ukernel__scalar_x4( - size_t batch, - const uint32_t* input, - uint32_t input_lshift, - uint32_t output_scale, - uint16_t* output) { - - assert(batch != 0); - assert(input != NULL); - assert(input_lshift < 32); - assert(output != NULL); - - for (; batch >= 4; batch -= 4) { - const uint32_t vi0 = input[0]; - const uint32_t vi1 = input[1]; - const uint32_t vi2 = input[2]; - const uint32_t vi3 = input[3]; - input += 4; - - const uint32_t scaled0 = vi0 << input_lshift; - const uint32_t scaled1 = vi1 << input_lshift; - const uint32_t scaled2 = vi2 << input_lshift; - const uint32_t scaled3 = vi3 << input_lshift; - - const uint32_t log_value0 = XNN_LIKELY(scaled0 != 0) ? math_u32_log32(scaled0, output_scale) : 0; - - const uint32_t vout0 = math_min_u32(log_value0, (uint32_t) INT16_MAX); // signed max value - output[0] = (uint16_t) vout0; - const uint32_t log_value1 = XNN_LIKELY(scaled1 != 0) ? math_u32_log32(scaled1, output_scale) : 0; - - const uint32_t vout1 = math_min_u32(log_value1, (uint32_t) INT16_MAX); // signed max value - output[1] = (uint16_t) vout1; - const uint32_t log_value2 = XNN_LIKELY(scaled2 != 0) ? math_u32_log32(scaled2, output_scale) : 0; - - const uint32_t vout2 = math_min_u32(log_value2, (uint32_t) INT16_MAX); // signed max value - output[2] = (uint16_t) vout2; - const uint32_t log_value3 = XNN_LIKELY(scaled3 != 0) ? math_u32_log32(scaled3, output_scale) : 0; - - const uint32_t vout3 = math_min_u32(log_value3, (uint32_t) INT16_MAX); // signed max value - output[3] = (uint16_t) vout3; - - output += 4; - } - - if XNN_UNLIKELY(batch != 0) { - do { - const uint32_t vi = *input++; - const uint32_t scaled = vi << input_lshift; - - const uint32_t log_value = XNN_LIKELY(scaled != 0) ? math_u32_log32(scaled, output_scale) : 0; - - const uint32_t vout = math_min_u32(log_value, (uint32_t) INT16_MAX); - *output++ = (uint16_t) vout; - } while (--batch != 0); - } -} diff --git a/src/u32-vlog/scalar.c.in b/src/u32-vlog/scalar.c.in deleted file mode 100644 index 3a6366cf735..00000000000 --- a/src/u32-vlog/scalar.c.in +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert BATCH_TILE >= 1 -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/vlog.h" - - -void xnn_u32_vlog_ukernel__scalar_x${BATCH_TILE}( - size_t batch, - const uint32_t* input, - uint32_t input_lshift, - uint32_t output_scale, - uint16_t* output) { - - assert(batch != 0); - assert(input != NULL); - assert(input_lshift < 32); - assert(output != NULL); - - $if BATCH_TILE > 1: - for (; batch >= ${BATCH_TILE}; batch -= ${BATCH_TILE}) { - $for N in range(BATCH_TILE): - const uint32_t vi${N} = input[${N}]; - input += ${BATCH_TILE}; - - $for N in range(BATCH_TILE): - const uint32_t scaled${N} = vi${N} << input_lshift; - - $for N in range(BATCH_TILE): - const uint32_t log_value${N} = XNN_LIKELY(scaled${N} != 0) ? math_u32_log32(scaled${N}, output_scale) : 0; - - const uint32_t vout${N} = math_min_u32(log_value${N}, (uint32_t) INT16_MAX); // signed max value - output[${N}] = (uint16_t) vout${N}; - - output += ${BATCH_TILE}; - } - - if XNN_UNLIKELY(batch != 0) { - do { - const uint32_t vi = *input++; - const uint32_t scaled = vi << input_lshift; - - const uint32_t log_value = XNN_LIKELY(scaled != 0) ? math_u32_log32(scaled, output_scale) : 0; - - const uint32_t vout = math_min_u32(log_value, (uint32_t) INT16_MAX); - *output++ = (uint16_t) vout; - } while (--batch != 0); - } -} diff --git a/src/u64-u32-vsqrtshift/u64-u32-vsqrtshift-scalar-cvtu32-sqrt-cvtu32f64-u1.c b/src/u64-u32-vsqrtshift/u64-u32-vsqrtshift-scalar-cvtu32-sqrt-cvtu32f64-u1.c deleted file mode 100644 index 17b92d45b3b..00000000000 --- a/src/u64-u32-vsqrtshift/u64-u32-vsqrtshift-scalar-cvtu32-sqrt-cvtu32f64-u1.c +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/vunary.h" - - -void xnn_u64_u32_vsqrtshift_ukernel__scalar_cvtu32_sqrt_cvtu32f64_u1( - size_t batch, - const uint64_t* input, - uint32_t* output, - uint32_t shift) -{ - assert(batch != 0); - assert(input != NULL); - assert(output != NULL); - assert(shift < 32); - - do { - const uint64_t vx = *input++; - - uint64_t vy = vx; - const uint32_t vx_hi = (uint32_t) (vx >> 32); - const uint32_t vx_lo = (uint32_t) vx; - if XNN_LIKELY(vx != 0) { - const double vf_hi = (double) vx_hi; - const double vf_lo = (double) vx_lo; - double vf = vf_hi * 0x1.0p+32 + vf_lo; - vf = sqrt(vf); - vy = math_cvt_sat_u32_f64(vf); - #if XNN_ARCH_ARM || XNN_ARCH_X86 - const uint64_t vsquared_y_less_x = math_mulext_u32((uint32_t) vy, (uint32_t) vy) - vx; - #else - const uint64_t vsquared_y_less_x = vy * vy - vx; - #endif - if XNN_UNPREDICTABLE((int64_t) (vsquared_y_less_x + vy) < 0) { - vy += 1; - } else if XNN_UNPREDICTABLE((int64_t) (vsquared_y_less_x - vy) >= 0) { - vy -= 1; - } - } - - // Match TFLM is producing incorrect result for high 64-bit inputs - const uint32_t vy_lo = (uint32_t) vy; - const uint32_t vy_hi = (uint32_t) (vy >> 32); - uint32_t vout = vy_lo | -vy_hi; - // Match TFLM is producing incorrect result for high 32-bit inputs - if XNN_LIKELY(vx_hi == 0) { - if (vout == UINT32_C(0x00010000)) { - vout -= 1; - } - } - - *output++ = vout >> shift; - - batch -= sizeof(uint64_t); - } while (batch != 0); -} diff --git a/src/u64-u32-vsqrtshift/u64-u32-vsqrtshift.h b/src/u64-u32-vsqrtshift/u64-u32-vsqrtshift.h deleted file mode 100644 index ac3701b6233..00000000000 --- a/src/u64-u32-vsqrtshift/u64-u32-vsqrtshift.h +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#ifndef XNN_UKERNEL_WITH_PARAMS -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ - XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) -#define XNN_DEFINED_UKERNEL_WITH_PARAMS -#endif - -#ifndef XNN_UKERNEL -#define XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) \ - XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, void, /*init_params=*/nullptr) -#define XNN_DEFINED_UKERNEL -#endif - -XNN_UKERNEL_WITH_PARAMS(0, xnn_u64_u32_vsqrtshift_ukernel__scalar_cvtu32_sqrt_cvtu32f64_u1, 1, false, uint64_t, uint32_t, NULL) - -#ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS -#undef XNN_DEFINED_UKERNEL_WITH_PARAMS -#undef XNN_UKERNEL_WITH_PARAMS -#endif - -#ifdef XNN_DEFINED_UKERNEL -#undef XNN_DEFINED_UKERNEL -#undef XNN_UKERNEL -#endif diff --git a/src/xnnpack/filterbank.h b/src/xnnpack/filterbank.h deleted file mode 100644 index 594b83bcf60..00000000000 --- a/src/xnnpack/filterbank.h +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#pragma once - -#include -#include - -#include "xnnpack/common.h" - -#ifdef __cplusplus -extern "C" { -#endif - - -#define DECLARE_U32_FILTERBANK_ACCUMULATE_UKERNEL_FUNCTION(fn_name) \ - XNN_INTERNAL void fn_name( \ - size_t rows, \ - const uint32_t* input, \ - const uint8_t* weight_widths, \ - const uint16_t* weights, \ - uint64_t* output); - -DECLARE_U32_FILTERBANK_ACCUMULATE_UKERNEL_FUNCTION(xnn_u32_filterbank_accumulate_ukernel__asm_aarch32_arm_x1) -DECLARE_U32_FILTERBANK_ACCUMULATE_UKERNEL_FUNCTION(xnn_u32_filterbank_accumulate_ukernel__asm_aarch32_neon_x1) -DECLARE_U32_FILTERBANK_ACCUMULATE_UKERNEL_FUNCTION(xnn_u32_filterbank_accumulate_ukernel__asm_aarch32_neon_x2) -DECLARE_U32_FILTERBANK_ACCUMULATE_UKERNEL_FUNCTION(xnn_u32_filterbank_accumulate_ukernel__neon_x1) -DECLARE_U32_FILTERBANK_ACCUMULATE_UKERNEL_FUNCTION(xnn_u32_filterbank_accumulate_ukernel__neon_x2) -DECLARE_U32_FILTERBANK_ACCUMULATE_UKERNEL_FUNCTION(xnn_u32_filterbank_accumulate_ukernel__scalar_x1) - - -#define DECLARE_U32_FILTERBANK_SUBTRACT_UKERNEL_FUNCTION(fn_name) \ - XNN_INTERNAL void fn_name( \ - size_t batch_size, \ - const uint32_t* input, \ - uint32_t smoothing, \ - uint32_t alternate_smoothing, \ - uint32_t one_minus_smoothing, \ - uint32_t alternate_one_minus_smoothing, \ - uint32_t min_signal_remaining, \ - uint32_t smoothing_bits, \ - uint32_t spectral_subtraction_bits, \ - uint32_t* noise_estimate, \ - uint32_t* output); - - -DECLARE_U32_FILTERBANK_SUBTRACT_UKERNEL_FUNCTION(xnn_u32_filterbank_subtract_ukernel__scalar_x2) - -#ifdef __cplusplus -} // extern "C" -#endif diff --git a/src/xnnpack/rmaxabs.h b/src/xnnpack/rmaxabs.h deleted file mode 100644 index 190ecadfd89..00000000000 --- a/src/xnnpack/rmaxabs.h +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#pragma once - -#include -#include - -#include "xnnpack/common.h" - -#ifdef __cplusplus -extern "C" { -#endif - - -#define DECLARE_S16_RMAXABS_UKERNEL_FUNCTION(fn_name) \ - XNN_INTERNAL void fn_name( \ - size_t batch_size, \ - const int16_t* input, \ - uint16_t* output); - - -DECLARE_S16_RMAXABS_UKERNEL_FUNCTION(xnn_s16_rmaxabs_ukernel__neon_x8) -DECLARE_S16_RMAXABS_UKERNEL_FUNCTION(xnn_s16_rmaxabs_ukernel__neon_x16) -DECLARE_S16_RMAXABS_UKERNEL_FUNCTION(xnn_s16_rmaxabs_ukernel__neon_x24) -DECLARE_S16_RMAXABS_UKERNEL_FUNCTION(xnn_s16_rmaxabs_ukernel__neon_x32) - -DECLARE_S16_RMAXABS_UKERNEL_FUNCTION(xnn_s16_rmaxabs_ukernel__scalar_x1) -DECLARE_S16_RMAXABS_UKERNEL_FUNCTION(xnn_s16_rmaxabs_ukernel__scalar_x2) -DECLARE_S16_RMAXABS_UKERNEL_FUNCTION(xnn_s16_rmaxabs_ukernel__scalar_x3) -DECLARE_S16_RMAXABS_UKERNEL_FUNCTION(xnn_s16_rmaxabs_ukernel__scalar_x4) - -#ifdef __cplusplus -} // extern "C" -#endif diff --git a/src/xnnpack/vlshift.h b/src/xnnpack/vlshift.h deleted file mode 100644 index 5e8d6aa12fc..00000000000 --- a/src/xnnpack/vlshift.h +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#pragma once - -#include -#include - -#include "xnnpack/common.h" - -#ifdef __cplusplus -extern "C" { -#endif - - -#define DECLARE_I16_VLSHIFT_UKERNEL_FUNCTION(fn_name) \ - XNN_INTERNAL void fn_name( \ - size_t batch, \ - const uint16_t* input, \ - uint16_t* output, \ - uint32_t shift); - - -DECLARE_I16_VLSHIFT_UKERNEL_FUNCTION(xnn_i16_vlshift_ukernel__neon_u8) -DECLARE_I16_VLSHIFT_UKERNEL_FUNCTION(xnn_i16_vlshift_ukernel__neon_u16) -DECLARE_I16_VLSHIFT_UKERNEL_FUNCTION(xnn_i16_vlshift_ukernel__neon_u24) -DECLARE_I16_VLSHIFT_UKERNEL_FUNCTION(xnn_i16_vlshift_ukernel__neon_u32) - -DECLARE_I16_VLSHIFT_UKERNEL_FUNCTION(xnn_i16_vlshift_ukernel__scalar_u1) -DECLARE_I16_VLSHIFT_UKERNEL_FUNCTION(xnn_i16_vlshift_ukernel__scalar_u2) -DECLARE_I16_VLSHIFT_UKERNEL_FUNCTION(xnn_i16_vlshift_ukernel__scalar_u3) -DECLARE_I16_VLSHIFT_UKERNEL_FUNCTION(xnn_i16_vlshift_ukernel__scalar_u4) - -#ifdef __cplusplus -} // extern "C" -#endif diff --git a/src/xnnpack/vsquareabs.h b/src/xnnpack/vsquareabs.h deleted file mode 100644 index c621ad0fe21..00000000000 --- a/src/xnnpack/vsquareabs.h +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#pragma once - -#include -#include - -#include "xnnpack/common.h" - -#ifdef __cplusplus -extern "C" { -#endif - - -#define DECLARE_CS16_VSQUAREABS_UKERNEL_FUNCTION(fn_name) \ - XNN_INTERNAL void fn_name( \ - size_t batch_size, \ - const int16_t* input, \ - uint32_t* output); - - -DECLARE_CS16_VSQUAREABS_UKERNEL_FUNCTION(xnn_cs16_vsquareabs_ukernel__scalar_x1) -DECLARE_CS16_VSQUAREABS_UKERNEL_FUNCTION(xnn_cs16_vsquareabs_ukernel__scalar_x2) -DECLARE_CS16_VSQUAREABS_UKERNEL_FUNCTION(xnn_cs16_vsquareabs_ukernel__scalar_x3) -DECLARE_CS16_VSQUAREABS_UKERNEL_FUNCTION(xnn_cs16_vsquareabs_ukernel__scalar_x4) - -DECLARE_CS16_VSQUAREABS_UKERNEL_FUNCTION(xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x4) -DECLARE_CS16_VSQUAREABS_UKERNEL_FUNCTION(xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x8) -DECLARE_CS16_VSQUAREABS_UKERNEL_FUNCTION(xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x12) -DECLARE_CS16_VSQUAREABS_UKERNEL_FUNCTION(xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x16) - -DECLARE_CS16_VSQUAREABS_UKERNEL_FUNCTION(xnn_cs16_vsquareabs_ukernel__hexagon_x2) -DECLARE_CS16_VSQUAREABS_UKERNEL_FUNCTION(xnn_cs16_vsquareabs_ukernel__hexagon_x4) -DECLARE_CS16_VSQUAREABS_UKERNEL_FUNCTION(xnn_cs16_vsquareabs_ukernel__hexagon_x6) -DECLARE_CS16_VSQUAREABS_UKERNEL_FUNCTION(xnn_cs16_vsquareabs_ukernel__hexagon_x8) -DECLARE_CS16_VSQUAREABS_UKERNEL_FUNCTION(xnn_cs16_vsquareabs_ukernel__hexagon_x10) -DECLARE_CS16_VSQUAREABS_UKERNEL_FUNCTION(xnn_cs16_vsquareabs_ukernel__hexagon_x12) - -#ifdef __cplusplus -} // extern "C" -#endif diff --git a/src/xnnpack/window.h b/src/xnnpack/window.h deleted file mode 100644 index 12bf5755167..00000000000 --- a/src/xnnpack/window.h +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#pragma once - -#include -#include - -#include "xnnpack/common.h" - -#ifdef __cplusplus -extern "C" { -#endif - - -#define DECLARE_S16_WINDOW_UKERNEL_FUNCTION(fn_name) \ - XNN_INTERNAL void fn_name( \ - size_t rows, \ - size_t batch_size, \ - const int16_t* input, \ - const int16_t* weights, \ - int16_t* output, \ - uint32_t shift); - - -DECLARE_S16_WINDOW_UKERNEL_FUNCTION(xnn_s16_window_ukernel__neon_u8) -DECLARE_S16_WINDOW_UKERNEL_FUNCTION(xnn_s16_window_ukernel__neon_u16) -DECLARE_S16_WINDOW_UKERNEL_FUNCTION(xnn_s16_window_ukernel__neon_u24) -DECLARE_S16_WINDOW_UKERNEL_FUNCTION(xnn_s16_window_ukernel__neon_u32) - -DECLARE_S16_WINDOW_UKERNEL_FUNCTION(xnn_s16_window_shift12_ukernel__neon_u8) -DECLARE_S16_WINDOW_UKERNEL_FUNCTION(xnn_s16_window_shift12_ukernel__neon_u16) -DECLARE_S16_WINDOW_UKERNEL_FUNCTION(xnn_s16_window_shift12_ukernel__neon_u24) -DECLARE_S16_WINDOW_UKERNEL_FUNCTION(xnn_s16_window_shift12_ukernel__neon_u32) - -DECLARE_S16_WINDOW_UKERNEL_FUNCTION(xnn_s16_window_shift15_ukernel__neon_u8) -DECLARE_S16_WINDOW_UKERNEL_FUNCTION(xnn_s16_window_shift15_ukernel__neon_u16) -DECLARE_S16_WINDOW_UKERNEL_FUNCTION(xnn_s16_window_shift15_ukernel__neon_u24) -DECLARE_S16_WINDOW_UKERNEL_FUNCTION(xnn_s16_window_shift15_ukernel__neon_u32) - -DECLARE_S16_WINDOW_UKERNEL_FUNCTION(xnn_s16_window_ukernel__scalar_u1) -DECLARE_S16_WINDOW_UKERNEL_FUNCTION(xnn_s16_window_ukernel__scalar_u2) -DECLARE_S16_WINDOW_UKERNEL_FUNCTION(xnn_s16_window_ukernel__scalar_u3) -DECLARE_S16_WINDOW_UKERNEL_FUNCTION(xnn_s16_window_ukernel__scalar_u4) - -#ifdef __cplusplus -} // extern "C" -#endif diff --git a/test/BUILD.bazel b/test/BUILD.bazel index caf905aa328..f10f2257fc2 100644 --- a/test/BUILD.bazel +++ b/test/BUILD.bazel @@ -306,7 +306,6 @@ sh_test( "f32_vtanh", "s8_vclamp", "u8_vclamp", - "u64_u32_vsqrtshift", ]] [xnnpack_unit_test( @@ -1217,87 +1216,6 @@ xnnpack_unit_test( deps = MICROKERNEL_TEST_DEPS, ) -xnnpack_unit_test( - name = "s16_rmaxabs_test", - srcs = [ - "rmaxabs-microkernel-tester.h", - "s16-rmaxabs.cc", - ], - deps = MICROKERNEL_TEST_DEPS, -) - -xnnpack_unit_test( - name = "s16_window_test", - srcs = [ - "s16-window.cc", - "window-microkernel-tester.h", - ], - deps = MICROKERNEL_TEST_DEPS, -) - -xnnpack_unit_test( - name = "u32_filterbank_accumulate_test", - srcs = [ - "filterbank-accumulate-microkernel-tester.h", - "u32-filterbank-accumulate.cc", - ], - deps = MICROKERNEL_TEST_DEPS, -) - -xnnpack_unit_test( - name = "u32_filterbank_subtract_test", - srcs = [ - "filterbank-subtract-microkernel-tester.h", - "u32-filterbank-subtract.cc", - ], - deps = MICROKERNEL_TEST_DEPS, -) - -xnnpack_unit_test( - name = "u32_vlog_test", - srcs = [ - "u32-vlog.cc", - "vlog-microkernel-tester.h", - ], - deps = MICROKERNEL_TEST_DEPS, -) - -xnnpack_unit_test( - name = "i16_vlshift_test", - srcs = [ - "i16-vlshift.cc", - "vlshift-microkernel-tester.h", - ], - deps = MICROKERNEL_TEST_DEPS, -) - -xnnpack_unit_test( - name = "cs16_vsquareabs_test", - srcs = [ - "cs16-vsquareabs.cc", - "vsquareabs-microkernel-tester.h", - ], - deps = MICROKERNEL_TEST_DEPS, -) - -xnnpack_unit_test( - name = "cs16_bfly4_test", - srcs = [ - "bfly4-microkernel-tester.h", - "cs16-bfly4.cc", - ], - deps = MICROKERNEL_TEST_DEPS, -) - -xnnpack_unit_test( - name = "cs16_fftr_test", - srcs = [ - "cs16-fftr.cc", - "fftr-microkernel-tester.h", - ], - deps = MICROKERNEL_TEST_DEPS, -) - xnnpack_unit_test( name = "u8_lut32norm_test", srcs = [ diff --git a/test/bfly4-microkernel-tester.h b/test/bfly4-microkernel-tester.h deleted file mode 100644 index ab7ddd4858c..00000000000 --- a/test/bfly4-microkernel-tester.h +++ /dev/null @@ -1,251 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack/math.h" -#include "xnnpack/microfnptr.h" -#include "replicable_random_device.h" - -// twiddle table for bfly4 for fft size 192 (complex numbers) -// Even numbers are numpy.floor(0.5 + 32767 * numpy.cos(-2*pi*numpy.linspace(0, 255, num=256) / 256)).astype(numpy.int16).tolist() -// Odd numbers are numpy.floor(0.5 + 32767 * numpy.sin(-2*pi*numpy.linspace(0, 255, num=256) / 256)).astype(numpy.int16).tolist() - -static const int16_t xnn_reference_table_fft256_twiddle[384] = { - 32767, 0, 32757, -804, 32728, -1608, 32678, -2410, - 32609, -3212, 32521, -4011, 32412, -4808, 32285, -5602, - 32137, -6393, 31971, -7179, 31785, -7962, 31580, -8739, - 31356, -9512, 31113,-10278, 30852,-11039, 30571,-11793, - 30273,-12539, 29956,-13279, 29621,-14010, 29268,-14732, - 28898,-15446, 28510,-16151, 28105,-16846, 27683,-17530, - 27245,-18204, 26790,-18868, 26319,-19519, 25832,-20159, - 25329,-20787, 24811,-21403, 24279,-22005, 23731,-22594, - 23170,-23170, 22594,-23731, 22005,-24279, 21403,-24811, - 20787,-25329, 20159,-25832, 19519,-26319, 18868,-26790, - 18204,-27245, 17530,-27683, 16846,-28105, 16151,-28510, - 15446,-28898, 14732,-29268, 14010,-29621, 13279,-29956, - 12539,-30273, 11793,-30571, 11039,-30852, 10278,-31113, - 9512,-31356, 8739,-31580, 7962,-31785, 7179,-31971, - 6393,-32137, 5602,-32285, 4808,-32412, 4011,-32521, - 3212,-32609, 2410,-32678, 1608,-32728, 804,-32757, - 0,-32767, -804,-32757, -1608,-32728, -2410,-32678, - -3212,-32609, -4011,-32521, -4808,-32412, -5602,-32285, - -6393,-32137, -7179,-31971, -7962,-31785, -8739,-31580, - -9512,-31356, -10278,-31113, -11039,-30852, -11793,-30571, - -12539,-30273, -13279,-29956, -14010,-29621, -14732,-29268, - -15446,-28898, -16151,-28510, -16846,-28105, -17530,-27683, - -18204,-27245, -18868,-26790, -19519,-26319, -20159,-25832, - -20787,-25329, -21403,-24811, -22005,-24279, -22594,-23731, - -23170,-23170, -23731,-22594, -24279,-22005, -24811,-21403, - -25329,-20787, -25832,-20159, -26319,-19519, -26790,-18868, - -27245,-18204, -27683,-17530, -28105,-16846, -28510,-16151, - -28898,-15446, -29268,-14732, -29621,-14010, -29956,-13279, - -30273,-12539, -30571,-11793, -30852,-11039, -31113,-10278, - -31356, -9512, -31580, -8739, -31785, -7962, -31971, -7179, - -32137, -6393, -32285, -5602, -32412, -4808, -32521, -4011, - -32609, -3212, -32678, -2410, -32728, -1608, -32757, -804, - -32767, 0, -32757, 804, -32728, 1608, -32678, 2410, - -32609, 3212, -32521, 4011, -32412, 4808, -32285, 5602, - -32137, 6393, -31971, 7179, -31785, 7962, -31580, 8739, - -31356, 9512, -31113, 10278, -30852, 11039, -30571, 11793, - -30273, 12539, -29956, 13279, -29621, 14010, -29268, 14732, - -28898, 15446, -28510, 16151, -28105, 16846, -27683, 17530, - -27245, 18204, -26790, 18868, -26319, 19519, -25832, 20159, - -25329, 20787, -24811, 21403, -24279, 22005, -23731, 22594, - -23170, 23170, -22594, 23731, -22005, 24279, -21403, 24811, - -20787, 25329, -20159, 25832, -19519, 26319, -18868, 26790, - -18204, 27245, -17530, 27683, -16846, 28105, -16151, 28510, - -15446, 28898, -14732, 29268, -14010, 29621, -13279, 29956, - -12539, 30273, -11793, 30571, -11039, 30852, -10278, 31113, - -9512, 31356, -8739, 31580, -7962, 31785, -7179, 31971, - -6393, 32137, -5602, 32285, -4808, 32412, -4011, 32521, - -3212, 32609, -2410, 32678, -1608, 32728, -804, 32757 -}; - -static void xnn_cs16_bfly4_reference( - size_t batch, - size_t samples, - int16_t* data, - const int16_t* twiddle, - size_t stride) -{ - assert(batch != 0); - assert(samples != 0); - assert(data != nullptr); - assert(stride != 0); - assert(twiddle != nullptr); - - int16_t* data0 = data; - int16_t* data1 = data + samples * 2; - int16_t* data2 = data + samples * 4; - int16_t* data3 = data + samples * 6; - - for (size_t n = 0; n < batch; ++n) { - const int16_t* tw1 = twiddle; - const int16_t* tw2 = twiddle; - const int16_t* tw3 = twiddle; - - for (size_t m = 0; m < samples; ++m) { - int32_t vout0_r = (int32_t) data0[0]; - int32_t vout0_i = (int32_t) data0[1]; - int32_t vout1_r = (int32_t) data1[0]; - int32_t vout1_i = (int32_t) data1[1]; - int32_t vout2_r = (int32_t) data2[0]; - int32_t vout2_i = (int32_t) data2[1]; - int32_t vout3_r = (int32_t) data3[0]; - int32_t vout3_i = (int32_t) data3[1]; - - const int32_t tw1_r = (const int32_t) tw1[0]; - const int32_t tw1_i = (const int32_t) tw1[1]; - const int32_t tw2_r = (const int32_t) tw2[0]; - const int32_t tw2_i = (const int32_t) tw2[1]; - const int32_t tw3_r = (const int32_t) tw3[0]; - const int32_t tw3_i = (const int32_t) tw3[1]; - - // Note 32767 / 4 = 8191. Should be 8192. - vout0_r = (vout0_r * 8191 + 16384) >> 15; - vout0_i = (vout0_i * 8191 + 16384) >> 15; - vout1_r = (vout1_r * 8191 + 16384) >> 15; - vout1_i = (vout1_i * 8191 + 16384) >> 15; - vout2_r = (vout2_r * 8191 + 16384) >> 15; - vout2_i = (vout2_i * 8191 + 16384) >> 15; - vout3_r = (vout3_r * 8191 + 16384) >> 15; - vout3_i = (vout3_i * 8191 + 16384) >> 15; - - const int32_t vtmp0_r = math_asr_s32(vout1_r * tw1_r - vout1_i * tw1_i + 16384, 15); - const int32_t vtmp0_i = math_asr_s32(vout1_r * tw1_i + vout1_i * tw1_r + 16384, 15); - const int32_t vtmp1_r = math_asr_s32(vout2_r * tw2_r - vout2_i * tw2_i + 16384, 15); - const int32_t vtmp1_i = math_asr_s32(vout2_r * tw2_i + vout2_i * tw2_r + 16384, 15); - const int32_t vtmp2_r = math_asr_s32(vout3_r * tw3_r - vout3_i * tw3_i + 16384, 15); - const int32_t vtmp2_i = math_asr_s32(vout3_r * tw3_i + vout3_i * tw3_r + 16384, 15); - - const int32_t vtmp5_r = vout0_r - vtmp1_r; - const int32_t vtmp5_i = vout0_i - vtmp1_i; - vout0_r += vtmp1_r; - vout0_i += vtmp1_i; - const int32_t vtmp3_r = vtmp0_r + vtmp2_r; - const int32_t vtmp3_i = vtmp0_i + vtmp2_i; - const int32_t vtmp4_r = vtmp0_r - vtmp2_r; - const int32_t vtmp4_i = vtmp0_i - vtmp2_i; - vout2_r = vout0_r - vtmp3_r; - vout2_i = vout0_i - vtmp3_i; - - tw1 += stride * 2; - tw2 += stride * 4; - tw3 += stride * 6; - vout0_r += vtmp3_r; - vout0_i += vtmp3_i; - - vout1_r = vtmp5_r + vtmp4_i; - vout1_i = vtmp5_i - vtmp4_r; - vout3_r = vtmp5_r - vtmp4_i; - vout3_i = vtmp5_i + vtmp4_r; - - data0[0] = (int16_t) vout0_r; - data0[1] = (int16_t) vout0_i; - data1[0] = (int16_t) vout1_r; - data1[1] = (int16_t) vout1_i; - data2[0] = (int16_t) vout2_r; - data2[1] = (int16_t) vout2_i; - data3[0] = (int16_t) vout3_r; - data3[1] = (int16_t) vout3_i; - data0 += 2; - data1 += 2; - data2 += 2; - data3 += 2; - } - - data0 += samples * 6; - data1 += samples * 6; - data2 += samples * 6; - data3 += samples * 6; - } while(--batch != 0); -} - -class BFly4MicrokernelTester { - public: - BFly4MicrokernelTester& batch(size_t batch) { - assert(batch != 0); - this->batch_ = batch; - return *this; - } - - size_t batch() const { - return this->batch_; - } - - BFly4MicrokernelTester& samples(size_t samples) { - assert(samples != 0); - this->samples_ = samples; - return *this; - } - - size_t samples() const { - return this->samples_; - } - - BFly4MicrokernelTester& stride(uint32_t stride) { - this->stride_ = stride; - return *this; - } - - uint32_t stride() const { - return this->stride_; - } - - BFly4MicrokernelTester& iterations(size_t iterations) { - this->iterations_ = iterations; - return *this; - } - - size_t iterations() const { - return this->iterations_; - } - - void Test(xnn_cs16_bfly4_ukernel_fn bfly4) const { - xnnpack::ReplicableRandomDevice rng; - auto i16rng = std::bind(std::uniform_int_distribution(), std::ref(rng)); - const size_t fft_size = samples() * stride() * 4; // 4 for bfly4. - - // 256 complex numbers = fft_size * 2 = 512 - std::vector y(fft_size * 2); - std::vector y_ref(fft_size * 2); - - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(y.begin(), y.end(), std::ref(i16rng)); - y_ref = y; - - // Compute reference results. - xnn_cs16_bfly4_reference(batch(), samples(), y_ref.data(), xnn_reference_table_fft256_twiddle, stride()); - - // Call optimized micro-kernel. - bfly4(batch(), samples() * sizeof(int16_t) * 2, y.data(), xnn_reference_table_fft256_twiddle, stride() * sizeof(int16_t) * 2); - - // Verify results. - for (size_t n = 0; n < fft_size * 2; n++) { - EXPECT_EQ(y[n], y_ref[n]) - << "at sample " << n << " / " << fft_size - << "\nsamples " << samples() - << "\nstride " << stride(); - } - } - } - - private: - size_t batch_{1}; - size_t samples_{1}; - uint32_t stride_{1}; - size_t iterations_{15}; -}; diff --git a/test/cs16-bfly4.cc b/test/cs16-bfly4.cc deleted file mode 100644 index 0edb7e5bba7..00000000000 --- a/test/cs16-bfly4.cc +++ /dev/null @@ -1,394 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: test/cs16-bfly4.yaml -// Generator: tools/generate-bfly4-test.py - - -#include -#include "xnnpack/common.h" -#include "xnnpack/fft.h" -#include "xnnpack/isa-checks.h" -#include "bfly4-microkernel-tester.h" - - -#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY - - TEST(CS16_BFLY4_SAMPLES1__ASM_AARCH32_NEON_X1, samples_eq_1) { - TEST_REQUIRES_ARM_NEON; - BFly4MicrokernelTester() - .batch(1) - .samples(1) - .stride(64) - .Test(xnn_cs16_bfly4_samples1_ukernel__asm_aarch32_neon_x1); - } - - TEST(CS16_BFLY4_SAMPLES1__ASM_AARCH32_NEON_X1, batch_eq_4) { - TEST_REQUIRES_ARM_NEON; - BFly4MicrokernelTester() - .batch(4) - .samples(1) - .stride(64) - .Test(xnn_cs16_bfly4_samples1_ukernel__asm_aarch32_neon_x1); - } - - TEST(CS16_BFLY4_SAMPLES1__ASM_AARCH32_NEON_X1, batch_gt_1) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 2; batch <= 16; batch++) { - BFly4MicrokernelTester() - .batch(batch) - .samples(1) - .stride(64) - .Test(xnn_cs16_bfly4_samples1_ukernel__asm_aarch32_neon_x1); - } - } -#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY - - -#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY - - TEST(CS16_BFLY4_SAMPLES1__ASM_AARCH32_NEON_X2, samples_eq_1) { - TEST_REQUIRES_ARM_NEON; - BFly4MicrokernelTester() - .batch(1) - .samples(1) - .stride(64) - .Test(xnn_cs16_bfly4_samples1_ukernel__asm_aarch32_neon_x2); - } - - TEST(CS16_BFLY4_SAMPLES1__ASM_AARCH32_NEON_X2, batch_eq_4) { - TEST_REQUIRES_ARM_NEON; - BFly4MicrokernelTester() - .batch(4) - .samples(1) - .stride(64) - .Test(xnn_cs16_bfly4_samples1_ukernel__asm_aarch32_neon_x2); - } - - TEST(CS16_BFLY4_SAMPLES1__ASM_AARCH32_NEON_X2, batch_gt_1) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 2; batch <= 16; batch++) { - BFly4MicrokernelTester() - .batch(batch) - .samples(1) - .stride(64) - .Test(xnn_cs16_bfly4_samples1_ukernel__asm_aarch32_neon_x2); - } - } -#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY - - -#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY - - TEST(CS16_BFLY4_SAMPLES1__ASM_AARCH32_NEON_X4, samples_eq_1) { - TEST_REQUIRES_ARM_NEON; - BFly4MicrokernelTester() - .batch(1) - .samples(1) - .stride(64) - .Test(xnn_cs16_bfly4_samples1_ukernel__asm_aarch32_neon_x4); - } - - TEST(CS16_BFLY4_SAMPLES1__ASM_AARCH32_NEON_X4, batch_eq_4) { - TEST_REQUIRES_ARM_NEON; - BFly4MicrokernelTester() - .batch(4) - .samples(1) - .stride(64) - .Test(xnn_cs16_bfly4_samples1_ukernel__asm_aarch32_neon_x4); - } - - TEST(CS16_BFLY4_SAMPLES1__ASM_AARCH32_NEON_X4, batch_gt_1) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 2; batch <= 16; batch++) { - BFly4MicrokernelTester() - .batch(batch) - .samples(1) - .stride(64) - .Test(xnn_cs16_bfly4_samples1_ukernel__asm_aarch32_neon_x4); - } - } -#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - - TEST(CS16_BFLY4_SAMPLES1__NEON, samples_eq_1) { - TEST_REQUIRES_ARM_NEON; - BFly4MicrokernelTester() - .batch(1) - .samples(1) - .stride(64) - .Test(xnn_cs16_bfly4_samples1_ukernel__neon); - } - - TEST(CS16_BFLY4_SAMPLES1__NEON, batch_eq_4) { - TEST_REQUIRES_ARM_NEON; - BFly4MicrokernelTester() - .batch(4) - .samples(1) - .stride(64) - .Test(xnn_cs16_bfly4_samples1_ukernel__neon); - } - - TEST(CS16_BFLY4_SAMPLES1__NEON, batch_gt_1) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 2; batch <= 16; batch++) { - BFly4MicrokernelTester() - .batch(batch) - .samples(1) - .stride(64) - .Test(xnn_cs16_bfly4_samples1_ukernel__neon); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - - TEST(CS16_BFLY4_SAMPLES4__NEON, samples_eq_4) { - TEST_REQUIRES_ARM_NEON; - BFly4MicrokernelTester() - .samples(4) - .stride(16) - .Test(xnn_cs16_bfly4_samples4_ukernel__neon); - } - - TEST(CS16_BFLY4_SAMPLES4__NEON, samples_eq_4_batch_gt_1) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 2; batch <= 4; batch++) { - BFly4MicrokernelTester() - .batch(batch) - .samples(4) - .stride(16) - .Test(xnn_cs16_bfly4_samples4_ukernel__neon); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - - TEST(CS16_BFLY4__NEON_X1, samples_eq_4) { - TEST_REQUIRES_ARM_NEON; - BFly4MicrokernelTester() - .samples(4) - .stride(16) - .Test(xnn_cs16_bfly4_ukernel__neon_x1); - } - - TEST(CS16_BFLY4__NEON_X1, samples_eq_4_batch_gt_1) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 2; batch <= 4; batch++) { - BFly4MicrokernelTester() - .batch(batch) - .samples(4) - .stride(16) - .Test(xnn_cs16_bfly4_ukernel__neon_x1); - } - } - - TEST(CS16_BFLY4__NEON_X1, samples_eq_16) { - TEST_REQUIRES_ARM_NEON; - BFly4MicrokernelTester() - .samples(16) - .stride(4) - .Test(xnn_cs16_bfly4_ukernel__neon_x1); - } - - TEST(CS16_BFLY4__NEON_X1, samples_eq_64) { - TEST_REQUIRES_ARM_NEON; - BFly4MicrokernelTester() - .samples(64) - .stride(1) - .Test(xnn_cs16_bfly4_ukernel__neon_x1); - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - - TEST(CS16_BFLY4__NEON_X4, samples_eq_4) { - TEST_REQUIRES_ARM_NEON; - BFly4MicrokernelTester() - .samples(4) - .stride(16) - .Test(xnn_cs16_bfly4_ukernel__neon_x4); - } - - TEST(CS16_BFLY4__NEON_X4, samples_eq_4_batch_gt_1) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 2; batch <= 4; batch++) { - BFly4MicrokernelTester() - .batch(batch) - .samples(4) - .stride(16) - .Test(xnn_cs16_bfly4_ukernel__neon_x4); - } - } - - TEST(CS16_BFLY4__NEON_X4, samples_eq_16) { - TEST_REQUIRES_ARM_NEON; - BFly4MicrokernelTester() - .samples(16) - .stride(4) - .Test(xnn_cs16_bfly4_ukernel__neon_x4); - } - - TEST(CS16_BFLY4__NEON_X4, samples_eq_64) { - TEST_REQUIRES_ARM_NEON; - BFly4MicrokernelTester() - .samples(64) - .stride(1) - .Test(xnn_cs16_bfly4_ukernel__neon_x4); - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - - -TEST(CS16_BFLY4_SAMPLES1__SCALAR, samples_eq_1) { - BFly4MicrokernelTester() - .batch(1) - .samples(1) - .stride(64) - .Test(xnn_cs16_bfly4_samples1_ukernel__scalar); -} - -TEST(CS16_BFLY4_SAMPLES1__SCALAR, batch_eq_4) { - BFly4MicrokernelTester() - .batch(4) - .samples(1) - .stride(64) - .Test(xnn_cs16_bfly4_samples1_ukernel__scalar); -} - -TEST(CS16_BFLY4_SAMPLES1__SCALAR, batch_gt_1) { - for (size_t batch = 2; batch <= 16; batch++) { - BFly4MicrokernelTester() - .batch(batch) - .samples(1) - .stride(64) - .Test(xnn_cs16_bfly4_samples1_ukernel__scalar); - } -} - - - -TEST(CS16_BFLY4_SAMPLES4__SCALAR, samples_eq_4) { - BFly4MicrokernelTester() - .samples(4) - .stride(16) - .Test(xnn_cs16_bfly4_samples4_ukernel__scalar); -} - -TEST(CS16_BFLY4_SAMPLES4__SCALAR, samples_eq_4_batch_gt_1) { - for (size_t batch = 2; batch <= 4; batch++) { - BFly4MicrokernelTester() - .batch(batch) - .samples(4) - .stride(16) - .Test(xnn_cs16_bfly4_samples4_ukernel__scalar); - } -} - - - -TEST(CS16_BFLY4__SCALAR_X1, samples_eq_4) { - BFly4MicrokernelTester() - .samples(4) - .stride(16) - .Test(xnn_cs16_bfly4_ukernel__scalar_x1); -} - -TEST(CS16_BFLY4__SCALAR_X1, samples_eq_4_batch_gt_1) { - for (size_t batch = 2; batch <= 4; batch++) { - BFly4MicrokernelTester() - .batch(batch) - .samples(4) - .stride(16) - .Test(xnn_cs16_bfly4_ukernel__scalar_x1); - } -} - -TEST(CS16_BFLY4__SCALAR_X1, samples_eq_16) { - BFly4MicrokernelTester() - .samples(16) - .stride(4) - .Test(xnn_cs16_bfly4_ukernel__scalar_x1); -} - -TEST(CS16_BFLY4__SCALAR_X1, samples_eq_64) { - BFly4MicrokernelTester() - .samples(64) - .stride(1) - .Test(xnn_cs16_bfly4_ukernel__scalar_x1); -} - - - -TEST(CS16_BFLY4__SCALAR_X2, samples_eq_4) { - BFly4MicrokernelTester() - .samples(4) - .stride(16) - .Test(xnn_cs16_bfly4_ukernel__scalar_x2); -} - -TEST(CS16_BFLY4__SCALAR_X2, samples_eq_4_batch_gt_1) { - for (size_t batch = 2; batch <= 4; batch++) { - BFly4MicrokernelTester() - .batch(batch) - .samples(4) - .stride(16) - .Test(xnn_cs16_bfly4_ukernel__scalar_x2); - } -} - -TEST(CS16_BFLY4__SCALAR_X2, samples_eq_16) { - BFly4MicrokernelTester() - .samples(16) - .stride(4) - .Test(xnn_cs16_bfly4_ukernel__scalar_x2); -} - -TEST(CS16_BFLY4__SCALAR_X2, samples_eq_64) { - BFly4MicrokernelTester() - .samples(64) - .stride(1) - .Test(xnn_cs16_bfly4_ukernel__scalar_x2); -} - - - -TEST(CS16_BFLY4__SCALAR_X4, samples_eq_4) { - BFly4MicrokernelTester() - .samples(4) - .stride(16) - .Test(xnn_cs16_bfly4_ukernel__scalar_x4); -} - -TEST(CS16_BFLY4__SCALAR_X4, samples_eq_4_batch_gt_1) { - for (size_t batch = 2; batch <= 4; batch++) { - BFly4MicrokernelTester() - .batch(batch) - .samples(4) - .stride(16) - .Test(xnn_cs16_bfly4_ukernel__scalar_x4); - } -} - -TEST(CS16_BFLY4__SCALAR_X4, samples_eq_16) { - BFly4MicrokernelTester() - .samples(16) - .stride(4) - .Test(xnn_cs16_bfly4_ukernel__scalar_x4); -} - -TEST(CS16_BFLY4__SCALAR_X4, samples_eq_64) { - BFly4MicrokernelTester() - .samples(64) - .stride(1) - .Test(xnn_cs16_bfly4_ukernel__scalar_x4); -} diff --git a/test/cs16-bfly4.yaml b/test/cs16-bfly4.yaml deleted file mode 100644 index c86cff68d49..00000000000 --- a/test/cs16-bfly4.yaml +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright 2022 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - - -# NEON -- name: xnn_cs16_bfly4_samples1_ukernel__asm_aarch32_neon_x1 -- name: xnn_cs16_bfly4_samples1_ukernel__asm_aarch32_neon_x2 -- name: xnn_cs16_bfly4_samples1_ukernel__asm_aarch32_neon_x4 -- name: xnn_cs16_bfly4_samples1_ukernel__neon -- name: xnn_cs16_bfly4_samples4_ukernel__neon -- name: xnn_cs16_bfly4_ukernel__neon_x1 -- name: xnn_cs16_bfly4_ukernel__neon_x4 - - -# Scalar -- name: xnn_cs16_bfly4_samples1_ukernel__scalar -- name: xnn_cs16_bfly4_samples4_ukernel__scalar -- name: xnn_cs16_bfly4_ukernel__scalar_x1 -- name: xnn_cs16_bfly4_ukernel__scalar_x2 -- name: xnn_cs16_bfly4_ukernel__scalar_x4 diff --git a/test/cs16-fftr.cc b/test/cs16-fftr.cc deleted file mode 100644 index de7d1a6a46e..00000000000 --- a/test/cs16-fftr.cc +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: test/cs16-fftr.yaml -// Generator: tools/generate-fftr-test.py - - -#include -#include "xnnpack/common.h" -#include "xnnpack/fft.h" -#include "xnnpack/isa-checks.h" -#include "fftr-microkernel-tester.h" - - -#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY - TEST(CS16_FFTR__ASM_AARCH32_NEON_X1, samples_eq_256) { - TEST_REQUIRES_ARM_NEON; - FftrMicrokernelTester() - .samples(256) - .Test(xnn_cs16_fftr_ukernel__asm_aarch32_neon_x1); - } -#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY - - -#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY - TEST(CS16_FFTR__ASM_AARCH32_NEON_X4, samples_eq_256) { - TEST_REQUIRES_ARM_NEON; - FftrMicrokernelTester() - .samples(256) - .Test(xnn_cs16_fftr_ukernel__asm_aarch32_neon_x4); - } -#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(CS16_FFTR__NEON_X4, samples_eq_256) { - TEST_REQUIRES_ARM_NEON; - FftrMicrokernelTester() - .samples(256) - .Test(xnn_cs16_fftr_ukernel__neon_x4); - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -TEST(CS16_FFTR__SCALAR_X1, samples_eq_256) { - FftrMicrokernelTester() - .samples(256) - .Test(xnn_cs16_fftr_ukernel__scalar_x1); -} - - -TEST(CS16_FFTR__SCALAR_X2, samples_eq_256) { - FftrMicrokernelTester() - .samples(256) - .Test(xnn_cs16_fftr_ukernel__scalar_x2); -} - - -TEST(CS16_FFTR__SCALAR_X4, samples_eq_256) { - FftrMicrokernelTester() - .samples(256) - .Test(xnn_cs16_fftr_ukernel__scalar_x4); -} diff --git a/test/cs16-fftr.yaml b/test/cs16-fftr.yaml deleted file mode 100644 index c304b62d46c..00000000000 --- a/test/cs16-fftr.yaml +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright 2022 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - - -# NEON -- name: xnn_cs16_fftr_ukernel__asm_aarch32_neon_x1 -- name: xnn_cs16_fftr_ukernel__asm_aarch32_neon_x4 -- name: xnn_cs16_fftr_ukernel__neon_x4 - -# Scalar -- name: xnn_cs16_fftr_ukernel__scalar_x1 -- name: xnn_cs16_fftr_ukernel__scalar_x2 -- name: xnn_cs16_fftr_ukernel__scalar_x4 diff --git a/test/cs16-vsquareabs.cc b/test/cs16-vsquareabs.cc deleted file mode 100644 index cdf2c005455..00000000000 --- a/test/cs16-vsquareabs.cc +++ /dev/null @@ -1,469 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: test/cs16-vsquareabs.yaml -// Generator: tools/generate-vsquareabs-test.py - - -#include -#include "xnnpack/common.h" -#include "xnnpack/isa-checks.h" -#include "xnnpack/vsquareabs.h" -#include "vsquareabs-microkernel-tester.h" - - -TEST(CS16_VSQUAREABS__SCALAR_X1, batch_eq_1) { - VSquareAbsMicrokernelTester() - .batch(1) - .Test(xnn_cs16_vsquareabs_ukernel__scalar_x1); -} - -TEST(CS16_VSQUAREABS__SCALAR_X1, batch_gt_1) { - for (size_t batch = 2; batch < 10; batch++) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__scalar_x1); - } -} - - -TEST(CS16_VSQUAREABS__SCALAR_X2, batch_eq_2) { - VSquareAbsMicrokernelTester() - .batch(2) - .Test(xnn_cs16_vsquareabs_ukernel__scalar_x2); -} - -TEST(CS16_VSQUAREABS__SCALAR_X2, batch_div_2) { - for (size_t batch = 4; batch < 20; batch += 2) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__scalar_x2); - } -} - -TEST(CS16_VSQUAREABS__SCALAR_X2, batch_lt_2) { - for (size_t batch = 1; batch < 2; batch++) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__scalar_x2); - } -} - -TEST(CS16_VSQUAREABS__SCALAR_X2, batch_gt_2) { - for (size_t batch = 3; batch < 4; batch++) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__scalar_x2); - } -} - - -TEST(CS16_VSQUAREABS__SCALAR_X3, batch_eq_3) { - VSquareAbsMicrokernelTester() - .batch(3) - .Test(xnn_cs16_vsquareabs_ukernel__scalar_x3); -} - -TEST(CS16_VSQUAREABS__SCALAR_X3, batch_div_3) { - for (size_t batch = 6; batch < 30; batch += 3) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__scalar_x3); - } -} - -TEST(CS16_VSQUAREABS__SCALAR_X3, batch_lt_3) { - for (size_t batch = 1; batch < 3; batch++) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__scalar_x3); - } -} - -TEST(CS16_VSQUAREABS__SCALAR_X3, batch_gt_3) { - for (size_t batch = 4; batch < 6; batch++) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__scalar_x3); - } -} - - -TEST(CS16_VSQUAREABS__SCALAR_X4, batch_eq_4) { - VSquareAbsMicrokernelTester() - .batch(4) - .Test(xnn_cs16_vsquareabs_ukernel__scalar_x4); -} - -TEST(CS16_VSQUAREABS__SCALAR_X4, batch_div_4) { - for (size_t batch = 8; batch < 40; batch += 4) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__scalar_x4); - } -} - -TEST(CS16_VSQUAREABS__SCALAR_X4, batch_lt_4) { - for (size_t batch = 1; batch < 4; batch++) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__scalar_x4); - } -} - -TEST(CS16_VSQUAREABS__SCALAR_X4, batch_gt_4) { - for (size_t batch = 5; batch < 8; batch++) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__scalar_x4); - } -} - - -#if XNN_ARCH_HEXAGON - TEST(CS16_VSQUAREABS__HEXAGON_X2, batch_eq_2) { - VSquareAbsMicrokernelTester() - .batch(2) - .Test(xnn_cs16_vsquareabs_ukernel__hexagon_x2); - } - - TEST(CS16_VSQUAREABS__HEXAGON_X2, batch_div_2) { - for (size_t batch = 4; batch < 20; batch += 2) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__hexagon_x2); - } - } - - TEST(CS16_VSQUAREABS__HEXAGON_X2, batch_lt_2) { - for (size_t batch = 1; batch < 2; batch++) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__hexagon_x2); - } - } - - TEST(CS16_VSQUAREABS__HEXAGON_X2, batch_gt_2) { - for (size_t batch = 3; batch < 4; batch++) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__hexagon_x2); - } - } -#endif // XNN_ARCH_HEXAGON - - -#if XNN_ARCH_HEXAGON - TEST(CS16_VSQUAREABS__HEXAGON_X4, batch_eq_4) { - VSquareAbsMicrokernelTester() - .batch(4) - .Test(xnn_cs16_vsquareabs_ukernel__hexagon_x4); - } - - TEST(CS16_VSQUAREABS__HEXAGON_X4, batch_div_4) { - for (size_t batch = 8; batch < 40; batch += 4) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__hexagon_x4); - } - } - - TEST(CS16_VSQUAREABS__HEXAGON_X4, batch_lt_4) { - for (size_t batch = 1; batch < 4; batch++) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__hexagon_x4); - } - } - - TEST(CS16_VSQUAREABS__HEXAGON_X4, batch_gt_4) { - for (size_t batch = 5; batch < 8; batch++) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__hexagon_x4); - } - } -#endif // XNN_ARCH_HEXAGON - - -#if XNN_ARCH_HEXAGON - TEST(CS16_VSQUAREABS__HEXAGON_X6, batch_eq_6) { - VSquareAbsMicrokernelTester() - .batch(6) - .Test(xnn_cs16_vsquareabs_ukernel__hexagon_x6); - } - - TEST(CS16_VSQUAREABS__HEXAGON_X6, batch_div_6) { - for (size_t batch = 12; batch < 60; batch += 6) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__hexagon_x6); - } - } - - TEST(CS16_VSQUAREABS__HEXAGON_X6, batch_lt_6) { - for (size_t batch = 1; batch < 6; batch++) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__hexagon_x6); - } - } - - TEST(CS16_VSQUAREABS__HEXAGON_X6, batch_gt_6) { - for (size_t batch = 7; batch < 12; batch++) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__hexagon_x6); - } - } -#endif // XNN_ARCH_HEXAGON - - -#if XNN_ARCH_HEXAGON - TEST(CS16_VSQUAREABS__HEXAGON_X8, batch_eq_8) { - VSquareAbsMicrokernelTester() - .batch(8) - .Test(xnn_cs16_vsquareabs_ukernel__hexagon_x8); - } - - TEST(CS16_VSQUAREABS__HEXAGON_X8, batch_div_8) { - for (size_t batch = 16; batch < 80; batch += 8) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__hexagon_x8); - } - } - - TEST(CS16_VSQUAREABS__HEXAGON_X8, batch_lt_8) { - for (size_t batch = 1; batch < 8; batch++) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__hexagon_x8); - } - } - - TEST(CS16_VSQUAREABS__HEXAGON_X8, batch_gt_8) { - for (size_t batch = 9; batch < 16; batch++) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__hexagon_x8); - } - } -#endif // XNN_ARCH_HEXAGON - - -#if XNN_ARCH_HEXAGON - TEST(CS16_VSQUAREABS__HEXAGON_X10, batch_eq_10) { - VSquareAbsMicrokernelTester() - .batch(10) - .Test(xnn_cs16_vsquareabs_ukernel__hexagon_x10); - } - - TEST(CS16_VSQUAREABS__HEXAGON_X10, batch_div_10) { - for (size_t batch = 20; batch < 100; batch += 10) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__hexagon_x10); - } - } - - TEST(CS16_VSQUAREABS__HEXAGON_X10, batch_lt_10) { - for (size_t batch = 1; batch < 10; batch++) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__hexagon_x10); - } - } - - TEST(CS16_VSQUAREABS__HEXAGON_X10, batch_gt_10) { - for (size_t batch = 11; batch < 20; batch++) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__hexagon_x10); - } - } -#endif // XNN_ARCH_HEXAGON - - -#if XNN_ARCH_HEXAGON - TEST(CS16_VSQUAREABS__HEXAGON_X12, batch_eq_12) { - VSquareAbsMicrokernelTester() - .batch(12) - .Test(xnn_cs16_vsquareabs_ukernel__hexagon_x12); - } - - TEST(CS16_VSQUAREABS__HEXAGON_X12, batch_div_12) { - for (size_t batch = 24; batch < 120; batch += 12) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__hexagon_x12); - } - } - - TEST(CS16_VSQUAREABS__HEXAGON_X12, batch_lt_12) { - for (size_t batch = 1; batch < 12; batch++) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__hexagon_x12); - } - } - - TEST(CS16_VSQUAREABS__HEXAGON_X12, batch_gt_12) { - for (size_t batch = 13; batch < 24; batch++) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__hexagon_x12); - } - } -#endif // XNN_ARCH_HEXAGON - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(CS16_VSQUAREABS__NEON_MLAL_LD128_X4, batch_eq_4) { - TEST_REQUIRES_ARM_NEON; - VSquareAbsMicrokernelTester() - .batch(4) - .Test(xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x4); - } - - TEST(CS16_VSQUAREABS__NEON_MLAL_LD128_X4, batch_div_4) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 8; batch < 40; batch += 4) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x4); - } - } - - TEST(CS16_VSQUAREABS__NEON_MLAL_LD128_X4, batch_lt_4) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 1; batch < 4; batch++) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x4); - } - } - - TEST(CS16_VSQUAREABS__NEON_MLAL_LD128_X4, batch_gt_4) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 5; batch < 8; batch++) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x4); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(CS16_VSQUAREABS__NEON_MLAL_LD128_X8, batch_eq_8) { - TEST_REQUIRES_ARM_NEON; - VSquareAbsMicrokernelTester() - .batch(8) - .Test(xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x8); - } - - TEST(CS16_VSQUAREABS__NEON_MLAL_LD128_X8, batch_div_8) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 16; batch < 80; batch += 8) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x8); - } - } - - TEST(CS16_VSQUAREABS__NEON_MLAL_LD128_X8, batch_lt_8) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 1; batch < 8; batch++) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x8); - } - } - - TEST(CS16_VSQUAREABS__NEON_MLAL_LD128_X8, batch_gt_8) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 9; batch < 16; batch++) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x8); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(CS16_VSQUAREABS__NEON_MLAL_LD128_X12, batch_eq_12) { - TEST_REQUIRES_ARM_NEON; - VSquareAbsMicrokernelTester() - .batch(12) - .Test(xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x12); - } - - TEST(CS16_VSQUAREABS__NEON_MLAL_LD128_X12, batch_div_12) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 24; batch < 120; batch += 12) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x12); - } - } - - TEST(CS16_VSQUAREABS__NEON_MLAL_LD128_X12, batch_lt_12) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 1; batch < 12; batch++) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x12); - } - } - - TEST(CS16_VSQUAREABS__NEON_MLAL_LD128_X12, batch_gt_12) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 13; batch < 24; batch++) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x12); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(CS16_VSQUAREABS__NEON_MLAL_LD128_X16, batch_eq_16) { - TEST_REQUIRES_ARM_NEON; - VSquareAbsMicrokernelTester() - .batch(16) - .Test(xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x16); - } - - TEST(CS16_VSQUAREABS__NEON_MLAL_LD128_X16, batch_div_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 32; batch < 160; batch += 16) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x16); - } - } - - TEST(CS16_VSQUAREABS__NEON_MLAL_LD128_X16, batch_lt_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 1; batch < 16; batch++) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x16); - } - } - - TEST(CS16_VSQUAREABS__NEON_MLAL_LD128_X16, batch_gt_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 17; batch < 32; batch++) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x16); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 diff --git a/test/cs16-vsquareabs.yaml b/test/cs16-vsquareabs.yaml deleted file mode 100644 index 15b94983d4e..00000000000 --- a/test/cs16-vsquareabs.yaml +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright 2022 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - - -# Scalar -- name: xnn_cs16_vsquareabs_ukernel__scalar_x1 -- name: xnn_cs16_vsquareabs_ukernel__scalar_x2 -- name: xnn_cs16_vsquareabs_ukernel__scalar_x3 -- name: xnn_cs16_vsquareabs_ukernel__scalar_x4 - -- name: xnn_cs16_vsquareabs_ukernel__hexagon_x2 -- name: xnn_cs16_vsquareabs_ukernel__hexagon_x4 -- name: xnn_cs16_vsquareabs_ukernel__hexagon_x6 -- name: xnn_cs16_vsquareabs_ukernel__hexagon_x8 -- name: xnn_cs16_vsquareabs_ukernel__hexagon_x10 -- name: xnn_cs16_vsquareabs_ukernel__hexagon_x12 - -# NEON -- name: xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x4 -- name: xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x8 -- name: xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x12 -- name: xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x16 diff --git a/test/fftr-microkernel-tester.h b/test/fftr-microkernel-tester.h deleted file mode 100644 index e4cfbbcac24..00000000000 --- a/test/fftr-microkernel-tester.h +++ /dev/null @@ -1,170 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack/math.h" -#include "xnnpack/microfnptr.h" -#include "replicable_random_device.h" - -static const int16_t xnn_reference_table_fftr_twiddle[256] = { - -402,-32765, -804,-32757, -1206,-32745, -1608,-32728, - -2009,-32705, -2410,-32678, -2811,-32646, -3212,-32609, - -3612,-32567, -4011,-32521, -4410,-32469, -4808,-32412, - -5205,-32351, -5602,-32285, -5998,-32213, -6393,-32137, - -6786,-32057, -7179,-31971, -7571,-31880, -7962,-31785, - -8351,-31685, -8739,-31580, -9126,-31470, -9512,-31356, - -9896,-31237, -10278,-31113, -10659,-30985, -11039,-30852, - -11417,-30714, -11793,-30571, -12167,-30424, -12539,-30273, - -12910,-30117, -13279,-29956, -13645,-29791, -14010,-29621, - -14372,-29447, -14732,-29268, -15090,-29085, -15446,-28898, - -15800,-28706, -16151,-28510, -16499,-28310, -16846,-28105, - -17189,-27896, -17530,-27683, -17869,-27466, -18204,-27245, - -18537,-27019, -18868,-26790, -19195,-26556, -19519,-26319, - -19841,-26077, -20159,-25832, -20475,-25582, -20787,-25329, - -21096,-25072, -21403,-24811, -21705,-24547, -22005,-24279, - -22301,-24007, -22594,-23731, -22884,-23452, -23170,-23170, - -23452,-22884, -23731,-22594, -24007,-22301, -24279,-22005, - -24547,-21705, -24811,-21403, -25072,-21096, -25329,-20787, - -25582,-20475, -25832,-20159, -26077,-19841, -26319,-19519, - -26556,-19195, -26790,-18868, -27019,-18537, -27245,-18204, - -27466,-17869, -27683,-17530, -27896,-17189, -28105,-16846, - -28310,-16499, -28510,-16151, -28706,-15800, -28898,-15446, - -29085,-15090, -29268,-14732, -29447,-14372, -29621,-14010, - -29791,-13645, -29956,-13279, -30117,-12910, -30273,-12539, - -30424,-12167, -30571,-11793, -30714,-11417, -30852,-11039, - -30985,-10659, -31113,-10278, -31237, -9896, -31356, -9512, - -31470, -9126, -31580, -8739, -31685, -8351, -31785, -7962, - -31880, -7571, -31971, -7179, -32057, -6786, -32137, -6393, - -32213, -5998, -32285, -5602, -32351, -5205, -32412, -4808, - -32469, -4410, -32521, -4011, -32567, -3612, -32609, -3212, - -32646, -2811, -32678, -2410, -32705, -2009, -32728, -1608, - -32745, -1206, -32757, -804, -32765, -402, -32767, 0, -}; - -static void xnn_cs16_fftr_reference( - size_t samples, - const int16_t* input, - int16_t* output, - const int16_t* twiddle) { - - assert(samples >= 2); - assert(samples % 2 == 0); - assert(input != nullptr); - assert(output != nullptr); - assert(twiddle != nullptr); - - const int16_t* il = input; - const int16_t* ir = input + samples * 2; - int32_t vdcr = (int32_t) il[0]; - int32_t vdci = (int32_t) il[1]; - il += 2; - vdcr = math_asr_s32(vdcr * 16383 + 16384, 15); - vdci = math_asr_s32(vdci * 16383 + 16384, 15); - - int16_t* outl = output; - int16_t* outr = output + samples * 2; - outl[0] = vdcr + vdci; - outl[1] = 0; - outl += 2; - outr[0] = vdcr - vdci; - outr[1] = 0; - - samples >>= 1; - - do { - int32_t vilr = (int32_t) il[0]; - int32_t vili = (int32_t) il[1]; - il += 2; - ir -= 2; - int32_t virr = (int32_t) ir[0]; - int32_t viri = (int32_t) ir[1]; - const int32_t vtwr = twiddle[0]; - const int32_t vtwi = twiddle[1]; - twiddle += 2; - - vilr = math_asr_s32(vilr * 16383 + 16384, 15); - vili = math_asr_s32(vili * 16383 + 16384, 15); - virr = math_asr_s32(virr * 16383 + 16384, 15); - viri = math_asr_s32(viri * 16383 + 16384, 15); - const int16_t vacc1r = vilr + virr; - const int16_t vacc1i = vili - viri; - const int16_t vacc2r = vilr - virr; - const int16_t vacc2i = vili + viri; - - const int32_t vaccr = math_asr_s32(vacc2r * vtwr - vacc2i * vtwi + 16384, 15); - const int32_t vacci = math_asr_s32(vacc2r * vtwi + vacc2i * vtwr + 16384, 15); - - outl[0] = math_asr_s32(vacc1r + vaccr, 1); - outl[1] = math_asr_s32(vacc1i + vacci, 1); - outl += 2; - outr -= 2; - outr[0] = math_asr_s32(vacc1r - vaccr, 1); - outr[1] = math_asr_s32(vacci - vacc1i, 1); - - } while (--samples != 0); -} - -class FftrMicrokernelTester { - public: - FftrMicrokernelTester& samples(size_t samples) { - assert(samples != 0); - this->samples_ = samples; - return *this; - } - - size_t samples() const { - return this->samples_; - } - - FftrMicrokernelTester& iterations(size_t iterations) { - this->iterations_ = iterations; - return *this; - } - - size_t iterations() const { - return this->iterations_; - } - - void Test(xnn_cs16_fftr_ukernel_fn fftr) const { - xnnpack::ReplicableRandomDevice rng; - auto i16rng = std::bind(std::uniform_int_distribution(), std::ref(rng)); - const size_t sample_size = samples() * 2 + 2; - - std::vector y(sample_size); - std::vector y_ref(sample_size); - - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(y.begin(), y.end(), std::ref(i16rng)); - std::copy(y.begin(), y.end(), y_ref.begin()); - - // Compute reference results. - xnn_cs16_fftr_reference(samples(), y_ref.data(), y_ref.data(), xnn_reference_table_fftr_twiddle); - - // Call optimized micro-kernel. - fftr(samples(), y.data(), xnn_reference_table_fftr_twiddle); - - // Verify results. - for (size_t n = 0; n < sample_size; n++) { - EXPECT_EQ(y[n], y_ref[n]) - << "at sample " << n << " / " << sample_size; - } - } - } - - private: - size_t samples_{256}; - size_t iterations_{15}; -}; diff --git a/test/filterbank-accumulate-microkernel-tester.h b/test/filterbank-accumulate-microkernel-tester.h deleted file mode 100644 index 2395631b52f..00000000000 --- a/test/filterbank-accumulate-microkernel-tester.h +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack/microfnptr.h" -#include "replicable_random_device.h" - -class FilterbankAccumulateMicrokernelTester { - public: - FilterbankAccumulateMicrokernelTester& rows(size_t rows) { - assert(rows != 0); - this->rows_ = rows; - return *this; - } - - size_t rows() const { - return this->rows_; - } - - FilterbankAccumulateMicrokernelTester& iterations(size_t iterations) { - this->iterations_ = iterations; - return *this; - } - - size_t iterations() const { - return this->iterations_; - } - - void Test(xnn_u32_filterbank_accumulate_ukernel_fn filterbank_accumulate) const { - xnnpack::ReplicableRandomDevice rng; - std::uniform_int_distribution u8dist(1, 10); - std::uniform_int_distribution u16dist; - std::uniform_int_distribution u32dist; - - std::vector filterbank_widths(rows() + 1); - std::vector output(rows()); - std::vector output_ref(rows()); - - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(filterbank_widths.begin(), filterbank_widths.end(), [&] { return u8dist(rng); }); - const size_t num_channels = std::accumulate(filterbank_widths.cbegin(), filterbank_widths.cend(), 0); - - std::vector input(num_channels); - std::vector weights(num_channels * 2); - std::generate(input.begin(), input.end(), [&] { return u32dist(rng); }); - std::generate(weights.begin(), weights.end(), [&] { return u16dist(rng); }); - std::fill(output.begin(), output.end(), UINT64_C(0xCAFEB0BADEADBEAF)); - - uint64_t weight_accumulator = 0; - uint64_t unweight_accumulator = 0; - size_t i = 0; - for (size_t m = 0; m <= rows(); m++) { - const size_t weight_width = filterbank_widths[m]; - for (size_t n = 0; n < weight_width; n++) { - weight_accumulator += uint64_t(input[i]) * uint64_t(weights[i * 2]); - unweight_accumulator += uint64_t(input[i]) * uint64_t(weights[i * 2 + 1]); - i += 1; - } - if (m != 0) { - output_ref[m - 1] = weight_accumulator; - } - weight_accumulator = unweight_accumulator; - unweight_accumulator = 0; - } - - // Call optimized micro-kernel. - filterbank_accumulate(rows(), input.data(), filterbank_widths.data(), weights.data(), output.data()); - - // Verify results. - for (size_t m = 0; m < rows(); m++) { - EXPECT_EQ(output[m], output_ref[m]) - << "at row " << m << " / " << rows(); - } - } - } - - private: - size_t rows_{1}; - size_t iterations_{15}; -}; diff --git a/test/filterbank-subtract-microkernel-tester.h b/test/filterbank-subtract-microkernel-tester.h deleted file mode 100644 index 291457ed08d..00000000000 --- a/test/filterbank-subtract-microkernel-tester.h +++ /dev/null @@ -1,134 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/aligned-allocator.h" -#include "xnnpack/microfnptr.h" -#include "replicable_random_device.h" - -class FilterbankSubtractMicrokernelTester { - public: - - FilterbankSubtractMicrokernelTester& batch(size_t batch) { - assert(batch != 0); - this->batch_ = batch; - return *this; - } - - size_t batch() const { - return this->batch_; - } - - FilterbankSubtractMicrokernelTester& inplace(bool inplace) { - this->inplace_ = inplace; - return *this; - } - - bool inplace() const { - return this->inplace_; - } - - FilterbankSubtractMicrokernelTester& iterations(size_t iterations) { - this->iterations_ = iterations; - return *this; - } - - size_t iterations() const { - return this->iterations_; - } - - void Test(xnn_u32_filterbank_subtract_ukernel_fn filterbank_subtract) const { - xnnpack::ReplicableRandomDevice rng; - auto u32rng = std::bind(std::uniform_int_distribution(), std::ref(rng)); - const uint32_t smoothing = 655; - const uint32_t alternate_smoothing = 655; - const uint32_t one_minus_smoothing = 15729; - const uint32_t alternate_one_minus_smoothing = 15729; - const uint32_t min_signal_remaining = 819; - const uint32_t smoothing_bits = 0; - const uint32_t spectral_subtraction_bits = 14; - - std::vector> x(batch() + XNN_EXTRA_BYTES / sizeof(uint32_t)); - std::vector> noise(batch() + XNN_EXTRA_BYTES / sizeof(uint32_t)); - std::vector> noise_ref(batch() + XNN_EXTRA_BYTES / sizeof(uint32_t)); - std::vector> y(batch() + (inplace() ? XNN_EXTRA_BYTES / sizeof(uint32_t) : 0)); - std::vector> y_ref(batch()); - const uint32_t* x_data = inplace() ? y.data() : x.data(); - - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(x.begin(), x.end(), std::ref(u32rng)); - std::iota(noise.begin(), noise.end(), 0); - std::iota(noise_ref.begin(), noise_ref.end(), 0); - std::generate(y.begin(), y.end(), std::ref(u32rng)); - std::generate(y_ref.begin(), y_ref.end(), std::ref(u32rng)); - - for (size_t n = 0; n < batch(); n += 2) { - const uint32_t vinput0 = x_data[n + 0]; - const uint32_t vinput1 = x_data[n + 1]; - - uint32_t vnoise_estimate0 = noise_ref[n + 0]; - uint32_t vnoise_estimate1 = noise_ref[n + 1]; - - // Scale up signa for smoothing filter computation. - const uint32_t vsignal_scaled_up0 = vinput0 << smoothing_bits; - const uint32_t vsignal_scaled_up1 = vinput1 << smoothing_bits; - - vnoise_estimate0 = (((uint64_t) (vsignal_scaled_up0) * smoothing) + - ((uint64_t) (vnoise_estimate0) * one_minus_smoothing)) >> spectral_subtraction_bits; - vnoise_estimate1 = (((uint64_t) (vsignal_scaled_up1) * alternate_smoothing) + - ((uint64_t) (vnoise_estimate1) * alternate_one_minus_smoothing)) >> spectral_subtraction_bits; - - noise_ref[n + 0] = vnoise_estimate0; - noise_ref[n + 1] = vnoise_estimate1; - - // Make sure that we can't get a negative value for the signal - estimate. - const uint32_t estimate_scaled_up0 = std::min(vnoise_estimate0, vsignal_scaled_up0); - const uint32_t estimate_scaled_up1 = std::min(vnoise_estimate1, vsignal_scaled_up1); - const uint32_t vsubtracted0 = (vsignal_scaled_up0 - estimate_scaled_up0) >> smoothing_bits; - const uint32_t vsubtracted1 = (vsignal_scaled_up1 - estimate_scaled_up1) >> smoothing_bits; - - const uint32_t vfloor0 = ((uint64_t) (vinput0) * min_signal_remaining) >> spectral_subtraction_bits; - const uint32_t vfloor1 = ((uint64_t) (vinput1) * min_signal_remaining) >> spectral_subtraction_bits; - const uint32_t vout0 = std::max(vsubtracted0, vfloor0); - const uint32_t vout1 = std::max(vsubtracted1, vfloor1); - - y_ref[n + 0] = vout0; - y_ref[n + 1] = vout1; - } - - // Call optimized micro-kernel. - filterbank_subtract(batch(), x_data, - smoothing, alternate_smoothing, one_minus_smoothing, alternate_one_minus_smoothing, - min_signal_remaining, smoothing_bits, spectral_subtraction_bits, - noise.data(), y.data()); - - // Verify results. - for (size_t n = 0; n < batch(); n++) { - EXPECT_EQ(y[n], y_ref[n]) - << "at n " << n << " / " << batch(); - EXPECT_EQ(noise[n], noise_ref[n]) - << "at n " << n << " / " << batch(); - } - } - } - - private: - size_t batch_{48}; - bool inplace_{false}; - size_t iterations_{15}; -}; diff --git a/test/i16-vlshift.cc b/test/i16-vlshift.cc deleted file mode 100644 index 731214b31db..00000000000 --- a/test/i16-vlshift.cc +++ /dev/null @@ -1,431 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: test/i16-vlshift.yaml -// Generator: tools/generate-vlshift-test.py - - -#include -#include "xnnpack/common.h" -#include "xnnpack/isa-checks.h" -#include "xnnpack/vlshift.h" -#include "vlshift-microkernel-tester.h" - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(I16_VLSHIFT__NEON_U8, batch_eq_8) { - TEST_REQUIRES_ARM_NEON; - VLShiftMicrokernelTester() - .batch(8) - .Test(xnn_i16_vlshift_ukernel__neon_u8); - } - - TEST(I16_VLSHIFT__NEON_U8, batch_div_8) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 16; batch < 80; batch += 8) { - VLShiftMicrokernelTester() - .batch(batch) - .Test(xnn_i16_vlshift_ukernel__neon_u8); - } - } - - TEST(I16_VLSHIFT__NEON_U8, batch_lt_8) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 1; batch < 8; batch++) { - VLShiftMicrokernelTester() - .batch(batch) - .Test(xnn_i16_vlshift_ukernel__neon_u8); - } - } - - TEST(I16_VLSHIFT__NEON_U8, batch_gt_8) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 9; batch < 16; batch++) { - VLShiftMicrokernelTester() - .batch(batch) - .Test(xnn_i16_vlshift_ukernel__neon_u8); - } - } - - TEST(I16_VLSHIFT__NEON_U8, inplace) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 1; batch <= 40; batch += 7) { - VLShiftMicrokernelTester() - .batch(batch) - .inplace(true) - .iterations(1) - .Test(xnn_i16_vlshift_ukernel__neon_u8); - } - } - - TEST(I16_VLSHIFT__NEON_U8, shift) { - TEST_REQUIRES_ARM_NEON; - for (uint32_t shift = 0; shift < 16; shift++) { - VLShiftMicrokernelTester() - .batch(8) - .shift(shift) - .Test(xnn_i16_vlshift_ukernel__neon_u8); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(I16_VLSHIFT__NEON_U16, batch_eq_16) { - TEST_REQUIRES_ARM_NEON; - VLShiftMicrokernelTester() - .batch(16) - .Test(xnn_i16_vlshift_ukernel__neon_u16); - } - - TEST(I16_VLSHIFT__NEON_U16, batch_div_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 32; batch < 160; batch += 16) { - VLShiftMicrokernelTester() - .batch(batch) - .Test(xnn_i16_vlshift_ukernel__neon_u16); - } - } - - TEST(I16_VLSHIFT__NEON_U16, batch_lt_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 1; batch < 16; batch++) { - VLShiftMicrokernelTester() - .batch(batch) - .Test(xnn_i16_vlshift_ukernel__neon_u16); - } - } - - TEST(I16_VLSHIFT__NEON_U16, batch_gt_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 17; batch < 32; batch++) { - VLShiftMicrokernelTester() - .batch(batch) - .Test(xnn_i16_vlshift_ukernel__neon_u16); - } - } - - TEST(I16_VLSHIFT__NEON_U16, inplace) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 1; batch <= 80; batch += 15) { - VLShiftMicrokernelTester() - .batch(batch) - .inplace(true) - .iterations(1) - .Test(xnn_i16_vlshift_ukernel__neon_u16); - } - } - - TEST(I16_VLSHIFT__NEON_U16, shift) { - TEST_REQUIRES_ARM_NEON; - for (uint32_t shift = 0; shift < 16; shift++) { - VLShiftMicrokernelTester() - .batch(16) - .shift(shift) - .Test(xnn_i16_vlshift_ukernel__neon_u16); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(I16_VLSHIFT__NEON_U24, batch_eq_24) { - TEST_REQUIRES_ARM_NEON; - VLShiftMicrokernelTester() - .batch(24) - .Test(xnn_i16_vlshift_ukernel__neon_u24); - } - - TEST(I16_VLSHIFT__NEON_U24, batch_div_24) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 48; batch < 240; batch += 24) { - VLShiftMicrokernelTester() - .batch(batch) - .Test(xnn_i16_vlshift_ukernel__neon_u24); - } - } - - TEST(I16_VLSHIFT__NEON_U24, batch_lt_24) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 1; batch < 24; batch++) { - VLShiftMicrokernelTester() - .batch(batch) - .Test(xnn_i16_vlshift_ukernel__neon_u24); - } - } - - TEST(I16_VLSHIFT__NEON_U24, batch_gt_24) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 25; batch < 48; batch++) { - VLShiftMicrokernelTester() - .batch(batch) - .Test(xnn_i16_vlshift_ukernel__neon_u24); - } - } - - TEST(I16_VLSHIFT__NEON_U24, inplace) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 1; batch <= 120; batch += 23) { - VLShiftMicrokernelTester() - .batch(batch) - .inplace(true) - .iterations(1) - .Test(xnn_i16_vlshift_ukernel__neon_u24); - } - } - - TEST(I16_VLSHIFT__NEON_U24, shift) { - TEST_REQUIRES_ARM_NEON; - for (uint32_t shift = 0; shift < 16; shift++) { - VLShiftMicrokernelTester() - .batch(24) - .shift(shift) - .Test(xnn_i16_vlshift_ukernel__neon_u24); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(I16_VLSHIFT__NEON_U32, batch_eq_32) { - TEST_REQUIRES_ARM_NEON; - VLShiftMicrokernelTester() - .batch(32) - .Test(xnn_i16_vlshift_ukernel__neon_u32); - } - - TEST(I16_VLSHIFT__NEON_U32, batch_div_32) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 64; batch < 320; batch += 32) { - VLShiftMicrokernelTester() - .batch(batch) - .Test(xnn_i16_vlshift_ukernel__neon_u32); - } - } - - TEST(I16_VLSHIFT__NEON_U32, batch_lt_32) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 1; batch < 32; batch++) { - VLShiftMicrokernelTester() - .batch(batch) - .Test(xnn_i16_vlshift_ukernel__neon_u32); - } - } - - TEST(I16_VLSHIFT__NEON_U32, batch_gt_32) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 33; batch < 64; batch++) { - VLShiftMicrokernelTester() - .batch(batch) - .Test(xnn_i16_vlshift_ukernel__neon_u32); - } - } - - TEST(I16_VLSHIFT__NEON_U32, inplace) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 1; batch <= 160; batch += 31) { - VLShiftMicrokernelTester() - .batch(batch) - .inplace(true) - .iterations(1) - .Test(xnn_i16_vlshift_ukernel__neon_u32); - } - } - - TEST(I16_VLSHIFT__NEON_U32, shift) { - TEST_REQUIRES_ARM_NEON; - for (uint32_t shift = 0; shift < 16; shift++) { - VLShiftMicrokernelTester() - .batch(32) - .shift(shift) - .Test(xnn_i16_vlshift_ukernel__neon_u32); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -TEST(I16_VLSHIFT__SCALAR_U1, batch_eq_1) { - VLShiftMicrokernelTester() - .batch(1) - .Test(xnn_i16_vlshift_ukernel__scalar_u1); -} - -TEST(I16_VLSHIFT__SCALAR_U1, batch_gt_1) { - for (size_t batch = 2; batch < 10; batch++) { - VLShiftMicrokernelTester() - .batch(batch) - .Test(xnn_i16_vlshift_ukernel__scalar_u1); - } -} - -TEST(I16_VLSHIFT__SCALAR_U1, inplace) { - for (size_t batch = 1; batch <= 5; batch += 1) { - VLShiftMicrokernelTester() - .batch(batch) - .inplace(true) - .iterations(1) - .Test(xnn_i16_vlshift_ukernel__scalar_u1); - } -} - -TEST(I16_VLSHIFT__SCALAR_U1, shift) { - for (uint32_t shift = 0; shift < 16; shift++) { - VLShiftMicrokernelTester() - .batch(1) - .shift(shift) - .Test(xnn_i16_vlshift_ukernel__scalar_u1); - } -} - - -TEST(I16_VLSHIFT__SCALAR_U2, batch_eq_2) { - VLShiftMicrokernelTester() - .batch(2) - .Test(xnn_i16_vlshift_ukernel__scalar_u2); -} - -TEST(I16_VLSHIFT__SCALAR_U2, batch_div_2) { - for (size_t batch = 4; batch < 20; batch += 2) { - VLShiftMicrokernelTester() - .batch(batch) - .Test(xnn_i16_vlshift_ukernel__scalar_u2); - } -} - -TEST(I16_VLSHIFT__SCALAR_U2, batch_lt_2) { - for (size_t batch = 1; batch < 2; batch++) { - VLShiftMicrokernelTester() - .batch(batch) - .Test(xnn_i16_vlshift_ukernel__scalar_u2); - } -} - -TEST(I16_VLSHIFT__SCALAR_U2, batch_gt_2) { - for (size_t batch = 3; batch < 4; batch++) { - VLShiftMicrokernelTester() - .batch(batch) - .Test(xnn_i16_vlshift_ukernel__scalar_u2); - } -} - -TEST(I16_VLSHIFT__SCALAR_U2, inplace) { - for (size_t batch = 1; batch <= 10; batch += 1) { - VLShiftMicrokernelTester() - .batch(batch) - .inplace(true) - .iterations(1) - .Test(xnn_i16_vlshift_ukernel__scalar_u2); - } -} - -TEST(I16_VLSHIFT__SCALAR_U2, shift) { - for (uint32_t shift = 0; shift < 16; shift++) { - VLShiftMicrokernelTester() - .batch(2) - .shift(shift) - .Test(xnn_i16_vlshift_ukernel__scalar_u2); - } -} - - -TEST(I16_VLSHIFT__SCALAR_U3, batch_eq_3) { - VLShiftMicrokernelTester() - .batch(3) - .Test(xnn_i16_vlshift_ukernel__scalar_u3); -} - -TEST(I16_VLSHIFT__SCALAR_U3, batch_div_3) { - for (size_t batch = 6; batch < 30; batch += 3) { - VLShiftMicrokernelTester() - .batch(batch) - .Test(xnn_i16_vlshift_ukernel__scalar_u3); - } -} - -TEST(I16_VLSHIFT__SCALAR_U3, batch_lt_3) { - for (size_t batch = 1; batch < 3; batch++) { - VLShiftMicrokernelTester() - .batch(batch) - .Test(xnn_i16_vlshift_ukernel__scalar_u3); - } -} - -TEST(I16_VLSHIFT__SCALAR_U3, batch_gt_3) { - for (size_t batch = 4; batch < 6; batch++) { - VLShiftMicrokernelTester() - .batch(batch) - .Test(xnn_i16_vlshift_ukernel__scalar_u3); - } -} - -TEST(I16_VLSHIFT__SCALAR_U3, inplace) { - for (size_t batch = 1; batch <= 15; batch += 2) { - VLShiftMicrokernelTester() - .batch(batch) - .inplace(true) - .iterations(1) - .Test(xnn_i16_vlshift_ukernel__scalar_u3); - } -} - -TEST(I16_VLSHIFT__SCALAR_U3, shift) { - for (uint32_t shift = 0; shift < 16; shift++) { - VLShiftMicrokernelTester() - .batch(3) - .shift(shift) - .Test(xnn_i16_vlshift_ukernel__scalar_u3); - } -} - - -TEST(I16_VLSHIFT__SCALAR_U4, batch_eq_4) { - VLShiftMicrokernelTester() - .batch(4) - .Test(xnn_i16_vlshift_ukernel__scalar_u4); -} - -TEST(I16_VLSHIFT__SCALAR_U4, batch_div_4) { - for (size_t batch = 8; batch < 40; batch += 4) { - VLShiftMicrokernelTester() - .batch(batch) - .Test(xnn_i16_vlshift_ukernel__scalar_u4); - } -} - -TEST(I16_VLSHIFT__SCALAR_U4, batch_lt_4) { - for (size_t batch = 1; batch < 4; batch++) { - VLShiftMicrokernelTester() - .batch(batch) - .Test(xnn_i16_vlshift_ukernel__scalar_u4); - } -} - -TEST(I16_VLSHIFT__SCALAR_U4, batch_gt_4) { - for (size_t batch = 5; batch < 8; batch++) { - VLShiftMicrokernelTester() - .batch(batch) - .Test(xnn_i16_vlshift_ukernel__scalar_u4); - } -} - -TEST(I16_VLSHIFT__SCALAR_U4, inplace) { - for (size_t batch = 1; batch <= 20; batch += 3) { - VLShiftMicrokernelTester() - .batch(batch) - .inplace(true) - .iterations(1) - .Test(xnn_i16_vlshift_ukernel__scalar_u4); - } -} - -TEST(I16_VLSHIFT__SCALAR_U4, shift) { - for (uint32_t shift = 0; shift < 16; shift++) { - VLShiftMicrokernelTester() - .batch(4) - .shift(shift) - .Test(xnn_i16_vlshift_ukernel__scalar_u4); - } -} diff --git a/test/i16-vlshift.yaml b/test/i16-vlshift.yaml deleted file mode 100644 index dbde4b6b3fe..00000000000 --- a/test/i16-vlshift.yaml +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright 2022 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - - -# ARM NEON -- name: xnn_i16_vlshift_ukernel__neon_u8 -- name: xnn_i16_vlshift_ukernel__neon_u16 -- name: xnn_i16_vlshift_ukernel__neon_u24 -- name: xnn_i16_vlshift_ukernel__neon_u32 - -# Scalar -- name: xnn_i16_vlshift_ukernel__scalar_u1 -- name: xnn_i16_vlshift_ukernel__scalar_u2 -- name: xnn_i16_vlshift_ukernel__scalar_u3 -- name: xnn_i16_vlshift_ukernel__scalar_u4 diff --git a/test/rmaxabs-microkernel-tester.h b/test/rmaxabs-microkernel-tester.h deleted file mode 100644 index df8aac3e17b..00000000000 --- a/test/rmaxabs-microkernel-tester.h +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/microfnptr.h" -#include "replicable_random_device.h" - -class RMaxAbsMicrokernelTester { - public: - - RMaxAbsMicrokernelTester& batch(size_t batch) { - assert(batch != 0); - this->batch_ = batch; - return *this; - } - - size_t batch() const { - return this->batch_; - } - - RMaxAbsMicrokernelTester& iterations(size_t iterations) { - this->iterations_ = iterations; - return *this; - } - - size_t iterations() const { - return this->iterations_; - } - - void Test(xnn_s16_rmaxabs_ukernel_fn rmaxabs) const { - xnnpack::ReplicableRandomDevice rng; - auto i16rng = std::bind(std::uniform_int_distribution(), std::ref(rng)); - - std::vector input(batch() + XNN_EXTRA_BYTES / sizeof(int16_t)); - - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), std::ref(i16rng)); - - // Compute reference results. - int32_t output_ref = 0; - for (size_t n = 0; n < batch(); n++) { - const int32_t input_value = static_cast(input[n]); - const int32_t abs_value = std::abs(input_value); - output_ref = std::max(output_ref, abs_value); - } - - // Call optimized micro-kernel. - uint16_t output = UINT16_C(0xDEAD); - rmaxabs(batch() * sizeof(int16_t), input.data(), &output); - - // Verify results. - ASSERT_EQ(static_cast(output), output_ref) - << "batch " << batch(); - } - } - - private: - size_t batch_{1}; - size_t iterations_{15}; -}; diff --git a/test/s16-rmaxabs.cc b/test/s16-rmaxabs.cc deleted file mode 100644 index 1c6a4a16e23..00000000000 --- a/test/s16-rmaxabs.cc +++ /dev/null @@ -1,271 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: test/s16-rmaxabs.yaml -// Generator: tools/generate-rmaxabs-test.py - - -#include -#include "xnnpack/common.h" -#include "xnnpack/isa-checks.h" -#include "xnnpack/rmaxabs.h" -#include "rmaxabs-microkernel-tester.h" - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(S16_RMAXABS__NEON_X8, batch_eq_8) { - TEST_REQUIRES_ARM_NEON; - RMaxAbsMicrokernelTester() - .batch(8) - .Test(xnn_s16_rmaxabs_ukernel__neon_x8); - } - - TEST(S16_RMAXABS__NEON_X8, batch_div_8) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 16; batch < 80; batch += 8) { - RMaxAbsMicrokernelTester() - .batch(batch) - .Test(xnn_s16_rmaxabs_ukernel__neon_x8); - } - } - - TEST(S16_RMAXABS__NEON_X8, batch_lt_8) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 1; batch < 8; batch++) { - RMaxAbsMicrokernelTester() - .batch(batch) - .Test(xnn_s16_rmaxabs_ukernel__neon_x8); - } - } - - TEST(S16_RMAXABS__NEON_X8, batch_gt_8) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 9; batch < 16; batch++) { - RMaxAbsMicrokernelTester() - .batch(batch) - .Test(xnn_s16_rmaxabs_ukernel__neon_x8); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(S16_RMAXABS__NEON_X16, batch_eq_16) { - TEST_REQUIRES_ARM_NEON; - RMaxAbsMicrokernelTester() - .batch(16) - .Test(xnn_s16_rmaxabs_ukernel__neon_x16); - } - - TEST(S16_RMAXABS__NEON_X16, batch_div_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 32; batch < 160; batch += 16) { - RMaxAbsMicrokernelTester() - .batch(batch) - .Test(xnn_s16_rmaxabs_ukernel__neon_x16); - } - } - - TEST(S16_RMAXABS__NEON_X16, batch_lt_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 1; batch < 16; batch++) { - RMaxAbsMicrokernelTester() - .batch(batch) - .Test(xnn_s16_rmaxabs_ukernel__neon_x16); - } - } - - TEST(S16_RMAXABS__NEON_X16, batch_gt_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 17; batch < 32; batch++) { - RMaxAbsMicrokernelTester() - .batch(batch) - .Test(xnn_s16_rmaxabs_ukernel__neon_x16); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(S16_RMAXABS__NEON_X24, batch_eq_24) { - TEST_REQUIRES_ARM_NEON; - RMaxAbsMicrokernelTester() - .batch(24) - .Test(xnn_s16_rmaxabs_ukernel__neon_x24); - } - - TEST(S16_RMAXABS__NEON_X24, batch_div_24) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 48; batch < 240; batch += 24) { - RMaxAbsMicrokernelTester() - .batch(batch) - .Test(xnn_s16_rmaxabs_ukernel__neon_x24); - } - } - - TEST(S16_RMAXABS__NEON_X24, batch_lt_24) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 1; batch < 24; batch++) { - RMaxAbsMicrokernelTester() - .batch(batch) - .Test(xnn_s16_rmaxabs_ukernel__neon_x24); - } - } - - TEST(S16_RMAXABS__NEON_X24, batch_gt_24) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 25; batch < 48; batch++) { - RMaxAbsMicrokernelTester() - .batch(batch) - .Test(xnn_s16_rmaxabs_ukernel__neon_x24); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(S16_RMAXABS__NEON_X32, batch_eq_32) { - TEST_REQUIRES_ARM_NEON; - RMaxAbsMicrokernelTester() - .batch(32) - .Test(xnn_s16_rmaxabs_ukernel__neon_x32); - } - - TEST(S16_RMAXABS__NEON_X32, batch_div_32) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 64; batch < 320; batch += 32) { - RMaxAbsMicrokernelTester() - .batch(batch) - .Test(xnn_s16_rmaxabs_ukernel__neon_x32); - } - } - - TEST(S16_RMAXABS__NEON_X32, batch_lt_32) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 1; batch < 32; batch++) { - RMaxAbsMicrokernelTester() - .batch(batch) - .Test(xnn_s16_rmaxabs_ukernel__neon_x32); - } - } - - TEST(S16_RMAXABS__NEON_X32, batch_gt_32) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch = 33; batch < 64; batch++) { - RMaxAbsMicrokernelTester() - .batch(batch) - .Test(xnn_s16_rmaxabs_ukernel__neon_x32); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -TEST(S16_RMAXABS__SCALAR_X1, batch_eq_1) { - RMaxAbsMicrokernelTester() - .batch(1) - .Test(xnn_s16_rmaxabs_ukernel__scalar_x1); -} - -TEST(S16_RMAXABS__SCALAR_X1, batch_gt_1) { - for (size_t batch = 2; batch < 10; batch++) { - RMaxAbsMicrokernelTester() - .batch(batch) - .Test(xnn_s16_rmaxabs_ukernel__scalar_x1); - } -} - - -TEST(S16_RMAXABS__SCALAR_X2, batch_eq_2) { - RMaxAbsMicrokernelTester() - .batch(2) - .Test(xnn_s16_rmaxabs_ukernel__scalar_x2); -} - -TEST(S16_RMAXABS__SCALAR_X2, batch_div_2) { - for (size_t batch = 4; batch < 20; batch += 2) { - RMaxAbsMicrokernelTester() - .batch(batch) - .Test(xnn_s16_rmaxabs_ukernel__scalar_x2); - } -} - -TEST(S16_RMAXABS__SCALAR_X2, batch_lt_2) { - for (size_t batch = 1; batch < 2; batch++) { - RMaxAbsMicrokernelTester() - .batch(batch) - .Test(xnn_s16_rmaxabs_ukernel__scalar_x2); - } -} - -TEST(S16_RMAXABS__SCALAR_X2, batch_gt_2) { - for (size_t batch = 3; batch < 4; batch++) { - RMaxAbsMicrokernelTester() - .batch(batch) - .Test(xnn_s16_rmaxabs_ukernel__scalar_x2); - } -} - - -TEST(S16_RMAXABS__SCALAR_X3, batch_eq_3) { - RMaxAbsMicrokernelTester() - .batch(3) - .Test(xnn_s16_rmaxabs_ukernel__scalar_x3); -} - -TEST(S16_RMAXABS__SCALAR_X3, batch_div_3) { - for (size_t batch = 6; batch < 30; batch += 3) { - RMaxAbsMicrokernelTester() - .batch(batch) - .Test(xnn_s16_rmaxabs_ukernel__scalar_x3); - } -} - -TEST(S16_RMAXABS__SCALAR_X3, batch_lt_3) { - for (size_t batch = 1; batch < 3; batch++) { - RMaxAbsMicrokernelTester() - .batch(batch) - .Test(xnn_s16_rmaxabs_ukernel__scalar_x3); - } -} - -TEST(S16_RMAXABS__SCALAR_X3, batch_gt_3) { - for (size_t batch = 4; batch < 6; batch++) { - RMaxAbsMicrokernelTester() - .batch(batch) - .Test(xnn_s16_rmaxabs_ukernel__scalar_x3); - } -} - - -TEST(S16_RMAXABS__SCALAR_X4, batch_eq_4) { - RMaxAbsMicrokernelTester() - .batch(4) - .Test(xnn_s16_rmaxabs_ukernel__scalar_x4); -} - -TEST(S16_RMAXABS__SCALAR_X4, batch_div_4) { - for (size_t batch = 8; batch < 40; batch += 4) { - RMaxAbsMicrokernelTester() - .batch(batch) - .Test(xnn_s16_rmaxabs_ukernel__scalar_x4); - } -} - -TEST(S16_RMAXABS__SCALAR_X4, batch_lt_4) { - for (size_t batch = 1; batch < 4; batch++) { - RMaxAbsMicrokernelTester() - .batch(batch) - .Test(xnn_s16_rmaxabs_ukernel__scalar_x4); - } -} - -TEST(S16_RMAXABS__SCALAR_X4, batch_gt_4) { - for (size_t batch = 5; batch < 8; batch++) { - RMaxAbsMicrokernelTester() - .batch(batch) - .Test(xnn_s16_rmaxabs_ukernel__scalar_x4); - } -} diff --git a/test/s16-rmaxabs.yaml b/test/s16-rmaxabs.yaml deleted file mode 100644 index 794ce3ecdcb..00000000000 --- a/test/s16-rmaxabs.yaml +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright 2022 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - - -# ARM NEON -- name: xnn_s16_rmaxabs_ukernel__neon_x8 -- name: xnn_s16_rmaxabs_ukernel__neon_x16 -- name: xnn_s16_rmaxabs_ukernel__neon_x24 -- name: xnn_s16_rmaxabs_ukernel__neon_x32 - -# Scalar -- name: xnn_s16_rmaxabs_ukernel__scalar_x1 -- name: xnn_s16_rmaxabs_ukernel__scalar_x2 -- name: xnn_s16_rmaxabs_ukernel__scalar_x3 -- name: xnn_s16_rmaxabs_ukernel__scalar_x4 diff --git a/test/s16-window.cc b/test/s16-window.cc deleted file mode 100644 index f8a3ec5e381..00000000000 --- a/test/s16-window.cc +++ /dev/null @@ -1,1166 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: test/s16-window.yaml -// Generator: tools/generate-window-test.py - - -#include -#include "xnnpack/common.h" -#include "xnnpack/isa-checks.h" -#include "xnnpack/window.h" -#include "window-microkernel-tester.h" - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(S16_WINDOW_SHIFT12__NEON_U8, channels_eq_8) { - TEST_REQUIRES_ARM_NEON; - WindowMicrokernelTester() - .rows(1) - .channels(8) - .shift(12) - .Test(xnn_s16_window_shift12_ukernel__neon_u8); - } - - TEST(S16_WINDOW_SHIFT12__NEON_U8, channels_div_8) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 16; channels < 80; channels += 8) { - WindowMicrokernelTester() - .channels(channels) - .shift(12) - .Test(xnn_s16_window_shift12_ukernel__neon_u8); - } - } - - TEST(S16_WINDOW_SHIFT12__NEON_U8, channels_lt_8) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - WindowMicrokernelTester() - .channels(channels) - .shift(12) - .Test(xnn_s16_window_shift12_ukernel__neon_u8); - } - } - - TEST(S16_WINDOW_SHIFT12__NEON_U8, channels_gt_8) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - WindowMicrokernelTester() - .channels(channels) - .shift(12) - .Test(xnn_s16_window_shift12_ukernel__neon_u8); - } - } - - TEST(S16_WINDOW_SHIFT12__NEON_U8, rows_gt_1) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 2; rows < 2; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - WindowMicrokernelTester() - .rows(rows) - .channels(channels) - .shift(12) - .Test(xnn_s16_window_shift12_ukernel__neon_u8); - } - } - } - - TEST(S16_WINDOW_SHIFT12__NEON_U8, inplace) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - WindowMicrokernelTester() - .rows(rows) - .channels(channels) - .shift(12) - .inplace(true) - .iterations(1) - .Test(xnn_s16_window_shift12_ukernel__neon_u8); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(S16_WINDOW_SHIFT12__NEON_U16, channels_eq_16) { - TEST_REQUIRES_ARM_NEON; - WindowMicrokernelTester() - .rows(1) - .channels(16) - .shift(12) - .Test(xnn_s16_window_shift12_ukernel__neon_u16); - } - - TEST(S16_WINDOW_SHIFT12__NEON_U16, channels_div_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 32; channels < 160; channels += 16) { - WindowMicrokernelTester() - .channels(channels) - .shift(12) - .Test(xnn_s16_window_shift12_ukernel__neon_u16); - } - } - - TEST(S16_WINDOW_SHIFT12__NEON_U16, channels_lt_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - WindowMicrokernelTester() - .channels(channels) - .shift(12) - .Test(xnn_s16_window_shift12_ukernel__neon_u16); - } - } - - TEST(S16_WINDOW_SHIFT12__NEON_U16, channels_gt_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - WindowMicrokernelTester() - .channels(channels) - .shift(12) - .Test(xnn_s16_window_shift12_ukernel__neon_u16); - } - } - - TEST(S16_WINDOW_SHIFT12__NEON_U16, rows_gt_1) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 2; rows < 2; rows++) { - for (size_t channels = 1; channels <= 80; channels += 15) { - WindowMicrokernelTester() - .rows(rows) - .channels(channels) - .shift(12) - .Test(xnn_s16_window_shift12_ukernel__neon_u16); - } - } - } - - TEST(S16_WINDOW_SHIFT12__NEON_U16, inplace) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - WindowMicrokernelTester() - .rows(rows) - .channels(channels) - .shift(12) - .inplace(true) - .iterations(1) - .Test(xnn_s16_window_shift12_ukernel__neon_u16); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(S16_WINDOW_SHIFT12__NEON_U24, channels_eq_24) { - TEST_REQUIRES_ARM_NEON; - WindowMicrokernelTester() - .rows(1) - .channels(24) - .shift(12) - .Test(xnn_s16_window_shift12_ukernel__neon_u24); - } - - TEST(S16_WINDOW_SHIFT12__NEON_U24, channels_div_24) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 48; channels < 240; channels += 24) { - WindowMicrokernelTester() - .channels(channels) - .shift(12) - .Test(xnn_s16_window_shift12_ukernel__neon_u24); - } - } - - TEST(S16_WINDOW_SHIFT12__NEON_U24, channels_lt_24) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - WindowMicrokernelTester() - .channels(channels) - .shift(12) - .Test(xnn_s16_window_shift12_ukernel__neon_u24); - } - } - - TEST(S16_WINDOW_SHIFT12__NEON_U24, channels_gt_24) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - WindowMicrokernelTester() - .channels(channels) - .shift(12) - .Test(xnn_s16_window_shift12_ukernel__neon_u24); - } - } - - TEST(S16_WINDOW_SHIFT12__NEON_U24, rows_gt_1) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 2; rows < 2; rows++) { - for (size_t channels = 1; channels <= 120; channels += 23) { - WindowMicrokernelTester() - .rows(rows) - .channels(channels) - .shift(12) - .Test(xnn_s16_window_shift12_ukernel__neon_u24); - } - } - } - - TEST(S16_WINDOW_SHIFT12__NEON_U24, inplace) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 120; channels += 23) { - WindowMicrokernelTester() - .rows(rows) - .channels(channels) - .shift(12) - .inplace(true) - .iterations(1) - .Test(xnn_s16_window_shift12_ukernel__neon_u24); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(S16_WINDOW_SHIFT12__NEON_U32, channels_eq_32) { - TEST_REQUIRES_ARM_NEON; - WindowMicrokernelTester() - .rows(1) - .channels(32) - .shift(12) - .Test(xnn_s16_window_shift12_ukernel__neon_u32); - } - - TEST(S16_WINDOW_SHIFT12__NEON_U32, channels_div_32) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 64; channels < 320; channels += 32) { - WindowMicrokernelTester() - .channels(channels) - .shift(12) - .Test(xnn_s16_window_shift12_ukernel__neon_u32); - } - } - - TEST(S16_WINDOW_SHIFT12__NEON_U32, channels_lt_32) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - WindowMicrokernelTester() - .channels(channels) - .shift(12) - .Test(xnn_s16_window_shift12_ukernel__neon_u32); - } - } - - TEST(S16_WINDOW_SHIFT12__NEON_U32, channels_gt_32) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - WindowMicrokernelTester() - .channels(channels) - .shift(12) - .Test(xnn_s16_window_shift12_ukernel__neon_u32); - } - } - - TEST(S16_WINDOW_SHIFT12__NEON_U32, rows_gt_1) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 2; rows < 2; rows++) { - for (size_t channels = 1; channels <= 160; channels += 31) { - WindowMicrokernelTester() - .rows(rows) - .channels(channels) - .shift(12) - .Test(xnn_s16_window_shift12_ukernel__neon_u32); - } - } - } - - TEST(S16_WINDOW_SHIFT12__NEON_U32, inplace) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 160; channels += 31) { - WindowMicrokernelTester() - .rows(rows) - .channels(channels) - .shift(12) - .inplace(true) - .iterations(1) - .Test(xnn_s16_window_shift12_ukernel__neon_u32); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(S16_WINDOW_SHIFT15__NEON_U8, channels_eq_8) { - TEST_REQUIRES_ARM_NEON; - WindowMicrokernelTester() - .rows(1) - .channels(8) - .shift(15) - .Test(xnn_s16_window_shift15_ukernel__neon_u8); - } - - TEST(S16_WINDOW_SHIFT15__NEON_U8, channels_div_8) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 16; channels < 80; channels += 8) { - WindowMicrokernelTester() - .channels(channels) - .shift(15) - .Test(xnn_s16_window_shift15_ukernel__neon_u8); - } - } - - TEST(S16_WINDOW_SHIFT15__NEON_U8, channels_lt_8) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - WindowMicrokernelTester() - .channels(channels) - .shift(15) - .Test(xnn_s16_window_shift15_ukernel__neon_u8); - } - } - - TEST(S16_WINDOW_SHIFT15__NEON_U8, channels_gt_8) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - WindowMicrokernelTester() - .channels(channels) - .shift(15) - .Test(xnn_s16_window_shift15_ukernel__neon_u8); - } - } - - TEST(S16_WINDOW_SHIFT15__NEON_U8, rows_gt_1) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 2; rows < 2; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - WindowMicrokernelTester() - .rows(rows) - .channels(channels) - .shift(15) - .Test(xnn_s16_window_shift15_ukernel__neon_u8); - } - } - } - - TEST(S16_WINDOW_SHIFT15__NEON_U8, inplace) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - WindowMicrokernelTester() - .rows(rows) - .channels(channels) - .shift(15) - .inplace(true) - .iterations(1) - .Test(xnn_s16_window_shift15_ukernel__neon_u8); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(S16_WINDOW_SHIFT15__NEON_U16, channels_eq_16) { - TEST_REQUIRES_ARM_NEON; - WindowMicrokernelTester() - .rows(1) - .channels(16) - .shift(15) - .Test(xnn_s16_window_shift15_ukernel__neon_u16); - } - - TEST(S16_WINDOW_SHIFT15__NEON_U16, channels_div_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 32; channels < 160; channels += 16) { - WindowMicrokernelTester() - .channels(channels) - .shift(15) - .Test(xnn_s16_window_shift15_ukernel__neon_u16); - } - } - - TEST(S16_WINDOW_SHIFT15__NEON_U16, channels_lt_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - WindowMicrokernelTester() - .channels(channels) - .shift(15) - .Test(xnn_s16_window_shift15_ukernel__neon_u16); - } - } - - TEST(S16_WINDOW_SHIFT15__NEON_U16, channels_gt_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - WindowMicrokernelTester() - .channels(channels) - .shift(15) - .Test(xnn_s16_window_shift15_ukernel__neon_u16); - } - } - - TEST(S16_WINDOW_SHIFT15__NEON_U16, rows_gt_1) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 2; rows < 2; rows++) { - for (size_t channels = 1; channels <= 80; channels += 15) { - WindowMicrokernelTester() - .rows(rows) - .channels(channels) - .shift(15) - .Test(xnn_s16_window_shift15_ukernel__neon_u16); - } - } - } - - TEST(S16_WINDOW_SHIFT15__NEON_U16, inplace) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - WindowMicrokernelTester() - .rows(rows) - .channels(channels) - .shift(15) - .inplace(true) - .iterations(1) - .Test(xnn_s16_window_shift15_ukernel__neon_u16); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(S16_WINDOW_SHIFT15__NEON_U24, channels_eq_24) { - TEST_REQUIRES_ARM_NEON; - WindowMicrokernelTester() - .rows(1) - .channels(24) - .shift(15) - .Test(xnn_s16_window_shift15_ukernel__neon_u24); - } - - TEST(S16_WINDOW_SHIFT15__NEON_U24, channels_div_24) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 48; channels < 240; channels += 24) { - WindowMicrokernelTester() - .channels(channels) - .shift(15) - .Test(xnn_s16_window_shift15_ukernel__neon_u24); - } - } - - TEST(S16_WINDOW_SHIFT15__NEON_U24, channels_lt_24) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - WindowMicrokernelTester() - .channels(channels) - .shift(15) - .Test(xnn_s16_window_shift15_ukernel__neon_u24); - } - } - - TEST(S16_WINDOW_SHIFT15__NEON_U24, channels_gt_24) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - WindowMicrokernelTester() - .channels(channels) - .shift(15) - .Test(xnn_s16_window_shift15_ukernel__neon_u24); - } - } - - TEST(S16_WINDOW_SHIFT15__NEON_U24, rows_gt_1) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 2; rows < 2; rows++) { - for (size_t channels = 1; channels <= 120; channels += 23) { - WindowMicrokernelTester() - .rows(rows) - .channels(channels) - .shift(15) - .Test(xnn_s16_window_shift15_ukernel__neon_u24); - } - } - } - - TEST(S16_WINDOW_SHIFT15__NEON_U24, inplace) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 120; channels += 23) { - WindowMicrokernelTester() - .rows(rows) - .channels(channels) - .shift(15) - .inplace(true) - .iterations(1) - .Test(xnn_s16_window_shift15_ukernel__neon_u24); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(S16_WINDOW_SHIFT15__NEON_U32, channels_eq_32) { - TEST_REQUIRES_ARM_NEON; - WindowMicrokernelTester() - .rows(1) - .channels(32) - .shift(15) - .Test(xnn_s16_window_shift15_ukernel__neon_u32); - } - - TEST(S16_WINDOW_SHIFT15__NEON_U32, channels_div_32) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 64; channels < 320; channels += 32) { - WindowMicrokernelTester() - .channels(channels) - .shift(15) - .Test(xnn_s16_window_shift15_ukernel__neon_u32); - } - } - - TEST(S16_WINDOW_SHIFT15__NEON_U32, channels_lt_32) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - WindowMicrokernelTester() - .channels(channels) - .shift(15) - .Test(xnn_s16_window_shift15_ukernel__neon_u32); - } - } - - TEST(S16_WINDOW_SHIFT15__NEON_U32, channels_gt_32) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - WindowMicrokernelTester() - .channels(channels) - .shift(15) - .Test(xnn_s16_window_shift15_ukernel__neon_u32); - } - } - - TEST(S16_WINDOW_SHIFT15__NEON_U32, rows_gt_1) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 2; rows < 2; rows++) { - for (size_t channels = 1; channels <= 160; channels += 31) { - WindowMicrokernelTester() - .rows(rows) - .channels(channels) - .shift(15) - .Test(xnn_s16_window_shift15_ukernel__neon_u32); - } - } - } - - TEST(S16_WINDOW_SHIFT15__NEON_U32, inplace) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 160; channels += 31) { - WindowMicrokernelTester() - .rows(rows) - .channels(channels) - .shift(15) - .inplace(true) - .iterations(1) - .Test(xnn_s16_window_shift15_ukernel__neon_u32); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(S16_WINDOW__NEON_U8, channels_eq_8) { - TEST_REQUIRES_ARM_NEON; - WindowMicrokernelTester() - .rows(1) - .channels(8) - .shift(0) - .Test(xnn_s16_window_ukernel__neon_u8); - } - - TEST(S16_WINDOW__NEON_U8, channels_div_8) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 16; channels < 80; channels += 8) { - WindowMicrokernelTester() - .channels(channels) - .shift(0) - .Test(xnn_s16_window_ukernel__neon_u8); - } - } - - TEST(S16_WINDOW__NEON_U8, channels_lt_8) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - WindowMicrokernelTester() - .channels(channels) - .shift(0) - .Test(xnn_s16_window_ukernel__neon_u8); - } - } - - TEST(S16_WINDOW__NEON_U8, channels_gt_8) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - WindowMicrokernelTester() - .channels(channels) - .shift(0) - .Test(xnn_s16_window_ukernel__neon_u8); - } - } - - TEST(S16_WINDOW__NEON_U8, rows_gt_1) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 2; rows < 2; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - WindowMicrokernelTester() - .rows(rows) - .channels(channels) - .shift(0) - .Test(xnn_s16_window_ukernel__neon_u8); - } - } - } - - TEST(S16_WINDOW__NEON_U8, inplace) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - WindowMicrokernelTester() - .rows(rows) - .channels(channels) - .shift(0) - .inplace(true) - .iterations(1) - .Test(xnn_s16_window_ukernel__neon_u8); - } - } - } - - TEST(S16_WINDOW__NEON_U8, shift) { - TEST_REQUIRES_ARM_NEON; - for (uint32_t shift = 0; shift < 32; shift++) { - WindowMicrokernelTester() - .rows(1) - .channels(8) - .shift(shift) - .Test(xnn_s16_window_ukernel__neon_u8); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(S16_WINDOW__NEON_U16, channels_eq_16) { - TEST_REQUIRES_ARM_NEON; - WindowMicrokernelTester() - .rows(1) - .channels(16) - .shift(0) - .Test(xnn_s16_window_ukernel__neon_u16); - } - - TEST(S16_WINDOW__NEON_U16, channels_div_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 32; channels < 160; channels += 16) { - WindowMicrokernelTester() - .channels(channels) - .shift(0) - .Test(xnn_s16_window_ukernel__neon_u16); - } - } - - TEST(S16_WINDOW__NEON_U16, channels_lt_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - WindowMicrokernelTester() - .channels(channels) - .shift(0) - .Test(xnn_s16_window_ukernel__neon_u16); - } - } - - TEST(S16_WINDOW__NEON_U16, channels_gt_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - WindowMicrokernelTester() - .channels(channels) - .shift(0) - .Test(xnn_s16_window_ukernel__neon_u16); - } - } - - TEST(S16_WINDOW__NEON_U16, rows_gt_1) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 2; rows < 2; rows++) { - for (size_t channels = 1; channels <= 80; channels += 15) { - WindowMicrokernelTester() - .rows(rows) - .channels(channels) - .shift(0) - .Test(xnn_s16_window_ukernel__neon_u16); - } - } - } - - TEST(S16_WINDOW__NEON_U16, inplace) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - WindowMicrokernelTester() - .rows(rows) - .channels(channels) - .shift(0) - .inplace(true) - .iterations(1) - .Test(xnn_s16_window_ukernel__neon_u16); - } - } - } - - TEST(S16_WINDOW__NEON_U16, shift) { - TEST_REQUIRES_ARM_NEON; - for (uint32_t shift = 0; shift < 32; shift++) { - WindowMicrokernelTester() - .rows(1) - .channels(16) - .shift(shift) - .Test(xnn_s16_window_ukernel__neon_u16); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(S16_WINDOW__NEON_U24, channels_eq_24) { - TEST_REQUIRES_ARM_NEON; - WindowMicrokernelTester() - .rows(1) - .channels(24) - .shift(0) - .Test(xnn_s16_window_ukernel__neon_u24); - } - - TEST(S16_WINDOW__NEON_U24, channels_div_24) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 48; channels < 240; channels += 24) { - WindowMicrokernelTester() - .channels(channels) - .shift(0) - .Test(xnn_s16_window_ukernel__neon_u24); - } - } - - TEST(S16_WINDOW__NEON_U24, channels_lt_24) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - WindowMicrokernelTester() - .channels(channels) - .shift(0) - .Test(xnn_s16_window_ukernel__neon_u24); - } - } - - TEST(S16_WINDOW__NEON_U24, channels_gt_24) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - WindowMicrokernelTester() - .channels(channels) - .shift(0) - .Test(xnn_s16_window_ukernel__neon_u24); - } - } - - TEST(S16_WINDOW__NEON_U24, rows_gt_1) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 2; rows < 2; rows++) { - for (size_t channels = 1; channels <= 120; channels += 23) { - WindowMicrokernelTester() - .rows(rows) - .channels(channels) - .shift(0) - .Test(xnn_s16_window_ukernel__neon_u24); - } - } - } - - TEST(S16_WINDOW__NEON_U24, inplace) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 120; channels += 23) { - WindowMicrokernelTester() - .rows(rows) - .channels(channels) - .shift(0) - .inplace(true) - .iterations(1) - .Test(xnn_s16_window_ukernel__neon_u24); - } - } - } - - TEST(S16_WINDOW__NEON_U24, shift) { - TEST_REQUIRES_ARM_NEON; - for (uint32_t shift = 0; shift < 32; shift++) { - WindowMicrokernelTester() - .rows(1) - .channels(24) - .shift(shift) - .Test(xnn_s16_window_ukernel__neon_u24); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(S16_WINDOW__NEON_U32, channels_eq_32) { - TEST_REQUIRES_ARM_NEON; - WindowMicrokernelTester() - .rows(1) - .channels(32) - .shift(0) - .Test(xnn_s16_window_ukernel__neon_u32); - } - - TEST(S16_WINDOW__NEON_U32, channels_div_32) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 64; channels < 320; channels += 32) { - WindowMicrokernelTester() - .channels(channels) - .shift(0) - .Test(xnn_s16_window_ukernel__neon_u32); - } - } - - TEST(S16_WINDOW__NEON_U32, channels_lt_32) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - WindowMicrokernelTester() - .channels(channels) - .shift(0) - .Test(xnn_s16_window_ukernel__neon_u32); - } - } - - TEST(S16_WINDOW__NEON_U32, channels_gt_32) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - WindowMicrokernelTester() - .channels(channels) - .shift(0) - .Test(xnn_s16_window_ukernel__neon_u32); - } - } - - TEST(S16_WINDOW__NEON_U32, rows_gt_1) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 2; rows < 2; rows++) { - for (size_t channels = 1; channels <= 160; channels += 31) { - WindowMicrokernelTester() - .rows(rows) - .channels(channels) - .shift(0) - .Test(xnn_s16_window_ukernel__neon_u32); - } - } - } - - TEST(S16_WINDOW__NEON_U32, inplace) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 160; channels += 31) { - WindowMicrokernelTester() - .rows(rows) - .channels(channels) - .shift(0) - .inplace(true) - .iterations(1) - .Test(xnn_s16_window_ukernel__neon_u32); - } - } - } - - TEST(S16_WINDOW__NEON_U32, shift) { - TEST_REQUIRES_ARM_NEON; - for (uint32_t shift = 0; shift < 32; shift++) { - WindowMicrokernelTester() - .rows(1) - .channels(32) - .shift(shift) - .Test(xnn_s16_window_ukernel__neon_u32); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -TEST(S16_WINDOW__SCALAR_U1, channels_eq_1) { - WindowMicrokernelTester() - .rows(1) - .channels(1) - .shift(0) - .Test(xnn_s16_window_ukernel__scalar_u1); -} - -TEST(S16_WINDOW__SCALAR_U1, channels_gt_1) { - for (size_t channels = 2; channels < 10; channels++) { - WindowMicrokernelTester() - .channels(channels) - .shift(0) - .Test(xnn_s16_window_ukernel__scalar_u1); - } -} - -TEST(S16_WINDOW__SCALAR_U1, rows_gt_1) { - for (size_t rows = 2; rows < 2; rows++) { - for (size_t channels = 1; channels <= 5; channels += 1) { - WindowMicrokernelTester() - .rows(rows) - .channels(channels) - .shift(0) - .Test(xnn_s16_window_ukernel__scalar_u1); - } - } -} - -TEST(S16_WINDOW__SCALAR_U1, inplace) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 5; channels += 1) { - WindowMicrokernelTester() - .rows(rows) - .channels(channels) - .shift(0) - .inplace(true) - .iterations(1) - .Test(xnn_s16_window_ukernel__scalar_u1); - } - } -} - -TEST(S16_WINDOW__SCALAR_U1, shift) { - for (uint32_t shift = 0; shift < 32; shift++) { - WindowMicrokernelTester() - .rows(1) - .channels(1) - .shift(shift) - .Test(xnn_s16_window_ukernel__scalar_u1); - } -} - -TEST(S16_WINDOW__SCALAR_U2, channels_eq_2) { - WindowMicrokernelTester() - .rows(1) - .channels(2) - .shift(0) - .Test(xnn_s16_window_ukernel__scalar_u2); -} - -TEST(S16_WINDOW__SCALAR_U2, channels_div_2) { - for (size_t channels = 4; channels < 20; channels += 2) { - WindowMicrokernelTester() - .channels(channels) - .shift(0) - .Test(xnn_s16_window_ukernel__scalar_u2); - } -} - -TEST(S16_WINDOW__SCALAR_U2, channels_lt_2) { - for (size_t channels = 1; channels < 2; channels++) { - WindowMicrokernelTester() - .channels(channels) - .shift(0) - .Test(xnn_s16_window_ukernel__scalar_u2); - } -} - -TEST(S16_WINDOW__SCALAR_U2, channels_gt_2) { - for (size_t channels = 3; channels < 4; channels++) { - WindowMicrokernelTester() - .channels(channels) - .shift(0) - .Test(xnn_s16_window_ukernel__scalar_u2); - } -} - -TEST(S16_WINDOW__SCALAR_U2, rows_gt_1) { - for (size_t rows = 2; rows < 2; rows++) { - for (size_t channels = 1; channels <= 10; channels += 1) { - WindowMicrokernelTester() - .rows(rows) - .channels(channels) - .shift(0) - .Test(xnn_s16_window_ukernel__scalar_u2); - } - } -} - -TEST(S16_WINDOW__SCALAR_U2, inplace) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 10; channels += 1) { - WindowMicrokernelTester() - .rows(rows) - .channels(channels) - .shift(0) - .inplace(true) - .iterations(1) - .Test(xnn_s16_window_ukernel__scalar_u2); - } - } -} - -TEST(S16_WINDOW__SCALAR_U2, shift) { - for (uint32_t shift = 0; shift < 32; shift++) { - WindowMicrokernelTester() - .rows(1) - .channels(2) - .shift(shift) - .Test(xnn_s16_window_ukernel__scalar_u2); - } -} - -TEST(S16_WINDOW__SCALAR_U3, channels_eq_3) { - WindowMicrokernelTester() - .rows(1) - .channels(3) - .shift(0) - .Test(xnn_s16_window_ukernel__scalar_u3); -} - -TEST(S16_WINDOW__SCALAR_U3, channels_div_3) { - for (size_t channels = 6; channels < 30; channels += 3) { - WindowMicrokernelTester() - .channels(channels) - .shift(0) - .Test(xnn_s16_window_ukernel__scalar_u3); - } -} - -TEST(S16_WINDOW__SCALAR_U3, channels_lt_3) { - for (size_t channels = 1; channels < 3; channels++) { - WindowMicrokernelTester() - .channels(channels) - .shift(0) - .Test(xnn_s16_window_ukernel__scalar_u3); - } -} - -TEST(S16_WINDOW__SCALAR_U3, channels_gt_3) { - for (size_t channels = 4; channels < 6; channels++) { - WindowMicrokernelTester() - .channels(channels) - .shift(0) - .Test(xnn_s16_window_ukernel__scalar_u3); - } -} - -TEST(S16_WINDOW__SCALAR_U3, rows_gt_1) { - for (size_t rows = 2; rows < 2; rows++) { - for (size_t channels = 1; channels <= 15; channels += 2) { - WindowMicrokernelTester() - .rows(rows) - .channels(channels) - .shift(0) - .Test(xnn_s16_window_ukernel__scalar_u3); - } - } -} - -TEST(S16_WINDOW__SCALAR_U3, inplace) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 15; channels += 2) { - WindowMicrokernelTester() - .rows(rows) - .channels(channels) - .shift(0) - .inplace(true) - .iterations(1) - .Test(xnn_s16_window_ukernel__scalar_u3); - } - } -} - -TEST(S16_WINDOW__SCALAR_U3, shift) { - for (uint32_t shift = 0; shift < 32; shift++) { - WindowMicrokernelTester() - .rows(1) - .channels(3) - .shift(shift) - .Test(xnn_s16_window_ukernel__scalar_u3); - } -} - -TEST(S16_WINDOW__SCALAR_U4, channels_eq_4) { - WindowMicrokernelTester() - .rows(1) - .channels(4) - .shift(0) - .Test(xnn_s16_window_ukernel__scalar_u4); -} - -TEST(S16_WINDOW__SCALAR_U4, channels_div_4) { - for (size_t channels = 8; channels < 40; channels += 4) { - WindowMicrokernelTester() - .channels(channels) - .shift(0) - .Test(xnn_s16_window_ukernel__scalar_u4); - } -} - -TEST(S16_WINDOW__SCALAR_U4, channels_lt_4) { - for (size_t channels = 1; channels < 4; channels++) { - WindowMicrokernelTester() - .channels(channels) - .shift(0) - .Test(xnn_s16_window_ukernel__scalar_u4); - } -} - -TEST(S16_WINDOW__SCALAR_U4, channels_gt_4) { - for (size_t channels = 5; channels < 8; channels++) { - WindowMicrokernelTester() - .channels(channels) - .shift(0) - .Test(xnn_s16_window_ukernel__scalar_u4); - } -} - -TEST(S16_WINDOW__SCALAR_U4, rows_gt_1) { - for (size_t rows = 2; rows < 2; rows++) { - for (size_t channels = 1; channels <= 20; channels += 3) { - WindowMicrokernelTester() - .rows(rows) - .channels(channels) - .shift(0) - .Test(xnn_s16_window_ukernel__scalar_u4); - } - } -} - -TEST(S16_WINDOW__SCALAR_U4, inplace) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - WindowMicrokernelTester() - .rows(rows) - .channels(channels) - .shift(0) - .inplace(true) - .iterations(1) - .Test(xnn_s16_window_ukernel__scalar_u4); - } - } -} - -TEST(S16_WINDOW__SCALAR_U4, shift) { - for (uint32_t shift = 0; shift < 32; shift++) { - WindowMicrokernelTester() - .rows(1) - .channels(4) - .shift(shift) - .Test(xnn_s16_window_ukernel__scalar_u4); - } -} \ No newline at end of file diff --git a/test/s16-window.yaml b/test/s16-window.yaml deleted file mode 100644 index 92fcd249718..00000000000 --- a/test/s16-window.yaml +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright 2022 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - - -# ARM NEON -- name: xnn_s16_window_shift12_ukernel__neon_u8 -- name: xnn_s16_window_shift12_ukernel__neon_u16 -- name: xnn_s16_window_shift12_ukernel__neon_u24 -- name: xnn_s16_window_shift12_ukernel__neon_u32 -- name: xnn_s16_window_shift15_ukernel__neon_u8 -- name: xnn_s16_window_shift15_ukernel__neon_u16 -- name: xnn_s16_window_shift15_ukernel__neon_u24 -- name: xnn_s16_window_shift15_ukernel__neon_u32 -- name: xnn_s16_window_ukernel__neon_u8 -- name: xnn_s16_window_ukernel__neon_u16 -- name: xnn_s16_window_ukernel__neon_u24 -- name: xnn_s16_window_ukernel__neon_u32 - -# Scalar -- name: xnn_s16_window_ukernel__scalar_u1 -- name: xnn_s16_window_ukernel__scalar_u2 -- name: xnn_s16_window_ukernel__scalar_u3 -- name: xnn_s16_window_ukernel__scalar_u4 diff --git a/test/u32-filterbank-accumulate.cc b/test/u32-filterbank-accumulate.cc deleted file mode 100644 index c1eee3af904..00000000000 --- a/test/u32-filterbank-accumulate.cc +++ /dev/null @@ -1,123 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: test/u32-filterbank-accumulate.yaml -// Generator: tools/generate-filterbank-accumulate-test.py - - -#include -#include "xnnpack/common.h" -#include "xnnpack/filterbank.h" -#include "xnnpack/isa-checks.h" -#include "filterbank-accumulate-microkernel-tester.h" - - -#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY - TEST(U32_FILTERBANK_ACCUMULATE__ASM_AARCH32_ARM_X1, rows_eq_1) { - FilterbankAccumulateMicrokernelTester() - .rows(1) - .Test(xnn_u32_filterbank_accumulate_ukernel__asm_aarch32_arm_x1); - } - - TEST(U32_FILTERBANK_ACCUMULATE__ASM_AARCH32_ARM_X1, rows_gt_1) { - for (size_t rows = 2; rows < 10; rows++) { - FilterbankAccumulateMicrokernelTester() - .rows(rows) - .Test(xnn_u32_filterbank_accumulate_ukernel__asm_aarch32_arm_x1); - } - } -#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY - - -#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY - TEST(U32_FILTERBANK_ACCUMULATE__ASM_AARCH32_NEON_X1, rows_eq_1) { - TEST_REQUIRES_ARM_NEON; - FilterbankAccumulateMicrokernelTester() - .rows(1) - .Test(xnn_u32_filterbank_accumulate_ukernel__asm_aarch32_neon_x1); - } - - TEST(U32_FILTERBANK_ACCUMULATE__ASM_AARCH32_NEON_X1, rows_gt_1) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 2; rows < 10; rows++) { - FilterbankAccumulateMicrokernelTester() - .rows(rows) - .Test(xnn_u32_filterbank_accumulate_ukernel__asm_aarch32_neon_x1); - } - } -#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY - - -#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY - TEST(U32_FILTERBANK_ACCUMULATE__ASM_AARCH32_NEON_X2, rows_eq_1) { - TEST_REQUIRES_ARM_NEON; - FilterbankAccumulateMicrokernelTester() - .rows(1) - .Test(xnn_u32_filterbank_accumulate_ukernel__asm_aarch32_neon_x2); - } - - TEST(U32_FILTERBANK_ACCUMULATE__ASM_AARCH32_NEON_X2, rows_gt_1) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 2; rows < 10; rows++) { - FilterbankAccumulateMicrokernelTester() - .rows(rows) - .Test(xnn_u32_filterbank_accumulate_ukernel__asm_aarch32_neon_x2); - } - } -#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(U32_FILTERBANK_ACCUMULATE__NEON_X1, rows_eq_1) { - TEST_REQUIRES_ARM_NEON; - FilterbankAccumulateMicrokernelTester() - .rows(1) - .Test(xnn_u32_filterbank_accumulate_ukernel__neon_x1); - } - - TEST(U32_FILTERBANK_ACCUMULATE__NEON_X1, rows_gt_1) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 2; rows < 10; rows++) { - FilterbankAccumulateMicrokernelTester() - .rows(rows) - .Test(xnn_u32_filterbank_accumulate_ukernel__neon_x1); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(U32_FILTERBANK_ACCUMULATE__NEON_X2, rows_eq_1) { - TEST_REQUIRES_ARM_NEON; - FilterbankAccumulateMicrokernelTester() - .rows(1) - .Test(xnn_u32_filterbank_accumulate_ukernel__neon_x2); - } - - TEST(U32_FILTERBANK_ACCUMULATE__NEON_X2, rows_gt_1) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 2; rows < 10; rows++) { - FilterbankAccumulateMicrokernelTester() - .rows(rows) - .Test(xnn_u32_filterbank_accumulate_ukernel__neon_x2); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -TEST(U32_FILTERBANK_ACCUMULATE__SCALAR_X1, rows_eq_1) { - FilterbankAccumulateMicrokernelTester() - .rows(1) - .Test(xnn_u32_filterbank_accumulate_ukernel__scalar_x1); -} - -TEST(U32_FILTERBANK_ACCUMULATE__SCALAR_X1, rows_gt_1) { - for (size_t rows = 2; rows < 10; rows++) { - FilterbankAccumulateMicrokernelTester() - .rows(rows) - .Test(xnn_u32_filterbank_accumulate_ukernel__scalar_x1); - } -} \ No newline at end of file diff --git a/test/u32-filterbank-accumulate.yaml b/test/u32-filterbank-accumulate.yaml deleted file mode 100644 index 6d861b574ff..00000000000 --- a/test/u32-filterbank-accumulate.yaml +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright 2022 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - - -# AArch32 assembly -- name: xnn_u32_filterbank_accumulate_ukernel__asm_aarch32_arm_x1 -- name: xnn_u32_filterbank_accumulate_ukernel__asm_aarch32_neon_x1 -- name: xnn_u32_filterbank_accumulate_ukernel__asm_aarch32_neon_x2 - -# ARM NEON -- name: xnn_u32_filterbank_accumulate_ukernel__neon_x1 -- name: xnn_u32_filterbank_accumulate_ukernel__neon_x2 - -# Scalar -- name: xnn_u32_filterbank_accumulate_ukernel__scalar_x1 diff --git a/test/u32-filterbank-subtract.cc b/test/u32-filterbank-subtract.cc deleted file mode 100644 index 20ff79254b5..00000000000 --- a/test/u32-filterbank-subtract.cc +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: test/u32-filterbank-subtract.yaml -// Generator: tools/generate-filterbank-subtract-test.py - - -#include -#include "xnnpack/common.h" -#include "xnnpack/filterbank.h" -#include "xnnpack/isa-checks.h" -#include "filterbank-subtract-microkernel-tester.h" - - -TEST(U32_FILTERBANK_SUBTRACT__SCALAR_X2, batch_eq_2) { - FilterbankSubtractMicrokernelTester() - .batch(2) - .Test(xnn_u32_filterbank_subtract_ukernel__scalar_x2); -} - -TEST(U32_FILTERBANK_SUBTRACT__SCALAR_X2, batch_div_2) { - for (size_t batch = 4; batch < 20; batch += 2) { - FilterbankSubtractMicrokernelTester() - .batch(batch) - .Test(xnn_u32_filterbank_subtract_ukernel__scalar_x2); - } -} - -TEST(U32_FILTERBANK_SUBTRACT__SCALAR_X2, batch_lt_2) { - for (size_t batch = 2; batch < 2; batch += 2) { - FilterbankSubtractMicrokernelTester() - .batch(batch) - .Test(xnn_u32_filterbank_subtract_ukernel__scalar_x2); - } -} - -TEST(U32_FILTERBANK_SUBTRACT__SCALAR_X2, batch_gt_2) { - for (size_t batch = 4; batch < 4; batch += 2) { - FilterbankSubtractMicrokernelTester() - .batch(batch) - .Test(xnn_u32_filterbank_subtract_ukernel__scalar_x2); - } -} - -TEST(U32_FILTERBANK_SUBTRACT__SCALAR_X2, inplace) { - for (size_t batch = 4; batch < 4; batch += 2) { - FilterbankSubtractMicrokernelTester() - .batch(batch) - .inplace(true) - .Test(xnn_u32_filterbank_subtract_ukernel__scalar_x2); - } -} \ No newline at end of file diff --git a/test/u32-filterbank-subtract.yaml b/test/u32-filterbank-subtract.yaml deleted file mode 100644 index 53e0ac77f5f..00000000000 --- a/test/u32-filterbank-subtract.yaml +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright 2022 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - - -# Scalar -- name: xnn_u32_filterbank_subtract_ukernel__scalar_x2 diff --git a/test/u32-vlog.cc b/test/u32-vlog.cc deleted file mode 100644 index 0665506e5e9..00000000000 --- a/test/u32-vlog.cc +++ /dev/null @@ -1,231 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: test/u32-vlog.yaml -// Generator: tools/generate-vlog-test.py - - -#include -#include "xnnpack/common.h" -#include "xnnpack/isa-checks.h" -#include "xnnpack/vlog.h" -#include "vlog-microkernel-tester.h" - - -TEST(U32_VLOG__SCALAR_X1, DISABLED_batch_eq_1) { - VLogMicrokernelTester() - .batch(1) - .Test(xnn_u32_vlog_ukernel__scalar_x1); -} - -TEST(U32_VLOG__SCALAR_X1, DISABLED_batch_gt_1) { - for (size_t batch = 2; batch < 10; batch++) { - VLogMicrokernelTester() - .batch(batch) - .Test(xnn_u32_vlog_ukernel__scalar_x1); - } -} - -TEST(U32_VLOG__SCALAR_X1, DISABLED_input_lshift) { - for (uint32_t input_lshift = 0; input_lshift < 32; input_lshift++) { - VLogMicrokernelTester() - .batch(1) - .input_lshift(input_lshift) - .Test(xnn_u32_vlog_ukernel__scalar_x1); - } -} - -TEST(U32_VLOG__SCALAR_X1, DISABLED_output_scale) { - for (uint32_t output_scale = 0; output_scale < 65536; output_scale += 3) { - VLogMicrokernelTester() - .batch(1) - .output_scale(output_scale) - .Test(xnn_u32_vlog_ukernel__scalar_x1); - } -} - -TEST(U32_VLOG__SCALAR_X1, DISABLED_inplace) { - for (size_t batch = 2; batch < 10; batch++) { - VLogMicrokernelTester() - .batch(batch) - .inplace(true) - .Test(xnn_u32_vlog_ukernel__scalar_x1); - } -} - - -TEST(U32_VLOG__SCALAR_X2, DISABLED_batch_eq_2) { - VLogMicrokernelTester() - .batch(2) - .Test(xnn_u32_vlog_ukernel__scalar_x2); -} - -TEST(U32_VLOG__SCALAR_X2, DISABLED_batch_div_2) { - for (size_t batch = 4; batch < 20; batch += 2) { - VLogMicrokernelTester() - .batch(batch) - .Test(xnn_u32_vlog_ukernel__scalar_x2); - } -} - -TEST(U32_VLOG__SCALAR_X2, DISABLED_batch_lt_2) { - for (size_t batch = 1; batch < 2; batch++) { - VLogMicrokernelTester() - .batch(batch) - .Test(xnn_u32_vlog_ukernel__scalar_x2); - } -} - -TEST(U32_VLOG__SCALAR_X2, DISABLED_batch_gt_2) { - for (size_t batch = 3; batch < 4; batch++) { - VLogMicrokernelTester() - .batch(batch) - .Test(xnn_u32_vlog_ukernel__scalar_x2); - } -} - -TEST(U32_VLOG__SCALAR_X2, DISABLED_input_lshift) { - for (uint32_t input_lshift = 0; input_lshift < 32; input_lshift++) { - VLogMicrokernelTester() - .batch(2) - .input_lshift(input_lshift) - .Test(xnn_u32_vlog_ukernel__scalar_x2); - } -} - -TEST(U32_VLOG__SCALAR_X2, DISABLED_output_scale) { - for (uint32_t output_scale = 0; output_scale < 65536; output_scale += 5) { - VLogMicrokernelTester() - .batch(2) - .output_scale(output_scale) - .Test(xnn_u32_vlog_ukernel__scalar_x2); - } -} - -TEST(U32_VLOG__SCALAR_X2, DISABLED_inplace) { - for (size_t batch = 3; batch < 4; batch++) { - VLogMicrokernelTester() - .batch(batch) - .inplace(true) - .Test(xnn_u32_vlog_ukernel__scalar_x2); - } -} - - -TEST(U32_VLOG__SCALAR_X3, DISABLED_batch_eq_3) { - VLogMicrokernelTester() - .batch(3) - .Test(xnn_u32_vlog_ukernel__scalar_x3); -} - -TEST(U32_VLOG__SCALAR_X3, DISABLED_batch_div_3) { - for (size_t batch = 6; batch < 30; batch += 3) { - VLogMicrokernelTester() - .batch(batch) - .Test(xnn_u32_vlog_ukernel__scalar_x3); - } -} - -TEST(U32_VLOG__SCALAR_X3, DISABLED_batch_lt_3) { - for (size_t batch = 1; batch < 3; batch++) { - VLogMicrokernelTester() - .batch(batch) - .Test(xnn_u32_vlog_ukernel__scalar_x3); - } -} - -TEST(U32_VLOG__SCALAR_X3, DISABLED_batch_gt_3) { - for (size_t batch = 4; batch < 6; batch++) { - VLogMicrokernelTester() - .batch(batch) - .Test(xnn_u32_vlog_ukernel__scalar_x3); - } -} - -TEST(U32_VLOG__SCALAR_X3, DISABLED_input_lshift) { - for (uint32_t input_lshift = 0; input_lshift < 32; input_lshift++) { - VLogMicrokernelTester() - .batch(3) - .input_lshift(input_lshift) - .Test(xnn_u32_vlog_ukernel__scalar_x3); - } -} - -TEST(U32_VLOG__SCALAR_X3, DISABLED_output_scale) { - for (uint32_t output_scale = 0; output_scale < 65536; output_scale += 5) { - VLogMicrokernelTester() - .batch(3) - .output_scale(output_scale) - .Test(xnn_u32_vlog_ukernel__scalar_x3); - } -} - -TEST(U32_VLOG__SCALAR_X3, DISABLED_inplace) { - for (size_t batch = 4; batch < 6; batch++) { - VLogMicrokernelTester() - .batch(batch) - .inplace(true) - .Test(xnn_u32_vlog_ukernel__scalar_x3); - } -} - - -TEST(U32_VLOG__SCALAR_X4, DISABLED_batch_eq_4) { - VLogMicrokernelTester() - .batch(4) - .Test(xnn_u32_vlog_ukernel__scalar_x4); -} - -TEST(U32_VLOG__SCALAR_X4, DISABLED_batch_div_4) { - for (size_t batch = 8; batch < 40; batch += 4) { - VLogMicrokernelTester() - .batch(batch) - .Test(xnn_u32_vlog_ukernel__scalar_x4); - } -} - -TEST(U32_VLOG__SCALAR_X4, DISABLED_batch_lt_4) { - for (size_t batch = 1; batch < 4; batch++) { - VLogMicrokernelTester() - .batch(batch) - .Test(xnn_u32_vlog_ukernel__scalar_x4); - } -} - -TEST(U32_VLOG__SCALAR_X4, DISABLED_batch_gt_4) { - for (size_t batch = 5; batch < 8; batch++) { - VLogMicrokernelTester() - .batch(batch) - .Test(xnn_u32_vlog_ukernel__scalar_x4); - } -} - -TEST(U32_VLOG__SCALAR_X4, DISABLED_input_lshift) { - for (uint32_t input_lshift = 0; input_lshift < 32; input_lshift++) { - VLogMicrokernelTester() - .batch(4) - .input_lshift(input_lshift) - .Test(xnn_u32_vlog_ukernel__scalar_x4); - } -} - -TEST(U32_VLOG__SCALAR_X4, DISABLED_output_scale) { - for (uint32_t output_scale = 0; output_scale < 65536; output_scale += 7) { - VLogMicrokernelTester() - .batch(4) - .output_scale(output_scale) - .Test(xnn_u32_vlog_ukernel__scalar_x4); - } -} - -TEST(U32_VLOG__SCALAR_X4, DISABLED_inplace) { - for (size_t batch = 5; batch < 8; batch++) { - VLogMicrokernelTester() - .batch(batch) - .inplace(true) - .Test(xnn_u32_vlog_ukernel__scalar_x4); - } -} diff --git a/test/u32-vlog.yaml b/test/u32-vlog.yaml deleted file mode 100644 index 12222c99298..00000000000 --- a/test/u32-vlog.yaml +++ /dev/null @@ -1,11 +0,0 @@ -# Copyright 2022 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - - -# Scalar -- name: xnn_u32_vlog_ukernel__scalar_x1 -- name: xnn_u32_vlog_ukernel__scalar_x2 -- name: xnn_u32_vlog_ukernel__scalar_x3 -- name: xnn_u32_vlog_ukernel__scalar_x4 diff --git a/test/u64-u32-vsqrtshift.cc b/test/u64-u32-vsqrtshift.cc deleted file mode 100644 index 1ffcc1e8be0..00000000000 --- a/test/u64-u32-vsqrtshift.cc +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Microkernel: u64-u32-vsqrtshift -// Generator: tools/generate-vunary-test.py - - -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/common.h" -#include "xnnpack/isa-checks.h" -#include "xnnpack/microparams-init.h" -#include "xnnpack/microparams.h" -#include "xnnpack/vunary.h" -#include "next_prime.h" -#include "vunary-microkernel-tester.h" - -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)\ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ - \ -TEST(ukernel, shift) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - const size_t batch_scale = get_batch_scale(); \ - const size_t batch_end = batch_tile * batch_scale; \ - const size_t batch_step = std::max(1, batch_tile - 1); \ - for (uint32_t shift = 0; shift < 32; shift++) { \ - for (size_t batch_size = 1; batch_size <= 5 * batch_end; batch_size += batch_step) { \ - VUnaryMicrokernelTester() \ - .batch_size(batch_size) \ - .shift(shift) \ - .Test(ukernel, init_params); \ - } \ - } \ -} -#include "src/u64-u32-vsqrtshift/u64-u32-vsqrtshift.h" -#undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/vlshift-microkernel-tester.h b/test/vlshift-microkernel-tester.h deleted file mode 100644 index 25e136d3940..00000000000 --- a/test/vlshift-microkernel-tester.h +++ /dev/null @@ -1,99 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/microfnptr.h" -#include "replicable_random_device.h" - -class VLShiftMicrokernelTester { - public: - VLShiftMicrokernelTester& batch(size_t batch) { - assert(batch != 0); - this->batch_ = batch; - return *this; - } - - size_t batch() const { - return this->batch_; - } - - VLShiftMicrokernelTester& shift(uint32_t shift) { - assert(shift < 32); - this->shift_ = shift; - return *this; - } - - uint32_t shift() const { - return this->shift_; - } - - VLShiftMicrokernelTester& inplace(bool inplace) { - this->inplace_ = inplace; - return *this; - } - - bool inplace() const { - return this->inplace_; - } - - VLShiftMicrokernelTester& iterations(size_t iterations) { - this->iterations_ = iterations; - return *this; - } - - size_t iterations() const { - return this->iterations_; - } - - void Test(xnn_i16_vlshift_ukernel_fn vlshift) const { - xnnpack::ReplicableRandomDevice rng; - auto u16rng = std::bind(std::uniform_int_distribution(), std::ref(rng)); - - std::vector input(batch() + XNN_EXTRA_BYTES / sizeof(uint16_t)); - std::vector output(batch() + (inplace() ? XNN_EXTRA_BYTES / sizeof(uint16_t) : 0)); - std::vector output_ref(batch()); - const uint16_t* input_data = inplace() ? output.data() : input.data(); - - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), std::ref(u16rng)); - std::fill(output.begin(), output.end(), UINT16_C(0xDEAD)); - - // Compute reference results. - for (size_t n = 0; n < batch(); n++) { - uint16_t value = input_data[n]; - value <<= shift(); - output_ref[n] = value; - } - - // Call optimized micro-kernel. - vlshift(batch(), input_data, output.data(), shift()); - - // Verify results. - for (size_t n = 0; n < batch(); n++) { - EXPECT_EQ(output[n], output_ref[n]) - << ", shift " << shift() - << ", batch " << n << " / " << batch(); - } - } - } - - private: - size_t batch_{1}; - uint32_t shift_{12}; - bool inplace_{false}; - size_t iterations_{15}; -}; diff --git a/test/vsquareabs-microkernel-tester.h b/test/vsquareabs-microkernel-tester.h deleted file mode 100644 index bd03ea02da2..00000000000 --- a/test/vsquareabs-microkernel-tester.h +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/microfnptr.h" -#include "replicable_random_device.h" - -class VSquareAbsMicrokernelTester { - public: - VSquareAbsMicrokernelTester& batch(size_t batch) { - assert(batch != 0); - this->batch_ = batch; - return *this; - } - - size_t batch() const { - return this->batch_; - } - - VSquareAbsMicrokernelTester& iterations(size_t iterations) { - this->iterations_ = iterations; - return *this; - } - - size_t iterations() const { - return this->iterations_; - } - - void Test(xnn_cs16_vsquareabs_ukernel_fn vsquareabs) const { - xnnpack::ReplicableRandomDevice rng; - auto i16rng = std::bind(std::uniform_int_distribution(), std::ref(rng)); - - std::vector input(batch() * 2 + XNN_EXTRA_BYTES / sizeof(int16_t)); - std::vector output(batch()); - std::vector output_ref(batch()); - - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), std::ref(i16rng)); - std::fill(output.begin(), output.end(), UINT32_C(0xDEADBEEF)); - - // Compute reference results. - for (size_t n = 0; n < batch(); n++) { - const int32_t r = static_cast(input[n * 2]); - const int32_t i = static_cast(input[n * 2 + 1]); - output_ref[n] = static_cast(r * r + i * i); - } - - // Call optimized micro-kernel. - vsquareabs(batch() * sizeof(int16_t) * 2, input.data(), output.data()); - - // Verify results. - for (size_t n = 0; n < batch(); n++) { - EXPECT_EQ(output[n], output_ref[n]) - << ", batch " << n << " / " << batch(); - } - } - } - - private: - size_t batch_{1}; - size_t iterations_{15}; -}; diff --git a/test/window-microkernel-tester.h b/test/window-microkernel-tester.h deleted file mode 100644 index c38b1e7fbe0..00000000000 --- a/test/window-microkernel-tester.h +++ /dev/null @@ -1,121 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/aligned-allocator.h" -#include "xnnpack/microfnptr.h" -#include "replicable_random_device.h" - -class WindowMicrokernelTester { - public: - WindowMicrokernelTester& rows(size_t rows) { - assert(rows != 0); - this->rows_ = rows; - return *this; - } - - size_t rows() const { - return this->rows_; - } - - WindowMicrokernelTester& channels(size_t channels) { - assert(channels != 0); - this->channels_ = channels; - return *this; - } - - size_t channels() const { - return this->channels_; - } - - WindowMicrokernelTester& shift(uint32_t shift) { - assert(shift < 32); - this->shift_ = shift; - return *this; - } - - uint32_t shift() const { - return this->shift_; - } - - WindowMicrokernelTester& inplace(bool inplace) { - this->inplace_ = inplace; - return *this; - } - - bool inplace() const { - return this->inplace_; - } - - WindowMicrokernelTester& iterations(size_t iterations) { - this->iterations_ = iterations; - return *this; - } - - size_t iterations() const { - return this->iterations_; - } - - void Test(xnn_s16_window_ukernel_fn window) const { - xnnpack::ReplicableRandomDevice rng; - auto i16rng = std::bind(std::uniform_int_distribution(), std::ref(rng)); - - std::vector input(channels() * rows() + XNN_EXTRA_BYTES / sizeof(int16_t)); - std::vector> weights(channels() + XNN_EXTRA_BYTES / sizeof(int16_t)); - std::vector output(channels() * rows() + (inplace() ? XNN_EXTRA_BYTES / sizeof(int16_t) : 0)); - std::vector output_ref(channels() * rows()); - const int16_t* x_data = inplace() ? output.data() : input.data(); - - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), std::ref(i16rng)); - std::generate(weights.begin(), weights.end(), std::ref(i16rng)); - std::fill(output.begin(), output.end(), INT16_C(0xDEAD)); - - // Compute reference results. - for (size_t m = 0; m < rows(); m++) { - for (size_t n = 0; n < channels(); n++) { - const int16_t x_value = x_data[m * channels() + n]; - int32_t value = (int32_t(x_value) * int32_t(weights[n])) >> shift(); - value = std::min(value, std::numeric_limits::max()); - value = std::max(value, std::numeric_limits::min()); - output_ref[m * channels() + n] = static_cast(value); - } - } - - // Call optimized micro-kernel. - window(rows(), channels() * sizeof(int16_t), x_data, weights.data(), output.data(), shift()); - - // Verify results. - for (size_t i = 0; i < rows(); i++) { - for (size_t c = 0; c < channels(); c++) { - EXPECT_EQ(output[i * channels() + c], output_ref[i * channels() + c]) - << "at row " << i << " / " << rows() - << ", shift " << shift() - << ", channel " << c << " / " << channels(); - } - } - } - } - - private: - size_t rows_{1}; - size_t channels_{1}; - uint32_t shift_{12}; - bool inplace_{false}; - size_t iterations_{15}; -}; diff --git a/tools/generate-bfly4-test.py b/tools/generate-bfly4-test.py deleted file mode 100755 index e5b245d6b1f..00000000000 --- a/tools/generate-bfly4-test.py +++ /dev/null @@ -1,188 +0,0 @@ -#!/usr/bin/env python -# Copyright 2022 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -import argparse -import codecs -import math -import os -import re -import sys -import yaml - -sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) -from primes import next_prime -import xngen -import xnncommon - - -parser = argparse.ArgumentParser(description='BFly4 microkernel test generator') -parser.add_argument("-s", "--spec", metavar="FILE", required=True, - help="Specification (YAML) file") -parser.add_argument("-o", "--output", metavar="FILE", required=True, - help='Output (C++ source) file') -parser.set_defaults(defines=list()) - - -def split_ukernel_name(name): - match = re.fullmatch(r"xnn_cs16_bfly4(_samples(\d+))?_ukernel__(.+)(_x(\d+))?", name) - assert match is not None, name - if match.group(2): - samples = int(match.group(2)) - else: - samples = 0 - if match.group(5): - samples_tile = int(match.group(5)) - else: - samples_tile = 1 - - arch, isa, assembly = xnncommon.parse_target_name(target_name=match.group(3)) - return samples, samples_tile, arch, isa, assembly - - -BFLY4_TEST_TEMPLATE = """\ - -$if SAMPLES == 1: - TEST(${TEST_NAME}, samples_eq_1) { - $if ISA_CHECK: - ${ISA_CHECK}; - BFly4MicrokernelTester() - .batch(1) - .samples(1) - .stride(64) - .Test(${", ".join(TEST_ARGS)}); - } - - TEST(${TEST_NAME}, batch_eq_4) { - $if ISA_CHECK: - ${ISA_CHECK}; - BFly4MicrokernelTester() - .batch(4) - .samples(1) - .stride(64) - .Test(${", ".join(TEST_ARGS)}); - } - - TEST(${TEST_NAME}, batch_gt_1) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t batch = 2; batch <= 16; batch++) { - BFly4MicrokernelTester() - .batch(batch) - .samples(1) - .stride(64) - .Test(${", ".join(TEST_ARGS)}); - } - } - -$if SAMPLES == 0 or SAMPLES == 4: - TEST(${TEST_NAME}, samples_eq_4) { - $if ISA_CHECK: - ${ISA_CHECK}; - BFly4MicrokernelTester() - .samples(4) - .stride(16) - .Test(${", ".join(TEST_ARGS)}); - } - - TEST(${TEST_NAME}, samples_eq_4_batch_gt_1) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t batch = 2; batch <= 4; batch++) { - BFly4MicrokernelTester() - .batch(batch) - .samples(4) - .stride(16) - .Test(${", ".join(TEST_ARGS)}); - } - } - -$if SAMPLES == 0: - TEST(${TEST_NAME}, samples_eq_16) { - $if ISA_CHECK: - ${ISA_CHECK}; - BFly4MicrokernelTester() - .samples(16) - .stride(4) - .Test(${", ".join(TEST_ARGS)}); - } - - TEST(${TEST_NAME}, samples_eq_64) { - $if ISA_CHECK: - ${ISA_CHECK}; - BFly4MicrokernelTester() - .samples(64) - .stride(1) - .Test(${", ".join(TEST_ARGS)}); - } - -""" - - -def generate_test_cases(ukernel, samples, samples_tile, isa): - """Generates all tests cases for a BFly4 micro-kernel. - - Args: - ukernel: C name of the micro-kernel function. - samples: fixed number of samples for specialized samples1 microkernel. - samples_tile: Number of samples processed per one iteration of the inner - loop of the micro-kernel. - isa: instruction set required to run the micro-kernel. Generated unit test - will skip execution if the host processor doesn't support this ISA. - - Returns: - Code for the test case. - """ - _, test_name = ukernel.split("_", 1) - _, datatype, ukernel_type, _ = ukernel.split("_", 3) - return xngen.preprocess(BFLY4_TEST_TEMPLATE, { - "TEST_NAME": test_name.upper().replace("UKERNEL_", ""), - "TEST_ARGS": [ukernel], - "DATATYPE": datatype, - "SAMPLES": samples, - "SAMPLE_TILE": samples_tile, - "ISA_CHECK": xnncommon.generate_isa_check_macro(isa), - "next_prime": next_prime, - }) - - -def main(args): - options = parser.parse_args(args) - - with codecs.open(options.spec, "r", encoding="utf-8") as spec_file: - spec_yaml = yaml.safe_load(spec_file) - if not isinstance(spec_yaml, list): - raise ValueError("expected a list of micro-kernels in the spec") - - tests = """\ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: {specification} -// Generator: {generator} - - -#include -#include "xnnpack/common.h" -#include "xnnpack/fft.h" -#include "xnnpack/isa-checks.h" -#include "bfly4-microkernel-tester.h" -""".format(specification=options.spec, generator=sys.argv[0]) - - for ukernel_spec in spec_yaml: - name = ukernel_spec["name"] - samples, samples_tile, arch, isa, assembly = split_ukernel_name(name) - - test_case = generate_test_cases(name, samples, samples_tile, isa) - tests += "\n\n" + xnncommon.postprocess_test_case(test_case, arch, isa, assembly) - - xnncommon.overwrite_if_changed(options.output, tests) - - -if __name__ == "__main__": - main(sys.argv[1:]) diff --git a/tools/generate-fftr-test.py b/tools/generate-fftr-test.py deleted file mode 100755 index 8d9a7626132..00000000000 --- a/tools/generate-fftr-test.py +++ /dev/null @@ -1,112 +0,0 @@ -#!/usr/bin/env python -# Copyright 2022 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -import argparse -import codecs -import math -import os -import re -import sys -import yaml - -sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) -from primes import next_prime -import xngen -import xnncommon - - -parser = argparse.ArgumentParser(description='Fftr microkernel test generator') -parser.add_argument("-s", "--spec", metavar="FILE", required=True, - help="Specification (YAML) file") -parser.add_argument("-o", "--output", metavar="FILE", required=True, - help='Output (C++ source) file') -parser.set_defaults(defines=list()) - - -def split_ukernel_name(name): - match = re.fullmatch(r"xnn_cs16_fftr_ukernel__(.+)_x(\d+)", name) - assert match is not None - sample_tile = int(match.group(2)) - - arch, isa, assembly = xnncommon.parse_target_name(target_name=match.group(1)) - return sample_tile, arch, isa, assembly - - -FFTR_TEST_TEMPLATE = """\ -TEST(${TEST_NAME}, samples_eq_256) { - $if ISA_CHECK: - ${ISA_CHECK}; - FftrMicrokernelTester() - .samples(256) - .Test(${", ".join(TEST_ARGS)}); -} - -""" - - -def generate_test_cases(ukernel, sample_tile, isa): - """Generates all tests cases for a Fftr micro-kernel. - - Args: - ukernel: C name of the micro-kernel function. - sample_tile: Number of samples processed per one iteration of the inner - loop of the micro-kernel. - isa: instruction set required to run the micro-kernel. Generated unit test - will skip execution if the host processor doesn't support this ISA. - - Returns: - Code for the test case. - """ - _, test_name = ukernel.split("_", 1) - _, datatype, ukernel_type, _ = ukernel.split("_", 3) - return xngen.preprocess(FFTR_TEST_TEMPLATE, { - "TEST_NAME": test_name.upper().replace("UKERNEL_", ""), - "TEST_ARGS": [ukernel], - "DATATYPE": datatype, - "SAMPLE_TILE": sample_tile, - "ISA_CHECK": xnncommon.generate_isa_check_macro(isa), - "next_prime": next_prime, - }) - - -def main(args): - options = parser.parse_args(args) - - with codecs.open(options.spec, "r", encoding="utf-8") as spec_file: - spec_yaml = yaml.safe_load(spec_file) - if not isinstance(spec_yaml, list): - raise ValueError("expected a list of micro-kernels in the spec") - - tests = """\ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: {specification} -// Generator: {generator} - - -#include -#include "xnnpack/common.h" -#include "xnnpack/fft.h" -#include "xnnpack/isa-checks.h" -#include "fftr-microkernel-tester.h" -""".format(specification=options.spec, generator=sys.argv[0]) - - for ukernel_spec in spec_yaml: - name = ukernel_spec["name"] - sample_tile, arch, isa, assembly = split_ukernel_name(name) - - test_case = generate_test_cases(name, sample_tile, isa) - tests += "\n\n" + xnncommon.postprocess_test_case(test_case, arch, isa, assembly) - - xnncommon.overwrite_if_changed(options.output, tests) - - -if __name__ == "__main__": - main(sys.argv[1:]) diff --git a/tools/generate-filterbank-accumulate-test.py b/tools/generate-filterbank-accumulate-test.py deleted file mode 100755 index 95519f98ff1..00000000000 --- a/tools/generate-filterbank-accumulate-test.py +++ /dev/null @@ -1,127 +0,0 @@ -#!/usr/bin/env python -# Copyright 2022 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -import argparse -import codecs -import math -import os -import re -import sys -import yaml - -sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) -from primes import next_prime -import xngen -import xnncommon - - -parser = argparse.ArgumentParser(description='Filterbank Accumulate microkernel test generator') -parser.add_argument("-s", "--spec", metavar="FILE", required=True, - help="Specification (YAML) file") -parser.add_argument("-o", "--output", metavar="FILE", required=True, - help='Output (C++ source) file') -parser.set_defaults(defines=list()) - - -def split_ukernel_name(name): - match = re.fullmatch(r"xnn_u32_filterbank_accumulate_ukernel__(.+)(_x(\d+))?", name) - assert match is not None - row_tile = 1 - batch_tile = 1 - if match.group(3): - batch_tile = int(match.group(3)) - - arch, isa, assembly = xnncommon.parse_target_name(target_name=match.group(1)) - return row_tile, batch_tile, arch, isa, assembly - - -FILTERBANK_ACCUMULATE_TEST_TEMPLATE = """\ -TEST(${TEST_NAME}, rows_eq_1) { - $if ISA_CHECK: - ${ISA_CHECK}; - FilterbankAccumulateMicrokernelTester() - .rows(1) - .Test(${", ".join(TEST_ARGS)}); -} - -TEST(${TEST_NAME}, rows_gt_1) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t rows = 2; rows < 10; rows++) { - FilterbankAccumulateMicrokernelTester() - .rows(rows) - .Test(${", ".join(TEST_ARGS)}); - } -} -""" - - -def generate_test_cases(ukernel, row_tile, batch_tile, isa): - """Generates all tests cases for a Filterbank Accumulate micro-kernel. - - Args: - ukernel: C name of the micro-kernel function. - row_tile: Number of rows (pixels) processed per one iteration of the outer - loop of the micro-kernel. - batch_tile: Number of batch processed per one iteration of the inner - loop of the micro-kernel. - isa: instruction set required to run the micro-kernel. Generated unit test - will skip execution if the host processor doesn't support this ISA. - - Returns: - Code for the test case. - """ - _, test_name = ukernel.split("_", 1) - _, datatype, ukernel_type, _ = ukernel.split("_", 3) - return xngen.preprocess(FILTERBANK_ACCUMULATE_TEST_TEMPLATE, { - "TEST_NAME": test_name.upper().replace("UKERNEL_", ""), - "TEST_ARGS": [ukernel], - "DATATYPE": datatype, - "ROW_TILE": row_tile, - "BATCH_TILE": batch_tile, - "ISA_CHECK": xnncommon.generate_isa_check_macro(isa), - "next_prime": next_prime, - }) - - -def main(args): - options = parser.parse_args(args) - - with codecs.open(options.spec, "r", encoding="utf-8") as spec_file: - spec_yaml = yaml.safe_load(spec_file) - if not isinstance(spec_yaml, list): - raise ValueError("expected a list of micro-kernels in the spec") - - tests = """\ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: {specification} -// Generator: {generator} - - -#include -#include "xnnpack/common.h" -#include "xnnpack/filterbank.h" -#include "xnnpack/isa-checks.h" -#include "filterbank-accumulate-microkernel-tester.h" -""".format(specification=options.spec, generator=sys.argv[0]) - - for ukernel_spec in spec_yaml: - name = ukernel_spec["name"] - row_tile, batch_tile, arch, isa, assembly = split_ukernel_name(name) - - test_case = generate_test_cases(name, row_tile, batch_tile, isa) - tests += "\n\n" + xnncommon.postprocess_test_case(test_case, arch, isa, assembly) - - xnncommon.overwrite_if_changed(options.output, tests) - - -if __name__ == "__main__": - main(sys.argv[1:]) diff --git a/tools/generate-filterbank-subtract-test.py b/tools/generate-filterbank-subtract-test.py deleted file mode 100755 index 41015ab5366..00000000000 --- a/tools/generate-filterbank-subtract-test.py +++ /dev/null @@ -1,153 +0,0 @@ -#!/usr/bin/env python -# Copyright 2022 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -import argparse -import codecs -import math -import os -import re -import sys -import yaml - -sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) -from primes import next_prime -import xngen -import xnncommon - - -parser = argparse.ArgumentParser(description='Filterbank Subtract microkernel test generator') -parser.add_argument("-s", "--spec", metavar="FILE", required=True, - help="Specification (YAML) file") -parser.add_argument("-o", "--output", metavar="FILE", required=True, - help='Output (C++ source) file') -parser.set_defaults(defines=list()) - - -def split_ukernel_name(name): - match = re.fullmatch(r"xnn_u32_filterbank_subtract_ukernel__(.+)_x(\d+)", name) - assert match is not None - batch_tile = int(match.group(2)) - - arch, isa, assembly = xnncommon.parse_target_name(target_name=match.group(1)) - return batch_tile, arch, isa - - -FILTERBANK_SUBTRACT_TEST_TEMPLATE = """\ -TEST(${TEST_NAME}, batch_eq_${BATCH_TILE}) { - $if ISA_CHECK: - ${ISA_CHECK}; - FilterbankSubtractMicrokernelTester() - .batch(${BATCH_TILE}) - .Test(${", ".join(TEST_ARGS)}); -} - -$if BATCH_TILE > 1: - TEST(${TEST_NAME}, batch_div_${BATCH_TILE}) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t batch = ${BATCH_TILE*2}; batch < ${BATCH_TILE*10}; batch += ${BATCH_TILE}) { - FilterbankSubtractMicrokernelTester() - .batch(batch) - .Test(${", ".join(TEST_ARGS)}); - } - } - - TEST(${TEST_NAME}, batch_lt_${BATCH_TILE}) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t batch = 2; batch < ${BATCH_TILE}; batch += 2) { - FilterbankSubtractMicrokernelTester() - .batch(batch) - .Test(${", ".join(TEST_ARGS)}); - } - } - -TEST(${TEST_NAME}, batch_gt_${BATCH_TILE}) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t batch = ${BATCH_TILE+2}; batch < ${10 if BATCH_TILE == 1 else BATCH_TILE*2}; batch += 2) { - FilterbankSubtractMicrokernelTester() - .batch(batch) - .Test(${", ".join(TEST_ARGS)}); - } -} - -TEST(${TEST_NAME}, inplace) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t batch = ${BATCH_TILE+2}; batch < ${10 if BATCH_TILE == 1 else BATCH_TILE*2}; batch += 2) { - FilterbankSubtractMicrokernelTester() - .batch(batch) - .inplace(true) - .Test(${", ".join(TEST_ARGS)}); - } -} -""" - - -def generate_test_cases(ukernel, batch_tile, isa): - """Generates all tests cases for a Filterbank Subtract micro-kernel. - - Args: - ukernel: C name of the micro-kernel function. - batch_tile: Number of batch processed per one iteration of the inner - loop of the micro-kernel. - isa: instruction set required to run the micro-kernel. Generated unit test - will skip execution if the host processor doesn't support this ISA. - - Returns: - Code for the test case. - """ - _, test_name = ukernel.split("_", 1) - _, datatype, ukernel_type, _ = ukernel.split("_", 3) - return xngen.preprocess(FILTERBANK_SUBTRACT_TEST_TEMPLATE, { - "TEST_NAME": test_name.upper().replace("UKERNEL_", ""), - "TEST_ARGS": [ukernel], - "DATATYPE": datatype, - "BATCH_TILE": batch_tile, - "ISA_CHECK": xnncommon.generate_isa_check_macro(isa), - "next_prime": next_prime, - }) - - -def main(args): - options = parser.parse_args(args) - - with codecs.open(options.spec, "r", encoding="utf-8") as spec_file: - spec_yaml = yaml.safe_load(spec_file) - if not isinstance(spec_yaml, list): - raise ValueError("expected a list of micro-kernels in the spec") - - tests = """\ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: {specification} -// Generator: {generator} - - -#include -#include "xnnpack/common.h" -#include "xnnpack/filterbank.h" -#include "xnnpack/isa-checks.h" -#include "filterbank-subtract-microkernel-tester.h" -""".format(specification=options.spec, generator=sys.argv[0]) - - for ukernel_spec in spec_yaml: - name = ukernel_spec["name"] - batch_tile, arch, isa = split_ukernel_name(name) - - test_case = generate_test_cases(name, batch_tile, isa) - tests += "\n\n" + xnncommon.postprocess_test_case(test_case, arch, isa) - - xnncommon.overwrite_if_changed(options.output, tests) - - -if __name__ == "__main__": - main(sys.argv[1:]) diff --git a/tools/generate-rmaxabs-test.py b/tools/generate-rmaxabs-test.py deleted file mode 100755 index f28eae0d683..00000000000 --- a/tools/generate-rmaxabs-test.py +++ /dev/null @@ -1,143 +0,0 @@ -#!/usr/bin/env python -# Copyright 2022 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -import argparse -import codecs -import math -import os -import re -import sys -import yaml - -sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) -from primes import next_prime -import xngen -import xnncommon - - -parser = argparse.ArgumentParser(description='RMaxAbs microkernel test generator') -parser.add_argument("-s", "--spec", metavar="FILE", required=True, - help="Specification (YAML) file") -parser.add_argument("-o", "--output", metavar="FILE", required=True, - help='Output (C++ source) file') -parser.set_defaults(defines=list()) - - -def split_ukernel_name(name): - match = re.fullmatch(r"xnn_s16_rmaxabs_ukernel__(.+)_x(\d+)", name) - assert match is not None - batch_tile = int(match.group(2)) - - arch, isa, assembly = xnncommon.parse_target_name(target_name=match.group(1)) - return batch_tile, arch, isa - - -RMAXABS_TEST_TEMPLATE = """\ -TEST(${TEST_NAME}, batch_eq_${BATCH_TILE}) { - $if ISA_CHECK: - ${ISA_CHECK}; - RMaxAbsMicrokernelTester() - .batch(${BATCH_TILE}) - .Test(${", ".join(TEST_ARGS)}); -} - -$if BATCH_TILE > 1: - TEST(${TEST_NAME}, batch_div_${BATCH_TILE}) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t batch = ${BATCH_TILE*2}; batch < ${BATCH_TILE*10}; batch += ${BATCH_TILE}) { - RMaxAbsMicrokernelTester() - .batch(batch) - .Test(${", ".join(TEST_ARGS)}); - } - } - - TEST(${TEST_NAME}, batch_lt_${BATCH_TILE}) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t batch = 1; batch < ${BATCH_TILE}; batch++) { - RMaxAbsMicrokernelTester() - .batch(batch) - .Test(${", ".join(TEST_ARGS)}); - } - } - -TEST(${TEST_NAME}, batch_gt_${BATCH_TILE}) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t batch = ${BATCH_TILE+1}; batch < ${10 if BATCH_TILE == 1 else BATCH_TILE*2}; batch++) { - RMaxAbsMicrokernelTester() - .batch(batch) - .Test(${", ".join(TEST_ARGS)}); - } -} - -""" - - -def generate_test_cases(ukernel, batch_tile, isa): - """Generates all tests cases for a RMaxAbs micro-kernel. - - Args: - ukernel: C name of the micro-kernel function. - batch_tile: Number of batch processed per one iteration of the inner - loop of the micro-kernel. - isa: instruction set required to run the micro-kernel. Generated unit test - will skip execution if the host processor doesn't support this ISA. - - Returns: - Code for the test case. - """ - _, test_name = ukernel.split("_", 1) - _, datatype, ukernel_type, _ = ukernel.split("_", 3) - return xngen.preprocess(RMAXABS_TEST_TEMPLATE, { - "TEST_NAME": test_name.upper().replace("UKERNEL_", ""), - "TEST_ARGS": [ukernel], - "DATATYPE": datatype, - "BATCH_TILE": batch_tile, - "ISA_CHECK": xnncommon.generate_isa_check_macro(isa), - "next_prime": next_prime, - }) - - -def main(args): - options = parser.parse_args(args) - - with codecs.open(options.spec, "r", encoding="utf-8") as spec_file: - spec_yaml = yaml.safe_load(spec_file) - if not isinstance(spec_yaml, list): - raise ValueError("expected a list of micro-kernels in the spec") - - tests = """\ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: {specification} -// Generator: {generator} - - -#include -#include "xnnpack/common.h" -#include "xnnpack/isa-checks.h" -#include "xnnpack/rmaxabs.h" -#include "rmaxabs-microkernel-tester.h" -""".format(specification=options.spec, generator=sys.argv[0]) - - for ukernel_spec in spec_yaml: - name = ukernel_spec["name"] - batch_tile, arch, isa = split_ukernel_name(name) - - test_case = generate_test_cases(name, batch_tile, isa) - tests += "\n\n" + xnncommon.postprocess_test_case(test_case, arch, isa) - - xnncommon.overwrite_if_changed(options.output, tests) - - -if __name__ == "__main__": - main(sys.argv[1:]) diff --git a/tools/generate-vlshift-test.py b/tools/generate-vlshift-test.py deleted file mode 100755 index 639f7aba75f..00000000000 --- a/tools/generate-vlshift-test.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python -# Copyright 2022 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -import argparse -import codecs -import math -import os -import re -import sys -import yaml - -sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) -from primes import next_prime -import xngen -import xnncommon - - -parser = argparse.ArgumentParser(description='VLShift microkernel test generator') -parser.add_argument("-s", "--spec", metavar="FILE", required=True, - help="Specification (YAML) file") -parser.add_argument("-o", "--output", metavar="FILE", required=True, - help='Output (C++ source) file') -parser.set_defaults(defines=list()) - - -def split_ukernel_name(name): - match = re.fullmatch(r"xnn_i16_vlshift_ukernel__(.+)_u(\d+)(v)?", name) - assert match is not None - batch_tile = int(match.group(2)) - - arch, isa, assembly = xnncommon.parse_target_name(target_name=match.group(1)) - return batch_tile, arch, isa - - -VLSHIFT_TEST_TEMPLATE = """\ -TEST(${TEST_NAME}, batch_eq_${BATCH_TILE}) { - $if ISA_CHECK: - ${ISA_CHECK}; - VLShiftMicrokernelTester() - .batch(${BATCH_TILE}) - .Test(${", ".join(TEST_ARGS)}); -} - -$if BATCH_TILE > 1: - TEST(${TEST_NAME}, batch_div_${BATCH_TILE}) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t batch = ${BATCH_TILE*2}; batch < ${BATCH_TILE*10}; batch += ${BATCH_TILE}) { - VLShiftMicrokernelTester() - .batch(batch) - .Test(${", ".join(TEST_ARGS)}); - } - } - - TEST(${TEST_NAME}, batch_lt_${BATCH_TILE}) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t batch = 1; batch < ${BATCH_TILE}; batch++) { - VLShiftMicrokernelTester() - .batch(batch) - .Test(${", ".join(TEST_ARGS)}); - } - } - -TEST(${TEST_NAME}, batch_gt_${BATCH_TILE}) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t batch = ${BATCH_TILE+1}; batch < ${10 if BATCH_TILE == 1 else BATCH_TILE*2}; batch++) { - VLShiftMicrokernelTester() - .batch(batch) - .Test(${", ".join(TEST_ARGS)}); - } -} - -TEST(${TEST_NAME}, inplace) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t batch = 1; batch <= ${BATCH_TILE*5}; batch += ${max(1, BATCH_TILE-1)}) { - VLShiftMicrokernelTester() - .batch(batch) - .inplace(true) - .iterations(1) - .Test(${", ".join(TEST_ARGS)}); - } -} - -TEST(${TEST_NAME}, shift) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (uint32_t shift = 0; shift < 16; shift++) { - VLShiftMicrokernelTester() - .batch(${BATCH_TILE}) - .shift(shift) - .Test(${", ".join(TEST_ARGS)}); - } -} - -""" - - -def generate_test_cases(ukernel, batch_tile, isa): - """Generates all tests cases for a VLShift micro-kernel. - - Args: - ukernel: C name of the micro-kernel function. - batch_tile: Number of batch processed per one iteration of the inner - loop of the micro-kernel. - isa: instruction set required to run the micro-kernel. Generated unit test - will skip execution if the host processor doesn't support this ISA. - - Returns: - Code for the test case. - """ - _, test_name = ukernel.split("_", 1) - _, datatype, ukernel_type, _ = ukernel.split("_", 3) - return xngen.preprocess(VLSHIFT_TEST_TEMPLATE, { - "TEST_NAME": test_name.upper().replace("UKERNEL_", ""), - "TEST_ARGS": [ukernel], - "DATATYPE": datatype, - "BATCH_TILE": batch_tile, - "ISA_CHECK": xnncommon.generate_isa_check_macro(isa), - "next_prime": next_prime, - }) - - -def main(args): - options = parser.parse_args(args) - - with codecs.open(options.spec, "r", encoding="utf-8") as spec_file: - spec_yaml = yaml.safe_load(spec_file) - if not isinstance(spec_yaml, list): - raise ValueError("expected a list of micro-kernels in the spec") - - tests = """\ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: {specification} -// Generator: {generator} - - -#include -#include "xnnpack/common.h" -#include "xnnpack/isa-checks.h" -#include "xnnpack/vlshift.h" -#include "vlshift-microkernel-tester.h" -""".format(specification=options.spec, generator=sys.argv[0]) - - for ukernel_spec in spec_yaml: - name = ukernel_spec["name"] - batch_tile, arch, isa = split_ukernel_name(name) - - test_case = generate_test_cases(name, batch_tile, isa) - tests += "\n\n" + xnncommon.postprocess_test_case(test_case, arch, isa) - - xnncommon.overwrite_if_changed(options.output, tests) - - -if __name__ == "__main__": - main(sys.argv[1:]) diff --git a/tools/generate-vsquareabs-test.py b/tools/generate-vsquareabs-test.py deleted file mode 100755 index f624c1f9010..00000000000 --- a/tools/generate-vsquareabs-test.py +++ /dev/null @@ -1,143 +0,0 @@ -#!/usr/bin/env python -# Copyright 2022 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -import argparse -import codecs -import math -import os -import re -import sys -import yaml - -sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) -from primes import next_prime -import xngen -import xnncommon - - -parser = argparse.ArgumentParser(description='VSquareAbs microkernel test generator') -parser.add_argument("-s", "--spec", metavar="FILE", required=True, - help="Specification (YAML) file") -parser.add_argument("-o", "--output", metavar="FILE", required=True, - help='Output (C++ source) file') -parser.set_defaults(defines=list()) - - -def split_ukernel_name(name): - match = re.fullmatch(r"xnn_cs16_vsquareabs_ukernel__(.+)_x(\d+)", name) - assert match is not None - batch_tile = int(match.group(2)) - - arch, isa, assembly = xnncommon.parse_target_name(target_name=match.group(1)) - return batch_tile, arch, isa - - -VSQUAREABS_TEST_TEMPLATE = """\ -TEST(${TEST_NAME}, batch_eq_${BATCH_TILE}) { - $if ISA_CHECK: - ${ISA_CHECK}; - VSquareAbsMicrokernelTester() - .batch(${BATCH_TILE}) - .Test(${", ".join(TEST_ARGS)}); -} - -$if BATCH_TILE > 1: - TEST(${TEST_NAME}, batch_div_${BATCH_TILE}) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t batch = ${BATCH_TILE*2}; batch < ${BATCH_TILE*10}; batch += ${BATCH_TILE}) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(${", ".join(TEST_ARGS)}); - } - } - - TEST(${TEST_NAME}, batch_lt_${BATCH_TILE}) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t batch = 1; batch < ${BATCH_TILE}; batch++) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(${", ".join(TEST_ARGS)}); - } - } - -TEST(${TEST_NAME}, batch_gt_${BATCH_TILE}) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t batch = ${BATCH_TILE+1}; batch < ${10 if BATCH_TILE == 1 else BATCH_TILE*2}; batch++) { - VSquareAbsMicrokernelTester() - .batch(batch) - .Test(${", ".join(TEST_ARGS)}); - } -} - -""" - - -def generate_test_cases(ukernel, batch_tile, isa): - """Generates all tests cases for a VSquareAbs micro-kernel. - - Args: - ukernel: C name of the micro-kernel function. - batch_tile: Number of batch processed per one iteration of the inner - loop of the micro-kernel. - isa: instruction set required to run the micro-kernel. Generated unit test - will skip execution if the host processor doesn't support this ISA. - - Returns: - Code for the test case. - """ - _, test_name = ukernel.split("_", 1) - _, datatype, ukernel_type, _ = ukernel.split("_", 3) - return xngen.preprocess(VSQUAREABS_TEST_TEMPLATE, { - "TEST_NAME": test_name.upper().replace("UKERNEL_", ""), - "TEST_ARGS": [ukernel], - "DATATYPE": datatype, - "BATCH_TILE": batch_tile, - "ISA_CHECK": xnncommon.generate_isa_check_macro(isa), - "next_prime": next_prime, - }) - - -def main(args): - options = parser.parse_args(args) - - with codecs.open(options.spec, "r", encoding="utf-8") as spec_file: - spec_yaml = yaml.safe_load(spec_file) - if not isinstance(spec_yaml, list): - raise ValueError("expected a list of micro-kernels in the spec") - - tests = """\ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: {specification} -// Generator: {generator} - - -#include -#include "xnnpack/common.h" -#include "xnnpack/isa-checks.h" -#include "xnnpack/vsquareabs.h" -#include "vsquareabs-microkernel-tester.h" -""".format(specification=options.spec, generator=sys.argv[0]) - - for ukernel_spec in spec_yaml: - name = ukernel_spec["name"] - batch_tile, arch, isa = split_ukernel_name(name) - - test_case = generate_test_cases(name, batch_tile, isa) - tests += "\n\n" + xnncommon.postprocess_test_case(test_case, arch, isa) - - xnncommon.overwrite_if_changed(options.output, tests) - - -if __name__ == "__main__": - main(sys.argv[1:]) diff --git a/tools/generate-window-test.py b/tools/generate-window-test.py deleted file mode 100755 index a8ddc00c88a..00000000000 --- a/tools/generate-window-test.py +++ /dev/null @@ -1,228 +0,0 @@ -#!/usr/bin/env python -# Copyright 2022 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -import argparse -import codecs -import math -import os -import re -import sys -import yaml - -sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) -from primes import next_prime -import xngen -import xnncommon - - -parser = argparse.ArgumentParser(description='Window microkernel test generator') -parser.add_argument("-s", "--spec", metavar="FILE", required=True, - help="Specification (YAML) file") -parser.add_argument("-o", "--output", metavar="FILE", required=True, - help='Output (C++ source) file') -parser.set_defaults(defines=list()) - - -def split_ukernel_name(name): - shift = 0 - row_tile = 1 - match = re.fullmatch(r"xnn_s16_window(_shift(\d+))?_ukernel__(.+)_u(\d+)(v)?", name) - assert match is not None - if match.group(2): - shift = int(match.group(2)) - channels_tile = int(match.group(4)) - - arch, isa, assembly = xnncommon.parse_target_name(target_name=match.group(3)) - return shift, row_tile, channels_tile, arch, isa - - -WINDOW_TEST_TEMPLATE = """\ -TEST(${TEST_NAME}, channels_eq_${BATCH_TILE}) { - $if ISA_CHECK: - ${ISA_CHECK}; - WindowMicrokernelTester() - .rows(1) - .channels(${BATCH_TILE}) - .shift(${SHIFT}) - .Test(${", ".join(TEST_ARGS)}); -} - -$if BATCH_TILE > 1: - TEST(${TEST_NAME}, channels_div_${BATCH_TILE}) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t channels = ${BATCH_TILE*2}; channels < ${BATCH_TILE*10}; channels += ${BATCH_TILE}) { - WindowMicrokernelTester() - .channels(channels) - .shift(${SHIFT}) - .Test(${", ".join(TEST_ARGS)}); - } - } - - TEST(${TEST_NAME}, channels_lt_${BATCH_TILE}) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t channels = 1; channels < ${BATCH_TILE}; channels++) { - WindowMicrokernelTester() - .channels(channels) - .shift(${SHIFT}) - .Test(${", ".join(TEST_ARGS)}); - } - } - -TEST(${TEST_NAME}, channels_gt_${BATCH_TILE}) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t channels = ${BATCH_TILE+1}; channels < ${10 if BATCH_TILE == 1 else BATCH_TILE*2}; channels++) { - WindowMicrokernelTester() - .channels(channels) - .shift(${SHIFT}) - .Test(${", ".join(TEST_ARGS)}); - } -} - -$if ROW_TILE > 1: - TEST(${TEST_NAME}, rows_lt_${ROW_TILE}) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t rows = 1; rows < ${ROW_TILE}; rows++) { - for (size_t channels = 1; channels <= ${BATCH_TILE*5}; channels += ${max(1, BATCH_TILE-1)}) { - WindowMicrokernelTester() - .rows(rows) - .channels(channels) - .shift(${SHIFT}) - .Test(${", ".join(TEST_ARGS)}); - } - } - } - - TEST(${TEST_NAME}, rows_div_${ROW_TILE}) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t rows = ${ROW_TILE*2}; rows <= ${ROW_TILE*4}; rows += ${ROW_TILE}) { - for (size_t channels = 1; channels <= ${BATCH_TILE*5}; channels += ${max(1, BATCH_TILE-1)}) { - WindowMicrokernelTester() - .rows(rows) - .channels(channels) - .shift(${SHIFT}) - .Test(${", ".join(TEST_ARGS)}); - } - } - } - -TEST(${TEST_NAME}, rows_gt_${ROW_TILE}) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t rows = ${ROW_TILE+1}; rows < ${ROW_TILE*2}; rows++) { - for (size_t channels = 1; channels <= ${BATCH_TILE*5}; channels += ${max(1, BATCH_TILE-1)}) { - WindowMicrokernelTester() - .rows(rows) - .channels(channels) - .shift(${SHIFT}) - .Test(${", ".join(TEST_ARGS)}); - } - } -} - -TEST(${TEST_NAME}, inplace) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t rows = 1; rows <= ${ROW_TILE*3}; rows += ${max(1, ROW_TILE-1)}) { - for (size_t channels = 1; channels <= ${BATCH_TILE*5}; channels += ${max(1, BATCH_TILE-1)}) { - WindowMicrokernelTester() - .rows(rows) - .channels(channels) - .shift(${SHIFT}) - .inplace(true) - .iterations(1) - .Test(${", ".join(TEST_ARGS)}); - } - } -} - -$if SHIFT == 0: - TEST(${TEST_NAME}, shift) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (uint32_t shift = 0; shift < 32; shift++) { - WindowMicrokernelTester() - .rows(${ROW_TILE}) - .channels(${BATCH_TILE}) - .shift(shift) - .Test(${", ".join(TEST_ARGS)}); - } - } -""" - - -def generate_test_cases(ukernel, shift, row_tile, channels_tile, isa): - """Generates all tests cases for a Window micro-kernel. - - Args: - ukernel: C name of the micro-kernel function. - shift: Shift by constant value. - row_tile: Number of rows (pixels) processed per one iteration of the outer - loop of the micro-kernel. - channels_tile: Number of channels processed per one iteration of the inner - loop of the micro-kernel. - isa: instruction set required to run the micro-kernel. Generated unit test - will skip execution if the host processor doesn't support this ISA. - - Returns: - Code for the test case. - """ - _, test_name = ukernel.split("_", 1) - _, datatype, ukernel_type, _ = ukernel.split("_", 3) - return xngen.preprocess(WINDOW_TEST_TEMPLATE, { - "TEST_NAME": test_name.upper().replace("UKERNEL_", ""), - "TEST_ARGS": [ukernel], - "DATATYPE": datatype, - "SHIFT": shift, - "ROW_TILE": row_tile, - "BATCH_TILE": channels_tile, - "ISA_CHECK": xnncommon.generate_isa_check_macro(isa), - "next_prime": next_prime, - }) - - -def main(args): - options = parser.parse_args(args) - - with codecs.open(options.spec, "r", encoding="utf-8") as spec_file: - spec_yaml = yaml.safe_load(spec_file) - if not isinstance(spec_yaml, list): - raise ValueError("expected a list of micro-kernels in the spec") - - tests = """\ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: {specification} -// Generator: {generator} - - -#include -#include "xnnpack/common.h" -#include "xnnpack/isa-checks.h" -#include "xnnpack/window.h" -#include "window-microkernel-tester.h" -""".format(specification=options.spec, generator=sys.argv[0]) - - for ukernel_spec in spec_yaml: - name = ukernel_spec["name"] - shift, row_tile, channels_tile, arch, isa = split_ukernel_name(name) - - test_case = generate_test_cases(name, shift, row_tile, channels_tile, isa) - tests += "\n\n" + xnncommon.postprocess_test_case(test_case, arch, isa) - - xnncommon.overwrite_if_changed(options.output, tests) - - -if __name__ == "__main__": - main(sys.argv[1:]) From 77179ffc82ae80185939b5fcdc1590cd8f034b2a Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Thu, 19 Sep 2024 12:22:20 -0700 Subject: [PATCH 06/50] Fix compiler detected overflow PiperOrigin-RevId: 676515795 --- test/vcvt-microkernel-tester.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/test/vcvt-microkernel-tester.h b/test/vcvt-microkernel-tester.h index c1624b5d083..f1f7689b27c 100644 --- a/test/vcvt-microkernel-tester.h +++ b/test/vcvt-microkernel-tester.h @@ -126,10 +126,8 @@ VCvtMicrokernelTester make_vcvt_tester() { return VCvtMicrokernelTester() .qmin(std::numeric_limits::min()) .qmax(std::numeric_limits::max()) - .output_zero_point((static_cast(std::numeric_limits::min()) + - static_cast(std::numeric_limits::max()) + - 1) / - 2); + .output_zero_point(std::numeric_limits::min() / 2 + + std::numeric_limits::max() / 2 + 1); } else { return VCvtMicrokernelTester(); } From 6500f4cd74eb61fe8326bf95269aafb1e6a02d0f Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Thu, 19 Sep 2024 12:24:51 -0700 Subject: [PATCH 07/50] Fix warnings due to reinterpreting xnn_float16* as uint16_t* PiperOrigin-RevId: 676516812 --- src/f16-ibilinear/gen/f16-ibilinear-neonfp16arith-c16.c | 4 ++-- src/f16-ibilinear/gen/f16-ibilinear-neonfp16arith-c8.c | 4 ++-- src/f16-ibilinear/neonfp16arith.c.in | 4 ++-- ...f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u32-acc2.c | 2 +- ...f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u32-acc4.c | 2 +- .../gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u32.c | 2 +- ...f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u40-acc2.c | 2 +- ...f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u40-acc5.c | 2 +- .../gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u40.c | 2 +- ...f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u48-acc2.c | 2 +- ...f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u48-acc3.c | 2 +- .../gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u48.c | 2 +- ...f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u64-acc2.c | 2 +- ...f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u64-acc4.c | 2 +- .../gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u64.c | 2 +- ...f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u72-acc3.c | 2 +- .../gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u72.c | 2 +- ...f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u80-acc2.c | 2 +- ...f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u80-acc5.c | 2 +- .../gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u80.c | 2 +- ...f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u96-acc2.c | 2 +- ...f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u96-acc3.c | 2 +- ...f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u96-acc6.c | 2 +- .../gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u96.c | 2 +- src/f16-raddstoreexpminusmax/neonfp16arith-rr2-p2.c.in | 2 +- src/f16-velu/gen/f16-velu-neonfp16arith-rr1-p3-u16.c | 6 +++--- src/f16-velu/gen/f16-velu-neonfp16arith-rr1-p3-u8.c | 6 +++--- src/f16-velu/neonfp16arith-rr1-p3.c.in | 6 +++--- src/f16-vlrelu/gen/f16-vlrelu-neonfp16arith-u16.c | 2 +- src/f16-vlrelu/gen/f16-vlrelu-neonfp16arith-u8.c | 2 +- src/f16-vlrelu/neonfp16arith.c.in | 2 +- 31 files changed, 40 insertions(+), 40 deletions(-) diff --git a/src/f16-ibilinear/gen/f16-ibilinear-neonfp16arith-c16.c b/src/f16-ibilinear/gen/f16-ibilinear-neonfp16arith-c16.c index 68908529f09..3db2bf3c283 100644 --- a/src/f16-ibilinear/gen/f16-ibilinear-neonfp16arith-c16.c +++ b/src/f16-ibilinear/gen/f16-ibilinear-neonfp16arith-c16.c @@ -37,8 +37,8 @@ void xnn_f16_ibilinear_ukernel__neonfp16arith_c16( const uint16_t* i3 = (const uint16_t*) ((uintptr_t) input[3] + input_offset); input += 4; - const float16x8_t valphah = vreinterpretq_f16_u16(vld1q_dup_u16(weights)); weights = (const xnn_float16*) weights + 1; - const float16x8_t valphav = vreinterpretq_f16_u16(vld1q_dup_u16(weights)); weights = (const xnn_float16*) weights + 1; + const float16x8_t valphah = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*)weights)); weights = weights + 1; + const float16x8_t valphav = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*)weights)); weights = weights + 1; size_t c = channels; for (; c >= 16 * sizeof(uint16_t); c -= 16 * sizeof(uint16_t)) { diff --git a/src/f16-ibilinear/gen/f16-ibilinear-neonfp16arith-c8.c b/src/f16-ibilinear/gen/f16-ibilinear-neonfp16arith-c8.c index e3f6b254e71..8879c2f94f6 100644 --- a/src/f16-ibilinear/gen/f16-ibilinear-neonfp16arith-c8.c +++ b/src/f16-ibilinear/gen/f16-ibilinear-neonfp16arith-c8.c @@ -37,8 +37,8 @@ void xnn_f16_ibilinear_ukernel__neonfp16arith_c8( const uint16_t* i3 = (const uint16_t*) ((uintptr_t) input[3] + input_offset); input += 4; - const float16x8_t valphah = vreinterpretq_f16_u16(vld1q_dup_u16(weights)); weights = (const xnn_float16*) weights + 1; - const float16x8_t valphav = vreinterpretq_f16_u16(vld1q_dup_u16(weights)); weights = (const xnn_float16*) weights + 1; + const float16x8_t valphah = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*)weights)); weights = weights + 1; + const float16x8_t valphav = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*)weights)); weights = weights + 1; size_t c = channels; for (; c >= 8 * sizeof(uint16_t); c -= 8 * sizeof(uint16_t)) { diff --git a/src/f16-ibilinear/neonfp16arith.c.in b/src/f16-ibilinear/neonfp16arith.c.in index 47bb3a1785d..2c7f1876eb1 100644 --- a/src/f16-ibilinear/neonfp16arith.c.in +++ b/src/f16-ibilinear/neonfp16arith.c.in @@ -37,8 +37,8 @@ void xnn_f16_ibilinear_ukernel__neonfp16arith_c${CHANNEL_TILE}${"" if PIXEL_TILE const uint16_t* i3 = (const uint16_t*) ((uintptr_t) input[3] + input_offset); input += 4; - const float16x8_t valphah = vreinterpretq_f16_u16(vld1q_dup_u16(weights)); weights = (const xnn_float16*) weights + 1; - const float16x8_t valphav = vreinterpretq_f16_u16(vld1q_dup_u16(weights)); weights = (const xnn_float16*) weights + 1; + const float16x8_t valphah = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*)weights)); weights = weights + 1; + const float16x8_t valphav = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*)weights)); weights = weights + 1; size_t c = channels; $if CHANNEL_TILE > 8: diff --git a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u32-acc2.c b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u32-acc2.c index c3be0d34d90..d088b550356 100644 --- a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u32-acc2.c +++ b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u32-acc2.c @@ -46,7 +46,7 @@ void xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_u32_acc2( XNN_FORCE_REALIZATION(vc1); XNN_FORCE_REALIZATION(vdenorm_cutoff); - const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16(max)); + const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*)max)); const uint16_t* i = (const uint16_t*) input; uint16_t* o = (uint16_t*) output; diff --git a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u32-acc4.c b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u32-acc4.c index 9e2535a5af8..d649436963a 100644 --- a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u32-acc4.c +++ b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u32-acc4.c @@ -46,7 +46,7 @@ void xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_u32_acc4( XNN_FORCE_REALIZATION(vc1); XNN_FORCE_REALIZATION(vdenorm_cutoff); - const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16(max)); + const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*)max)); const uint16_t* i = (const uint16_t*) input; uint16_t* o = (uint16_t*) output; diff --git a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u32.c b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u32.c index 8f19b870374..315c0fddbec 100644 --- a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u32.c +++ b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u32.c @@ -46,7 +46,7 @@ void xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_u32( XNN_FORCE_REALIZATION(vc1); XNN_FORCE_REALIZATION(vdenorm_cutoff); - const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16(max)); + const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*)max)); const uint16_t* i = (const uint16_t*) input; uint16_t* o = (uint16_t*) output; diff --git a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u40-acc2.c b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u40-acc2.c index 0e0dcdb491e..fbc71cf0300 100644 --- a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u40-acc2.c +++ b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u40-acc2.c @@ -46,7 +46,7 @@ void xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_u40_acc2( XNN_FORCE_REALIZATION(vc1); XNN_FORCE_REALIZATION(vdenorm_cutoff); - const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16(max)); + const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*)max)); const uint16_t* i = (const uint16_t*) input; uint16_t* o = (uint16_t*) output; diff --git a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u40-acc5.c b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u40-acc5.c index 34fd425e23b..2bcf34c870e 100644 --- a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u40-acc5.c +++ b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u40-acc5.c @@ -46,7 +46,7 @@ void xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_u40_acc5( XNN_FORCE_REALIZATION(vc1); XNN_FORCE_REALIZATION(vdenorm_cutoff); - const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16(max)); + const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*)max)); const uint16_t* i = (const uint16_t*) input; uint16_t* o = (uint16_t*) output; diff --git a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u40.c b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u40.c index 1d308661c71..d160f4a197e 100644 --- a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u40.c +++ b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u40.c @@ -46,7 +46,7 @@ void xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_u40( XNN_FORCE_REALIZATION(vc1); XNN_FORCE_REALIZATION(vdenorm_cutoff); - const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16(max)); + const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*)max)); const uint16_t* i = (const uint16_t*) input; uint16_t* o = (uint16_t*) output; diff --git a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u48-acc2.c b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u48-acc2.c index e33a6c8373b..bf68c6d5552 100644 --- a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u48-acc2.c +++ b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u48-acc2.c @@ -46,7 +46,7 @@ void xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_u48_acc2( XNN_FORCE_REALIZATION(vc1); XNN_FORCE_REALIZATION(vdenorm_cutoff); - const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16(max)); + const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*)max)); const uint16_t* i = (const uint16_t*) input; uint16_t* o = (uint16_t*) output; diff --git a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u48-acc3.c b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u48-acc3.c index e83c152c707..918064dabed 100644 --- a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u48-acc3.c +++ b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u48-acc3.c @@ -46,7 +46,7 @@ void xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_u48_acc3( XNN_FORCE_REALIZATION(vc1); XNN_FORCE_REALIZATION(vdenorm_cutoff); - const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16(max)); + const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*)max)); const uint16_t* i = (const uint16_t*) input; uint16_t* o = (uint16_t*) output; diff --git a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u48.c b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u48.c index 8d2c2180485..cf46977f106 100644 --- a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u48.c +++ b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u48.c @@ -46,7 +46,7 @@ void xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_u48( XNN_FORCE_REALIZATION(vc1); XNN_FORCE_REALIZATION(vdenorm_cutoff); - const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16(max)); + const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*)max)); const uint16_t* i = (const uint16_t*) input; uint16_t* o = (uint16_t*) output; diff --git a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u64-acc2.c b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u64-acc2.c index 132e57e4666..0a1202921be 100644 --- a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u64-acc2.c +++ b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u64-acc2.c @@ -46,7 +46,7 @@ void xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_u64_acc2( XNN_FORCE_REALIZATION(vc1); XNN_FORCE_REALIZATION(vdenorm_cutoff); - const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16(max)); + const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*)max)); const uint16_t* i = (const uint16_t*) input; uint16_t* o = (uint16_t*) output; diff --git a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u64-acc4.c b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u64-acc4.c index a505828a36b..b96f4690e9b 100644 --- a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u64-acc4.c +++ b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u64-acc4.c @@ -46,7 +46,7 @@ void xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_u64_acc4( XNN_FORCE_REALIZATION(vc1); XNN_FORCE_REALIZATION(vdenorm_cutoff); - const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16(max)); + const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*)max)); const uint16_t* i = (const uint16_t*) input; uint16_t* o = (uint16_t*) output; diff --git a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u64.c b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u64.c index 63987e994d6..4208190dd2c 100644 --- a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u64.c +++ b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u64.c @@ -46,7 +46,7 @@ void xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_u64( XNN_FORCE_REALIZATION(vc1); XNN_FORCE_REALIZATION(vdenorm_cutoff); - const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16(max)); + const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*)max)); const uint16_t* i = (const uint16_t*) input; uint16_t* o = (uint16_t*) output; diff --git a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u72-acc3.c b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u72-acc3.c index 77d178de0a8..545f2928dae 100644 --- a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u72-acc3.c +++ b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u72-acc3.c @@ -46,7 +46,7 @@ void xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_u72_acc3( XNN_FORCE_REALIZATION(vc1); XNN_FORCE_REALIZATION(vdenorm_cutoff); - const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16(max)); + const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*)max)); const uint16_t* i = (const uint16_t*) input; uint16_t* o = (uint16_t*) output; diff --git a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u72.c b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u72.c index bc0ab4c2fb6..03d45f5242f 100644 --- a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u72.c +++ b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u72.c @@ -46,7 +46,7 @@ void xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_u72( XNN_FORCE_REALIZATION(vc1); XNN_FORCE_REALIZATION(vdenorm_cutoff); - const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16(max)); + const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*)max)); const uint16_t* i = (const uint16_t*) input; uint16_t* o = (uint16_t*) output; diff --git a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u80-acc2.c b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u80-acc2.c index 063ca579f90..e39e1c08ac7 100644 --- a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u80-acc2.c +++ b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u80-acc2.c @@ -46,7 +46,7 @@ void xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_u80_acc2( XNN_FORCE_REALIZATION(vc1); XNN_FORCE_REALIZATION(vdenorm_cutoff); - const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16(max)); + const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*)max)); const uint16_t* i = (const uint16_t*) input; uint16_t* o = (uint16_t*) output; diff --git a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u80-acc5.c b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u80-acc5.c index 2d859dd09d6..77fce68a5f9 100644 --- a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u80-acc5.c +++ b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u80-acc5.c @@ -46,7 +46,7 @@ void xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_u80_acc5( XNN_FORCE_REALIZATION(vc1); XNN_FORCE_REALIZATION(vdenorm_cutoff); - const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16(max)); + const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*)max)); const uint16_t* i = (const uint16_t*) input; uint16_t* o = (uint16_t*) output; diff --git a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u80.c b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u80.c index bad1a9fc288..68b09281a77 100644 --- a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u80.c +++ b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u80.c @@ -46,7 +46,7 @@ void xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_u80( XNN_FORCE_REALIZATION(vc1); XNN_FORCE_REALIZATION(vdenorm_cutoff); - const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16(max)); + const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*)max)); const uint16_t* i = (const uint16_t*) input; uint16_t* o = (uint16_t*) output; diff --git a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u96-acc2.c b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u96-acc2.c index c3d902eec43..12cc377e912 100644 --- a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u96-acc2.c +++ b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u96-acc2.c @@ -46,7 +46,7 @@ void xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_u96_acc2( XNN_FORCE_REALIZATION(vc1); XNN_FORCE_REALIZATION(vdenorm_cutoff); - const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16(max)); + const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*)max)); const uint16_t* i = (const uint16_t*) input; uint16_t* o = (uint16_t*) output; diff --git a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u96-acc3.c b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u96-acc3.c index f72fca68147..0346fcf1fd4 100644 --- a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u96-acc3.c +++ b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u96-acc3.c @@ -46,7 +46,7 @@ void xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_u96_acc3( XNN_FORCE_REALIZATION(vc1); XNN_FORCE_REALIZATION(vdenorm_cutoff); - const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16(max)); + const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*)max)); const uint16_t* i = (const uint16_t*) input; uint16_t* o = (uint16_t*) output; diff --git a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u96-acc6.c b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u96-acc6.c index d8c072a0a80..93f12e36e1d 100644 --- a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u96-acc6.c +++ b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u96-acc6.c @@ -46,7 +46,7 @@ void xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_u96_acc6( XNN_FORCE_REALIZATION(vc1); XNN_FORCE_REALIZATION(vdenorm_cutoff); - const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16(max)); + const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*)max)); const uint16_t* i = (const uint16_t*) input; uint16_t* o = (uint16_t*) output; diff --git a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u96.c b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u96.c index ddd592ca45d..806d3b7d2e1 100644 --- a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u96.c +++ b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u96.c @@ -46,7 +46,7 @@ void xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_u96( XNN_FORCE_REALIZATION(vc1); XNN_FORCE_REALIZATION(vdenorm_cutoff); - const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16(max)); + const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*)max)); const uint16_t* i = (const uint16_t*) input; uint16_t* o = (uint16_t*) output; diff --git a/src/f16-raddstoreexpminusmax/neonfp16arith-rr2-p2.c.in b/src/f16-raddstoreexpminusmax/neonfp16arith-rr2-p2.c.in index df836e96bec..909117a734d 100644 --- a/src/f16-raddstoreexpminusmax/neonfp16arith-rr2-p2.c.in +++ b/src/f16-raddstoreexpminusmax/neonfp16arith-rr2-p2.c.in @@ -46,7 +46,7 @@ void xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_u${BATCH_TILE}${ XNN_FORCE_REALIZATION(vc1); XNN_FORCE_REALIZATION(vdenorm_cutoff); - const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16(max)); + const float16x8_t vi_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*)max)); const uint16_t* i = (const uint16_t*) input; uint16_t* o = (uint16_t*) output; diff --git a/src/f16-velu/gen/f16-velu-neonfp16arith-rr1-p3-u16.c b/src/f16-velu/gen/f16-velu-neonfp16arith-rr1-p3-u16.c index ffb42f3ae14..ea5ea771581 100644 --- a/src/f16-velu/gen/f16-velu-neonfp16arith-rr1-p3-u16.c +++ b/src/f16-velu/gen/f16-velu-neonfp16arith-rr1-p3-u16.c @@ -40,9 +40,9 @@ void xnn_f16_velu_ukernel__neonfp16arith_rr1_p3_u16( XNN_FORCE_REALIZATION(vc3); XNN_FORCE_REALIZATION(vc2); - const float16x8_t vprescale = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->scalar.prescale)); - const float16x8_t vminus_alpha = vnegq_f16(vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->scalar.alpha))); - const float16x8_t vbeta = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->scalar.beta)); + const float16x8_t vprescale = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.prescale)); + const float16x8_t vminus_alpha = vnegq_f16(vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.alpha))); + const float16x8_t vbeta = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.beta)); const uint16_t* i = (const uint16_t*) input; uint16_t* o = (uint16_t*) output; diff --git a/src/f16-velu/gen/f16-velu-neonfp16arith-rr1-p3-u8.c b/src/f16-velu/gen/f16-velu-neonfp16arith-rr1-p3-u8.c index e87eb8b8387..7fb4305dee8 100644 --- a/src/f16-velu/gen/f16-velu-neonfp16arith-rr1-p3-u8.c +++ b/src/f16-velu/gen/f16-velu-neonfp16arith-rr1-p3-u8.c @@ -40,9 +40,9 @@ void xnn_f16_velu_ukernel__neonfp16arith_rr1_p3_u8( XNN_FORCE_REALIZATION(vc3); XNN_FORCE_REALIZATION(vc2); - const float16x8_t vprescale = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->scalar.prescale)); - const float16x8_t vminus_alpha = vnegq_f16(vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->scalar.alpha))); - const float16x8_t vbeta = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->scalar.beta)); + const float16x8_t vprescale = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.prescale)); + const float16x8_t vminus_alpha = vnegq_f16(vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.alpha))); + const float16x8_t vbeta = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.beta)); const uint16_t* i = (const uint16_t*) input; uint16_t* o = (uint16_t*) output; diff --git a/src/f16-velu/neonfp16arith-rr1-p3.c.in b/src/f16-velu/neonfp16arith-rr1-p3.c.in index 6d253231826..6460812b341 100644 --- a/src/f16-velu/neonfp16arith-rr1-p3.c.in +++ b/src/f16-velu/neonfp16arith-rr1-p3.c.in @@ -39,9 +39,9 @@ void xnn_f16_velu_ukernel__neonfp16arith_rr1_p3_u${BATCH_TILE}( XNN_FORCE_REALIZATION(vc3); XNN_FORCE_REALIZATION(vc2); - const float16x8_t vprescale = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->scalar.prescale)); - const float16x8_t vminus_alpha = vnegq_f16(vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->scalar.alpha))); - const float16x8_t vbeta = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->scalar.beta)); + const float16x8_t vprescale = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.prescale)); + const float16x8_t vminus_alpha = vnegq_f16(vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.alpha))); + const float16x8_t vbeta = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.beta)); const uint16_t* i = (const uint16_t*) input; uint16_t* o = (uint16_t*) output; diff --git a/src/f16-vlrelu/gen/f16-vlrelu-neonfp16arith-u16.c b/src/f16-vlrelu/gen/f16-vlrelu-neonfp16arith-u16.c index 3f2cd3aea4d..d872060e589 100644 --- a/src/f16-vlrelu/gen/f16-vlrelu-neonfp16arith-u16.c +++ b/src/f16-vlrelu/gen/f16-vlrelu-neonfp16arith-u16.c @@ -26,7 +26,7 @@ void xnn_f16_vlrelu_ukernel__neonfp16arith_u16( assert(input != NULL); assert(output != NULL); - const float16x8_t vslope = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->scalar.slope)); + const float16x8_t vslope = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.slope)); const uint16_t* i = (const uint16_t*) input; uint16_t* o = (uint16_t*) output; for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { diff --git a/src/f16-vlrelu/gen/f16-vlrelu-neonfp16arith-u8.c b/src/f16-vlrelu/gen/f16-vlrelu-neonfp16arith-u8.c index d7201be2e3d..9ff2ff4adb6 100644 --- a/src/f16-vlrelu/gen/f16-vlrelu-neonfp16arith-u8.c +++ b/src/f16-vlrelu/gen/f16-vlrelu-neonfp16arith-u8.c @@ -26,7 +26,7 @@ void xnn_f16_vlrelu_ukernel__neonfp16arith_u8( assert(input != NULL); assert(output != NULL); - const float16x8_t vslope = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->scalar.slope)); + const float16x8_t vslope = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.slope)); const uint16_t* i = (const uint16_t*) input; uint16_t* o = (uint16_t*) output; for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { diff --git a/src/f16-vlrelu/neonfp16arith.c.in b/src/f16-vlrelu/neonfp16arith.c.in index 24b774ae5b9..e66ce08d0f3 100644 --- a/src/f16-vlrelu/neonfp16arith.c.in +++ b/src/f16-vlrelu/neonfp16arith.c.in @@ -25,7 +25,7 @@ void xnn_f16_vlrelu_ukernel__neonfp16arith_u${BATCH_TILE}( assert(input != NULL); assert(output != NULL); - const float16x8_t vslope = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->scalar.slope)); + const float16x8_t vslope = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.slope)); const uint16_t* i = (const uint16_t*) input; uint16_t* o = (uint16_t*) output; $if BATCH_TILE > 4: From 439a4d816cc25d774f7c1fef19e454612234b2c6 Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Thu, 19 Sep 2024 12:25:41 -0700 Subject: [PATCH 08/50] Fix incorrect types leading to warnings on some compilers. PiperOrigin-RevId: 676517065 --- bench/bf16-gemm.cc | 6 ++++-- bench/f16-conv-hwc2chw.cc | 2 +- bench/f16-dwconv.cc | 4 ++-- bench/f16-dwconv2d-chw.cc | 2 +- 4 files changed, 8 insertions(+), 6 deletions(-) diff --git a/bench/bf16-gemm.cc b/bench/bf16-gemm.cc index daba852b39d..c8aaf61a570 100644 --- a/bench/bf16-gemm.cc +++ b/bench/bf16-gemm.cc @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -66,12 +67,13 @@ static void bf16_gemm(benchmark::State& state, reinterpret_cast(w.data()), /*extra_bytes=*/0, /*params=*/nullptr); std::vector c(c_elements * num_buffers); - std::fill(c.begin(), c.end(), UINT16_C(0x7FC0) /* NaN */); + std::fill(c.begin(), c.end(), std::nanf("")); // Prepare minmax parameters. xnn_bf16_minmax_params params; init_params(¶ms, - UINT16_C(0xFF80) /* -inf */, UINT16_C(0x7F80) /* inf */); + -std::numeric_limits::infinity(), + +std::numeric_limits::infinity()); size_t buffer_index = 0; for (auto _ : state) { diff --git a/bench/f16-conv-hwc2chw.cc b/bench/f16-conv-hwc2chw.cc index f97cce8dd52..f17b98729cb 100644 --- a/bench/f16-conv-hwc2chw.cc +++ b/bench/f16-conv-hwc2chw.cc @@ -66,7 +66,7 @@ static void f16_conv_hwc2chw(benchmark::State& state, sizeof(xnn_float16) * (weights_elements + output_elements)); std::vector> packed_weights(weights_elements * num_buffers); - std::fill(packed_weights.begin(), packed_weights.end(), UINT16_C(0)); + std::fill(packed_weights.begin(), packed_weights.end(), 0); xnn_pack_f16_dconv_oki_w( output_channels, input_channels, output_channels_tile, kernel_size /* kernel height */, kernel_size /* kernel width */, diff --git a/bench/f16-dwconv.cc b/bench/f16-dwconv.cc index 6f52c588d8d..cc2adb8d3e1 100644 --- a/bench/f16-dwconv.cc +++ b/bench/f16-dwconv.cc @@ -87,7 +87,7 @@ static void f16_dwconv(benchmark::State& state, sizeof(xnn_float16) * (w_elements + c_elements) + sizeof(void*) * i_elements); std::vector> w(w_elements * num_buffers); - std::fill(w.begin(), w.end(), UINT16_C(0)); + std::fill(w.begin(), w.end(), 0); xnn_pack_f16_dwconv_ghw_w(primary_tile, 0, 0, kernel_height, kernel_width, channels, channel_tile, channel_tile, /*channel_round=*/1, reinterpret_cast(k.data()), @@ -223,7 +223,7 @@ static void f16_dwconv(benchmark::State& state, sizeof(xnn_float16) * (w_elements + c_elements) + sizeof(void*) * i_elements); std::vector> w(w_elements * num_buffers); - std::fill(w.begin(), w.end(), UINT16_C(0)); + std::fill(w.begin(), w.end(), 0); xnn_pack_f16_dwconv_ghw_w( first_pass_tile, middle_pass_tile, last_pass_tile, kernel_height, kernel_width, diff --git a/bench/f16-dwconv2d-chw.cc b/bench/f16-dwconv2d-chw.cc index 15f848d8a29..d5cc12c8395 100644 --- a/bench/f16-dwconv2d-chw.cc +++ b/bench/f16-dwconv2d-chw.cc @@ -98,7 +98,7 @@ static void f16_dwconv2d_chw(benchmark::State& state, sizeof(xnn_float16) * (w_elements + o_elements)); std::vector> packed_weights(w_elements * num_buffers); - std::fill(packed_weights.begin(), packed_weights.end(), UINT16_C(0)); + std::fill(packed_weights.begin(), packed_weights.end(), 0); for (size_t c = 0; c < channels; c++) { packed_weights[c * kernel_size + c] = bias[c]; for (size_t i = 0; i < kernel_size; i++) { From 338483a7981ec3b5be1e6018807881da9776d4bc Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Thu, 19 Sep 2024 12:25:56 -0700 Subject: [PATCH 09/50] Remove remenants of vsqrtshift kernel PiperOrigin-RevId: 676517134 --- src/xnnpack/microfnptr.h | 8 ----- src/xnnpack/vunary.h | 9 ----- test/vunary-microkernel-tester.cc | 55 ------------------------------- test/vunary-microkernel-tester.h | 3 -- tools/generate-vunary-test.py | 1 - 5 files changed, 76 deletions(-) diff --git a/src/xnnpack/microfnptr.h b/src/xnnpack/microfnptr.h index 4572c74f9ba..9b300dc3920 100644 --- a/src/xnnpack/microfnptr.h +++ b/src/xnnpack/microfnptr.h @@ -2055,14 +2055,6 @@ typedef void (*xnn_f32_vsqrt_ukernel_fn)( float* output, const struct xnn_f32_sqrt_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); -// VSQRTSHIFT: Vector SQuare RooT and SHIFT elementwise - -typedef void (*xnn_u64_u32_vsqrtshift_ukernel_fn)( - size_t batch, - const uint64_t* input, - uint32_t* output, - uint32_t shift); - // VRSQRT: Vector Reciprocal SQuare RooT elementwise typedef void (*xnn_f16_vrsqrt_ukernel_fn)( diff --git a/src/xnnpack/vunary.h b/src/xnnpack/vunary.h index 7e7842636a5..5aefeceb748 100644 --- a/src/xnnpack/vunary.h +++ b/src/xnnpack/vunary.h @@ -110,15 +110,6 @@ extern "C" { #undef XNN_UKERNEL #undef XNN_UKERNEL_WITH_PARAMS -#define DECLARE_U64_U32_VSQRTSHIFT_UKERNEL_FUNCTION(fn_name) \ - XNN_INTERNAL void fn_name( \ - size_t n, \ - const uint64_t* x, \ - uint32_t* y, \ - uint32_t shift); - -DECLARE_U64_U32_VSQRTSHIFT_UKERNEL_FUNCTION(xnn_u64_u32_vsqrtshift_ukernel__scalar_cvtu32_sqrt_cvtu32f64_u1) - #define DECLARE_XX_VUNARY_UKERNEL_FUNCTION(fn_name) \ XNN_INTERNAL void fn_name( \ diff --git a/test/vunary-microkernel-tester.cc b/test/vunary-microkernel-tester.cc index 5324ffd7b6a..ffa6e29a5fe 100644 --- a/test/vunary-microkernel-tester.cc +++ b/test/vunary-microkernel-tester.cc @@ -438,58 +438,3 @@ void VUnaryMicrokernelTester::Test(xnn_u8_vclamp_ukernel_fn vclamp, } } } - -void VUnaryMicrokernelTester::Test(xnn_u64_u32_vsqrtshift_ukernel_fn vsqrtshift, - uint32_t, Default) const { - ASSERT_FALSE(inplace()); - - xnnpack::ReplicableRandomDevice rng; - auto u64rng = - std::bind(std::uniform_int_distribution(), std::ref(rng)); - - std::vector x(batch_size() + XNN_EXTRA_BYTES / sizeof(uint64_t)); - std::vector y(batch_size()); - std::vector y_ref(batch_size()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(x.begin(), x.end(), std::ref(u64rng)); - std::fill(y.begin(), y.end(), UINT32_C(0xDEADBEEF)); - - // Compute reference results. - for (size_t i = 0; i < batch_size(); i++) { - const uint64_t x_value = x[i]; - uint32_t y_value = 0; - // Match TFLM semantics, including bugs - if (static_cast(x_value) == x_value) { - y_value = - static_cast(std::lrint(std::sqrt(static_cast( - static_cast(static_cast(x_value)))))); - y_value = - std::min(y_value, std::numeric_limits::max()); - } else if (x_value != 0) { - uint64_t y0 = x_value >> 1; - uint64_t y1 = (y0 + x_value / y0) >> 1; - do { - y0 = y1; - y1 = (y0 + x_value / y0) >> 1; - } while (y1 < y0); - - // y0 is sqrt(x_value) rounded down, round up if needed - if (static_cast(y0 * y0 + y0 - x_value) < 0) { - y0 += 1; - } - y_value = static_cast( - std::min(y0, std::numeric_limits::max())); - } - y_ref[i] = y_value >> shift(); - } - - // Call optimized micro-kernel. - vsqrtshift(batch_size() * sizeof(uint64_t), x.data(), y.data(), shift()); - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - EXPECT_EQ(y_ref[i], y[i]) << "at " << i << " / " << batch_size() << ", x[" - << i << "]: " << x[i] << ", shift: " << shift(); - } - } -} diff --git a/test/vunary-microkernel-tester.h b/test/vunary-microkernel-tester.h index c99d0db8c43..8293d2769c9 100644 --- a/test/vunary-microkernel-tester.h +++ b/test/vunary-microkernel-tester.h @@ -271,9 +271,6 @@ class VUnaryMicrokernelTester { xnn_init_u8_minmax_params_fn init_params, Default = Default()) const; - void Test(xnn_u64_u32_vsqrtshift_ukernel_fn vsqrtshift, uint32_t, - Default = Default()) const; - private: // Generic test function for `vunary` kernels. // diff --git a/tools/generate-vunary-test.py b/tools/generate-vunary-test.py index bb632ac7c95..810b7e53925 100755 --- a/tools/generate-vunary-test.py +++ b/tools/generate-vunary-test.py @@ -55,7 +55,6 @@ "vsigmoid": "Sigmoid", "vsqr": "Square", "vsqrt": "SquareRoot", - "vsqrtshift": "SquareRootShift", "vtanh": "TanH", } From 6819c3284851964b2c1c9648e24e12a2cfe88a57 Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Thu, 19 Sep 2024 16:15:02 -0700 Subject: [PATCH 10/50] Use header tables for dwconv kernels This doesn't touch benchmarks yet, because they are not generated currently, and have a lot of issues that need to be resolved first. Some of the dwconv kernels were missing listing in the yaml files, hence not getting test coverage, including some that were used in production configs. This issue was detected automatically by this change, and this change fixes the problem. PiperOrigin-RevId: 676602059 --- BUILD.bazel | 16 + CMakeLists.txt | 2 +- scripts/generate-tests.sh | 34 +- src/f16-dwconv/f16-dwconv-minmax-multipass.h | 51 + src/f16-dwconv/f16-dwconv-minmax-unipass.h | 63 + src/f32-dwconv/f32-dwconv-minmax-multipass.h | 128 + src/f32-dwconv/f32-dwconv-minmax-unipass.h | 237 ++ src/f32-dwconv/f32-dwconv-multipass.h | 36 + src/f32-dwconv/f32-dwconv-unipass.h | 49 + .../qs8-dwconv-minmax-multipass-fp32.h | 156 + .../qs8-dwconv-minmax-multipass-rndnu.h | 39 + .../qs8-dwconv-minmax-unipass-fp32.h | 119 + .../qs8-dwconv-minmax-unipass-rndnu.h | 33 + .../qs8-qc8w-dwconv-minmax-multipass-fp32.h | 192 + .../qs8-qc8w-dwconv-minmax-unipass-fp32.h | 172 + .../qu8-dwconv-minmax-multipass-fp32.h | 120 + .../qu8-dwconv-minmax-multipass-rndnu.h | 30 + .../qu8-dwconv-minmax-unipass-fp32.h | 91 + .../qu8-dwconv-minmax-unipass-rndnu.h | 27 + src/xnnpack/dwconv.h | 1726 +------- test/dwconv-microkernel-tester.cc | 4 +- test/dwconv-microkernel-tester.h | 4 +- test/f16-dwconv-minmax-multipass.cc | 766 +--- test/f16-dwconv-minmax-multipass.yaml | 80 - test/f16-dwconv-minmax-unipass.cc | 926 +---- test/f16-dwconv-minmax-unipass.yaml | 104 - test/f32-dwconv-minmax-multipass.cc | 1703 +------- test/f32-dwconv-minmax-multipass.yaml | 190 - test/f32-dwconv-minmax-unipass.cc | 3655 +---------------- test/f32-dwconv-minmax-unipass.yaml | 436 -- test/f32-dwconv-multipass.cc | 234 +- test/f32-dwconv-multipass.yaml | 24 - test/f32-dwconv-unipass.cc | 477 +-- test/f32-dwconv-unipass.yaml | 42 - test/qs8-dwconv-minmax-multipass-fp32.cc | 2655 +----------- test/qs8-dwconv-minmax-multipass-fp32.yaml | 288 -- test/qs8-dwconv-minmax-multipass-rndnu.cc | 613 +-- test/qs8-dwconv-minmax-multipass-rndnu.yaml | 60 - test/qs8-dwconv-minmax-unipass-fp32.cc | 1775 +------- test/qs8-dwconv-minmax-unipass-fp32.yaml | 208 - test/qs8-dwconv-minmax-unipass-rndnu.cc | 442 +- test/qs8-dwconv-minmax-unipass-rndnu.yaml | 49 - test/qs8-qc8w-dwconv-minmax-multipass-fp32.cc | 3434 +--------------- ...qs8-qc8w-dwconv-minmax-multipass-fp32.yaml | 360 -- test/qs8-qc8w-dwconv-minmax-unipass-fp32.cc | 2622 +----------- test/qs8-qc8w-dwconv-minmax-unipass-fp32.yaml | 296 -- test/qu8-dwconv-minmax-multipass-fp32.cc | 1917 +-------- test/qu8-dwconv-minmax-multipass-fp32.yaml | 216 - test/qu8-dwconv-minmax-multipass-rndnu.cc | 424 +- test/qu8-dwconv-minmax-multipass-rndnu.yaml | 42 - test/qu8-dwconv-minmax-unipass-fp32.cc | 1259 +----- test/qu8-dwconv-minmax-unipass-fp32.yaml | 152 - test/qu8-dwconv-minmax-unipass-rndnu.cc | 332 +- test/qu8-dwconv-minmax-unipass-rndnu.yaml | 37 - tools/generate-dwconv-multipass-test.py | 223 +- tools/generate-dwconv-unipass-test.py | 226 +- 56 files changed, 2409 insertions(+), 27187 deletions(-) create mode 100644 src/f16-dwconv/f16-dwconv-minmax-multipass.h create mode 100644 src/f16-dwconv/f16-dwconv-minmax-unipass.h create mode 100644 src/f32-dwconv/f32-dwconv-minmax-multipass.h create mode 100644 src/f32-dwconv/f32-dwconv-minmax-unipass.h create mode 100644 src/f32-dwconv/f32-dwconv-multipass.h create mode 100644 src/f32-dwconv/f32-dwconv-unipass.h create mode 100644 src/qs8-dwconv/qs8-dwconv-minmax-multipass-fp32.h create mode 100644 src/qs8-dwconv/qs8-dwconv-minmax-multipass-rndnu.h create mode 100644 src/qs8-dwconv/qs8-dwconv-minmax-unipass-fp32.h create mode 100644 src/qs8-dwconv/qs8-dwconv-minmax-unipass-rndnu.h create mode 100644 src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-multipass-fp32.h create mode 100644 src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-unipass-fp32.h create mode 100644 src/qu8-dwconv/qu8-dwconv-minmax-multipass-fp32.h create mode 100644 src/qu8-dwconv/qu8-dwconv-minmax-multipass-rndnu.h create mode 100644 src/qu8-dwconv/qu8-dwconv-minmax-unipass-fp32.h create mode 100644 src/qu8-dwconv/qu8-dwconv-minmax-unipass-rndnu.h delete mode 100644 test/f16-dwconv-minmax-multipass.yaml delete mode 100644 test/f16-dwconv-minmax-unipass.yaml delete mode 100644 test/f32-dwconv-minmax-multipass.yaml delete mode 100644 test/f32-dwconv-minmax-unipass.yaml delete mode 100644 test/f32-dwconv-multipass.yaml delete mode 100644 test/f32-dwconv-unipass.yaml delete mode 100644 test/qs8-dwconv-minmax-multipass-fp32.yaml delete mode 100644 test/qs8-dwconv-minmax-multipass-rndnu.yaml delete mode 100644 test/qs8-dwconv-minmax-unipass-fp32.yaml delete mode 100644 test/qs8-dwconv-minmax-unipass-rndnu.yaml delete mode 100644 test/qs8-qc8w-dwconv-minmax-multipass-fp32.yaml delete mode 100644 test/qs8-qc8w-dwconv-minmax-unipass-fp32.yaml delete mode 100644 test/qu8-dwconv-minmax-multipass-fp32.yaml delete mode 100644 test/qu8-dwconv-minmax-multipass-rndnu.yaml delete mode 100644 test/qu8-dwconv-minmax-unipass-fp32.yaml delete mode 100644 test/qu8-dwconv-minmax-unipass-rndnu.yaml diff --git a/BUILD.bazel b/BUILD.bazel index bff2ee97e90..2d41fc143cb 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -302,6 +302,22 @@ MICROKERNEL_DEFS = [ "src/xx-transposev/xx-transposev.h", "src/xx-fill/xx-fill.h", "src/xx-pad/xx-pad.h", + "src/f16-dwconv/f16-dwconv-minmax-unipass.h", + "src/f32-dwconv/f32-dwconv-minmax-unipass.h", + "src/f32-dwconv/f32-dwconv-unipass.h", + "src/qs8-dwconv/qs8-dwconv-minmax-unipass-fp32.h", + "src/qs8-dwconv/qs8-dwconv-minmax-unipass-rndnu.h", + "src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-unipass-fp32.h", + "src/qu8-dwconv/qu8-dwconv-minmax-unipass-fp32.h", + "src/qu8-dwconv/qu8-dwconv-minmax-unipass-rndnu.h", + "src/f16-dwconv/f16-dwconv-minmax-multipass.h", + "src/f32-dwconv/f32-dwconv-minmax-multipass.h", + "src/f32-dwconv/f32-dwconv-multipass.h", + "src/qs8-dwconv/qs8-dwconv-minmax-multipass-fp32.h", + "src/qs8-dwconv/qs8-dwconv-minmax-multipass-rndnu.h", + "src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-multipass-fp32.h", + "src/qu8-dwconv/qu8-dwconv-minmax-multipass-fp32.h", + "src/qu8-dwconv/qu8-dwconv-minmax-multipass-rndnu.h", ] MICROKERNEL_HDRS = [ diff --git a/CMakeLists.txt b/CMakeLists.txt index 2728c26380c..6b0e25c7b27 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1571,7 +1571,7 @@ IF(XNNPACK_BUILD_TESTS) qu8-dwconv-minmax-unipass-rndnu) FOREACH(TEST ${MICROKERNEL_DWCONV_UNIT_TESTS}) ADD_EXECUTABLE(${TEST}-test test/${TEST}.cc) - TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE include src test) + TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE . include src test) TARGET_LINK_LIBRARIES(${TEST}-test PRIVATE dwconv-microkernel-tester fp16 diff --git a/scripts/generate-tests.sh b/scripts/generate-tests.sh index 8e44081cac3..5d295936fc3 100755 --- a/scripts/generate-tests.sh +++ b/scripts/generate-tests.sh @@ -190,8 +190,6 @@ tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f32-vta tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel s8-vclamp --output test/s8-vclamp.cc & tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel u8-vclamp --output test/u8-vclamp.cc & -tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel u64-u32-vsqrtshift --output test/u64-u32-vsqrtshift.cc & - ### Tests for VLRelu micro-kernels tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f16-vlrelu --output test/f16-vlrelu.cc & tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f32-vlrelu --output test/f32-vlrelu.cc & @@ -231,27 +229,27 @@ tools/generate-conv-hwc2chw-test.py --spec test/f16-conv-hwc2chw.yaml --output t tools/generate-conv-hwc2chw-test.py --spec test/f32-conv-hwc2chw.yaml --output test/f32-conv-hwc2chw.cc & ### Tests for DWConv micro-kernels -tools/generate-dwconv-unipass-test.py --spec test/f16-dwconv-minmax-unipass.yaml --output test/f16-dwconv-minmax-unipass.cc & -tools/generate-dwconv-multipass-test.py --spec test/f16-dwconv-minmax-multipass.yaml --output test/f16-dwconv-minmax-multipass.cc & +tools/generate-dwconv-unipass-test.py --ukernel f16-dwconv-minmax-unipass --output test/f16-dwconv-minmax-unipass.cc & +tools/generate-dwconv-multipass-test.py --ukernel f16-dwconv-minmax-multipass --output test/f16-dwconv-minmax-multipass.cc & -tools/generate-dwconv-unipass-test.py --spec test/f32-dwconv-unipass.yaml --output test/f32-dwconv-unipass.cc & -tools/generate-dwconv-unipass-test.py --spec test/f32-dwconv-minmax-unipass.yaml --output test/f32-dwconv-minmax-unipass.cc & -tools/generate-dwconv-multipass-test.py --spec test/f32-dwconv-multipass.yaml --output test/f32-dwconv-multipass.cc & -tools/generate-dwconv-multipass-test.py --spec test/f32-dwconv-minmax-multipass.yaml --output test/f32-dwconv-minmax-multipass.cc & +tools/generate-dwconv-unipass-test.py --ukernel f32-dwconv-unipass --output test/f32-dwconv-unipass.cc & +tools/generate-dwconv-unipass-test.py --ukernel f32-dwconv-minmax-unipass --output test/f32-dwconv-minmax-unipass.cc & +tools/generate-dwconv-multipass-test.py --ukernel f32-dwconv-multipass --output test/f32-dwconv-multipass.cc & +tools/generate-dwconv-multipass-test.py --ukernel f32-dwconv-minmax-multipass --output test/f32-dwconv-minmax-multipass.cc & -tools/generate-dwconv-unipass-test.py --spec test/qs8-qc8w-dwconv-minmax-unipass-fp32.yaml --output test/qs8-qc8w-dwconv-minmax-unipass-fp32.cc & -tools/generate-dwconv-unipass-test.py --spec test/qs8-dwconv-minmax-unipass-fp32.yaml --output test/qs8-dwconv-minmax-unipass-fp32.cc & -tools/generate-dwconv-unipass-test.py --spec test/qu8-dwconv-minmax-unipass-fp32.yaml --output test/qu8-dwconv-minmax-unipass-fp32.cc & +tools/generate-dwconv-unipass-test.py --ukernel qs8-qc8w-dwconv-minmax-unipass-fp32 --output test/qs8-qc8w-dwconv-minmax-unipass-fp32.cc & +tools/generate-dwconv-unipass-test.py --ukernel qs8-dwconv-minmax-unipass-fp32 --output test/qs8-dwconv-minmax-unipass-fp32.cc & +tools/generate-dwconv-unipass-test.py --ukernel qu8-dwconv-minmax-unipass-fp32 --output test/qu8-dwconv-minmax-unipass-fp32.cc & -tools/generate-dwconv-unipass-test.py --spec test/qs8-dwconv-minmax-unipass-rndnu.yaml --output test/qs8-dwconv-minmax-unipass-rndnu.cc & -tools/generate-dwconv-unipass-test.py --spec test/qu8-dwconv-minmax-unipass-rndnu.yaml --output test/qu8-dwconv-minmax-unipass-rndnu.cc & +tools/generate-dwconv-unipass-test.py --ukernel qs8-dwconv-minmax-unipass-rndnu --output test/qs8-dwconv-minmax-unipass-rndnu.cc & +tools/generate-dwconv-unipass-test.py --ukernel qu8-dwconv-minmax-unipass-rndnu --output test/qu8-dwconv-minmax-unipass-rndnu.cc & -tools/generate-dwconv-multipass-test.py --spec test/qs8-qc8w-dwconv-minmax-multipass-fp32.yaml --output test/qs8-qc8w-dwconv-minmax-multipass-fp32.cc & -tools/generate-dwconv-multipass-test.py --spec test/qs8-dwconv-minmax-multipass-fp32.yaml --output test/qs8-dwconv-minmax-multipass-fp32.cc & -tools/generate-dwconv-multipass-test.py --spec test/qu8-dwconv-minmax-multipass-fp32.yaml --output test/qu8-dwconv-minmax-multipass-fp32.cc & +tools/generate-dwconv-multipass-test.py --ukernel qs8-qc8w-dwconv-minmax-multipass-fp32 --output test/qs8-qc8w-dwconv-minmax-multipass-fp32.cc & +tools/generate-dwconv-multipass-test.py --ukernel qs8-dwconv-minmax-multipass-fp32 --output test/qs8-dwconv-minmax-multipass-fp32.cc & +tools/generate-dwconv-multipass-test.py --ukernel qu8-dwconv-minmax-multipass-fp32 --output test/qu8-dwconv-minmax-multipass-fp32.cc & -tools/generate-dwconv-multipass-test.py --spec test/qs8-dwconv-minmax-multipass-rndnu.yaml --output test/qs8-dwconv-minmax-multipass-rndnu.cc & -tools/generate-dwconv-multipass-test.py --spec test/qu8-dwconv-minmax-multipass-rndnu.yaml --output test/qu8-dwconv-minmax-multipass-rndnu.cc & +tools/generate-dwconv-multipass-test.py --ukernel qs8-dwconv-minmax-multipass-rndnu --output test/qs8-dwconv-minmax-multipass-rndnu.cc & +tools/generate-dwconv-multipass-test.py --ukernel qu8-dwconv-minmax-multipass-rndnu --output test/qu8-dwconv-minmax-multipass-rndnu.cc & ### Tests for DWConv CHW layout micro-kernels tools/generate-dwconv2d-chw-test.py --spec test/f16-dwconv2d-chw.yaml --output test/f16-dwconv2d-chw.cc & diff --git a/src/f16-dwconv/f16-dwconv-minmax-multipass.h b/src/f16-dwconv/f16-dwconv-minmax-multipass.h new file mode 100644 index 00000000000..3671651870e --- /dev/null +++ b/src/f16-dwconv/f16-dwconv-minmax-multipass.h @@ -0,0 +1,51 @@ +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +// Arguments are: +// XNN_DWCONV_UNIPASS(arch, name, first_pass_tile, middle_pass_tile, last_pass_tile, channel_tile, channel_subtile, channel_round, datatype, weights_type, buffer_type,params_type, init_fn) + +#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__neonfp16arith, 5, 5, 5, 8, 8, 4, xnn_float16, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__neonfp16arith_acc2, 5, 5, 5, 8, 8, 4, xnn_float16, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__neonfp16arith, 5, 5, 5, 16, 8, 4, xnn_float16, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__neonfp16arith_acc2, 5, 5, 5, 16, 8, 4, xnn_float16, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__neonfp16arith, 5, 5, 5, 32, 8, 4, xnn_float16, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__neonfp16arith_acc2, 5, 5, 5, 32, 8, 4, xnn_float16, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__neonfp16arith, 6, 6, 7, 8, 8, 4, xnn_float16, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__neonfp16arith_acc2, 6, 6, 7, 8, 8, 4, xnn_float16, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__neonfp16arith, 6, 6, 7, 16, 8, 4, xnn_float16, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__neonfp16arith_acc2, 6, 6, 7, 16, 8, 4, xnn_float16, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__neonfp16arith, 6, 6, 7, 32, 8, 4, xnn_float16, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__neonfp16arith_acc2, 6, 6, 7, 32, 8, 4, xnn_float16, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__neonfp16arith, 8, 8, 9, 8, 8, 4, xnn_float16, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__neonfp16arith_acc2, 8, 8, 9, 8, 8, 4, xnn_float16, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__neonfp16arith, 8, 8, 9, 16, 8, 4, xnn_float16, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__neonfp16arith_acc2, 8, 8, 9, 16, 8, 4, xnn_float16, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__neonfp16arith, 8, 8, 9, 32, 8, 4, xnn_float16, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__neonfp16arith_acc2, 8, 8, 9, 32, 8, 4, xnn_float16, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_DWCONV_MULTIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__fma3, 5, 5, 5, 8, 8, 4, xnn_float16, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__fma3_acc2, 5, 5, 5, 8, 8, 4, xnn_float16, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__fma3, 5, 5, 5, 16, 8, 4, xnn_float16, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__fma3_acc2, 5, 5, 5, 16, 8, 4, xnn_float16, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__fma3, 5, 5, 5, 32, 8, 4, xnn_float16, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__fma3_acc2, 5, 5, 5, 32, 8, 4, xnn_float16, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__fma3, 6, 6, 7, 8, 8, 4, xnn_float16, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__fma3_acc2, 6, 6, 7, 8, 8, 4, xnn_float16, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__fma3, 6, 6, 7, 16, 8, 4, xnn_float16, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__fma3_acc2, 6, 6, 7, 16, 8, 4, xnn_float16, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__fma3, 6, 6, 7, 32, 8, 4, xnn_float16, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__fma3_acc2, 6, 6, 7, 32, 8, 4, xnn_float16, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__fma3, 8, 8, 9, 8, 8, 4, xnn_float16, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__fma3_acc2, 8, 8, 9, 8, 8, 4, xnn_float16, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__fma3, 8, 8, 9, 16, 8, 4, xnn_float16, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__fma3_acc2, 8, 8, 9, 16, 8, 4, xnn_float16, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__fma3, 8, 8, 9, 32, 8, 4, xnn_float16, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__fma3_acc2, 8, 8, 9, 32, 8, 4, xnn_float16, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + diff --git a/src/f16-dwconv/f16-dwconv-minmax-unipass.h b/src/f16-dwconv/f16-dwconv-minmax-unipass.h new file mode 100644 index 00000000000..d6d8301e7c6 --- /dev/null +++ b/src/f16-dwconv/f16-dwconv-minmax-unipass.h @@ -0,0 +1,63 @@ +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +// Arguments are: +// XNN_DWCONV_UNIPASS(arch, name, c_block, pipelined, cr, kr, datatype, weights_type,params_type, init_fn) + +#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_3p8c__neonfp16arith, 8, false, 8, 3, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_3p8c__neonfp16arith_acc2, 8, false, 8, 3, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_3p16c__neonfp16arith, 16, false, 16, 3, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_3p16c__neonfp16arith_acc2, 16, false, 16, 3, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_3p32c__neonfp16arith, 32, false, 32, 3, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_3p32c__neonfp16arith_acc2, 32, false, 32, 3, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_4p8c__neonfp16arith, 8, false, 8, 4, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_4p8c__neonfp16arith_acc2, 8, false, 8, 4, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_4p16c__neonfp16arith, 16, false, 16, 4, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_4p16c__neonfp16arith_acc2, 16, false, 16, 4, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_4p32c__neonfp16arith, 32, false, 32, 4, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_4p32c__neonfp16arith_acc2, 32, false, 32, 4, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_9p8c__neonfp16arith, 8, false, 8, 9, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_9p8c__neonfp16arith_acc2, 8, false, 8, 9, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_9p16c__neonfp16arith, 16, false, 16, 9, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_9p16c__neonfp16arith_acc2, 16, false, 16, 9, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_9p32c__neonfp16arith, 32, false, 32, 9, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_9p32c__neonfp16arith_acc2, 32, false, 32, 9, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_25p8c__neonfp16arith, 8, false, 8, 25, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_25p8c__neonfp16arith_acc2, 8, false, 8, 25, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_25p16c__neonfp16arith, 16, false, 16, 25, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_25p16c__neonfp16arith_acc2, 16, false, 16, 25, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_25p32c__neonfp16arith, 32, false, 32, 25, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fp16_arith, xnn_f16_dwconv_minmax_ukernel_25p32c__neonfp16arith_acc2, 32, false, 32, 25, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_3p8c__fma3, 8, false, 8, 3, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_3p8c__fma3_acc2, 8, false, 8, 3, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_3p16c__fma3, 16, false, 16, 3, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_3p16c__fma3_acc2, 16, false, 16, 3, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_3p32c__fma3, 32, false, 32, 3, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_3p32c__fma3_acc2, 32, false, 32, 3, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_4p8c__fma3, 8, false, 8, 4, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_4p8c__fma3_acc2, 8, false, 8, 4, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_4p16c__fma3, 16, false, 16, 4, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_4p16c__fma3_acc2, 16, false, 16, 4, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_4p32c__fma3, 32, false, 32, 4, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_4p32c__fma3_acc2, 32, false, 32, 4, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_9p8c__fma3, 8, false, 8, 9, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_9p8c__fma3_acc2, 8, false, 8, 9, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_9p16c__fma3, 16, false, 16, 9, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_9p16c__fma3_acc2, 16, false, 16, 9, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_9p32c__fma3, 32, false, 32, 9, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_9p32c__fma3_acc2, 32, false, 32, 9, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_25p8c__fma3, 8, false, 8, 25, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_25p8c__fma3_acc2, 8, false, 8, 25, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_25p16c__fma3, 16, false, 16, 25, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_25p16c__fma3_acc2, 16, false, 16, 25, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_25p32c__fma3, 32, false, 32, 25, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f16_dwconv_minmax_ukernel_25p32c__fma3_acc2, 32, false, 32, 25, xnn_float16, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + diff --git a/src/f32-dwconv/f32-dwconv-minmax-multipass.h b/src/f32-dwconv/f32-dwconv-minmax-multipass.h new file mode 100644 index 00000000000..659f4aa6f7c --- /dev/null +++ b/src/f32-dwconv/f32-dwconv-minmax-multipass.h @@ -0,0 +1,128 @@ +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +// Arguments are: +// XNN_DWCONV_UNIPASS(arch, name, first_pass_tile, middle_pass_tile, last_pass_tile, channel_tile, channel_subtile, channel_round, datatype, weights_type, buffer_type,params_type, init_fn) + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__neon, 5, 5, 5, 4, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__neon_acc2, 5, 5, 5, 4, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_fma, xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__neonfma, 5, 5, 5, 4, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_fma, xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__neonfma_acc2, 5, 5, 5, 4, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_f32_dwconv_minmax_ukernel_5f5m5l8c4s4r__neon, 5, 5, 5, 8, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_f32_dwconv_minmax_ukernel_5f5m5l8c4s4r__neon_acc2, 5, 5, 5, 8, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_fma, xnn_f32_dwconv_minmax_ukernel_5f5m5l8c4s4r__neonfma, 5, 5, 5, 8, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_fma, xnn_f32_dwconv_minmax_ukernel_5f5m5l8c4s4r__neonfma_acc2, 5, 5, 5, 8, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_f32_dwconv_minmax_ukernel_6f6m7l4c4s4r__neon, 6, 6, 7, 4, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_f32_dwconv_minmax_ukernel_6f6m7l4c4s4r__neon_acc2, 6, 6, 7, 4, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_fma, xnn_f32_dwconv_minmax_ukernel_6f6m7l4c4s4r__neonfma, 6, 6, 7, 4, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_fma, xnn_f32_dwconv_minmax_ukernel_6f6m7l4c4s4r__neonfma_acc2, 6, 6, 7, 4, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_f32_dwconv_minmax_ukernel_6f6m7l8c4s4r__neon, 6, 6, 7, 8, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_f32_dwconv_minmax_ukernel_6f6m7l8c4s4r__neon_acc2, 6, 6, 7, 8, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_fma, xnn_f32_dwconv_minmax_ukernel_6f6m7l8c4s4r__neonfma, 6, 6, 7, 8, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_fma, xnn_f32_dwconv_minmax_ukernel_6f6m7l8c4s4r__neonfma_acc2, 6, 6, 7, 8, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_f32_dwconv_minmax_ukernel_8f8m9l4c4s4r__neon, 8, 8, 9, 4, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_f32_dwconv_minmax_ukernel_8f8m9l4c4s4r__neon_acc2, 8, 8, 9, 4, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_fma, xnn_f32_dwconv_minmax_ukernel_8f8m9l4c4s4r__neonfma, 8, 8, 9, 4, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_fma, xnn_f32_dwconv_minmax_ukernel_8f8m9l4c4s4r__neonfma_acc2, 8, 8, 9, 4, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_f32_dwconv_minmax_ukernel_8f8m9l8c4s4r__neon, 8, 8, 9, 8, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_f32_dwconv_minmax_ukernel_8f8m9l8c4s4r__neon_acc2, 8, 8, 9, 8, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_fma, xnn_f32_dwconv_minmax_ukernel_8f8m9l8c4s4r__neonfma, 8, 8, 9, 8, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_fma, xnn_f32_dwconv_minmax_ukernel_8f8m9l8c4s4r__neonfma_acc2, 8, 8, 9, 8, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__sse, 5, 5, 5, 4, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__sse_acc2, 5, 5, 5, 4, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_5f5m5l8c4s4r__sse, 5, 5, 5, 8, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_5f5m5l8c4s4r__sse_acc2, 5, 5, 5, 8, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_5f5m5l16c4s4r__sse, 5, 5, 5, 16, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_5f5m5l16c4s4r__sse_acc2, 5, 5, 5, 16, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_6f6m7l4c4s4r__sse, 6, 6, 7, 4, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_6f6m7l4c4s4r__sse_acc2, 6, 6, 7, 4, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_6f6m7l8c4s4r__sse, 6, 6, 7, 8, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_6f6m7l8c4s4r__sse_acc2, 6, 6, 7, 8, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_6f6m7l16c4s4r__sse, 6, 6, 7, 16, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_6f6m7l16c4s4r__sse_acc2, 6, 6, 7, 16, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_8f8m9l4c4s4r__sse, 8, 8, 9, 4, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_8f8m9l4c4s4r__sse_acc2, 8, 8, 9, 4, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_8f8m9l8c4s4r__sse, 8, 8, 9, 8, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_8f8m9l8c4s4r__sse_acc2, 8, 8, 9, 8, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_8f8m9l16c4s4r__sse, 8, 8, 9, 16, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_8f8m9l16c4s4r__sse_acc2, 8, 8, 9, 16, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx, xnn_f32_dwconv_minmax_ukernel_5f5m5l8c8s4r__avx, 5, 5, 5, 8, 8, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx, xnn_f32_dwconv_minmax_ukernel_5f5m5l8c8s4r__avx_acc2, 5, 5, 5, 8, 8, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx, xnn_f32_dwconv_minmax_ukernel_5f5m5l16c8s4r__avx, 5, 5, 5, 16, 8, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx, xnn_f32_dwconv_minmax_ukernel_5f5m5l16c8s4r__avx_acc2, 5, 5, 5, 16, 8, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx, xnn_f32_dwconv_minmax_ukernel_6f6m7l8c8s4r__avx, 6, 6, 7, 8, 8, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx, xnn_f32_dwconv_minmax_ukernel_6f6m7l8c8s4r__avx_acc2, 6, 6, 7, 8, 8, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx, xnn_f32_dwconv_minmax_ukernel_6f6m7l16c8s4r__avx, 6, 6, 7, 16, 8, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx, xnn_f32_dwconv_minmax_ukernel_6f6m7l16c8s4r__avx_acc2, 6, 6, 7, 16, 8, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx, xnn_f32_dwconv_minmax_ukernel_8f8m9l8c8s4r__avx, 8, 8, 9, 8, 8, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx, xnn_f32_dwconv_minmax_ukernel_8f8m9l8c8s4r__avx_acc2, 8, 8, 9, 8, 8, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx, xnn_f32_dwconv_minmax_ukernel_8f8m9l16c8s4r__avx, 8, 8, 9, 16, 8, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx, xnn_f32_dwconv_minmax_ukernel_8f8m9l16c8s4r__avx_acc2, 8, 8, 9, 16, 8, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_5f5m5l8c8s4r__fma3, 5, 5, 5, 8, 8, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_5f5m5l8c8s4r__fma3_acc2, 5, 5, 5, 8, 8, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_5f5m5l16c8s4r__fma3, 5, 5, 5, 16, 8, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_5f5m5l16c8s4r__fma3_acc2, 5, 5, 5, 16, 8, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_5f5m5l32c8s4r__fma3, 5, 5, 5, 32, 8, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_5f5m5l32c8s4r__fma3_acc2, 5, 5, 5, 32, 8, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_7f6m6l8c8s4r__fma3, 7, 6, 6, 8, 8, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_7f6m6l8c8s4r__fma3_acc2, 7, 6, 6, 8, 8, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_7f6m6l16c8s4r__fma3, 7, 6, 6, 16, 8, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_7f6m6l16c8s4r__fma3_acc2, 7, 6, 6, 16, 8, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_7f6m6l32c8s4r__fma3, 7, 6, 6, 32, 8, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_7f6m6l32c8s4r__fma3_acc2, 7, 6, 6, 32, 8, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512f, xnn_f32_dwconv_minmax_ukernel_5f5m5l16c16s1r__avx512f, 5, 5, 5, 16, 16, 1, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512f, xnn_f32_dwconv_minmax_ukernel_5f5m5l16c16s1r__avx512f_acc2, 5, 5, 5, 16, 16, 1, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512f, xnn_f32_dwconv_minmax_ukernel_5f5m5l32c16s1r__avx512f, 5, 5, 5, 32, 16, 1, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512f, xnn_f32_dwconv_minmax_ukernel_5f5m5l32c16s1r__avx512f_acc2, 5, 5, 5, 32, 16, 1, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_3f3m3l4c4s4r__wasmsimd_arm, 3, 3, 3, 4, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_3f3m3l4c4s4r__wasmsimd_arm_acc2, 3, 3, 3, 4, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_3f3m3l8c4s4r__wasmsimd_arm, 3, 3, 3, 8, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_3f3m3l8c4s4r__wasmsimd_arm_acc2, 3, 3, 3, 8, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_3f3m3l4c4s4r__wasmsimd_x86, 3, 3, 3, 4, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_3f3m3l4c4s4r__wasmsimd_x86_acc2, 3, 3, 3, 4, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_3f3m3l8c4s4r__wasmsimd_x86, 3, 3, 3, 8, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_3f3m3l8c4s4r__wasmsimd_x86_acc2, 3, 3, 3, 8, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmsimd_arm, 5, 5, 5, 4, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmsimd_arm_acc2, 5, 5, 5, 4, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmsimd_x86, 5, 5, 5, 4, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmsimd_x86_acc2, 5, 5, 5, 4, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +#if XNN_ARCH_WASMRELAXEDSIMD +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmrelaxedsimd, 5, 5, 5, 4, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmrelaxedsimd_acc2, 5, 5, 5, 4, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmrelaxedsimd_fma, 5, 5, 5, 4, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmrelaxedsimd_fma_acc2, 5, 5, 5, 4, 4, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +#endif // XNN_ARCH_WASMRELAXEDSIMD + +#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_3f3m3l1c1s1r__wasm, 3, 3, 3, 1, 1, 1, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_3f3m3l1c1s1r__wasm_acc2, 3, 3, 3, 1, 1, 1, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_5f5m5l1c1s1r__wasm, 5, 5, 5, 1, 1, 1, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_5f5m5l1c1s1r__wasm_acc2, 5, 5, 5, 1, 1, 1, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_6f6m7l1c1s1r__wasm, 6, 6, 7, 1, 1, 1, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_6f6m7l1c1s1r__wasm_acc2, 6, 6, 7, 1, 1, 1, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_8f8m9l1c1s1r__wasm, 8, 8, 9, 1, 1, 1, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_8f8m9l1c1s1r__wasm_acc2, 8, 8, 9, 1, 1, 1, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_2f2m2l1c1s1r__scalar, 2, 2, 2, 1, 1, 1, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_2f2m2l1c1s1r__scalar_acc2, 2, 2, 2, 1, 1, 1, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_2f2m2l4c1s1r__scalar, 2, 2, 2, 4, 1, 1, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_2f2m2l4c1s1r__scalar_acc2, 2, 2, 2, 4, 1, 1, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_5f5m5l1c1s1r__scalar, 5, 5, 5, 1, 1, 1, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_5f5m5l1c1s1r__scalar_acc2, 5, 5, 5, 1, 1, 1, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_6f6m7l1c1s1r__scalar, 6, 6, 7, 1, 1, 1, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_6f6m7l1c1s1r__scalar_acc2, 6, 6, 7, 1, 1, 1, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_8f8m9l1c1s1r__scalar, 8, 8, 9, 1, 1, 1, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_minmax_ukernel_8f8m9l1c1s1r__scalar_acc2, 8, 8, 9, 1, 1, 1, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) + diff --git a/src/f32-dwconv/f32-dwconv-minmax-unipass.h b/src/f32-dwconv/f32-dwconv-minmax-unipass.h new file mode 100644 index 00000000000..9a5630b1b1c --- /dev/null +++ b/src/f32-dwconv/f32-dwconv-minmax-unipass.h @@ -0,0 +1,237 @@ +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +// Arguments are: +// XNN_DWCONV_UNIPASS(arch, name, c_block, pipelined, cr, kr, datatype, weights_type,params_type, init_fn) + +#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fma, xnn_f32_dwconv_minmax_ukernel_9p4c__asm_aarch64_neonfma, 4, false, 4, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fma, xnn_f32_dwconv_minmax_ukernel_9p4c__asm_aarch64_neonfma_cortex_a55, 4, true, 4, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +#endif // XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_f32_dwconv_minmax_ukernel_3p4c__neon, 4, false, 4, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_f32_dwconv_minmax_ukernel_3p4c__neon_acc2, 4, false, 4, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fma, xnn_f32_dwconv_minmax_ukernel_3p4c__neonfma, 4, false, 4, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fma, xnn_f32_dwconv_minmax_ukernel_3p4c__neonfma_acc2, 4, false, 4, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_f32_dwconv_minmax_ukernel_3p8c__neon, 8, false, 8, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_f32_dwconv_minmax_ukernel_3p8c__neon_acc2, 8, false, 8, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fma, xnn_f32_dwconv_minmax_ukernel_3p8c__neonfma, 8, false, 8, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fma, xnn_f32_dwconv_minmax_ukernel_3p8c__neonfma_acc2, 8, false, 8, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_f32_dwconv_minmax_ukernel_3p16c__neon, 16, false, 16, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_f32_dwconv_minmax_ukernel_3p16c__neon_acc2, 16, false, 16, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fma, xnn_f32_dwconv_minmax_ukernel_3p16c__neonfma, 16, false, 16, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fma, xnn_f32_dwconv_minmax_ukernel_3p16c__neonfma_acc2, 16, false, 16, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_f32_dwconv_minmax_ukernel_4p4c__neon, 4, false, 4, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_f32_dwconv_minmax_ukernel_4p4c__neon_acc2, 4, false, 4, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fma, xnn_f32_dwconv_minmax_ukernel_4p4c__neonfma, 4, false, 4, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fma, xnn_f32_dwconv_minmax_ukernel_4p4c__neonfma_acc2, 4, false, 4, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_f32_dwconv_minmax_ukernel_4p8c__neon, 8, false, 8, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_f32_dwconv_minmax_ukernel_4p8c__neon_acc2, 8, false, 8, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fma, xnn_f32_dwconv_minmax_ukernel_4p8c__neonfma, 8, false, 8, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fma, xnn_f32_dwconv_minmax_ukernel_4p8c__neonfma_acc2, 8, false, 8, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_f32_dwconv_minmax_ukernel_4p16c__neon, 16, false, 16, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_f32_dwconv_minmax_ukernel_4p16c__neon_acc2, 16, false, 16, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fma, xnn_f32_dwconv_minmax_ukernel_4p16c__neonfma, 16, false, 16, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fma, xnn_f32_dwconv_minmax_ukernel_4p16c__neonfma_acc2, 16, false, 16, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_f32_dwconv_minmax_ukernel_9p4c__neon, 4, false, 4, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_f32_dwconv_minmax_ukernel_9p4c__neon_acc2, 4, false, 4, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fma, xnn_f32_dwconv_minmax_ukernel_9p4c__neonfma, 4, false, 4, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fma, xnn_f32_dwconv_minmax_ukernel_9p4c__neonfma_acc2, 4, false, 4, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_f32_dwconv_minmax_ukernel_9p8c__neon, 8, false, 8, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_f32_dwconv_minmax_ukernel_9p8c__neon_acc2, 8, false, 8, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fma, xnn_f32_dwconv_minmax_ukernel_9p8c__neonfma, 8, false, 8, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fma, xnn_f32_dwconv_minmax_ukernel_9p8c__neonfma_acc2, 8, false, 8, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_f32_dwconv_minmax_ukernel_9p16c__neon, 16, false, 16, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_f32_dwconv_minmax_ukernel_9p16c__neon_acc2, 16, false, 16, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fma, xnn_f32_dwconv_minmax_ukernel_9p16c__neonfma, 16, false, 16, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fma, xnn_f32_dwconv_minmax_ukernel_9p16c__neonfma_acc2, 16, false, 16, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_f32_dwconv_minmax_ukernel_25p4c__neon, 4, false, 4, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_f32_dwconv_minmax_ukernel_25p4c__neon_acc2, 4, false, 4, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fma, xnn_f32_dwconv_minmax_ukernel_25p4c__neonfma, 4, false, 4, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fma, xnn_f32_dwconv_minmax_ukernel_25p4c__neonfma_acc2, 4, false, 4, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_f32_dwconv_minmax_ukernel_25p8c__neon, 8, false, 8, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_f32_dwconv_minmax_ukernel_25p8c__neon_acc2, 8, false, 8, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fma, xnn_f32_dwconv_minmax_ukernel_25p8c__neonfma, 8, false, 8, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fma, xnn_f32_dwconv_minmax_ukernel_25p8c__neonfma_acc2, 8, false, 8, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_f32_dwconv_minmax_ukernel_25p16c__neon, 16, false, 16, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_f32_dwconv_minmax_ukernel_25p16c__neon_acc2, 16, false, 16, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fma, xnn_f32_dwconv_minmax_ukernel_25p16c__neonfma, 16, false, 16, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_fma, xnn_f32_dwconv_minmax_ukernel_25p16c__neonfma_acc2, 16, false, 16, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_3p4c__sse, 4, false, 4, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_3p4c__sse_acc2, 4, false, 4, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_3p8c__sse, 8, false, 8, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_3p8c__sse_acc2, 8, false, 8, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_4p4c__sse, 4, false, 4, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_4p4c__sse_acc2, 4, false, 4, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_4p8c__sse, 8, false, 8, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_4p8c__sse_acc2, 8, false, 8, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_9p4c__sse, 4, false, 4, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_9p4c__sse_acc2, 4, false, 4, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_9p8c__sse, 8, false, 8, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_9p8c__sse_acc2, 8, false, 8, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_25p4c__sse, 4, false, 4, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_25p4c__sse_acc2, 4, false, 4, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_25p8c__sse, 8, false, 8, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_25p8c__sse_acc2, 8, false, 8, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_f32_dwconv_minmax_ukernel_3p8c__avx, 8, false, 8, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_f32_dwconv_minmax_ukernel_3p8c__avx_acc2, 8, false, 8, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_f32_dwconv_minmax_ukernel_3p16c__avx, 16, false, 16, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_f32_dwconv_minmax_ukernel_3p16c__avx_acc2, 16, false, 16, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_f32_dwconv_minmax_ukernel_4p8c__avx, 8, false, 8, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_f32_dwconv_minmax_ukernel_4p8c__avx_acc2, 8, false, 8, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_f32_dwconv_minmax_ukernel_4p16c__avx, 16, false, 16, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_f32_dwconv_minmax_ukernel_4p16c__avx_acc2, 16, false, 16, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_f32_dwconv_minmax_ukernel_9p8c__avx, 8, false, 8, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_f32_dwconv_minmax_ukernel_9p8c__avx_acc2, 8, false, 8, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_f32_dwconv_minmax_ukernel_9p16c__avx, 16, false, 16, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_f32_dwconv_minmax_ukernel_9p16c__avx_acc2, 16, false, 16, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_f32_dwconv_minmax_ukernel_25p8c__avx, 8, false, 8, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_f32_dwconv_minmax_ukernel_25p8c__avx_acc2, 8, false, 8, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_f32_dwconv_minmax_ukernel_25p16c__avx, 16, false, 16, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_f32_dwconv_minmax_ukernel_25p16c__avx_acc2, 16, false, 16, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_3p8c__fma3, 8, false, 8, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_3p8c__fma3_acc2, 8, false, 8, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_3p16c__fma3, 16, false, 16, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_3p16c__fma3_acc2, 16, false, 16, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_4p8c__fma3, 8, false, 8, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_4p8c__fma3_acc2, 8, false, 8, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_4p16c__fma3, 16, false, 16, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_4p16c__fma3_acc2, 16, false, 16, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_9p8c__fma3, 8, false, 8, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_9p8c__fma3_acc2, 8, false, 8, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_9p16c__fma3, 16, false, 16, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_9p16c__fma3_acc2, 16, false, 16, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_25p8c__fma3, 8, false, 8, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_25p8c__fma3_acc2, 8, false, 8, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_25p16c__fma3, 16, false, 16, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_25p16c__fma3_acc2, 16, false, 16, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512f, xnn_f32_dwconv_minmax_ukernel_3p16c__avx512f, 16, false, 16, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512f, xnn_f32_dwconv_minmax_ukernel_3p16c__avx512f_acc2, 16, false, 16, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512f, xnn_f32_dwconv_minmax_ukernel_3p32c__avx512f, 32, false, 32, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512f, xnn_f32_dwconv_minmax_ukernel_3p32c__avx512f_acc2, 32, false, 32, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512f, xnn_f32_dwconv_minmax_ukernel_4p16c__avx512f, 16, false, 16, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512f, xnn_f32_dwconv_minmax_ukernel_4p16c__avx512f_acc2, 16, false, 16, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512f, xnn_f32_dwconv_minmax_ukernel_4p32c__avx512f, 32, false, 32, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512f, xnn_f32_dwconv_minmax_ukernel_4p32c__avx512f_acc2, 32, false, 32, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512f, xnn_f32_dwconv_minmax_ukernel_9p16c__avx512f, 16, false, 16, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512f, xnn_f32_dwconv_minmax_ukernel_9p16c__avx512f_acc2, 16, false, 16, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512f, xnn_f32_dwconv_minmax_ukernel_9p32c__avx512f, 32, false, 32, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512f, xnn_f32_dwconv_minmax_ukernel_9p32c__avx512f_acc2, 32, false, 32, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512f, xnn_f32_dwconv_minmax_ukernel_25p16c__avx512f, 16, false, 16, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512f, xnn_f32_dwconv_minmax_ukernel_25p16c__avx512f_acc2, 16, false, 16, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512f, xnn_f32_dwconv_minmax_ukernel_25p32c__avx512f, 32, false, 32, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512f, xnn_f32_dwconv_minmax_ukernel_25p32c__avx512f_acc2, 32, false, 32, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_3p4c__wasmsimd_arm, 4, false, 4, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_3p4c__wasmsimd_arm_acc2, 4, false, 4, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_3p4c__wasmsimd_x86, 4, false, 4, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_3p4c__wasmsimd_x86_acc2, 4, false, 4, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_3p8c__wasmsimd_arm, 8, false, 8, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_3p8c__wasmsimd_arm_acc2, 8, false, 8, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_3p8c__wasmsimd_x86, 8, false, 8, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_3p8c__wasmsimd_x86_acc2, 8, false, 8, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_4p4c__wasmsimd_arm, 4, false, 4, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_4p4c__wasmsimd_arm_acc2, 4, false, 4, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_4p4c__wasmsimd_x86, 4, false, 4, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_4p4c__wasmsimd_x86_acc2, 4, false, 4, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_4p8c__wasmsimd_arm, 8, false, 8, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_4p8c__wasmsimd_arm_acc2, 8, false, 8, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_4p8c__wasmsimd_x86, 8, false, 8, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_4p8c__wasmsimd_x86_acc2, 8, false, 8, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_9p4c__wasmsimd_arm, 4, false, 4, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_9p4c__wasmsimd_arm_acc2, 4, false, 4, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_9p4c__wasmsimd_x86, 4, false, 4, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_9p4c__wasmsimd_x86_acc2, 4, false, 4, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_9p8c__wasmsimd_arm, 8, false, 8, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_9p8c__wasmsimd_arm_acc2, 8, false, 8, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_9p8c__wasmsimd_x86, 8, false, 8, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_9p8c__wasmsimd_x86_acc2, 8, false, 8, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_25p4c__wasmsimd_arm, 4, false, 4, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_25p4c__wasmsimd_arm_acc2, 4, false, 4, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_25p4c__wasmsimd_x86, 4, false, 4, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_25p4c__wasmsimd_x86_acc2, 4, false, 4, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_25p8c__wasmsimd_arm, 8, false, 8, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_25p8c__wasmsimd_arm_acc2, 8, false, 8, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_25p8c__wasmsimd_x86, 8, false, 8, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_25p8c__wasmsimd_x86_acc2, 8, false, 8, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +#if XNN_ARCH_WASMRELAXEDSIMD +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_3p4c__wasmrelaxedsimd, 4, false, 4, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_3p4c__wasmrelaxedsimd_acc2, 4, false, 4, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_3p4c__wasmrelaxedsimd_fma, 4, false, 4, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_3p4c__wasmrelaxedsimd_fma_acc2, 4, false, 4, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_3p8c__wasmrelaxedsimd, 8, false, 8, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_3p8c__wasmrelaxedsimd_acc2, 8, false, 8, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_3p8c__wasmrelaxedsimd_fma, 8, false, 8, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_3p8c__wasmrelaxedsimd_fma_acc2, 8, false, 8, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_4p4c__wasmrelaxedsimd, 4, false, 4, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_4p4c__wasmrelaxedsimd_acc2, 4, false, 4, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_4p4c__wasmrelaxedsimd_fma, 4, false, 4, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_4p4c__wasmrelaxedsimd_fma_acc2, 4, false, 4, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_4p8c__wasmrelaxedsimd, 8, false, 8, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_4p8c__wasmrelaxedsimd_acc2, 8, false, 8, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_4p8c__wasmrelaxedsimd_fma, 8, false, 8, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_4p8c__wasmrelaxedsimd_fma_acc2, 8, false, 8, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_9p4c__wasmrelaxedsimd, 4, false, 4, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_9p4c__wasmrelaxedsimd_acc2, 4, false, 4, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_9p4c__wasmrelaxedsimd_fma, 4, false, 4, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_9p4c__wasmrelaxedsimd_fma_acc2, 4, false, 4, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_9p8c__wasmrelaxedsimd, 8, false, 8, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_9p8c__wasmrelaxedsimd_acc2, 8, false, 8, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_9p8c__wasmrelaxedsimd_fma, 8, false, 8, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_9p8c__wasmrelaxedsimd_fma_acc2, 8, false, 8, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_25p4c__wasmrelaxedsimd, 4, false, 4, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_25p4c__wasmrelaxedsimd_acc2, 4, false, 4, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_25p4c__wasmrelaxedsimd_fma, 4, false, 4, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_25p4c__wasmrelaxedsimd_fma_acc2, 4, false, 4, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_25p8c__wasmrelaxedsimd, 8, false, 8, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_25p8c__wasmrelaxedsimd_acc2, 8, false, 8, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_25p8c__wasmrelaxedsimd_fma, 8, false, 8, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_25p8c__wasmrelaxedsimd_fma_acc2, 8, false, 8, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +#endif // XNN_ARCH_WASMRELAXEDSIMD + +#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_3p1c__wasm, 1, false, 1, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_3p1c__wasm_acc2, 1, false, 1, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_3p2c__wasm, 2, false, 2, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_3p2c__wasm_acc2, 2, false, 2, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_4p1c__wasm, 1, false, 1, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_4p1c__wasm_acc2, 1, false, 1, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_4p2c__wasm, 2, false, 2, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_4p2c__wasm_acc2, 2, false, 2, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_9p1c__wasm, 1, false, 1, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_9p1c__wasm_acc2, 1, false, 1, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_9p2c__wasm, 2, false, 2, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_9p2c__wasm_acc2, 2, false, 2, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_25p1c__wasm, 1, false, 1, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_25p1c__wasm_acc2, 1, false, 1, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_25p2c__wasm, 2, false, 2, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_25p2c__wasm_acc2, 2, false, 2, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_3p1c__scalar, 1, false, 1, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_3p1c__scalar_acc2, 1, false, 1, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_3p2c__scalar, 2, false, 2, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_3p2c__scalar_acc2, 2, false, 2, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_4p1c__scalar, 1, false, 1, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_4p1c__scalar_acc2, 1, false, 1, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_4p2c__scalar, 2, false, 2, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_4p2c__scalar_acc2, 2, false, 2, 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_9p1c__scalar, 1, false, 1, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_9p1c__scalar_acc2, 1, false, 1, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_9p2c__scalar, 2, false, 2, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_9p2c__scalar_acc2, 2, false, 2, 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_25p1c__scalar, 1, false, 1, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_25p1c__scalar_acc2, 1, false, 1, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_25p2c__scalar, 2, false, 2, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_25p2c__scalar_acc2, 2, false, 2, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) + diff --git a/src/f32-dwconv/f32-dwconv-multipass.h b/src/f32-dwconv/f32-dwconv-multipass.h new file mode 100644 index 00000000000..bab9de0477b --- /dev/null +++ b/src/f32-dwconv/f32-dwconv-multipass.h @@ -0,0 +1,36 @@ +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +// Arguments are: +// XNN_DWCONV_UNIPASS(arch, name, first_pass_tile, middle_pass_tile, last_pass_tile, channel_tile, channel_subtile, channel_round, datatype, weights_type, buffer_type,params_type, init_fn) + +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_ukernel_2f2m2l1c1s1r__scalar, 2, 2, 2, 1, 1, 1, float, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_ukernel_2f2m2l1c1s1r__scalar_acc2, 2, 2, 2, 1, 1, 1, float, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_ukernel_2f2m2l4c1s1r__scalar, 2, 2, 2, 4, 1, 1, float, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_ukernel_2f2m2l4c1s1r__scalar_acc2, 2, 2, 2, 4, 1, 1, float, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_ukernel_3f3m3l1c1s1r__scalar, 3, 3, 3, 1, 1, 1, float, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_ukernel_3f3m3l1c1s1r__scalar_acc2, 3, 3, 3, 1, 1, 1, float, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_ukernel_5f5m5l1c1s1r__scalar, 5, 5, 5, 1, 1, 1, float, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_ukernel_5f5m5l1c1s1r__scalar_acc2, 5, 5, 5, 1, 1, 1, float, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_ukernel_6f6m7l1c1s1r__scalar, 6, 6, 7, 1, 1, 1, float, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_ukernel_6f6m7l1c1s1r__scalar_acc2, 6, 6, 7, 1, 1, 1, float, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_ukernel_8f8m9l1c1s1r__scalar, 8, 8, 9, 1, 1, 1, float, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_ukernel_8f8m9l1c1s1r__scalar_acc2, 8, 8, 9, 1, 1, 1, float, float, float, struct xnn_f32_default_params, NULL) + +#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_ukernel_3f3m3l4c4s4r__wasmsimd, 3, 3, 3, 4, 4, 4, float, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_ukernel_3f3m3l4c4s4r__wasmsimd_acc2, 3, 3, 3, 4, 4, 4, float, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_ukernel_3f3m3l8c4s4r__wasmsimd, 3, 3, 3, 8, 4, 4, float, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_ukernel_3f3m3l8c4s4r__wasmsimd_acc2, 3, 3, 3, 8, 4, 4, float, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_ukernel_5f5m5l4c4s4r__wasmsimd, 5, 5, 5, 4, 4, 4, float, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_ukernel_5f5m5l4c4s4r__wasmsimd_acc2, 5, 5, 5, 4, 4, 4, float, float, float, struct xnn_f32_default_params, NULL) +#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +#if XNN_ARCH_WASMRELAXEDSIMD +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_ukernel_5f5m5l4c4s4r__wasmrelaxedsimd_fma, 5, 5, 5, 4, 4, 4, float, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_MULTIPASS(0, xnn_f32_dwconv_ukernel_5f5m5l4c4s4r__wasmrelaxedsimd_fma_acc2, 5, 5, 5, 4, 4, 4, float, float, float, struct xnn_f32_default_params, NULL) +#endif // XNN_ARCH_WASMRELAXEDSIMD + + diff --git a/src/f32-dwconv/f32-dwconv-unipass.h b/src/f32-dwconv/f32-dwconv-unipass.h new file mode 100644 index 00000000000..f52415fdb7d --- /dev/null +++ b/src/f32-dwconv/f32-dwconv-unipass.h @@ -0,0 +1,49 @@ +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +// Arguments are: +// XNN_DWCONV_UNIPASS(arch, name, c_block, pipelined, cr, kr, datatype, weights_type,params_type, init_fn) + +#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_ukernel_3p4c__wasmsimd, 4, false, 4, 3, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_ukernel_3p8c__wasmsimd, 8, false, 8, 3, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_ukernel_4p4c__wasmsimd, 4, false, 4, 4, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_ukernel_4p8c__wasmsimd, 8, false, 8, 4, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_ukernel_9p4c__wasmsimd, 4, false, 4, 9, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_ukernel_9p4c__wasmsimd_acc2, 4, false, 4, 9, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_ukernel_9p8c__wasmsimd, 8, false, 8, 9, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_ukernel_9p8c__wasmsimd_acc2, 8, false, 8, 9, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_ukernel_25p4c__wasmsimd, 4, false, 4, 25, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_ukernel_25p8c__wasmsimd, 8, false, 8, 25, float, float, struct xnn_f32_default_params, NULL) +#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +#if XNN_ARCH_WASMRELAXEDSIMD +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_ukernel_3p4c__wasmrelaxedsimd_fma, 4, false, 4, 3, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_ukernel_3p8c__wasmrelaxedsimd_fma, 8, false, 8, 3, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_ukernel_4p4c__wasmrelaxedsimd_fma, 4, false, 4, 4, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_ukernel_4p8c__wasmrelaxedsimd_fma, 8, false, 8, 4, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_ukernel_9p4c__wasmrelaxedsimd_fma, 4, false, 4, 9, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_ukernel_9p8c__wasmrelaxedsimd_fma, 8, false, 8, 9, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_ukernel_25p4c__wasmrelaxedsimd_fma, 4, false, 4, 25, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_ukernel_25p8c__wasmrelaxedsimd_fma, 8, false, 8, 25, float, float, struct xnn_f32_default_params, NULL) +#endif // XNN_ARCH_WASMRELAXEDSIMD + +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_ukernel_3p1c__scalar, 1, false, 1, 3, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_ukernel_3p1c__scalar_acc2, 1, false, 1, 3, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_ukernel_3p2c__scalar, 2, false, 2, 3, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_ukernel_3p2c__scalar_acc2, 2, false, 2, 3, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_ukernel_4p1c__scalar, 1, false, 1, 4, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_ukernel_4p1c__scalar_acc2, 1, false, 1, 4, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_ukernel_4p2c__scalar, 2, false, 2, 4, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_ukernel_4p2c__scalar_acc2, 2, false, 2, 4, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_ukernel_9p1c__scalar, 1, false, 1, 9, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_ukernel_9p1c__scalar_acc2, 1, false, 1, 9, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_ukernel_9p2c__scalar, 2, false, 2, 9, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_ukernel_9p2c__scalar_acc2, 2, false, 2, 9, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_ukernel_25p1c__scalar, 1, false, 1, 25, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_ukernel_25p1c__scalar_acc2, 1, false, 1, 25, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_ukernel_25p2c__scalar, 2, false, 2, 25, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_ukernel_25p2c__scalar_acc2, 2, false, 2, 25, float, float, struct xnn_f32_default_params, NULL) + diff --git a/src/qs8-dwconv/qs8-dwconv-minmax-multipass-fp32.h b/src/qs8-dwconv/qs8-dwconv-minmax-multipass-fp32.h new file mode 100644 index 00000000000..e595ae13676 --- /dev/null +++ b/src/qs8-dwconv/qs8-dwconv-minmax-multipass-fp32.h @@ -0,0 +1,156 @@ +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +// Arguments are: +// XNN_DWCONV_UNIPASS(arch, name, first_pass_tile, middle_pass_tile, last_pass_tile, channel_tile, channel_subtile, channel_round, datatype, weights_type, buffer_type,params_type, init_fn) + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neon_mul16, 5, 5, 5, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neonv8_mul16, 5, 5, 5, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neon_mul16, 5, 5, 5, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neonv8_mul16, 5, 5, 5, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l32c8s8r__neon_mul16, 5, 5, 5, 32, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l32c8s8r__neonv8_mul16, 5, 5, 5, 32, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neon_mul16, 6, 6, 7, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neonv8_mul16, 6, 6, 7, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neon_mul16, 6, 6, 7, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neonv8_mul16, 6, 6, 7, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l32c8s8r__neon_mul16, 6, 6, 7, 32, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l32c8s8r__neonv8_mul16, 6, 6, 7, 32, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neon_mul16, 8, 8, 9, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neonv8_mul16, 8, 8, 9, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neon_mul16, 8, 8, 9, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neonv8_mul16, 8, 8, 9, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l32c8s8r__neon_mul16, 8, 8, 9, 32, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l32c8s8r__neonv8_mul16, 8, 8, 9, 32, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_neonv8_params) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c4s4r__sse41_mul32, 5, 5, 5, 8, 4, 4, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse2_mul16, 5, 5, 5, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse2_mul16_add16, 5, 5, 5, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse41_mul16, 5, 5, 5, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse41_mul16_add16, 5, 5, 5, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c4s4r__sse41_mul32, 5, 5, 5, 16, 4, 4, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse2_mul16, 5, 5, 5, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse2_mul16_add16, 5, 5, 5, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse41_mul16, 5, 5, 5, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse41_mul16_add16, 5, 5, 5, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c4s4r__sse41_mul32, 6, 6, 7, 8, 4, 4, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse2_mul16, 6, 6, 7, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse2_mul16_add16, 6, 6, 7, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse41_mul16, 6, 6, 7, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse41_mul16_add16, 6, 6, 7, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c4s4r__sse41_mul32, 6, 6, 7, 16, 4, 4, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse2_mul16, 6, 6, 7, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse2_mul16_add16, 6, 6, 7, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse41_mul16, 6, 6, 7, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse41_mul16_add16, 6, 6, 7, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c4s4r__sse41_mul32, 8, 8, 9, 8, 4, 4, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse2_mul16, 8, 8, 9, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse2_mul16_add16, 8, 8, 9, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse41_mul16, 8, 8, 9, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse41_mul16_add16, 8, 8, 9, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c4s4r__sse41_mul32, 8, 8, 9, 16, 4, 4, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse2_mul16, 8, 8, 9, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse2_mul16_add16, 8, 8, 9, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse41_mul16, 8, 8, 9, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse41_mul16_add16, 8, 8, 9, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c4s4r__avx_mul32, 5, 5, 5, 8, 4, 4, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__avx2_mul32, 5, 5, 5, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c4s4r__avx_mul32, 5, 5, 5, 16, 4, 4, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__avx2_mul32, 5, 5, 5, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c16s16r__avx2_mul16_add16_vpunpck, 5, 5, 5, 16, 16, 16, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c16s16r__avx2_mul16_vpmovsx, 5, 5, 5, 16, 16, 16, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c16s16r__avx2_mul16_vpunpck, 5, 5, 5, 16, 16, 16, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l32c8s8r__avx2_mul32, 5, 5, 5, 32, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l32c16s16r__avx2_mul16_add16_vpunpck, 5, 5, 5, 32, 16, 16, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l32c16s16r__avx2_mul16_vpmovsx, 5, 5, 5, 32, 16, 16, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l32c16s16r__avx2_mul16_vpunpck, 5, 5, 5, 32, 16, 16, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c4s4r__avx_mul32, 6, 6, 7, 8, 4, 4, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__avx2_mul32, 6, 6, 7, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c4s4r__avx_mul32, 6, 6, 7, 16, 4, 4, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__avx2_mul32, 6, 6, 7, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c16s16r__avx2_mul16_add16_vpunpck, 6, 6, 7, 16, 16, 16, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c16s16r__avx2_mul16_vpmovsx, 6, 6, 7, 16, 16, 16, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c16s16r__avx2_mul16_vpunpck, 6, 6, 7, 16, 16, 16, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l32c8s8r__avx2_mul32, 6, 6, 7, 32, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l32c16s16r__avx2_mul16_add16_vpunpck, 6, 6, 7, 32, 16, 16, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l32c16s16r__avx2_mul16_vpmovsx, 6, 6, 7, 32, 16, 16, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l32c16s16r__avx2_mul16_vpunpck, 6, 6, 7, 32, 16, 16, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c4s4r__avx_mul32, 8, 8, 9, 8, 4, 4, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__avx2_mul32, 8, 8, 9, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c4s4r__avx_mul32, 8, 8, 9, 16, 4, 4, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__avx2_mul32, 8, 8, 9, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c16s16r__avx2_mul16_add16_vpunpck, 8, 8, 9, 16, 16, 16, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c16s16r__avx2_mul16_vpmovsx, 8, 8, 9, 16, 16, 16, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c16s16r__avx2_mul16_vpunpck, 8, 8, 9, 16, 16, 16, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l32c8s8r__avx2_mul32, 8, 8, 9, 32, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l32c16s16r__avx2_mul16_add16_vpunpck, 8, 8, 9, 32, 16, 16, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l32c16s16r__avx2_mul16_vpmovsx, 8, 8, 9, 32, 16, 16, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l32c16s16r__avx2_mul16_vpunpck, 8, 8, 9, 32, 16, 16, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c16s1r__avx512skx_mul32, 5, 5, 5, 16, 16, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l32c16s1r__avx512skx_mul32, 5, 5, 5, 32, 16, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c16s1r__avx512skx_mul32, 6, 6, 7, 16, 16, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l32c16s1r__avx512skx_mul32, 6, 6, 7, 32, 16, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c16s1r__avx512skx_mul32, 8, 8, 9, 16, 16, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l32c16s1r__avx512skx_mul32, 8, 8, 9, 32, 16, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__wasmsimd_mul16, 5, 5, 5, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__wasmsimd_mul16_add16, 5, 5, 5, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__wasmsimd_mul16, 5, 5, 5, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__wasmsimd_mul16_add16, 5, 5, 5, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__wasmsimd_mul16, 6, 6, 7, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__wasmsimd_mul16_add16, 6, 6, 7, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__wasmsimd_mul16, 6, 6, 7, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__wasmsimd_mul16_add16, 6, 6, 7, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__wasmsimd_mul16, 8, 8, 9, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__wasmsimd_mul16_add16, 8, 8, 9, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__wasmsimd_mul16, 8, 8, 9, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__wasmsimd_mul16_add16, 8, 8, 9, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__wasm_fmagic, 5, 5, 5, 1, 1, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__wasm_fmagic, 5, 5, 5, 2, 1, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__wasm_fmagic, 5, 5, 5, 4, 1, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__wasm_fmagic, 6, 6, 7, 1, 1, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__wasm_fmagic, 6, 6, 7, 2, 1, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__wasm_fmagic, 6, 6, 7, 4, 1, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__wasm_fmagic, 8, 8, 9, 1, 1, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__wasm_fmagic, 8, 8, 9, 2, 1, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__wasm_fmagic, 8, 8, 9, 4, 1, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_fmagic, 5, 5, 5, 1, 1, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_imagic, 5, 5, 5, 1, 1, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_lrintf, 5, 5, 5, 1, 1, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_fmagic, 5, 5, 5, 2, 1, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_imagic, 5, 5, 5, 2, 1, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_lrintf, 5, 5, 5, 2, 1, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_fmagic, 5, 5, 5, 4, 1, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_imagic, 5, 5, 5, 4, 1, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_lrintf, 5, 5, 5, 4, 1, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_fmagic, 6, 6, 7, 1, 1, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_imagic, 6, 6, 7, 1, 1, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_lrintf, 6, 6, 7, 1, 1, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_fmagic, 6, 6, 7, 2, 1, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_imagic, 6, 6, 7, 2, 1, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_lrintf, 6, 6, 7, 2, 1, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_fmagic, 6, 6, 7, 4, 1, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_imagic, 6, 6, 7, 4, 1, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_lrintf, 6, 6, 7, 4, 1, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_fmagic, 8, 8, 9, 1, 1, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_imagic, 8, 8, 9, 1, 1, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_lrintf, 8, 8, 9, 1, 1, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_fmagic, 8, 8, 9, 2, 1, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_imagic, 8, 8, 9, 2, 1, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_lrintf, 8, 8, 9, 2, 1, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_fmagic, 8, 8, 9, 4, 1, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_imagic, 8, 8, 9, 4, 1, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_lrintf, 8, 8, 9, 4, 1, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) + diff --git a/src/qs8-dwconv/qs8-dwconv-minmax-multipass-rndnu.h b/src/qs8-dwconv/qs8-dwconv-minmax-multipass-rndnu.h new file mode 100644 index 00000000000..0b5104b683d --- /dev/null +++ b/src/qs8-dwconv/qs8-dwconv-minmax-multipass-rndnu.h @@ -0,0 +1,39 @@ +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +// Arguments are: +// XNN_DWCONV_UNIPASS(arch, name, first_pass_tile, middle_pass_tile, last_pass_tile, channel_tile, channel_subtile, channel_round, datatype, weights_type, buffer_type,params_type, init_fn) + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l8c8s8r__neon_mla8_ld64, 5, 5, 5, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l8c8s8r__neon_mul8_ld64, 5, 5, 5, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l8c8s8r__neon_mul16, 5, 5, 5, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l16c8s8r__neon_mla8_ld64, 5, 5, 5, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l16c8s8r__neon_mla8_ld128, 5, 5, 5, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l16c8s8r__neon_mul8_ld64, 5, 5, 5, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l16c8s8r__neon_mul8_ld128, 5, 5, 5, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l16c8s8r__neon_mul16, 5, 5, 5, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l32c8s8r__neon_mul16, 5, 5, 5, 32, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l8c8s8r__neon_mla8_ld64, 6, 6, 7, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l8c8s8r__neon_mul8_ld64, 6, 6, 7, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l8c8s8r__neon_mul16, 6, 6, 7, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l16c8s8r__neon_mla8_ld64, 6, 6, 7, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l16c8s8r__neon_mla8_ld128, 6, 6, 7, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l16c8s8r__neon_mul8_ld64, 6, 6, 7, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l16c8s8r__neon_mul8_ld128, 6, 6, 7, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l16c8s8r__neon_mul16, 6, 6, 7, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l32c8s8r__neon_mul16, 6, 6, 7, 32, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l8c8s8r__neon_mla8_ld64, 8, 8, 9, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l8c8s8r__neon_mul8_ld64, 8, 8, 9, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l8c8s8r__neon_mul16, 8, 8, 9, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l16c8s8r__neon_mla8_ld64, 8, 8, 9, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l16c8s8r__neon_mla8_ld128, 8, 8, 9, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l16c8s8r__neon_mul8_ld64, 8, 8, 9, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l16c8s8r__neon_mul8_ld128, 8, 8, 9, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l16c8s8r__neon_mul16, 8, 8, 9, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l32c8s8r__neon_mul16, 8, 8, 9, 32, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + + diff --git a/src/qs8-dwconv/qs8-dwconv-minmax-unipass-fp32.h b/src/qs8-dwconv/qs8-dwconv-minmax-unipass-fp32.h new file mode 100644 index 00000000000..f3773aad7bf --- /dev/null +++ b/src/qs8-dwconv/qs8-dwconv-minmax-unipass-fp32.h @@ -0,0 +1,119 @@ +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +// Arguments are: +// XNN_DWCONV_UNIPASS(arch, name, c_block, pipelined, cr, kr, datatype, weights_type,params_type, init_fn) + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__neon_mul16, 8, false, 8, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_v8, xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__neonv8_mul16, 8, false, 8, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__neon_mul16, 16, false, 16, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_v8, xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__neonv8_mul16, 16, false, 16, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_fp32_ukernel_9p32c__neon_mul16, 32, false, 32, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_v8, xnn_qs8_dwconv_minmax_fp32_ukernel_9p32c__neonv8_mul16, 32, false, 32, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__neon_mul16, 8, false, 8, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_v8, xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__neonv8_mul16, 8, false, 8, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__neon_mul16, 16, false, 16, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_v8, xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__neonv8_mul16, 16, false, 16, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__neon_mul16, 32, false, 32, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_v8, xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__neonv8_mul16, 32, false, 32, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_neonv8_params) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__sse2_mul16, 8, false, 8, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__sse2_mul16_add16, 8, false, 8, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_sse4_1, xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul16, 8, false, 8, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_sse4_1, xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul16_add16, 8, false, 8, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_sse4_1, xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul32, 8, false, 8, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__sse2_mul16, 16, false, 16, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__sse2_mul16_add16, 16, false, 16, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_sse4_1, xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__sse41_mul16, 16, false, 16, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_sse4_1, xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__sse41_mul16_add16, 16, false, 16, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_sse4_1, xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__sse41_mul32, 16, false, 16, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__sse2_mul16, 8, false, 8, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__sse2_mul16_add16, 8, false, 8, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_sse4_1, xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul16, 8, false, 8, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_sse4_1, xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul16_add16, 8, false, 8, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_sse4_1, xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul32, 8, false, 8, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__sse2_mul16, 16, false, 16, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__sse2_mul16_add16, 16, false, 16, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_sse4_1, xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__sse41_mul16, 16, false, 16, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_sse4_1, xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__sse41_mul16_add16, 16, false, 16, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_sse4_1, xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__sse41_mul32, 16, false, 16, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__avx_mul16, 8, false, 8, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__avx_mul16_add16, 8, false, 8, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__avx_mul32, 8, false, 8, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__avx2_mul32, 8, false, 8, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx_mul16, 16, false, 16, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx_mul16_add16, 16, false, 16, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx_mul32, 16, false, 16, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul16_add16_vpunpck, 16, false, 16, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul16_vpmovsx, 16, false, 16, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul16_vpunpck, 16, false, 16, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul32, 16, false, 16, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul16_add16_vpunpck, 32, false, 32, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul16_vpmovsx, 32, false, 32, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul16_vpunpck, 32, false, 32, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul32, 32, false, 32, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__avx_mul16, 8, false, 8, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__avx_mul16_add16, 8, false, 8, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__avx_mul32, 8, false, 8, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__avx2_mul32, 8, false, 8, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__avx_mul16, 16, false, 16, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__avx_mul16_add16, 16, false, 16, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__avx_mul32, 16, false, 16, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul16_add16_vpunpck, 16, false, 16, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul16_vpmovsx, 16, false, 16, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul16_vpunpck, 16, false, 16, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul32, 16, false, 16, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul16_add16_vpunpck, 32, false, 32, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul16_vpmovsx, 32, false, 32, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul16_vpunpck, 32, false, 32, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul32, 32, false, 32, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512skx, xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx512skx_mul32, 16, false, 16, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512skx, xnn_qs8_dwconv_minmax_fp32_ukernel_9p32c__avx512skx_mul32, 32, false, 32, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512skx, xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__avx512skx_mul32, 16, false, 16, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512skx, xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__avx512skx_mul32, 32, false, 32, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__wasmsimd_mul16, 8, false, 8, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__wasmsimd_mul16_add16, 8, false, 8, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__wasmsimd_mul16, 16, false, 16, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__wasmsimd_mul16_add16, 16, false, 16, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__wasmsimd_mul16, 8, false, 8, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__wasmsimd_mul16_add16, 8, false, 8, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__wasmsimd_mul16, 16, false, 16, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__wasmsimd_mul16_add16, 16, false, 16, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_9p1c__wasm_fmagic, 1, false, 1, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_9p2c__wasm_fmagic, 2, false, 2, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_9p4c__wasm_fmagic, 4, false, 4, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_25p1c__wasm_fmagic, 1, false, 1, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_25p2c__wasm_fmagic, 2, false, 2, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_25p4c__wasm_fmagic, 4, false, 4, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_9p1c__scalar_fmagic, 1, false, 1, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_9p1c__scalar_imagic, 1, false, 1, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_9p1c__scalar_lrintf, 1, false, 1, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_9p2c__scalar_fmagic, 2, false, 2, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_9p2c__scalar_imagic, 2, false, 2, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_9p2c__scalar_lrintf, 2, false, 2, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_9p4c__scalar_fmagic, 4, false, 4, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_9p4c__scalar_imagic, 4, false, 4, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_9p4c__scalar_lrintf, 4, false, 4, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_25p1c__scalar_fmagic, 1, false, 1, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_25p1c__scalar_imagic, 1, false, 1, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_25p1c__scalar_lrintf, 1, false, 1, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_25p2c__scalar_fmagic, 2, false, 2, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_25p2c__scalar_imagic, 2, false, 2, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_25p2c__scalar_lrintf, 2, false, 2, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_25p4c__scalar_fmagic, 4, false, 4, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_25p4c__scalar_imagic, 4, false, 4, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_25p4c__scalar_lrintf, 4, false, 4, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) + diff --git a/src/qs8-dwconv/qs8-dwconv-minmax-unipass-rndnu.h b/src/qs8-dwconv/qs8-dwconv-minmax-unipass-rndnu.h new file mode 100644 index 00000000000..96a1387fe45 --- /dev/null +++ b/src/qs8-dwconv/qs8-dwconv-minmax-unipass-rndnu.h @@ -0,0 +1,33 @@ +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +// Arguments are: +// XNN_DWCONV_UNIPASS(arch, name, c_block, pipelined, cr, kr, datatype, weights_type,params_type, init_fn) + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_9p8c__neon_mla8_ld64, 8, false, 8, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_9p8c__neon_mul8_ld64, 8, false, 8, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_9p8c__neon_mul16, 8, false, 8, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mla8_ld64, 16, false, 16, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mla8_ld128, 16, false, 16, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mul8_ld64, 16, false, 16, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mul8_ld128, 16, false, 16, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mul16, 16, false, 16, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_9p32c__neon_mul16, 32, false, 32, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_25p8c__neon_mla8_ld64, 8, false, 8, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_25p8c__neon_mul8_ld64, 8, false, 8, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_25p8c__neon_mul16, 8, false, 8, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_25p16c__neon_mla8_ld64, 16, false, 16, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_25p16c__neon_mla8_ld128, 16, false, 16, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_25p16c__neon_mul8_ld64, 16, false, 16, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_25p16c__neon_mul8_ld128, 16, false, 16, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_25p16c__neon_mul16, 16, false, 16, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_dwconv_minmax_rndnu_ukernel_25p32c__neon_mul16, 32, false, 32, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_neon_params) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_rndnu_ukernel_9p1c__scalar, 1, false, 1, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_rndnu_ukernel_9p2c__scalar, 2, false, 2, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_rndnu_ukernel_9p4c__scalar, 4, false, 4, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_rndnu_scalar_params) + diff --git a/src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-multipass-fp32.h b/src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-multipass-fp32.h new file mode 100644 index 00000000000..2787de0b4a4 --- /dev/null +++ b/src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-multipass-fp32.h @@ -0,0 +1,192 @@ +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +// Arguments are: +// XNN_DWCONV_UNIPASS(arch, name, first_pass_tile, middle_pass_tile, last_pass_tile, channel_tile, channel_subtile, channel_round, datatype, weights_type, buffer_type,params_type, init_fn) + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neon_mla8_ld64, 5, 5, 5, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neon_mul8_ld64, 5, 5, 5, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neon_mul16, 5, 5, 5, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neonv8_mla8_ld64, 5, 5, 5, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neonv8_mul8_ld64, 5, 5, 5, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neonv8_mul16, 5, 5, 5, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neon_mla8_ld64, 5, 5, 5, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neon_mla8_ld128, 5, 5, 5, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neon_mul8_ld64, 5, 5, 5, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neon_mul8_ld128, 5, 5, 5, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neon_mul16, 5, 5, 5, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neonv8_mla8_ld64, 5, 5, 5, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neonv8_mla8_ld128, 5, 5, 5, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neonv8_mul8_ld64, 5, 5, 5, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neonv8_mul8_ld128, 5, 5, 5, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neonv8_mul16, 5, 5, 5, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l32c8s8r__neon_mul16, 5, 5, 5, 32, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l32c8s8r__neonv8_mul16, 5, 5, 5, 32, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neon_mla8_ld64, 6, 6, 7, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neon_mul8_ld64, 6, 6, 7, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neon_mul16, 6, 6, 7, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neonv8_mla8_ld64, 6, 6, 7, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neonv8_mul8_ld64, 6, 6, 7, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neonv8_mul16, 6, 6, 7, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neon_mla8_ld64, 6, 6, 7, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neon_mla8_ld128, 6, 6, 7, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neon_mul8_ld64, 6, 6, 7, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neon_mul8_ld128, 6, 6, 7, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neon_mul16, 6, 6, 7, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neonv8_mla8_ld64, 6, 6, 7, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neonv8_mla8_ld128, 6, 6, 7, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neonv8_mul8_ld64, 6, 6, 7, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neonv8_mul8_ld128, 6, 6, 7, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neonv8_mul16, 6, 6, 7, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l32c8s8r__neon_mul16, 6, 6, 7, 32, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l32c8s8r__neonv8_mul16, 6, 6, 7, 32, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neon_mla8_ld64, 8, 8, 9, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neon_mul8_ld64, 8, 8, 9, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neon_mul16, 8, 8, 9, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neonv8_mla8_ld64, 8, 8, 9, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neonv8_mul8_ld64, 8, 8, 9, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neonv8_mul16, 8, 8, 9, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neon_mla8_ld64, 8, 8, 9, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neon_mla8_ld128, 8, 8, 9, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neon_mul8_ld64, 8, 8, 9, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neon_mul8_ld128, 8, 8, 9, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neon_mul16, 8, 8, 9, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neonv8_mla8_ld64, 8, 8, 9, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neonv8_mla8_ld128, 8, 8, 9, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neonv8_mul8_ld64, 8, 8, 9, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neonv8_mul8_ld128, 8, 8, 9, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neonv8_mul16, 8, 8, 9, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l32c8s8r__neon_mul16, 8, 8, 9, 32, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l32c8s8r__neonv8_mul16, 8, 8, 9, 32, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c4s4r__sse41_mul32, 5, 5, 5, 8, 4, 4, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse2_mul16, 5, 5, 5, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse2_mul16_add16, 5, 5, 5, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse41_mul16, 5, 5, 5, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse41_mul16_add16, 5, 5, 5, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c4s4r__sse41_mul32, 5, 5, 5, 16, 4, 4, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse2_mul16, 5, 5, 5, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse2_mul16_add16, 5, 5, 5, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse41_mul16, 5, 5, 5, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse41_mul16_add16, 5, 5, 5, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c4s4r__sse41_mul32, 6, 6, 7, 8, 4, 4, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse2_mul16, 6, 6, 7, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse2_mul16_add16, 6, 6, 7, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse41_mul16, 6, 6, 7, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse41_mul16_add16, 6, 6, 7, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c4s4r__sse41_mul32, 6, 6, 7, 16, 4, 4, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse2_mul16, 6, 6, 7, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse2_mul16_add16, 6, 6, 7, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse41_mul16, 6, 6, 7, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse41_mul16_add16, 6, 6, 7, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c4s4r__sse41_mul32, 8, 8, 9, 8, 4, 4, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse2_mul16, 8, 8, 9, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse2_mul16_add16, 8, 8, 9, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse41_mul16, 8, 8, 9, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse41_mul16_add16, 8, 8, 9, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c4s4r__sse41_mul32, 8, 8, 9, 16, 4, 4, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse2_mul16, 8, 8, 9, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse2_mul16_add16, 8, 8, 9, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse41_mul16, 8, 8, 9, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse41_mul16_add16, 8, 8, 9, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c4s4r__avx_mul32, 5, 5, 5, 8, 4, 4, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__avx2_mul32, 5, 5, 5, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c4s4r__avx_mul32, 5, 5, 5, 16, 4, 4, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__avx2_mul32, 5, 5, 5, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c16s16r__avx2_mul16_add16_vpunpck, 5, 5, 5, 16, 16, 16, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c16s16r__avx2_mul16_vpmovsx, 5, 5, 5, 16, 16, 16, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c16s16r__avx2_mul16_vpunpck, 5, 5, 5, 16, 16, 16, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l32c8s8r__avx2_mul32, 5, 5, 5, 32, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l32c16s16r__avx2_mul16_add16_vpunpck, 5, 5, 5, 32, 16, 16, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l32c16s16r__avx2_mul16_vpmovsx, 5, 5, 5, 32, 16, 16, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l32c16s16r__avx2_mul16_vpunpck, 5, 5, 5, 32, 16, 16, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c4s4r__avx_mul32, 6, 6, 7, 8, 4, 4, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__avx2_mul32, 6, 6, 7, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c4s4r__avx_mul32, 6, 6, 7, 16, 4, 4, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__avx2_mul32, 6, 6, 7, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c16s16r__avx2_mul16_add16_vpunpck, 6, 6, 7, 16, 16, 16, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c16s16r__avx2_mul16_vpmovsx, 6, 6, 7, 16, 16, 16, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c16s16r__avx2_mul16_vpunpck, 6, 6, 7, 16, 16, 16, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l32c8s8r__avx2_mul32, 6, 6, 7, 32, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l32c16s16r__avx2_mul16_add16_vpunpck, 6, 6, 7, 32, 16, 16, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l32c16s16r__avx2_mul16_vpmovsx, 6, 6, 7, 32, 16, 16, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l32c16s16r__avx2_mul16_vpunpck, 6, 6, 7, 32, 16, 16, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c4s4r__avx_mul32, 8, 8, 9, 8, 4, 4, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__avx2_mul32, 8, 8, 9, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c4s4r__avx_mul32, 8, 8, 9, 16, 4, 4, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__avx2_mul32, 8, 8, 9, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c16s16r__avx2_mul16_add16_vpunpck, 8, 8, 9, 16, 16, 16, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c16s16r__avx2_mul16_vpmovsx, 8, 8, 9, 16, 16, 16, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c16s16r__avx2_mul16_vpunpck, 8, 8, 9, 16, 16, 16, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l32c8s8r__avx2_mul32, 8, 8, 9, 32, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l32c16s16r__avx2_mul16_add16_vpunpck, 8, 8, 9, 32, 16, 16, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l32c16s16r__avx2_mul16_vpmovsx, 8, 8, 9, 32, 16, 16, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l32c16s16r__avx2_mul16_vpunpck, 8, 8, 9, 32, 16, 16, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c16s1r__avx512skx_mul32, 5, 5, 5, 16, 16, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l32c16s1r__avx512skx_mul32, 5, 5, 5, 32, 16, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c16s1r__avx512skx_mul32, 6, 6, 7, 16, 16, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l32c16s1r__avx512skx_mul32, 6, 6, 7, 32, 16, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c16s1r__avx512skx_mul32, 8, 8, 9, 16, 16, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l32c16s1r__avx512skx_mul32, 8, 8, 9, 32, 16, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__wasmsimd_mul16, 5, 5, 5, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__wasmsimd_mul16_add16, 5, 5, 5, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__wasmsimd_mul16, 5, 5, 5, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__wasmsimd_mul16_add16, 5, 5, 5, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__wasmsimd_mul16, 6, 6, 7, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__wasmsimd_mul16_add16, 6, 6, 7, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__wasmsimd_mul16, 6, 6, 7, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__wasmsimd_mul16_add16, 6, 6, 7, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__wasmsimd_mul16, 8, 8, 9, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__wasmsimd_mul16_add16, 8, 8, 9, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__wasmsimd_mul16, 8, 8, 9, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__wasmsimd_mul16_add16, 8, 8, 9, 16, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__wasm_fmagic, 5, 5, 5, 1, 1, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__wasm_fmagic, 5, 5, 5, 2, 1, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__wasm_fmagic, 5, 5, 5, 4, 1, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__wasm_fmagic, 6, 6, 7, 1, 1, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__wasm_fmagic, 6, 6, 7, 2, 1, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__wasm_fmagic, 6, 6, 7, 4, 1, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__wasm_fmagic, 8, 8, 9, 1, 1, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__wasm_fmagic, 8, 8, 9, 2, 1, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__wasm_fmagic, 8, 8, 9, 4, 1, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_fmagic, 5, 5, 5, 1, 1, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_imagic, 5, 5, 5, 1, 1, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_lrintf, 5, 5, 5, 1, 1, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_fmagic, 5, 5, 5, 2, 1, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_imagic, 5, 5, 5, 2, 1, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_lrintf, 5, 5, 5, 2, 1, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_fmagic, 5, 5, 5, 4, 1, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_imagic, 5, 5, 5, 4, 1, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_lrintf, 5, 5, 5, 4, 1, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_fmagic, 6, 6, 7, 1, 1, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_imagic, 6, 6, 7, 1, 1, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_lrintf, 6, 6, 7, 1, 1, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_fmagic, 6, 6, 7, 2, 1, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_imagic, 6, 6, 7, 2, 1, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_lrintf, 6, 6, 7, 2, 1, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_fmagic, 6, 6, 7, 4, 1, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_imagic, 6, 6, 7, 4, 1, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_lrintf, 6, 6, 7, 4, 1, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_fmagic, 8, 8, 9, 1, 1, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_imagic, 8, 8, 9, 1, 1, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_lrintf, 8, 8, 9, 1, 1, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_fmagic, 8, 8, 9, 2, 1, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_imagic, 8, 8, 9, 2, 1, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_lrintf, 8, 8, 9, 2, 1, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_fmagic, 8, 8, 9, 4, 1, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_imagic, 8, 8, 9, 4, 1, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_lrintf, 8, 8, 9, 4, 1, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) + diff --git a/src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-unipass-fp32.h b/src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-unipass-fp32.h new file mode 100644 index 00000000000..5281e1b2243 --- /dev/null +++ b/src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-unipass-fp32.h @@ -0,0 +1,172 @@ +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +// Arguments are: +// XNN_DWCONV_UNIPASS(arch, name, c_block, pipelined, cr, kr, datatype, weights_type,params_type, init_fn) + +#if XNN_ARCH_ARM +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p8c__asm_aarch32_neonv8_mla8_cortex_a35, 8, false, 8, 3, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +#endif // XNN_ARCH_ARM + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p8c__neon_mla8_ld64, 8, false, 8, 3, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p8c__neonv8_mla8_ld64, 8, false, 8, 3, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__asm_aarch32_neonv8_mla8_cortex_a35, 16, false, 16, 3, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +#endif // XNN_ARCH_ARM + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__neon_mla8_ld64, 16, false, 16, 3, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__neon_mla8_ld128, 16, false, 16, 3, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__neonv8_mla8_ld64, 16, false, 16, 3, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__neonv8_mla8_ld128, 16, false, 16, 3, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_4p8c__neon_mla8_ld64, 8, false, 8, 4, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__neon_mla8_ld64, 8, false, 8, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__neon_mul8_ld64, 8, false, 8, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__neon_mul16, 8, false, 8, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__neonv8_mla8_ld64, 8, false, 8, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__neonv8_mul8_ld64, 8, false, 8, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__neonv8_mul16, 8, false, 8, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neon_mla8_ld64, 16, false, 16, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neon_mla8_ld128, 16, false, 16, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neon_mul8_ld64, 16, false, 16, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neon_mul8_ld128, 16, false, 16, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neon_mul16, 16, false, 16, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neonv8_mla8_ld64, 16, false, 16, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neonv8_mla8_ld128, 16, false, 16, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neonv8_mul8_ld64, 16, false, 16, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neonv8_mul8_ld128, 16, false, 16, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neonv8_mul16, 16, false, 16, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p32c__neon_mul16, 32, false, 32, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p32c__neonv8_mul16, 32, false, 32, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__neon_mla8_ld64, 8, false, 8, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__neon_mul8_ld64, 8, false, 8, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__neon_mul16, 8, false, 8, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__neonv8_mla8_ld64, 8, false, 8, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__neonv8_mul8_ld64, 8, false, 8, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__neonv8_mul16, 8, false, 8, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neon_mla8_ld64, 16, false, 16, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neon_mla8_ld128, 16, false, 16, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neon_mul8_ld64, 16, false, 16, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neon_mul8_ld128, 16, false, 16, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neon_mul16, 16, false, 16, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neonv8_mla8_ld64, 16, false, 16, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neonv8_mla8_ld128, 16, false, 16, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neonv8_mul8_ld64, 16, false, 16, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neonv8_mul8_ld128, 16, false, 16, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neonv8_mul16, 16, false, 16, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__neon_mul16, 32, false, 32, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_v8, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__neonv8_mul16, 32, false, 32, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p8c__sse2_mul16, 8, false, 8, 3, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_sse4_1, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p8c__sse41_mul16, 8, false, 8, 3, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__sse2_mul16, 8, false, 8, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__sse2_mul16_add16, 8, false, 8, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_sse4_1, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul16, 8, false, 8, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_sse4_1, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul16_add16, 8, false, 8, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_sse4_1, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul32, 8, false, 8, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__sse2_mul16, 16, false, 16, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__sse2_mul16_add16, 16, false, 16, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_sse4_1, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__sse41_mul16, 16, false, 16, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_sse4_1, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__sse41_mul16_add16, 16, false, 16, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_sse4_1, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__sse41_mul32, 16, false, 16, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__sse2_mul16, 8, false, 8, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__sse2_mul16_add16, 8, false, 8, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_sse4_1, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul16, 8, false, 8, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_sse4_1, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul16_add16, 8, false, 8, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_sse4_1, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul32, 8, false, 8, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__sse2_mul16, 16, false, 16, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__sse2_mul16_add16, 16, false, 16, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_sse4_1, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__sse41_mul16, 16, false, 16, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_sse4_1, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__sse41_mul16_add16, 16, false, 16, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_sse4_1, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__sse41_mul32, 16, false, 16, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__avx_mul16_add16, 16, false, 16, 3, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__avx2_mul32, 16, false, 16, 3, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__avx_mul16, 8, false, 8, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__avx_mul16_add16, 8, false, 8, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__avx_mul32, 8, false, 8, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__avx2_mul32, 8, false, 8, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__avx_mul16, 16, false, 16, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__avx_mul16_add16, 16, false, 16, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__avx_mul32, 16, false, 16, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul16_add16_vpunpck, 16, false, 16, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul16_vpmovsx, 16, false, 16, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul16_vpunpck, 16, false, 16, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul32, 16, false, 16, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul16_add16_vpunpck, 32, false, 32, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul16_vpmovsx, 32, false, 32, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul16_vpunpck, 32, false, 32, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul32, 32, false, 32, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__avx_mul16, 8, false, 8, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__avx_mul16_add16, 8, false, 8, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__avx_mul32, 8, false, 8, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__avx2_mul32, 8, false, 8, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__avx_mul16, 16, false, 16, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__avx_mul16_add16, 16, false, 16, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__avx_mul32, 16, false, 16, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul16_add16_vpunpck, 16, false, 16, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul16_vpmovsx, 16, false, 16, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul16_vpunpck, 16, false, 16, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul32, 16, false, 16, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul16_add16_vpunpck, 32, false, 32, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul16_vpmovsx, 32, false, 32, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul16_vpunpck, 32, false, 32, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul32, 32, false, 32, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512skx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p32c__avx512skx_mul32, 32, false, 32, 3, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512skx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__avx512skx_mul32, 16, false, 16, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512skx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p32c__avx512skx_mul32, 32, false, 32, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512skx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__avx512skx_mul32, 16, false, 16, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512skx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__avx512skx_mul32, 32, false, 32, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__wasmsimd_mul16_add16, 16, false, 16, 3, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__wasmsimd_mul16, 8, false, 8, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__wasmsimd_mul16_add16, 8, false, 8, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__wasmsimd_mul16, 16, false, 16, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__wasmsimd_mul16_add16, 16, false, 16, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__wasmsimd_mul16, 8, false, 8, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__wasmsimd_mul16_add16, 8, false, 8, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__wasmsimd_mul16, 16, false, 16, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__wasmsimd_mul16_add16, 16, false, 16, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p2c__wasm_fmagic, 2, false, 2, 3, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p1c__wasm_fmagic, 1, false, 1, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p2c__wasm_fmagic, 2, false, 2, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p4c__wasm_fmagic, 4, false, 4, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p1c__wasm_fmagic, 1, false, 1, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p2c__wasm_fmagic, 2, false, 2, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p4c__wasm_fmagic, 4, false, 4, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p1c__scalar_fmagic, 1, false, 1, 3, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p2c__scalar_imagic, 2, false, 2, 3, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p2c__scalar_lrintf, 2, false, 2, 3, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_4p2c__scalar_imagic, 2, false, 2, 4, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p1c__scalar_fmagic, 1, false, 1, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p1c__scalar_imagic, 1, false, 1, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p1c__scalar_lrintf, 1, false, 1, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p2c__scalar_fmagic, 2, false, 2, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p2c__scalar_imagic, 2, false, 2, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p2c__scalar_lrintf, 2, false, 2, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p4c__scalar_fmagic, 4, false, 4, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p4c__scalar_imagic, 4, false, 4, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p4c__scalar_lrintf, 4, false, 4, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p1c__scalar_fmagic, 1, false, 1, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p1c__scalar_imagic, 1, false, 1, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p1c__scalar_lrintf, 1, false, 1, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p2c__scalar_fmagic, 2, false, 2, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p2c__scalar_imagic, 2, false, 2, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p2c__scalar_lrintf, 2, false, 2, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p4c__scalar_fmagic, 4, false, 4, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p4c__scalar_imagic, 4, false, 4, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p4c__scalar_lrintf, 4, false, 4, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) + diff --git a/src/qu8-dwconv/qu8-dwconv-minmax-multipass-fp32.h b/src/qu8-dwconv/qu8-dwconv-minmax-multipass-fp32.h new file mode 100644 index 00000000000..667780c8ef7 --- /dev/null +++ b/src/qu8-dwconv/qu8-dwconv-minmax-multipass-fp32.h @@ -0,0 +1,120 @@ +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +// Arguments are: +// XNN_DWCONV_UNIPASS(arch, name, first_pass_tile, middle_pass_tile, last_pass_tile, channel_tile, channel_subtile, channel_round, datatype, weights_type, buffer_type,params_type, init_fn) + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neon_mul16, 5, 5, 5, 8, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neonv8_mul16, 5, 5, 5, 8, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neon_mul16, 5, 5, 5, 16, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neonv8_mul16, 5, 5, 5, 16, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l32c8s8r__neon_mul16, 5, 5, 5, 32, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l32c8s8r__neonv8_mul16, 5, 5, 5, 32, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neon_mul16, 6, 6, 7, 8, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neonv8_mul16, 6, 6, 7, 8, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neon_mul16, 6, 6, 7, 16, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neonv8_mul16, 6, 6, 7, 16, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l32c8s8r__neon_mul16, 6, 6, 7, 32, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l32c8s8r__neonv8_mul16, 6, 6, 7, 32, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neon_mul16, 8, 8, 9, 8, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neonv8_mul16, 8, 8, 9, 8, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neon_mul16, 8, 8, 9, 16, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neonv8_mul16, 8, 8, 9, 16, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l32c8s8r__neon_mul16, 8, 8, 9, 32, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon_v8, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l32c8s8r__neonv8_mul16, 8, 8, 9, 32, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_neonv8_params) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c4s4r__sse41_mul32, 5, 5, 5, 8, 4, 4, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse2_mul16, 5, 5, 5, 8, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse41_mul16, 5, 5, 5, 8, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c4s4r__sse41_mul32, 5, 5, 5, 16, 4, 4, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse2_mul16, 5, 5, 5, 16, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse41_mul16, 5, 5, 5, 16, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l8c4s4r__sse41_mul32, 6, 6, 7, 8, 4, 4, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse2_mul16, 6, 6, 7, 8, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse41_mul16, 6, 6, 7, 8, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c4s4r__sse41_mul32, 6, 6, 7, 16, 4, 4, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse2_mul16, 6, 6, 7, 16, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse41_mul16, 6, 6, 7, 16, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l8c4s4r__sse41_mul32, 8, 8, 9, 8, 4, 4, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse2_mul16, 8, 8, 9, 8, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse41_mul16, 8, 8, 9, 8, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c4s4r__sse41_mul32, 8, 8, 9, 16, 4, 4, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse2_mul16, 8, 8, 9, 16, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_sse4_1, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse41_mul16, 8, 8, 9, 16, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx, xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c4s4r__avx_mul32, 5, 5, 5, 8, 4, 4, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__avx2_mul32, 5, 5, 5, 8, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx, xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c4s4r__avx_mul32, 5, 5, 5, 16, 4, 4, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__avx2_mul32, 5, 5, 5, 16, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l32c8s8r__avx2_mul32, 5, 5, 5, 32, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx, xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l8c4s4r__avx_mul32, 6, 6, 7, 8, 4, 4, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__avx2_mul32, 6, 6, 7, 8, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx, xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c4s4r__avx_mul32, 6, 6, 7, 16, 4, 4, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__avx2_mul32, 6, 6, 7, 16, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l32c8s8r__avx2_mul32, 6, 6, 7, 32, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l8c4s4r__avx_mul32, 8, 8, 9, 8, 4, 4, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__avx2_mul32, 8, 8, 9, 8, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c4s4r__avx_mul32, 8, 8, 9, 16, 4, 4, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__avx2_mul32, 8, 8, 9, 16, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l32c8s8r__avx2_mul32, 8, 8, 9, 32, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c16s1r__avx512skx_mul32, 5, 5, 5, 16, 16, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l32c16s1r__avx512skx_mul32, 5, 5, 5, 32, 16, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c16s1r__avx512skx_mul32, 6, 6, 7, 16, 16, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l32c16s1r__avx512skx_mul32, 6, 6, 7, 32, 16, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c16s1r__avx512skx_mul32, 8, 8, 9, 16, 16, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l32c16s1r__avx512skx_mul32, 8, 8, 9, 32, 16, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__wasmsimd_mul16, 5, 5, 5, 8, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__wasmsimd_mul16, 5, 5, 5, 16, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__wasmsimd_mul16, 6, 6, 7, 8, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__wasmsimd_mul16, 6, 6, 7, 16, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__wasmsimd_mul16, 8, 8, 9, 8, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__wasmsimd_mul16, 8, 8, 9, 16, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__wasm_fmagic, 5, 5, 5, 1, 1, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__wasm_fmagic, 5, 5, 5, 2, 1, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__wasm_fmagic, 5, 5, 5, 4, 1, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__wasm_fmagic, 6, 6, 7, 1, 1, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__wasm_fmagic, 6, 6, 7, 2, 1, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__wasm_fmagic, 6, 6, 7, 4, 1, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__wasm_fmagic, 8, 8, 9, 1, 1, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__wasm_fmagic, 8, 8, 9, 2, 1, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__wasm_fmagic, 8, 8, 9, 4, 1, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_fmagic, 5, 5, 5, 1, 1, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_imagic, 5, 5, 5, 1, 1, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_lrintf, 5, 5, 5, 1, 1, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_fmagic, 5, 5, 5, 2, 1, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_imagic, 5, 5, 5, 2, 1, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_lrintf, 5, 5, 5, 2, 1, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_fmagic, 5, 5, 5, 4, 1, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_imagic, 5, 5, 5, 4, 1, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_lrintf, 5, 5, 5, 4, 1, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_fmagic, 6, 6, 7, 1, 1, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_imagic, 6, 6, 7, 1, 1, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_lrintf, 6, 6, 7, 1, 1, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_fmagic, 6, 6, 7, 2, 1, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_imagic, 6, 6, 7, 2, 1, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_lrintf, 6, 6, 7, 2, 1, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_fmagic, 6, 6, 7, 4, 1, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_imagic, 6, 6, 7, 4, 1, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_lrintf, 6, 6, 7, 4, 1, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_fmagic, 8, 8, 9, 1, 1, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_imagic, 8, 8, 9, 1, 1, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_lrintf, 8, 8, 9, 1, 1, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_fmagic, 8, 8, 9, 2, 1, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_imagic, 8, 8, 9, 2, 1, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_lrintf, 8, 8, 9, 2, 1, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_fmagic, 8, 8, 9, 4, 1, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_imagic, 8, 8, 9, 4, 1, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_lrintf, 8, 8, 9, 4, 1, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) + diff --git a/src/qu8-dwconv/qu8-dwconv-minmax-multipass-rndnu.h b/src/qu8-dwconv/qu8-dwconv-minmax-multipass-rndnu.h new file mode 100644 index 00000000000..88c773f77bc --- /dev/null +++ b/src/qu8-dwconv/qu8-dwconv-minmax-multipass-rndnu.h @@ -0,0 +1,30 @@ +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +// Arguments are: +// XNN_DWCONV_UNIPASS(arch, name, first_pass_tile, middle_pass_tile, last_pass_tile, channel_tile, channel_subtile, channel_round, datatype, weights_type, buffer_type,params_type, init_fn) + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_rndnu_ukernel_5f5m5l8c8s8r__neon_mul8, 5, 5, 5, 8, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_rndnu_ukernel_5f5m5l8c8s8r__neon_mul16, 5, 5, 5, 8, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_rndnu_ukernel_5f5m5l16c8s8r__neon_mul8, 5, 5, 5, 16, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_rndnu_ukernel_5f5m5l16c8s8r__neon_mul16, 5, 5, 5, 16, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_rndnu_ukernel_5f5m5l32c8s8r__neon_mul8, 5, 5, 5, 32, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_rndnu_ukernel_5f5m5l32c8s8r__neon_mul16, 5, 5, 5, 32, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_rndnu_ukernel_6f6m7l8c8s8r__neon_mul8, 6, 6, 7, 8, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_rndnu_ukernel_6f6m7l8c8s8r__neon_mul16, 6, 6, 7, 8, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_rndnu_ukernel_6f6m7l16c8s8r__neon_mul8, 6, 6, 7, 16, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_rndnu_ukernel_6f6m7l16c8s8r__neon_mul16, 6, 6, 7, 16, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_rndnu_ukernel_6f6m7l32c8s8r__neon_mul8, 6, 6, 7, 32, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_rndnu_ukernel_6f6m7l32c8s8r__neon_mul16, 6, 6, 7, 32, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_rndnu_ukernel_8f8m9l8c8s8r__neon_mul8, 8, 8, 9, 8, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_rndnu_ukernel_8f8m9l8c8s8r__neon_mul16, 8, 8, 9, 8, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_rndnu_ukernel_8f8m9l16c8s8r__neon_mul8, 8, 8, 9, 16, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_rndnu_ukernel_8f8m9l16c8s8r__neon_mul16, 8, 8, 9, 16, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_rndnu_ukernel_8f8m9l32c8s8r__neon_mul8, 8, 8, 9, 32, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_MULTIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_rndnu_ukernel_8f8m9l32c8s8r__neon_mul16, 8, 8, 9, 32, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_rndnu_neon_params) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + + diff --git a/src/qu8-dwconv/qu8-dwconv-minmax-unipass-fp32.h b/src/qu8-dwconv/qu8-dwconv-minmax-unipass-fp32.h new file mode 100644 index 00000000000..0a0cd7e224c --- /dev/null +++ b/src/qu8-dwconv/qu8-dwconv-minmax-unipass-fp32.h @@ -0,0 +1,91 @@ +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +// Arguments are: +// XNN_DWCONV_UNIPASS(arch, name, c_block, pipelined, cr, kr, datatype, weights_type,params_type, init_fn) + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__neon_mul16, 8, false, 8, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_v8, xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__neonv8_mul16, 8, false, 8, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__neon_mul16, 16, false, 16, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_v8, xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__neonv8_mul16, 16, false, 16, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_fp32_ukernel_9p32c__neon_mul16, 32, false, 32, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_v8, xnn_qu8_dwconv_minmax_fp32_ukernel_9p32c__neonv8_mul16, 32, false, 32, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__neon_mul16, 8, false, 8, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_v8, xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__neonv8_mul16, 8, false, 8, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__neon_mul16, 16, false, 16, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_v8, xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__neonv8_mul16, 16, false, 16, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_neonv8_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_fp32_ukernel_25p32c__neon_mul16, 32, false, 32, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon_v8, xnn_qu8_dwconv_minmax_fp32_ukernel_25p32c__neonv8_mul16, 32, false, 32, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_neonv8_params) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_DWCONV_UNIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__sse2_mul16, 8, false, 8, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_sse4_1, xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul16, 8, false, 8, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_sse4_1, xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul32, 8, false, 8, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__sse2_mul16, 16, false, 16, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_sse4_1, xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__sse41_mul16, 16, false, 16, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_sse4_1, xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__sse41_mul32, 16, false, 16, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__sse2_mul16, 8, false, 8, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_sse4_1, xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul16, 8, false, 8, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_sse4_1, xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul32, 8, false, 8, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__sse2_mul16, 16, false, 16, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_sse4_1, xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__sse41_mul16, 16, false, 16, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_sse4_1, xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__sse41_mul32, 16, false, 16, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__avx_mul16, 8, false, 8, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__avx_mul32, 8, false, 8, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__avx2_mul32, 8, false, 8, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__avx_mul16, 16, false, 16, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__avx_mul32, 16, false, 16, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul32, 16, false, 16, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qu8_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul32, 32, false, 32, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__avx_mul16, 8, false, 8, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__avx_mul32, 8, false, 8, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__avx2_mul32, 8, false, 8, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__avx_mul16, 16, false, 16, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__avx_mul32, 16, false, 16, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul32, 16, false, 16, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qu8_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul32, 32, false, 32, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512skx, xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__avx512skx_mul32, 16, false, 16, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512skx, xnn_qu8_dwconv_minmax_fp32_ukernel_9p32c__avx512skx_mul32, 32, false, 32, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512skx, xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__avx512skx_mul32, 16, false, 16, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512skx, xnn_qu8_dwconv_minmax_fp32_ukernel_25p32c__avx512skx_mul32, 32, false, 32, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_DWCONV_UNIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__wasmsimd_mul16, 8, false, 8, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__wasmsimd_mul16, 16, false, 16, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__wasmsimd_mul16, 8, false, 8, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__wasmsimd_mul16, 16, false, 16, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_DWCONV_UNIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_9p1c__wasm_fmagic, 1, false, 1, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_9p2c__wasm_fmagic, 2, false, 2, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_9p4c__wasm_fmagic, 4, false, 4, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_25p1c__wasm_fmagic, 1, false, 1, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_25p2c__wasm_fmagic, 2, false, 2, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_25p4c__wasm_fmagic, 4, false, 4, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +XNN_DWCONV_UNIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_9p1c__scalar_fmagic, 1, false, 1, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_9p1c__scalar_imagic, 1, false, 1, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_9p1c__scalar_lrintf, 1, false, 1, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_9p2c__scalar_fmagic, 2, false, 2, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_9p2c__scalar_imagic, 2, false, 2, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_9p2c__scalar_lrintf, 2, false, 2, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_9p4c__scalar_fmagic, 4, false, 4, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_9p4c__scalar_imagic, 4, false, 4, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_9p4c__scalar_lrintf, 4, false, 4, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_25p1c__scalar_fmagic, 1, false, 1, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_25p1c__scalar_imagic, 1, false, 1, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_25p1c__scalar_lrintf, 1, false, 1, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_25p2c__scalar_fmagic, 2, false, 2, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_25p2c__scalar_imagic, 2, false, 2, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_25p2c__scalar_lrintf, 2, false, 2, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_25p4c__scalar_fmagic, 4, false, 4, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_25p4c__scalar_imagic, 4, false, 4, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_25p4c__scalar_lrintf, 4, false, 4, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) + diff --git a/src/qu8-dwconv/qu8-dwconv-minmax-unipass-rndnu.h b/src/qu8-dwconv/qu8-dwconv-minmax-unipass-rndnu.h new file mode 100644 index 00000000000..f7d1a623023 --- /dev/null +++ b/src/qu8-dwconv/qu8-dwconv-minmax-unipass-rndnu.h @@ -0,0 +1,27 @@ +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +// Arguments are: +// XNN_DWCONV_UNIPASS(arch, name, c_block, pipelined, cr, kr, datatype, weights_type,params_type, init_fn) + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_rndnu_ukernel_9p8c__neon_mul8, 8, false, 8, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_rndnu_ukernel_9p8c__neon_mul16, 8, false, 8, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mul8, 16, false, 16, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mul16, 16, false, 16, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_rndnu_ukernel_9p32c__neon_mul8, 32, false, 32, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_rndnu_ukernel_9p32c__neon_mul16, 32, false, 32, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_rndnu_ukernel_25p8c__neon_mul8, 8, false, 8, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_rndnu_ukernel_25p8c__neon_mul16, 8, false, 8, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_rndnu_ukernel_25p16c__neon_mul8, 16, false, 16, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_rndnu_ukernel_25p16c__neon_mul16, 16, false, 16, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_rndnu_ukernel_25p32c__neon_mul8, 32, false, 32, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_rndnu_neon_params) +XNN_DWCONV_UNIPASS(xnn_arch_arm_neon, xnn_qu8_dwconv_minmax_rndnu_ukernel_25p32c__neon_mul16, 32, false, 32, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_rndnu_neon_params) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +XNN_DWCONV_UNIPASS(0, xnn_qu8_dwconv_minmax_rndnu_ukernel_9p1c__scalar, 1, false, 1, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_rndnu_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qu8_dwconv_minmax_rndnu_ukernel_9p2c__scalar, 2, false, 2, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_rndnu_scalar_params) +XNN_DWCONV_UNIPASS(0, xnn_qu8_dwconv_minmax_rndnu_ukernel_9p4c__scalar, 4, false, 4, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_rndnu_scalar_params) + diff --git a/src/xnnpack/dwconv.h b/src/xnnpack/dwconv.h index 6d96286d6ec..a4c23e345cb 100644 --- a/src/xnnpack/dwconv.h +++ b/src/xnnpack/dwconv.h @@ -19,1712 +19,52 @@ extern "C" { #endif -#define DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(fn_name) \ +#define XNN_DWCONV_UNIPASS(arch_flags, fn_name, c_block, adj_c_block, cr, kr, datatype, weights_type, params_type, init_fn) \ XNN_INTERNAL void fn_name( \ size_t channels, \ size_t output_width, \ - const float** input, \ - const float* weights, \ - float* output, \ + const datatype** input, \ + const weights_type* weights, \ + datatype* output, \ intptr_t input_stride, \ size_t output_increment, \ size_t input_offset, \ - const float* zero, \ - const struct xnn_f32_default_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); - -#define DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(fn_name) \ - XNN_INTERNAL void fn_name( \ - size_t channels, \ - size_t output_width, \ - const float** input, \ - const float* weights, \ - float* output, \ - intptr_t input_stride, \ - size_t output_increment, \ - size_t input_offset, \ - const float* zero, \ - const union xnn_f32_minmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); - -#define DECLARE_F32_DWCONV_MULTIPASS_UKERNEL_FUNCTION(fn_name) \ + const datatype* zero, \ + const params_type params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); +#include "src/f16-dwconv/f16-dwconv-minmax-unipass.h" +#include "src/f32-dwconv/f32-dwconv-minmax-unipass.h" +#include "src/f32-dwconv/f32-dwconv-unipass.h" +#include "src/qs8-dwconv/qs8-dwconv-minmax-unipass-fp32.h" +#include "src/qs8-dwconv/qs8-dwconv-minmax-unipass-rndnu.h" +#include "src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-unipass-fp32.h" +#include "src/qu8-dwconv/qu8-dwconv-minmax-unipass-fp32.h" +#include "src/qu8-dwconv/qu8-dwconv-minmax-unipass-rndnu.h" +#undef XNN_DWCONV_UNIPASS + + +#define XNN_DWCONV_MULTIPASS(arch_flags, fn_name, first_pass_tile, middle_pass_tile, last_pass_tile, channel_tile, channel_subtile, channel_round, datatype, weights_type, buffer_type, params_type, init_fn) \ XNN_INTERNAL void fn_name( \ size_t channels, \ size_t output_width, \ - const float** input, \ - const float* weights, \ - float* output, \ + const datatype** input, \ + const weights_type* weights, \ + datatype* output, \ intptr_t input_stride, \ size_t output_increment, \ size_t input_offset, \ - const float* zero, \ + const datatype* zero, \ size_t kernel_size, \ - float* buffer, \ - const struct xnn_f32_default_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); - -#define DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(fn_name) \ - XNN_INTERNAL void fn_name( \ - size_t channels, \ - size_t output_width, \ - const float** input, \ - const float* weights, \ - float* output, \ - intptr_t input_stride, \ - size_t output_increment, \ - size_t input_offset, \ - const float* zero, \ - size_t kernel_size, \ - float* buffer, \ - const union xnn_f32_minmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); - -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__sse) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__sse_acc2) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l8c4s4r__sse) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l8c4s4r__sse_acc2) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l16c4s4r__sse) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l16c4s4r__sse_acc2) - -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_6f6m7l4c4s4r__sse) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_6f6m7l4c4s4r__sse_acc2) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_6f6m7l8c4s4r__sse) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_6f6m7l8c4s4r__sse_acc2) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_6f6m7l16c4s4r__sse) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_6f6m7l16c4s4r__sse_acc2) - -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_7f6m6l8c4s4r__sse_acc2) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_8f8m9l4c4s4r__sse) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_8f8m9l4c4s4r__sse_acc2) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_8f8m9l8c4s4r__sse) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_8f8m9l8c4s4r__sse_acc2) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_8f8m9l16c4s4r__sse) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_8f8m9l16c4s4r__sse_acc2) - -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l8c8s4r__avx) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l8c8s4r__avx_acc2) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l16c8s4r__avx) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l16c8s4r__avx_acc2) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_6f6m7l8c8s4r__avx) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_6f6m7l8c8s4r__avx_acc2) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_6f6m7l16c8s4r__avx) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_6f6m7l16c8s4r__avx_acc2) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_8f8m9l8c8s4r__avx) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_8f8m9l8c8s4r__avx_acc2) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_8f8m9l16c8s4r__avx) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_8f8m9l16c8s4r__avx_acc2) - -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l8c8s4r__fma3) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l8c8s4r__fma3_acc2) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l16c8s4r__fma3) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l16c8s4r__fma3_acc2) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l32c8s4r__fma3) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l32c8s4r__fma3_acc2) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_7f6m6l8c8s4r__fma3) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_7f6m6l8c8s4r__fma3_acc2) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_7f6m6l16c8s4r__fma3) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_7f6m6l16c8s4r__fma3_acc2) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_7f6m6l32c8s4r__fma3) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_7f6m6l32c8s4r__fma3_acc2) - -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l16c16s1r__avx512f) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l16c16s1r__avx512f_acc2) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l32c16s1r__avx512f) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l32c16s1r__avx512f_acc2) - -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__neonfma) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__neonfma_acc2) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l8c4s4r__neonfma) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l8c4s4r__neonfma_acc2) - -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_6f6m7l4c4s4r__neonfma) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_6f6m7l4c4s4r__neonfma_acc2) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_6f6m7l8c4s4r__neonfma) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_6f6m7l8c4s4r__neonfma_acc2) - -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_8f8m9l4c4s4r__neonfma) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_8f8m9l4c4s4r__neonfma_acc2) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_8f8m9l8c4s4r__neonfma) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_8f8m9l8c4s4r__neonfma_acc2) - -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__neon) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__neon_acc2) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l8c4s4r__neon) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l8c4s4r__neon_acc2) - -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_6f6m7l4c4s4r__neon) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_6f6m7l4c4s4r__neon_acc2) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_6f6m7l8c4s4r__neon) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_6f6m7l8c4s4r__neon_acc2) - -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_8f8m9l4c4s4r__neon) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_8f8m9l4c4s4r__neon_acc2) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_8f8m9l8c4s4r__neon) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_8f8m9l8c4s4r__neon_acc2) - -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3f3m3l1c1s1r__wasm) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3f3m3l1c1s1r__wasm_acc2) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l1c1s1r__wasm) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l1c1s1r__wasm_acc2) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_6f6m7l1c1s1r__wasm) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_6f6m7l1c1s1r__wasm_acc2) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_8f8m9l1c1s1r__wasm) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_8f8m9l1c1s1r__wasm_acc2) - -DECLARE_F32_DWCONV_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_3f3m3l4c4s4r__wasmsimd) -DECLARE_F32_DWCONV_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_3f3m3l4c4s4r__wasmsimd_acc2) -DECLARE_F32_DWCONV_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_3f3m3l8c4s4r__wasmsimd) -DECLARE_F32_DWCONV_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_3f3m3l8c4s4r__wasmsimd_acc2) - -DECLARE_F32_DWCONV_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_5f5m5l4c4s4r__wasmsimd) -DECLARE_F32_DWCONV_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_5f5m5l4c4s4r__wasmsimd_acc2) - -DECLARE_F32_DWCONV_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_5f5m5l4c4s4r__wasmrelaxedsimd_fma) -DECLARE_F32_DWCONV_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_5f5m5l4c4s4r__wasmrelaxedsimd_fma_acc2) - -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3f3m3l4c4s4r__wasmsimd_arm) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3f3m3l4c4s4r__wasmsimd_arm_acc2) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3f3m3l8c4s4r__wasmsimd_arm) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3f3m3l8c4s4r__wasmsimd_arm_acc2) - -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmsimd_arm) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmsimd_arm_acc2) - -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3f3m3l4c4s4r__wasmsimd_x86) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3f3m3l4c4s4r__wasmsimd_x86_acc2) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3f3m3l8c4s4r__wasmsimd_x86) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3f3m3l8c4s4r__wasmsimd_x86_acc2) - -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmsimd_x86) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmsimd_x86_acc2) - -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmrelaxedsimd) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmrelaxedsimd_acc2) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmrelaxedsimd_fma) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmrelaxedsimd_fma_acc2) - -DECLARE_F32_DWCONV_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_2f2m2l1c1s1r__scalar) -DECLARE_F32_DWCONV_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_2f2m2l1c1s1r__scalar_acc2) -DECLARE_F32_DWCONV_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_2f2m2l4c1s1r__scalar) -DECLARE_F32_DWCONV_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_2f2m2l4c1s1r__scalar_acc2) -DECLARE_F32_DWCONV_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_3f3m3l1c1s1r__scalar) -DECLARE_F32_DWCONV_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_3f3m3l1c1s1r__scalar_acc2) -DECLARE_F32_DWCONV_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_5f5m5l1c1s1r__scalar) -DECLARE_F32_DWCONV_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_5f5m5l1c1s1r__scalar_acc2) -DECLARE_F32_DWCONV_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_6f6m7l1c1s1r__scalar) -DECLARE_F32_DWCONV_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_6f6m7l1c1s1r__scalar_acc2) -DECLARE_F32_DWCONV_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_8f8m9l1c1s1r__scalar) -DECLARE_F32_DWCONV_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_8f8m9l1c1s1r__scalar_acc2) - -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_2f2m2l1c1s1r__scalar) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_2f2m2l1c1s1r__scalar_acc2) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_2f2m2l4c1s1r__scalar) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_2f2m2l4c1s1r__scalar_acc2) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l1c1s1r__scalar) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_5f5m5l1c1s1r__scalar_acc2) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_6f6m7l1c1s1r__scalar) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_6f6m7l1c1s1r__scalar_acc2) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_8f8m9l1c1s1r__scalar) -DECLARE_F32_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_8f8m9l1c1s1r__scalar_acc2) - -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p4c__neon) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p4c__neon_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p4c__neonfma) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p4c__neonfma_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p4c__sse) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p4c__sse_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p8c__avx) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p8c__avx_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p8c__fma3) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p8c__fma3_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p8c__neon) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p8c__neon_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p8c__neonfma) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p8c__neonfma_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p8c__sse) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p8c__sse_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p16c__avx) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p16c__avx_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p16c__avx512f) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p16c__avx512f_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p16c__fma3) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p16c__fma3_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p16c__neon) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p16c__neon_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p16c__neonfma) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p16c__neonfma_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p32c__avx512f) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p32c__avx512f_acc2) - -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p4c__neon) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p4c__neon_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p4c__neonfma) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p4c__neonfma_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p4c__sse) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p4c__sse_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p8c__avx) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p8c__avx_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p8c__fma3) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p8c__fma3_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p8c__neon) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p8c__neon_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p8c__neonfma) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p8c__neonfma_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p8c__sse) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p8c__sse_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p16c__avx) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p16c__avx_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p16c__avx512f) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p16c__avx512f_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p16c__fma3) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p16c__fma3_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p16c__neon) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p16c__neon_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p16c__neonfma) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p16c__neonfma_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p32c__avx512f) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p32c__avx512f_acc2) - -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p4c__asm_aarch64_neonfma) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p4c__asm_aarch64_neonfma_cortex_a55) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p4c__neon) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p4c__neon_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p4c__neonfma) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p4c__neonfma_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p4c__sse) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p4c__sse_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p8c__avx) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p8c__avx_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p8c__fma3) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p8c__fma3_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p8c__neon) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p8c__neon_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p8c__neonfma) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p8c__neonfma_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p8c__sse) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p8c__sse_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p16c__avx) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p16c__avx_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p16c__avx512f) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p16c__avx512f_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p16c__fma3) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p16c__fma3_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p16c__neon) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p16c__neon_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p16c__neonfma) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p16c__neonfma_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p32c__avx512f) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p32c__avx512f_acc2) - -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p4c__neon) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p4c__neon_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p4c__neonfma) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p4c__neonfma_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p4c__sse) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p4c__sse_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p8c__avx) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p8c__avx_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p8c__fma3) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p8c__fma3_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p8c__neon) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p8c__neon_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p8c__neonfma) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p8c__neonfma_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p8c__sse) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p8c__sse_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p16c__avx) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p16c__avx_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p16c__avx512f) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p16c__avx512f_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p16c__fma3) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p16c__fma3_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p16c__neon) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p16c__neon_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p16c__neonfma) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p16c__neonfma_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p32c__avx512f) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p32c__avx512f_acc2) - -DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_3p4c__wasmrelaxedsimd_fma) -DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_3p4c__wasmsimd) -DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_3p8c__wasmrelaxedsimd_fma) -DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_3p8c__wasmsimd) - -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p4c__wasmrelaxedsimd) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p4c__wasmrelaxedsimd_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p4c__wasmrelaxedsimd_fma) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p4c__wasmrelaxedsimd_fma_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p4c__wasmsimd_arm) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p4c__wasmsimd_arm_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p4c__wasmsimd_x86) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p4c__wasmsimd_x86_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p8c__wasmrelaxedsimd) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p8c__wasmrelaxedsimd_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p8c__wasmrelaxedsimd_fma) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p8c__wasmrelaxedsimd_fma_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p8c__wasmsimd_arm) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p8c__wasmsimd_arm_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p8c__wasmsimd_x86) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p8c__wasmsimd_x86_acc2) - -DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_4p4c__wasmrelaxedsimd_fma) -DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_4p4c__wasmsimd) -DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_4p8c__wasmrelaxedsimd_fma) -DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_4p8c__wasmsimd) - -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p4c__wasmrelaxedsimd) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p4c__wasmrelaxedsimd_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p4c__wasmrelaxedsimd_fma) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p4c__wasmrelaxedsimd_fma_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p4c__wasmsimd_arm) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p4c__wasmsimd_arm_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p4c__wasmsimd_x86) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p4c__wasmsimd_x86_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p8c__wasmrelaxedsimd) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p8c__wasmrelaxedsimd_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p8c__wasmrelaxedsimd_fma) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p8c__wasmrelaxedsimd_fma_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p8c__wasmsimd_arm) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p8c__wasmsimd_arm_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p8c__wasmsimd_x86) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p8c__wasmsimd_x86_acc2) - -DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_9p4c__wasmrelaxedsimd_fma) -DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_9p4c__wasmsimd) -DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_9p4c__wasmsimd_acc2) -DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_9p8c__wasmrelaxedsimd_fma) -DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_9p8c__wasmsimd) -DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_9p8c__wasmsimd_acc2) - -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p4c__wasmrelaxedsimd) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p4c__wasmrelaxedsimd_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p4c__wasmrelaxedsimd_fma) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p4c__wasmrelaxedsimd_fma_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p4c__wasmsimd_arm) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p4c__wasmsimd_arm_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p4c__wasmsimd_x86) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p4c__wasmsimd_x86_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p8c__wasmrelaxedsimd) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p8c__wasmrelaxedsimd_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p8c__wasmrelaxedsimd_fma) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p8c__wasmrelaxedsimd_fma_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p8c__wasmsimd_arm) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p8c__wasmsimd_arm_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p8c__wasmsimd_x86) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p8c__wasmsimd_x86_acc2) - -DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_25p4c__wasmrelaxedsimd_fma) -DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_25p4c__wasmsimd) -DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_25p8c__wasmrelaxedsimd_fma) -DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_25p8c__wasmsimd) - -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p4c__wasmrelaxedsimd) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p4c__wasmrelaxedsimd_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p4c__wasmrelaxedsimd_fma) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p4c__wasmrelaxedsimd_fma_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p4c__wasmsimd_arm) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p4c__wasmsimd_arm_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p4c__wasmsimd_x86) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p4c__wasmsimd_x86_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p8c__wasmrelaxedsimd) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p8c__wasmrelaxedsimd_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p8c__wasmrelaxedsimd_fma) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p8c__wasmrelaxedsimd_fma_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p8c__wasmsimd_arm) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p8c__wasmsimd_arm_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p8c__wasmsimd_x86) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p8c__wasmsimd_x86_acc2) - -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p1c__wasm) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p1c__wasm_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p2c__wasm) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p2c__wasm_acc2) - -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p1c__wasm) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p1c__wasm_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p2c__wasm) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p2c__wasm_acc2) - -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p1c__wasm) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p1c__wasm_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p2c__wasm) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p2c__wasm_acc2) - -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p1c__wasm) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p1c__wasm_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p2c__wasm) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p2c__wasm_acc2) - -DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_3p1c__scalar) -DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_3p1c__scalar_acc2) -DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_3p2c__scalar) -DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_3p2c__scalar_acc2) - -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p1c__scalar) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p1c__scalar_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p2c__scalar) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_3p2c__scalar_acc2) - -DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_4p1c__scalar) -DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_4p1c__scalar_acc2) -DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_4p2c__scalar) -DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_4p2c__scalar_acc2) - -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p1c__scalar) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p1c__scalar_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p2c__scalar) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_4p2c__scalar_acc2) - -DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_9p1c__scalar) -DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_9p1c__scalar_acc2) -DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_9p2c__scalar) -DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_9p2c__scalar_acc2) - -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p1c__scalar) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p1c__scalar_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p2c__scalar) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_9p2c__scalar_acc2) - -DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_25p1c__scalar) -DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_25p1c__scalar_acc2) -DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_25p2c__scalar) -DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_25p2c__scalar_acc2) - -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p1c__scalar) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p1c__scalar_acc2) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p2c__scalar) -DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_25p2c__scalar_acc2) - - -#define DECLARE_F16_DWCONV_UNIPASS_UKERNEL_FUNCTION(fn_name) \ - XNN_INTERNAL void fn_name( \ - size_t channels, \ - size_t output_width, \ - const xnn_float16** input, \ - const xnn_float16* weights, \ - xnn_float16* output, \ - intptr_t input_stride, \ - size_t output_increment, \ - size_t input_offset, \ - const xnn_float16* zero, \ - const struct xnn_f16_default_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); - -#define DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(fn_name) \ - XNN_INTERNAL void fn_name( \ - size_t channels, \ - size_t output_width, \ - const xnn_float16** input, \ - const xnn_float16* weights, \ - xnn_float16* output, \ - intptr_t input_stride, \ - size_t output_increment, \ - size_t input_offset, \ - const xnn_float16* zero, \ - const union xnn_f16_minmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); - -#define DECLARE_F16_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(fn_name) \ - XNN_INTERNAL void fn_name( \ - size_t channels, \ - size_t output_width, \ - const xnn_float16** input, \ - const xnn_float16* weights, \ - xnn_float16* output, \ - intptr_t input_stride, \ - size_t output_increment, \ - size_t input_offset, \ - const xnn_float16* zero, \ - size_t kernel_size, \ - xnn_float16* buffer, \ - const union xnn_f16_minmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); - -DECLARE_F16_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__neonfp16arith) -DECLARE_F16_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__neonfp16arith_acc2) -DECLARE_F16_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__neonfp16arith) -DECLARE_F16_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__neonfp16arith_acc2) -DECLARE_F16_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__neonfp16arith) -DECLARE_F16_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__neonfp16arith_acc2) - -DECLARE_F16_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__neonfp16arith) -DECLARE_F16_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__neonfp16arith_acc2) -DECLARE_F16_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__neonfp16arith) -DECLARE_F16_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__neonfp16arith_acc2) -DECLARE_F16_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__neonfp16arith) -DECLARE_F16_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__neonfp16arith_acc2) - -DECLARE_F16_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__neonfp16arith) -DECLARE_F16_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__neonfp16arith_acc2) -DECLARE_F16_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__neonfp16arith) -DECLARE_F16_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__neonfp16arith_acc2) -DECLARE_F16_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__neonfp16arith) -DECLARE_F16_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__neonfp16arith_acc2) - -DECLARE_F16_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__fma3) -DECLARE_F16_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__fma3_acc2) -DECLARE_F16_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__fma3) -DECLARE_F16_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__fma3_acc2) -DECLARE_F16_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__fma3) -DECLARE_F16_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__fma3_acc2) - -DECLARE_F16_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__fma3) -DECLARE_F16_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__fma3_acc2) -DECLARE_F16_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__fma3) -DECLARE_F16_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__fma3_acc2) -DECLARE_F16_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__fma3) -DECLARE_F16_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__fma3_acc2) - -DECLARE_F16_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__fma3) -DECLARE_F16_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__fma3_acc2) -DECLARE_F16_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__fma3) -DECLARE_F16_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__fma3_acc2) -DECLARE_F16_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__fma3) -DECLARE_F16_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__fma3_acc2) - -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_3p8c__neonfp16arith) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_3p8c__neonfp16arith_acc2) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_3p16c__neonfp16arith) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_3p16c__neonfp16arith_acc2) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_3p32c__neonfp16arith) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_3p32c__neonfp16arith_acc2) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_4p8c__neonfp16arith) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_4p8c__neonfp16arith_acc2) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_4p16c__neonfp16arith) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_4p16c__neonfp16arith_acc2) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_4p32c__neonfp16arith) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_4p32c__neonfp16arith_acc2) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_9p8c__neonfp16arith) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_9p8c__neonfp16arith_acc2) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_9p16c__neonfp16arith) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_9p16c__neonfp16arith_acc2) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_9p32c__neonfp16arith) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_9p32c__neonfp16arith_acc2) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_25p8c__neonfp16arith) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_25p8c__neonfp16arith_acc2) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_25p16c__neonfp16arith) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_25p16c__neonfp16arith_acc2) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_25p32c__neonfp16arith) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_25p32c__neonfp16arith_acc2) - -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_3p8c__fma3) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_3p8c__fma3_acc2) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_3p16c__fma3) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_3p16c__fma3_acc2) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_3p32c__fma3) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_3p32c__fma3_acc2) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_4p8c__fma3) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_4p8c__fma3_acc2) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_4p16c__fma3) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_4p16c__fma3_acc2) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_4p32c__fma3) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_4p32c__fma3_acc2) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_9p8c__fma3) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_9p8c__fma3_acc2) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_9p16c__fma3) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_9p16c__fma3_acc2) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_9p32c__fma3) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_9p32c__fma3_acc2) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_25p8c__fma3) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_25p8c__fma3_acc2) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_25p16c__fma3) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_25p16c__fma3_acc2) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_25p32c__fma3) -DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_25p32c__fma3_acc2) - - -#define DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(fn_name) \ - XNN_INTERNAL void fn_name( \ - size_t channels, \ - size_t output_width, \ - const uint8_t** input, \ - const void* weights, \ - uint8_t* output, \ - intptr_t input_stride, \ - size_t output_increment, \ - size_t input_offset, \ - const uint8_t* zero, \ - const union xnn_qu8_conv_minmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); - -#define DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(fn_name) \ - XNN_INTERNAL void fn_name( \ - size_t channels, \ - size_t output_width, \ - const uint8_t** input, \ - const void* weights, \ - uint8_t* output, \ - intptr_t input_stride, \ - size_t output_increment, \ - size_t input_offset, \ - const uint8_t* zero, \ - size_t kernel_size, \ - int32_t* buffer, \ - const union xnn_qu8_conv_minmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); - -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__neon_mul16) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__neon_mul16) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_9p32c__neon_mul16) - -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__neonv8_mul16) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__neonv8_mul16) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_9p32c__neonv8_mul16) - -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_rndnu_ukernel_9p8c__neon_mul16) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mul16) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_rndnu_ukernel_9p32c__neon_mul16) - -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_rndnu_ukernel_9p8c__neon_mul8) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mul8) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_rndnu_ukernel_9p32c__neon_mul8) - -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__sse2_mul16) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__sse2_mul16) - -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul16) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__sse41_mul16) - -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__avx_mul16) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__avx_mul16) - -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__sse2_mul16) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__sse2_mul16) - -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul16) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__sse41_mul16) - -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__avx_mul16) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__avx_mul16) - -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul32) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__sse41_mul32) - -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__avx_mul32) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__avx_mul32) - -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__avx2_mul32) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul32) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul32) - -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__avx512skx_mul32) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_9p32c__avx512skx_mul32) - -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__wasmsimd_mul16) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__wasmsimd_mul16) - -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_9p1c__wasm_fmagic) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_9p2c__wasm_fmagic) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_9p4c__wasm_fmagic) - -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_rndnu_ukernel_9p1c__scalar) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_rndnu_ukernel_9p2c__scalar) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_rndnu_ukernel_9p4c__scalar) - -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_9p1c__scalar_fmagic) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_9p2c__scalar_fmagic) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_9p4c__scalar_fmagic) - -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_9p1c__scalar_imagic) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_9p2c__scalar_imagic) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_9p4c__scalar_imagic) - -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_9p1c__scalar_lrintf) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_9p2c__scalar_lrintf) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_9p4c__scalar_lrintf) - -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__neon_mul16) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__neon_mul16) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_25p32c__neon_mul16) - -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__neonv8_mul16) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__neonv8_mul16) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_25p32c__neonv8_mul16) - -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_rndnu_ukernel_25p8c__neon_mul16) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_rndnu_ukernel_25p16c__neon_mul16) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_rndnu_ukernel_25p32c__neon_mul16) - -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_rndnu_ukernel_25p8c__neon_mul8) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_rndnu_ukernel_25p16c__neon_mul8) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_rndnu_ukernel_25p32c__neon_mul8) - -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul32) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__sse41_mul32) - -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__avx_mul32) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__avx_mul32) - -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__avx2_mul32) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul32) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul32) - -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__avx512skx_mul32) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_25p32c__avx512skx_mul32) - -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__wasmsimd_mul16) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__wasmsimd_mul16) - -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_25p1c__wasm_fmagic) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_25p2c__wasm_fmagic) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_25p4c__wasm_fmagic) - -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_25p1c__scalar_fmagic) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_25p2c__scalar_fmagic) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_25p4c__scalar_fmagic) - -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_25p1c__scalar_imagic) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_25p2c__scalar_imagic) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_25p4c__scalar_imagic) - -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_25p1c__scalar_lrintf) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_25p2c__scalar_lrintf) -DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_25p4c__scalar_lrintf) - -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neon_mul16) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neonv8_mul16) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neon_mul16) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neonv8_mul16) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l32c8s8r__neon_mul16) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l32c8s8r__neonv8_mul16) - -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neon_mul16) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neonv8_mul16) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neon_mul16) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neonv8_mul16) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l32c8s8r__neon_mul16) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l32c8s8r__neonv8_mul16) - -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neon_mul16) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neonv8_mul16) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neon_mul16) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neonv8_mul16) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l32c8s8r__neon_mul16) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l32c8s8r__neonv8_mul16) - -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_rndnu_ukernel_5f5m5l8c8s8r__neon_mul8) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_rndnu_ukernel_5f5m5l16c8s8r__neon_mul8) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_rndnu_ukernel_5f5m5l32c8s8r__neon_mul8) - -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_rndnu_ukernel_6f6m7l8c8s8r__neon_mul8) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_rndnu_ukernel_6f6m7l16c8s8r__neon_mul8) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_rndnu_ukernel_6f6m7l32c8s8r__neon_mul8) - -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_rndnu_ukernel_8f8m9l8c8s8r__neon_mul8) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_rndnu_ukernel_8f8m9l16c8s8r__neon_mul8) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_rndnu_ukernel_8f8m9l32c8s8r__neon_mul8) - -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_rndnu_ukernel_5f5m5l8c8s8r__neon_mul16) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_rndnu_ukernel_5f5m5l16c8s8r__neon_mul16) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_rndnu_ukernel_5f5m5l32c8s8r__neon_mul16) - -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_rndnu_ukernel_6f6m7l8c8s8r__neon_mul16) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_rndnu_ukernel_6f6m7l16c8s8r__neon_mul16) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_rndnu_ukernel_6f6m7l32c8s8r__neon_mul16) - -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_rndnu_ukernel_8f8m9l8c8s8r__neon_mul16) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_rndnu_ukernel_8f8m9l16c8s8r__neon_mul16) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_rndnu_ukernel_8f8m9l32c8s8r__neon_mul16) - -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse2_mul16) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse2_mul16) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse2_mul16) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse2_mul16) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse2_mul16) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse2_mul16) - -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse41_mul16) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse41_mul16) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse41_mul16) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse41_mul16) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse41_mul16) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse41_mul16) - -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c4s4r__sse41_mul32) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c4s4r__sse41_mul32) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l8c4s4r__sse41_mul32) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c4s4r__sse41_mul32) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l8c4s4r__sse41_mul32) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c4s4r__sse41_mul32) - -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c4s4r__avx_mul32) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c4s4r__avx_mul32) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l8c4s4r__avx_mul32) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c4s4r__avx_mul32) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l8c4s4r__avx_mul32) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c4s4r__avx_mul32) - -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__avx2_mul32) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__avx2_mul32) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l32c8s8r__avx2_mul32) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__avx2_mul32) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__avx2_mul32) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l32c8s8r__avx2_mul32) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__avx2_mul32) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__avx2_mul32) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l32c8s8r__avx2_mul32) - -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c16s1r__avx512skx_mul32) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l32c16s1r__avx512skx_mul32) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c16s1r__avx512skx_mul32) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l32c16s1r__avx512skx_mul32) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c16s1r__avx512skx_mul32) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l32c16s1r__avx512skx_mul32) - -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_fmagic) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_imagic) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_lrintf) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_fmagic) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_imagic) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_lrintf) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_fmagic) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_imagic) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_lrintf) - -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_fmagic) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_imagic) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_lrintf) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_fmagic) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_imagic) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_lrintf) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_fmagic) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_imagic) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_lrintf) - -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_fmagic) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_imagic) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_lrintf) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_fmagic) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_imagic) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_lrintf) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_fmagic) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_imagic) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_lrintf) - -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__wasm_fmagic) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__wasm_fmagic) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__wasm_fmagic) - -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__wasm_fmagic) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__wasm_fmagic) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__wasm_fmagic) - -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__wasm_fmagic) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__wasm_fmagic) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__wasm_fmagic) - -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__wasmsimd_mul16) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__wasmsimd_mul16) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__wasmsimd_mul16) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__wasmsimd_mul16) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__wasmsimd_mul16) -DECLARE_QU8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__wasmsimd_mul16) - -#define DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(fn_name) \ - XNN_INTERNAL void fn_name( \ - size_t channels, \ - size_t output_width, \ - const int8_t** input, \ - const void* weights, \ - int8_t* output, \ - intptr_t input_stride, \ - size_t output_increment, \ - size_t input_offset, \ - const int8_t* zero, \ - const union xnn_qs8_conv_minmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); - -#define DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(fn_name) \ - XNN_INTERNAL void fn_name( \ - size_t channels, \ - size_t output_width, \ - const int8_t** input, \ - const void* weights, \ - int8_t* output, \ - intptr_t input_stride, \ - size_t output_increment, \ - size_t input_offset, \ - const int8_t* zero, \ - size_t kernel_size, \ - int32_t* buffer, \ - const union xnn_qs8_conv_minmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_9p8c__neon_mul8_ld64) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mul8_ld64) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mul8_ld128) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_9p8c__neon_mla8_ld64) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mla8_ld64) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mla8_ld128) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_25p8c__neon_mul8_ld64) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_25p16c__neon_mul8_ld64) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_25p16c__neon_mul8_ld128) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_25p8c__neon_mla8_ld64) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_25p16c__neon_mla8_ld64) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_25p16c__neon_mla8_ld128) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__neon_mul16) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__neon_mul16) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p32c__neon_mul16) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__neonv8_mul16) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__neonv8_mul16) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p32c__neonv8_mul16) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_9p8c__neon_mul16) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mul16) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_9p32c__neon_mul16) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__sse2_mul16) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__sse2_mul16) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__sse2_mul16_add16) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__sse2_mul16_add16) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul16) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__sse41_mul16) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul16_add16) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__sse41_mul16_add16) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__avx_mul16) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx_mul16) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__avx_mul16_add16) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx_mul16_add16) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul16_vpmovsx) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul16_vpmovsx) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul16_vpunpck) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul16_vpunpck) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul16_add16_vpunpck) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul16_add16_vpunpck) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul32) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__sse41_mul32) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__avx_mul32) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx_mul32) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__avx2_mul32) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul32) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul32) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx512skx_mul32) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p32c__avx512skx_mul32) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__wasmsimd_mul16) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__wasmsimd_mul16) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__wasmsimd_mul16_add16) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__wasmsimd_mul16_add16) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p1c__wasm_fmagic) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p2c__wasm_fmagic) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p4c__wasm_fmagic) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_9p1c__scalar) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_9p2c__scalar) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_9p4c__scalar) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p1c__scalar_fmagic) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p2c__scalar_fmagic) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p4c__scalar_fmagic) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p1c__scalar_imagic) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p2c__scalar_imagic) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p4c__scalar_imagic) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p1c__scalar_lrintf) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p2c__scalar_lrintf) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_9p4c__scalar_lrintf) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__neon_mul16) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__neon_mul16) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__neon_mul16) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__neonv8_mul16) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__neonv8_mul16) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__neonv8_mul16) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_25p8c__neon_mul16) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_25p16c__neon_mul16) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_25p32c__neon_mul16) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__sse2_mul16) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__sse2_mul16) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__sse2_mul16_add16) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__sse2_mul16_add16) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul16) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__sse41_mul16) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul16_add16) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__sse41_mul16_add16) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__avx_mul16) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__avx_mul16) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__avx_mul16_add16) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__avx_mul16_add16) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul16_vpmovsx) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul16_vpmovsx) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul16_vpunpck) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul16_vpunpck) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul16_add16_vpunpck) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul16_add16_vpunpck) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul32) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__sse41_mul32) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__avx_mul32) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__avx_mul32) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__avx2_mul32) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul32) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul32) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__avx512skx_mul32) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__avx512skx_mul32) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__wasmsimd_mul16) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__wasmsimd_mul16) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__wasmsimd_mul16_add16) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__wasmsimd_mul16_add16) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p1c__wasm_fmagic) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p2c__wasm_fmagic) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p4c__wasm_fmagic) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p1c__scalar_fmagic) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p2c__scalar_fmagic) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p4c__scalar_fmagic) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p1c__scalar_imagic) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p2c__scalar_imagic) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p4c__scalar_imagic) - -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p1c__scalar_lrintf) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p2c__scalar_lrintf) -DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_25p4c__scalar_lrintf) - -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neon_mul16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neonv8_mul16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neon_mul16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neonv8_mul16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l32c8s8r__neon_mul16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l32c8s8r__neonv8_mul16) - -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neon_mul16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neonv8_mul16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neon_mul16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neonv8_mul16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l32c8s8r__neon_mul16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l32c8s8r__neonv8_mul16) - -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neon_mul16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neonv8_mul16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neon_mul16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neonv8_mul16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l32c8s8r__neon_mul16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l32c8s8r__neonv8_mul16) - -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l8c8s8r__neon_mla8_ld64) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l8c8s8r__neon_mul8_ld64) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l16c8s8r__neon_mla8_ld64) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l16c8s8r__neon_mla8_ld128) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l16c8s8r__neon_mul8_ld64) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l16c8s8r__neon_mul8_ld128) - -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l8c8s8r__neon_mla8_ld64) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l8c8s8r__neon_mul8_ld64) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l16c8s8r__neon_mla8_ld64) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l16c8s8r__neon_mla8_ld128) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l16c8s8r__neon_mul8_ld64) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l16c8s8r__neon_mul8_ld128) - -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l8c8s8r__neon_mla8_ld64) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l8c8s8r__neon_mul8_ld64) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l16c8s8r__neon_mla8_ld64) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l16c8s8r__neon_mla8_ld128) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l16c8s8r__neon_mul8_ld64) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l16c8s8r__neon_mul8_ld128) - -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l8c8s8r__neon_mul16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l16c8s8r__neon_mul16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l32c8s8r__neon_mul16) - -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l8c8s8r__neon_mul16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l16c8s8r__neon_mul16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l32c8s8r__neon_mul16) - -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l8c8s8r__neon_mul16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l16c8s8r__neon_mul16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l32c8s8r__neon_mul16) - -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse2_mul16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse2_mul16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse2_mul16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse2_mul16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse2_mul16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse2_mul16) - -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse2_mul16_add16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse2_mul16_add16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse2_mul16_add16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse2_mul16_add16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse2_mul16_add16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse2_mul16_add16) - -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse41_mul16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse41_mul16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse41_mul16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse41_mul16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse41_mul16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse41_mul16) - -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse41_mul16_add16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse41_mul16_add16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse41_mul16_add16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse41_mul16_add16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse41_mul16_add16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse41_mul16_add16) - -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c4s4r__sse41_mul32) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c4s4r__sse41_mul32) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c4s4r__sse41_mul32) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c4s4r__sse41_mul32) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c4s4r__sse41_mul32) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c4s4r__sse41_mul32) - -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c4s4r__avx_mul32) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c4s4r__avx_mul32) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c4s4r__avx_mul32) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c4s4r__avx_mul32) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c4s4r__avx_mul32) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c4s4r__avx_mul32) - -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c16s16r__avx2_mul16_vpmovsx) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l32c16s16r__avx2_mul16_vpmovsx) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c16s16r__avx2_mul16_vpmovsx) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l32c16s16r__avx2_mul16_vpmovsx) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c16s16r__avx2_mul16_vpmovsx) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l32c16s16r__avx2_mul16_vpmovsx) - -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c16s16r__avx2_mul16_vpunpck) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l32c16s16r__avx2_mul16_vpunpck) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c16s16r__avx2_mul16_vpunpck) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l32c16s16r__avx2_mul16_vpunpck) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c16s16r__avx2_mul16_vpunpck) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l32c16s16r__avx2_mul16_vpunpck) - -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c16s16r__avx2_mul16_add16_vpunpck) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l32c16s16r__avx2_mul16_add16_vpunpck) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c16s16r__avx2_mul16_add16_vpunpck) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l32c16s16r__avx2_mul16_add16_vpunpck) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c16s16r__avx2_mul16_add16_vpunpck) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l32c16s16r__avx2_mul16_add16_vpunpck) - -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__avx2_mul32) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__avx2_mul32) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l32c8s8r__avx2_mul32) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__avx2_mul32) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__avx2_mul32) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l32c8s8r__avx2_mul32) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__avx2_mul32) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__avx2_mul32) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l32c8s8r__avx2_mul32) - -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c16s1r__avx512skx_mul32) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l32c16s1r__avx512skx_mul32) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c16s1r__avx512skx_mul32) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l32c16s1r__avx512skx_mul32) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c16s1r__avx512skx_mul32) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l32c16s1r__avx512skx_mul32) - -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_fmagic) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_imagic) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_lrintf) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_fmagic) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_imagic) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_lrintf) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_fmagic) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_imagic) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_lrintf) - -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_fmagic) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_imagic) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_lrintf) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_fmagic) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_imagic) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_lrintf) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_fmagic) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_imagic) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_lrintf) - -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_fmagic) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_imagic) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_lrintf) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_fmagic) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_imagic) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_lrintf) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_fmagic) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_imagic) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_lrintf) - -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__wasm_fmagic) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__wasm_fmagic) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__wasm_fmagic) - -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__wasm_fmagic) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__wasm_fmagic) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__wasm_fmagic) - -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__wasm_fmagic) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__wasm_fmagic) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__wasm_fmagic) - -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__wasmsimd_mul16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__wasmsimd_mul16_add16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__wasmsimd_mul16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__wasmsimd_mul16_add16) - -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__wasmsimd_mul16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__wasmsimd_mul16_add16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__wasmsimd_mul16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__wasmsimd_mul16_add16) - -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__wasmsimd_mul16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__wasmsimd_mul16_add16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__wasmsimd_mul16) -DECLARE_QS8_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__wasmsimd_mul16_add16) - -#define DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(fn_name) \ - XNN_INTERNAL void fn_name( \ - size_t channels, \ - size_t output_width, \ - const int8_t** input, \ - const void* weights, \ - int8_t* output, \ - intptr_t input_stride, \ - size_t output_increment, \ - size_t input_offset, \ - const int8_t* zero, \ - const union xnn_qs8_qc8w_conv_minmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); - -#define DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(fn_name) \ - XNN_INTERNAL void fn_name( \ - size_t channels, \ - size_t output_width, \ - const int8_t** input, \ - const void* weights, \ - int8_t* output, \ - intptr_t input_stride, \ - size_t output_increment, \ - size_t input_offset, \ - const int8_t* zero, \ - size_t kernel_size, \ - int32_t* buffer, \ - const union xnn_qs8_qc8w_conv_minmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p8c__asm_aarch32_neonv8_mla8_cortex_a35) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__asm_aarch32_neonv8_mla8_cortex_a35) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p8c__neon_mla8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__neon_mla8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__neon_mla8_ld128) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p8c__neonv8_mla8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__neonv8_mla8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__neonv8_mla8_ld128) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_4p8c__neon_mla8_ld64) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__neon_mul8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neon_mul8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neon_mul8_ld128) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__neon_mla8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neon_mla8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neon_mla8_ld128) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__neonv8_mul8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neonv8_mul8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neonv8_mul8_ld128) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__neonv8_mla8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neonv8_mla8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neonv8_mla8_ld128) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__neon_mul8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neon_mul8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neon_mul8_ld128) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__neon_mla8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neon_mla8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neon_mla8_ld128) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__neonv8_mul8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neonv8_mul8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neonv8_mul8_ld128) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__neonv8_mla8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neonv8_mla8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neonv8_mla8_ld128) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__neon_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neon_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p32c__neon_mul16) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__neonv8_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neonv8_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p32c__neonv8_mul16) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__neon_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neon_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__neon_mul16) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__neonv8_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neonv8_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__neonv8_mul16) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p8c__sse2_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p8c__sse41_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__avx_mul16_add16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__avx2_mul32) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p32c__avx512skx_mul32) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__wasmsimd_mul16_add16) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p1c__scalar_fmagic) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p2c__scalar_imagic) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p2c__scalar_lrintf) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p2c__wasm_fmagic) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_4p2c__scalar_imagic) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__sse2_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__sse2_mul16) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__sse2_mul16_add16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__sse2_mul16_add16) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__sse41_mul16) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul16_add16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__sse41_mul16_add16) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__avx_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__avx_mul16) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__avx_mul16_add16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__avx_mul16_add16) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul16_vpmovsx) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul16_vpmovsx) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul16_vpunpck) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul16_vpunpck) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul16_add16_vpunpck) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul16_add16_vpunpck) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul32) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__sse41_mul32) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__avx_mul32) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__avx_mul32) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__avx2_mul32) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul32) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul32) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__avx512skx_mul32) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p32c__avx512skx_mul32) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__wasmsimd_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__wasmsimd_mul16) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__wasmsimd_mul16_add16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__wasmsimd_mul16_add16) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p1c__wasm_fmagic) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p2c__wasm_fmagic) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p4c__wasm_fmagic) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p1c__scalar_fmagic) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p2c__scalar_fmagic) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p4c__scalar_fmagic) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p1c__scalar_imagic) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p2c__scalar_imagic) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p4c__scalar_imagic) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p1c__scalar_lrintf) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p2c__scalar_lrintf) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p4c__scalar_lrintf) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__sse2_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__sse2_mul16) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__sse2_mul16_add16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__sse2_mul16_add16) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__sse41_mul16) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul16_add16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__sse41_mul16_add16) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__avx_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__avx_mul16) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__avx_mul16_add16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__avx_mul16_add16) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul16_vpmovsx) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul16_vpmovsx) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul16_vpunpck) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul16_vpunpck) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul16_add16_vpunpck) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul16_add16_vpunpck) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul32) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__sse41_mul32) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__avx_mul32) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__avx_mul32) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__avx2_mul32) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul32) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul32) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__avx512skx_mul32) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__avx512skx_mul32) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__wasmsimd_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__wasmsimd_mul16) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__wasmsimd_mul16_add16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__wasmsimd_mul16_add16) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p1c__wasm_fmagic) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p2c__wasm_fmagic) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p4c__wasm_fmagic) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p1c__scalar_fmagic) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p2c__scalar_fmagic) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p4c__scalar_fmagic) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p1c__scalar_imagic) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p2c__scalar_imagic) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p4c__scalar_imagic) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p1c__scalar_lrintf) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p2c__scalar_lrintf) -DECLARE_QS8_QC8W_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p4c__scalar_lrintf) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neon_mla8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neon_mul8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neon_mla8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neon_mla8_ld128) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neon_mul8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neon_mul8_ld128) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neon_mla8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neon_mul8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neon_mla8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neon_mla8_ld128) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neon_mul8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neon_mul8_ld128) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neon_mla8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neon_mul8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neon_mla8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neon_mla8_ld128) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neon_mul8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neon_mul8_ld128) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neonv8_mla8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neonv8_mul8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neonv8_mla8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neonv8_mla8_ld128) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neonv8_mul8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neonv8_mul8_ld128) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neonv8_mla8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neonv8_mul8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neonv8_mla8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neonv8_mla8_ld128) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neonv8_mul8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neonv8_mul8_ld128) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neonv8_mla8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neonv8_mul8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neonv8_mla8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neonv8_mla8_ld128) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neonv8_mul8_ld64) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neonv8_mul8_ld128) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neon_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neonv8_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neon_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neonv8_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l32c8s8r__neon_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l32c8s8r__neonv8_mul16) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neon_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neonv8_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neon_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neonv8_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l32c8s8r__neon_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l32c8s8r__neonv8_mul16) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neon_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neonv8_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neon_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neonv8_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l32c8s8r__neon_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l32c8s8r__neonv8_mul16) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse2_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse2_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse2_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse2_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse2_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse2_mul16) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse2_mul16_add16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse2_mul16_add16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse2_mul16_add16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse2_mul16_add16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse2_mul16_add16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse2_mul16_add16) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse41_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse41_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse41_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse41_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse41_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse41_mul16) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse41_mul16_add16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse41_mul16_add16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse41_mul16_add16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse41_mul16_add16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse41_mul16_add16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse41_mul16_add16) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c4s4r__sse41_mul32) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c4s4r__sse41_mul32) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c4s4r__sse41_mul32) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c4s4r__sse41_mul32) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c4s4r__sse41_mul32) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c4s4r__sse41_mul32) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c4s4r__avx_mul32) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c4s4r__avx_mul32) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c4s4r__avx_mul32) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c4s4r__avx_mul32) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c4s4r__avx_mul32) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c4s4r__avx_mul32) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c16s16r__avx2_mul16_vpmovsx) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l32c16s16r__avx2_mul16_vpmovsx) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c16s16r__avx2_mul16_vpmovsx) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l32c16s16r__avx2_mul16_vpmovsx) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c16s16r__avx2_mul16_vpmovsx) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l32c16s16r__avx2_mul16_vpmovsx) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c16s16r__avx2_mul16_vpunpck) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l32c16s16r__avx2_mul16_vpunpck) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c16s16r__avx2_mul16_vpunpck) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l32c16s16r__avx2_mul16_vpunpck) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c16s16r__avx2_mul16_vpunpck) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l32c16s16r__avx2_mul16_vpunpck) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c16s16r__avx2_mul16_add16_vpunpck) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l32c16s16r__avx2_mul16_add16_vpunpck) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c16s16r__avx2_mul16_add16_vpunpck) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l32c16s16r__avx2_mul16_add16_vpunpck) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c16s16r__avx2_mul16_add16_vpunpck) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l32c16s16r__avx2_mul16_add16_vpunpck) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__avx2_mul32) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__avx2_mul32) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l32c8s8r__avx2_mul32) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__avx2_mul32) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__avx2_mul32) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l32c8s8r__avx2_mul32) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__avx2_mul32) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__avx2_mul32) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l32c8s8r__avx2_mul32) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c16s1r__avx512skx_mul32) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l32c16s1r__avx512skx_mul32) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c16s1r__avx512skx_mul32) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l32c16s1r__avx512skx_mul32) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c16s1r__avx512skx_mul32) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l32c16s1r__avx512skx_mul32) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_fmagic) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_imagic) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_lrintf) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_fmagic) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_imagic) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_lrintf) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_fmagic) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_imagic) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_lrintf) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_fmagic) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_imagic) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_lrintf) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_fmagic) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_imagic) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_lrintf) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_fmagic) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_imagic) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_lrintf) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_fmagic) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_imagic) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_lrintf) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_fmagic) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_imagic) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_lrintf) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_fmagic) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_imagic) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_lrintf) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__wasm_fmagic) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__wasm_fmagic) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__wasm_fmagic) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__wasm_fmagic) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__wasm_fmagic) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__wasm_fmagic) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__wasm_fmagic) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__wasm_fmagic) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__wasm_fmagic) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__wasmsimd_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__wasmsimd_mul16_add16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__wasmsimd_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__wasmsimd_mul16_add16) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__wasmsimd_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__wasmsimd_mul16_add16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__wasmsimd_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__wasmsimd_mul16_add16) - -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__wasmsimd_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__wasmsimd_mul16_add16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__wasmsimd_mul16) -DECLARE_QS8_QC8W_DWCONV_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__wasmsimd_mul16_add16) + buffer_type* buffer, \ + const params_type params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); +#include "src/f16-dwconv/f16-dwconv-minmax-multipass.h" +#include "src/f32-dwconv/f32-dwconv-minmax-multipass.h" +#include "src/f32-dwconv/f32-dwconv-multipass.h" +#include "src/qs8-dwconv/qs8-dwconv-minmax-multipass-fp32.h" +#include "src/qs8-dwconv/qs8-dwconv-minmax-multipass-rndnu.h" +#include "src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-multipass-fp32.h" +#include "src/qu8-dwconv/qu8-dwconv-minmax-multipass-fp32.h" +#include "src/qu8-dwconv/qu8-dwconv-minmax-multipass-rndnu.h" +#undef XNN_DWCONV_MULTIPASS #define DECLARE_F32_DWCONV2D_CHW_MINMAX_UKERNEL_FUNCTION(fn_name) \ XNN_INTERNAL void fn_name( \ diff --git a/test/dwconv-microkernel-tester.cc b/test/dwconv-microkernel-tester.cc index ee318d19e8f..d29e22e7572 100644 --- a/test/dwconv-microkernel-tester.cc +++ b/test/dwconv-microkernel-tester.cc @@ -1218,7 +1218,7 @@ void DWConvMicrokernelTester::Test( } void DWConvMicrokernelTester::Test( - xnn_f32_dwconv_unipass_ukernel_fn dwconv) const { + xnn_f32_dwconv_unipass_ukernel_fn dwconv, const void*) const { xnnpack::ReplicableRandomDevice rng; std::uniform_real_distribution f32dist; @@ -1390,7 +1390,7 @@ void DWConvMicrokernelTester::Test( } void DWConvMicrokernelTester::Test( - xnn_f32_dwconv_multipass_ukernel_fn dwconv) const { + xnn_f32_dwconv_multipass_ukernel_fn dwconv, const void*) const { xnnpack::ReplicableRandomDevice rng; std::uniform_real_distribution f32dist; diff --git a/test/dwconv-microkernel-tester.h b/test/dwconv-microkernel-tester.h index 31d47fac1d4..88336d7a1a6 100644 --- a/test/dwconv-microkernel-tester.h +++ b/test/dwconv-microkernel-tester.h @@ -211,12 +211,12 @@ class DWConvMicrokernelTester { void Test(xnn_f16_dwconv_minmax_multipass_ukernel_fn dwconv_minmax, xnn_init_f16_minmax_params_fn init_params) const; - void Test(xnn_f32_dwconv_unipass_ukernel_fn dwconv) const; + void Test(xnn_f32_dwconv_unipass_ukernel_fn dwconv, const void* = nullptr) const; void Test(xnn_f32_dwconv_minmax_unipass_ukernel_fn dwconv_minmax, xnn_init_f32_minmax_params_fn init_params) const; - void Test(xnn_f32_dwconv_multipass_ukernel_fn dwconv) const; + void Test(xnn_f32_dwconv_multipass_ukernel_fn dwconv, const void* = nullptr) const; void Test(xnn_f32_dwconv_minmax_multipass_ukernel_fn dwconv_minmax, xnn_init_f32_minmax_params_fn init_params) const; diff --git a/test/f16-dwconv-minmax-multipass.cc b/test/f16-dwconv-minmax-multipass.cc index f6dafa03d03..994598c292f 100644 --- a/test/f16-dwconv-minmax-multipass.cc +++ b/test/f16-dwconv-minmax-multipass.cc @@ -4,7 +4,7 @@ // LICENSE file in the root directory of this source tree. // // Auto-generated file. Do not edit! -// Specification: test/f16-dwconv-minmax-multipass.yaml +// Microkernel: f16-dwconv-minmax-multipass // Generator: tools/generate-dwconv-multipass-test.py @@ -25,14 +25,13 @@ namespace { -std::vector CreateTests1( - size_t c_block, size_t adj_c_block, size_t cr, size_t kr, +std::vector CreateTests( + size_t c_block, size_t cr, size_t kr, size_t first_pass_tile, size_t middle_pass_tile, size_t last_pass_tile, size_t channel_subtile, size_t channel_round, std::function test_func, std::function isa_check = nullptr) { const std::string cbs = std::to_string(c_block); - const std::string acbs = std::to_string(adj_c_block); std::vector tests; tests.reserve(17); @@ -90,7 +89,7 @@ std::vector CreateTests1( .channel_round(channel_round) .kernel_size(first_pass_tile + 1) , test_func, isa_check) - .loop_channels(adj_c_block = c_block, cr * 16, cr * 3)); + .loop_channels(c_block * 2, cr * 16, cr * 3)); tests.push_back(DWConvTestParams( "c_div_" + cbs + "_first_pass_and_last_pass", @@ -103,7 +102,7 @@ std::vector CreateTests1( .channel_round(channel_round) .kernel_size(first_pass_tile + last_pass_tile) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3)); + .loop_channels(c_block * 2, cr * 16, cr * 3)); tests.push_back(DWConvTestParams( "c_div_" + cbs + "_multipass", @@ -115,7 +114,7 @@ std::vector CreateTests1( .channel_subtile(channel_subtile) .channel_round(channel_round) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3) + .loop_channels(c_block * 2, cr * 16, cr * 3) .loop_kernel_size( first_pass_tile + middle_pass_tile + last_pass_tile, first_pass_tile + middle_pass_tile * 2 + last_pass_tile)); @@ -132,7 +131,7 @@ std::vector CreateTests1( .kernel_size(first_pass_tile + last_pass_tile) .qmin(128) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3)); + .loop_channels(c_block * 2, cr * 16, cr * 3)); tests.push_back(DWConvTestParams( "c_div_" + cbs + "_with_qmax", @@ -146,11 +145,11 @@ std::vector CreateTests1( .kernel_size(first_pass_tile + last_pass_tile) .qmax(128) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3)); + .loop_channels(c_block * 2, cr * 16, cr * 3)); } tests.push_back(DWConvTestParams( - "c_gt_" + acbs + "_first_pass_plus_one", + "c_gt_" + cbs + "_first_pass_plus_one", DWConvMicrokernelTester() .first_pass_tile(first_pass_tile) .middle_pass_tile(middle_pass_tile) @@ -160,10 +159,10 @@ std::vector CreateTests1( .channel_round(channel_round) .kernel_size(first_pass_tile + 1) , test_func, isa_check) - .loop_channels(adj_c_block + 1, c_block == 1 ? 10 : adj_c_block + c_block)); + .loop_channels(c_block + 1, c_block == 1 ? 10 : c_block * 2)); tests.push_back(DWConvTestParams( - "c_gt_" + acbs + "_first_pass_and_last_pass", + "c_gt_" + cbs + "_first_pass_and_last_pass", DWConvMicrokernelTester() .first_pass_tile(first_pass_tile) .middle_pass_tile(middle_pass_tile) @@ -173,10 +172,10 @@ std::vector CreateTests1( .channel_round(channel_round) .kernel_size(first_pass_tile + last_pass_tile) , test_func, isa_check) - .loop_channels(adj_c_block + 1, c_block == 1 ? 10 : adj_c_block + c_block)); + .loop_channels(c_block + 1, c_block == 1 ? 10 : c_block * 2)); tests.push_back(DWConvTestParams( - "c_gt_" + acbs + "_multipass", + "c_gt_" + cbs + "_multipass", DWConvMicrokernelTester() .first_pass_tile(first_pass_tile) .middle_pass_tile(middle_pass_tile) @@ -185,7 +184,7 @@ std::vector CreateTests1( .channel_subtile(channel_subtile) .channel_round(channel_round) , test_func, isa_check) - .loop_channels(adj_c_block + 1, c_block == 1 ? 10 : adj_c_block + c_block) + .loop_channels(c_block + 1, c_block == 1 ? 10 : c_block * 2) .loop_kernel_size( first_pass_tile + middle_pass_tile + last_pass_tile, first_pass_tile + middle_pass_tile * 2 + last_pass_tile)); @@ -278,7 +277,7 @@ std::vector CreateTests1( .channel_round(channel_round) .input_offset(xnnpack::NextPrime(cr + 1) * 16) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3) + .loop_channels(c_block * 2, cr * 16, cr * 3) .loop_kernel_size(first_pass_tile + middle_pass_tile + last_pass_tile, first_pass_tile + middle_pass_tile * 2 + last_pass_tile)); @@ -287,722 +286,19 @@ std::vector CreateTests1( } // namespace - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_5F5M5L8C8S4R__NEONFP16ARITH, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__neonfp16arith, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_5F5M5L8C8S4R__NEONFP16ARITH_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__neonfp16arith_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_5F5M5L16C8S4R__NEONFP16ARITH, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__neonfp16arith, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_5F5M5L16C8S4R__NEONFP16ARITH_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__neonfp16arith_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_5F5M5L32C8S4R__NEONFP16ARITH, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__neonfp16arith, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_5F5M5L32C8S4R__NEONFP16ARITH_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__neonfp16arith_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_6F6M7L8C8S4R__NEONFP16ARITH, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__neonfp16arith, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_6F6M7L8C8S4R__NEONFP16ARITH_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__neonfp16arith_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_6F6M7L16C8S4R__NEONFP16ARITH, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__neonfp16arith, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_6F6M7L16C8S4R__NEONFP16ARITH_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__neonfp16arith_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_6F6M7L32C8S4R__NEONFP16ARITH, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__neonfp16arith, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_6F6M7L32C8S4R__NEONFP16ARITH_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__neonfp16arith_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_8F8M9L8C8S4R__NEONFP16ARITH, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__neonfp16arith, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_8F8M9L8C8S4R__NEONFP16ARITH_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__neonfp16arith_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_8F8M9L16C8S4R__NEONFP16ARITH, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__neonfp16arith, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_8F8M9L16C8S4R__NEONFP16ARITH_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__neonfp16arith_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_8F8M9L32C8S4R__NEONFP16ARITH, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__neonfp16arith, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_8F8M9L32C8S4R__NEONFP16ARITH_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__neonfp16arith_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_5F5M5L8C8S4R__FMA3, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__fma3, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_5F5M5L8C8S4R__FMA3_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__fma3_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_5F5M5L16C8S4R__FMA3, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__fma3, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_5F5M5L16C8S4R__FMA3_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__fma3_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_5F5M5L32C8S4R__FMA3, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__fma3, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_5F5M5L32C8S4R__FMA3_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__fma3_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_6F6M7L8C8S4R__FMA3, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__fma3, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_6F6M7L8C8S4R__FMA3_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__fma3_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_6F6M7L16C8S4R__FMA3, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__fma3, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_6F6M7L16C8S4R__FMA3_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__fma3_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_6F6M7L32C8S4R__FMA3, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__fma3, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_6F6M7L32C8S4R__FMA3_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__fma3_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_8F8M9L8C8S4R__FMA3, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__fma3, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_8F8M9L8C8S4R__FMA3_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__fma3_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_8F8M9L16C8S4R__FMA3, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__fma3, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_8F8M9L16C8S4R__FMA3_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__fma3_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_8F8M9L32C8S4R__FMA3, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__fma3, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_8F8M9L32C8S4R__FMA3_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__fma3_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 +#define XNN_DWCONV_MULTIPASS(arch_flags, ukernel, first_pass_tile, middle_pass_tile, last_pass_tile, channel_tile, channel_subtile, channel_round, datatype, weights_type, buffer_type, params_type, init_params)\ +INSTANTIATE_TEST_SUITE_P( \ + ukernel, DWConvTest, \ + testing::ValuesIn(CreateTests( \ + channel_tile, channel_tile, first_pass_tile, \ + first_pass_tile, middle_pass_tile, last_pass_tile, \ + channel_subtile, channel_round, \ + [](DWConvMicrokernelTester& tester) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + tester.Test(ukernel, init_params); \ + })), \ + [](const testing::TestParamInfo& info) { \ + return info.param.test_name; \ + }); +#include "src/f16-dwconv/f16-dwconv-minmax-multipass.h" +#undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-dwconv-minmax-multipass.yaml b/test/f16-dwconv-minmax-multipass.yaml deleted file mode 100644 index f21419280b5..00000000000 --- a/test/f16-dwconv-minmax-multipass.yaml +++ /dev/null @@ -1,80 +0,0 @@ -# Copyright 2023 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# ARM NEON+FP16ARITH -- name: xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__neonfp16arith - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__neonfp16arith_acc2 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__neonfp16arith - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__neonfp16arith_acc2 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__neonfp16arith - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__neonfp16arith_acc2 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__neonfp16arith - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__neonfp16arith_acc2 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__neonfp16arith - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__neonfp16arith_acc2 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__neonfp16arith - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__neonfp16arith_acc2 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__neonfp16arith - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__neonfp16arith_acc2 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__neonfp16arith - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__neonfp16arith_acc2 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__neonfp16arith - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__neonfp16arith_acc2 - init: xnn_init_f16_minmax_scalar_params - -# x86 FMA3 -- name: xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__fma3 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__fma3_acc2 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__fma3 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__fma3_acc2 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__fma3 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__fma3_acc2 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__fma3 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__fma3_acc2 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__fma3 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__fma3_acc2 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__fma3 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__fma3_acc2 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__fma3 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__fma3_acc2 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__fma3 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__fma3_acc2 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__fma3 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__fma3_acc2 - init: xnn_init_f16_minmax_scalar_params diff --git a/test/f16-dwconv-minmax-unipass.cc b/test/f16-dwconv-minmax-unipass.cc index 732743a0491..fbbce5862f8 100644 --- a/test/f16-dwconv-minmax-unipass.cc +++ b/test/f16-dwconv-minmax-unipass.cc @@ -7,7 +7,7 @@ // LICENSE file in the root directory of this source tree. // // Auto-generated file. Do not edit! -// Specification: test/f16-dwconv-minmax-unipass.yaml +// Microkernel: f16-dwconv-minmax-unipass // Generator: tools/generate-dwconv-unipass-test.py @@ -28,10 +28,10 @@ namespace { -std::vector CreateTests1( - size_t c_block, size_t adj_c_block, size_t cr, size_t kr, - std::function test_func, - std::function isa_check = nullptr) { +std::vector CreateTests( + size_t c_block, bool is_pipelined, size_t cr, size_t kr, + std::function test_func) { + size_t adj_c_block = is_pipelined ? c_block * 2 : c_block; const std::string cbs = std::to_string(c_block); const std::string acbs = std::to_string(adj_c_block); @@ -44,8 +44,17 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .channels(c_block) - , test_func, isa_check)); + , test_func)); + if (is_pipelined) { + tests.push_back(DWConvTestParams( + "c_eq_" + std::to_string(c_block * 2), + DWConvMicrokernelTester() + .channel_tile(cr) + .kernel_tile(kr) + .channels(c_block * 2) + , test_func)); + } if (c_block > 1) { tests.push_back(DWConvTestParams( @@ -53,7 +62,7 @@ std::vector CreateTests1( DWConvMicrokernelTester() .channel_tile(cr) .kernel_tile(kr) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); tests.push_back(DWConvTestParams( @@ -62,7 +71,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .qmin(128) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); tests.push_back(DWConvTestParams( @@ -71,7 +80,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .qmax(128) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); tests.push_back(DWConvTestParams( @@ -79,7 +88,7 @@ std::vector CreateTests1( DWConvMicrokernelTester() .channel_tile(cr) .kernel_tile(kr) - , test_func, isa_check) + , test_func) .loop_channels(1, adj_c_block - 1)); } @@ -88,7 +97,7 @@ std::vector CreateTests1( DWConvMicrokernelTester() .channel_tile(cr) .kernel_tile(kr) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + 1, (c_block == 1 ? 10 : adj_c_block + c_block) - 1)); tests.push_back(DWConvTestParams( @@ -97,7 +106,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .qmin(128) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + 1, (c_block == 1 ? 10 : adj_c_block + c_block) - 1)); tests.push_back(DWConvTestParams( @@ -106,7 +115,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .qmax(128) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + 1, (c_block == 1 ? 10 : adj_c_block + c_block) - 1)); tests.push_back(DWConvTestParams( @@ -115,7 +124,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .width(3) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); tests.push_back(DWConvTestParams( @@ -124,7 +133,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .width(3) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1)) .loop_step(2, kr)); @@ -135,7 +144,7 @@ std::vector CreateTests1( .kernel_tile(kr) .width(5) .output_stride(xnnpack::NextPrime(cr * 5 + 1)) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); tests.push_back(DWConvTestParams( @@ -145,7 +154,7 @@ std::vector CreateTests1( .kernel_tile(kr) .width(3) .qmin(128) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); tests.push_back(DWConvTestParams( @@ -155,7 +164,7 @@ std::vector CreateTests1( .kernel_tile(kr) .width(3) .qmax(128) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); @@ -165,7 +174,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .input_offset(xnnpack::NextPrime(cr + 1) * 16) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); tests.push_back(DWConvTestParams( @@ -174,7 +183,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .input_offset(xnnpack::NextPrime(cr + 1) * 16) - , test_func, isa_check) + , test_func) .loop_zi(0, kr - 1) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); @@ -183,866 +192,17 @@ std::vector CreateTests1( } // namespace - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_3P8C__NEONFP16ARITH, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_3p8c__neonfp16arith, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_3P8C__NEONFP16ARITH_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_3p8c__neonfp16arith_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_3P16C__NEONFP16ARITH, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_3p16c__neonfp16arith, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_3P16C__NEONFP16ARITH_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_3p16c__neonfp16arith_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_3P32C__NEONFP16ARITH, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_3p32c__neonfp16arith, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_3P32C__NEONFP16ARITH_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_3p32c__neonfp16arith_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_4P8C__NEONFP16ARITH, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_4p8c__neonfp16arith, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_4P8C__NEONFP16ARITH_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_4p8c__neonfp16arith_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_4P16C__NEONFP16ARITH, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_4p16c__neonfp16arith, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_4P16C__NEONFP16ARITH_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_4p16c__neonfp16arith_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_4P32C__NEONFP16ARITH, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_4p32c__neonfp16arith, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_4P32C__NEONFP16ARITH_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_4p32c__neonfp16arith_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_9P8C__NEONFP16ARITH, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_9p8c__neonfp16arith, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_9P8C__NEONFP16ARITH_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_9p8c__neonfp16arith_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_9P16C__NEONFP16ARITH, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_9p16c__neonfp16arith, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_9P16C__NEONFP16ARITH_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_9p16c__neonfp16arith_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_9P32C__NEONFP16ARITH, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_9p32c__neonfp16arith, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_9P32C__NEONFP16ARITH_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_9p32c__neonfp16arith_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_25P8C__NEONFP16ARITH, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_25p8c__neonfp16arith, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_25P8C__NEONFP16ARITH_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_25p8c__neonfp16arith_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_25P16C__NEONFP16ARITH, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_25p16c__neonfp16arith, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_25P16C__NEONFP16ARITH_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_25p16c__neonfp16arith_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_25P32C__NEONFP16ARITH, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_25p32c__neonfp16arith, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_25P32C__NEONFP16ARITH_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_25p32c__neonfp16arith_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_3P8C__FMA3, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_3p8c__fma3, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_3P8C__FMA3_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_3p8c__fma3_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_3P16C__FMA3, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_3p16c__fma3, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_3P16C__FMA3_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_3p16c__fma3_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_3P32C__FMA3, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_3p32c__fma3, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_3P32C__FMA3_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_3p32c__fma3_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_4P8C__FMA3, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_4p8c__fma3, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_4P8C__FMA3_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_4p8c__fma3_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_4P16C__FMA3, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_4p16c__fma3, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_4P16C__FMA3_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_4p16c__fma3_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_4P32C__FMA3, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_4p32c__fma3, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_4P32C__FMA3_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_4p32c__fma3_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_9P8C__FMA3, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_9p8c__fma3, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_9P8C__FMA3_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_9p8c__fma3_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_9P16C__FMA3, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_9p16c__fma3, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_9P16C__FMA3_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_9p16c__fma3_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_9P32C__FMA3, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_9p32c__fma3, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_9P32C__FMA3_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_9p32c__fma3_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_25P8C__FMA3, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_25p8c__fma3, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_25P8C__FMA3_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_25p8c__fma3_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_25P16C__FMA3, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_25p16c__fma3, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_25P16C__FMA3_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_25p16c__fma3_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_25P32C__FMA3, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_25p32c__fma3, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F16_DWCONV_MINMAX_25P32C__FMA3_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f16_dwconv_minmax_ukernel_25p32c__fma3_acc2, - xnn_init_f16_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 +#define XNN_DWCONV_UNIPASS(arch_flags, ukernel, c_block, is_pipelined, cr, kr, datatype, weights_type, params_type, init_params)\ +INSTANTIATE_TEST_SUITE_P( \ + ukernel, DWConvTest, \ + testing::ValuesIn(CreateTests( \ + c_block, is_pipelined, cr, kr, \ + [](DWConvMicrokernelTester& tester) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + tester.Test(ukernel, init_params); \ + })), \ + [](const testing::TestParamInfo& info) { \ + return info.param.test_name; \ + }); +#include "src/f16-dwconv/f16-dwconv-minmax-unipass.h" +#undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-dwconv-minmax-unipass.yaml b/test/f16-dwconv-minmax-unipass.yaml deleted file mode 100644 index d8edeca1470..00000000000 --- a/test/f16-dwconv-minmax-unipass.yaml +++ /dev/null @@ -1,104 +0,0 @@ -# Copyright 2020 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# ARM NEON+FP16ARITH -- name: xnn_f16_dwconv_minmax_ukernel_3p8c__neonfp16arith - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_3p8c__neonfp16arith_acc2 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_3p16c__neonfp16arith - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_3p16c__neonfp16arith_acc2 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_3p32c__neonfp16arith - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_3p32c__neonfp16arith_acc2 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_4p8c__neonfp16arith - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_4p8c__neonfp16arith_acc2 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_4p16c__neonfp16arith - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_4p16c__neonfp16arith_acc2 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_4p32c__neonfp16arith - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_4p32c__neonfp16arith_acc2 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_9p8c__neonfp16arith - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_9p8c__neonfp16arith_acc2 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_9p16c__neonfp16arith - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_9p16c__neonfp16arith_acc2 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_9p32c__neonfp16arith - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_9p32c__neonfp16arith_acc2 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_25p8c__neonfp16arith - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_25p8c__neonfp16arith_acc2 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_25p16c__neonfp16arith - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_25p16c__neonfp16arith_acc2 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_25p32c__neonfp16arith - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_25p32c__neonfp16arith_acc2 - init: xnn_init_f16_minmax_scalar_params - -# x86 FMA3 -- name: xnn_f16_dwconv_minmax_ukernel_3p8c__fma3 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_3p8c__fma3_acc2 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_3p16c__fma3 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_3p16c__fma3_acc2 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_3p32c__fma3 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_3p32c__fma3_acc2 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_4p8c__fma3 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_4p8c__fma3_acc2 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_4p16c__fma3 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_4p16c__fma3_acc2 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_4p32c__fma3 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_4p32c__fma3_acc2 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_9p8c__fma3 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_9p8c__fma3_acc2 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_9p16c__fma3 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_9p16c__fma3_acc2 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_9p32c__fma3 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_9p32c__fma3_acc2 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_25p8c__fma3 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_25p8c__fma3_acc2 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_25p16c__fma3 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_25p16c__fma3_acc2 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_25p32c__fma3 - init: xnn_init_f16_minmax_scalar_params -- name: xnn_f16_dwconv_minmax_ukernel_25p32c__fma3_acc2 - init: xnn_init_f16_minmax_scalar_params diff --git a/test/f32-dwconv-minmax-multipass.cc b/test/f32-dwconv-minmax-multipass.cc index 7020e8f61b0..4b1ae109c14 100644 --- a/test/f32-dwconv-minmax-multipass.cc +++ b/test/f32-dwconv-minmax-multipass.cc @@ -4,7 +4,7 @@ // LICENSE file in the root directory of this source tree. // // Auto-generated file. Do not edit! -// Specification: test/f32-dwconv-minmax-multipass.yaml +// Microkernel: f32-dwconv-minmax-multipass // Generator: tools/generate-dwconv-multipass-test.py @@ -25,14 +25,13 @@ namespace { -std::vector CreateTests1( - size_t c_block, size_t adj_c_block, size_t cr, size_t kr, +std::vector CreateTests( + size_t c_block, size_t cr, size_t kr, size_t first_pass_tile, size_t middle_pass_tile, size_t last_pass_tile, size_t channel_subtile, size_t channel_round, std::function test_func, std::function isa_check = nullptr) { const std::string cbs = std::to_string(c_block); - const std::string acbs = std::to_string(adj_c_block); std::vector tests; tests.reserve(17); @@ -90,7 +89,7 @@ std::vector CreateTests1( .channel_round(channel_round) .kernel_size(first_pass_tile + 1) , test_func, isa_check) - .loop_channels(adj_c_block = c_block, cr * 16, cr * 3)); + .loop_channels(c_block * 2, cr * 16, cr * 3)); tests.push_back(DWConvTestParams( "c_div_" + cbs + "_first_pass_and_last_pass", @@ -103,7 +102,7 @@ std::vector CreateTests1( .channel_round(channel_round) .kernel_size(first_pass_tile + last_pass_tile) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3)); + .loop_channels(c_block * 2, cr * 16, cr * 3)); tests.push_back(DWConvTestParams( "c_div_" + cbs + "_multipass", @@ -115,7 +114,7 @@ std::vector CreateTests1( .channel_subtile(channel_subtile) .channel_round(channel_round) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3) + .loop_channels(c_block * 2, cr * 16, cr * 3) .loop_kernel_size( first_pass_tile + middle_pass_tile + last_pass_tile, first_pass_tile + middle_pass_tile * 2 + last_pass_tile)); @@ -132,7 +131,7 @@ std::vector CreateTests1( .kernel_size(first_pass_tile + last_pass_tile) .qmin(128) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3)); + .loop_channels(c_block * 2, cr * 16, cr * 3)); tests.push_back(DWConvTestParams( "c_div_" + cbs + "_with_qmax", @@ -146,11 +145,11 @@ std::vector CreateTests1( .kernel_size(first_pass_tile + last_pass_tile) .qmax(128) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3)); + .loop_channels(c_block * 2, cr * 16, cr * 3)); } tests.push_back(DWConvTestParams( - "c_gt_" + acbs + "_first_pass_plus_one", + "c_gt_" + cbs + "_first_pass_plus_one", DWConvMicrokernelTester() .first_pass_tile(first_pass_tile) .middle_pass_tile(middle_pass_tile) @@ -160,10 +159,10 @@ std::vector CreateTests1( .channel_round(channel_round) .kernel_size(first_pass_tile + 1) , test_func, isa_check) - .loop_channels(adj_c_block + 1, c_block == 1 ? 10 : adj_c_block + c_block)); + .loop_channels(c_block + 1, c_block == 1 ? 10 : c_block * 2)); tests.push_back(DWConvTestParams( - "c_gt_" + acbs + "_first_pass_and_last_pass", + "c_gt_" + cbs + "_first_pass_and_last_pass", DWConvMicrokernelTester() .first_pass_tile(first_pass_tile) .middle_pass_tile(middle_pass_tile) @@ -173,10 +172,10 @@ std::vector CreateTests1( .channel_round(channel_round) .kernel_size(first_pass_tile + last_pass_tile) , test_func, isa_check) - .loop_channels(adj_c_block + 1, c_block == 1 ? 10 : adj_c_block + c_block)); + .loop_channels(c_block + 1, c_block == 1 ? 10 : c_block * 2)); tests.push_back(DWConvTestParams( - "c_gt_" + acbs + "_multipass", + "c_gt_" + cbs + "_multipass", DWConvMicrokernelTester() .first_pass_tile(first_pass_tile) .middle_pass_tile(middle_pass_tile) @@ -185,7 +184,7 @@ std::vector CreateTests1( .channel_subtile(channel_subtile) .channel_round(channel_round) , test_func, isa_check) - .loop_channels(adj_c_block + 1, c_block == 1 ? 10 : adj_c_block + c_block) + .loop_channels(c_block + 1, c_block == 1 ? 10 : c_block * 2) .loop_kernel_size( first_pass_tile + middle_pass_tile + last_pass_tile, first_pass_tile + middle_pass_tile * 2 + last_pass_tile)); @@ -278,7 +277,7 @@ std::vector CreateTests1( .channel_round(channel_round) .input_offset(xnnpack::NextPrime(cr + 1) * 16) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3) + .loop_channels(c_block * 2, cr * 16, cr * 3) .loop_kernel_size(first_pass_tile + middle_pass_tile + last_pass_tile, first_pass_tile + middle_pass_tile * 2 + last_pass_tile)); @@ -287,1661 +286,19 @@ std::vector CreateTests1( } // namespace - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_5F5M5L4C4S4R__NEON, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__neon, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_5F5M5L4C4S4R__NEON_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__neon_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_5F5M5L4C4S4R__NEONFMA, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__neonfma, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FMA; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_5F5M5L4C4S4R__NEONFMA_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__neonfma_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FMA; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_5F5M5L8C4S4R__NEON, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_5f5m5l8c4s4r__neon, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_5F5M5L8C4S4R__NEON_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_5f5m5l8c4s4r__neon_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_5F5M5L8C4S4R__NEONFMA, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_5f5m5l8c4s4r__neonfma, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FMA; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_5F5M5L8C4S4R__NEONFMA_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_5f5m5l8c4s4r__neonfma_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FMA; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_6F6M7L4C4S4R__NEON, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_6f6m7l4c4s4r__neon, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_6F6M7L4C4S4R__NEON_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_6f6m7l4c4s4r__neon_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_6F6M7L4C4S4R__NEONFMA, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_6f6m7l4c4s4r__neonfma, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FMA; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_6F6M7L4C4S4R__NEONFMA_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_6f6m7l4c4s4r__neonfma_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FMA; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_6F6M7L8C4S4R__NEON, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_6f6m7l8c4s4r__neon, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_6F6M7L8C4S4R__NEON_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_6f6m7l8c4s4r__neon_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_6F6M7L8C4S4R__NEONFMA, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_6f6m7l8c4s4r__neonfma, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FMA; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_6F6M7L8C4S4R__NEONFMA_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_6f6m7l8c4s4r__neonfma_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FMA; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_8F8M9L4C4S4R__NEON, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_8f8m9l4c4s4r__neon, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_8F8M9L4C4S4R__NEON_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_8f8m9l4c4s4r__neon_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_8F8M9L4C4S4R__NEONFMA, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_8f8m9l4c4s4r__neonfma, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FMA; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_8F8M9L4C4S4R__NEONFMA_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_8f8m9l4c4s4r__neonfma_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FMA; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_8F8M9L8C4S4R__NEON, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_8f8m9l8c4s4r__neon, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_8F8M9L8C4S4R__NEON_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_8f8m9l8c4s4r__neon_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_8F8M9L8C4S4R__NEONFMA, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_8f8m9l8c4s4r__neonfma, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FMA; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_8F8M9L8C4S4R__NEONFMA_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_8f8m9l8c4s4r__neonfma_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FMA; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_5F5M5L4C4S4R__SSE, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__sse, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_SSE; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_5F5M5L4C4S4R__SSE_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__sse_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_SSE; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_5F5M5L8C4S4R__SSE, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_5f5m5l8c4s4r__sse, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_SSE; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_5F5M5L8C4S4R__SSE_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_5f5m5l8c4s4r__sse_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_SSE; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_5F5M5L16C4S4R__SSE, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_5f5m5l16c4s4r__sse, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_SSE; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_5F5M5L16C4S4R__SSE_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_5f5m5l16c4s4r__sse_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_SSE; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_6F6M7L4C4S4R__SSE, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_6f6m7l4c4s4r__sse, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_SSE; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_6F6M7L4C4S4R__SSE_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_6f6m7l4c4s4r__sse_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_SSE; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_6F6M7L8C4S4R__SSE, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_6f6m7l8c4s4r__sse, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_SSE; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_6F6M7L8C4S4R__SSE_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_6f6m7l8c4s4r__sse_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_SSE; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_6F6M7L16C4S4R__SSE, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_6f6m7l16c4s4r__sse, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_SSE; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_6F6M7L16C4S4R__SSE_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_6f6m7l16c4s4r__sse_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_SSE; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_8F8M9L4C4S4R__SSE, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_8f8m9l4c4s4r__sse, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_SSE; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_8F8M9L4C4S4R__SSE_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_8f8m9l4c4s4r__sse_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_SSE; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_8F8M9L8C4S4R__SSE, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_8f8m9l8c4s4r__sse, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_SSE; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_8F8M9L8C4S4R__SSE_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_8f8m9l8c4s4r__sse_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_SSE; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_8F8M9L16C4S4R__SSE, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_8f8m9l16c4s4r__sse, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_SSE; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_8F8M9L16C4S4R__SSE_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_8f8m9l16c4s4r__sse_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_SSE; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_5F5M5L8C8S4R__AVX, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_5f5m5l8c8s4r__avx, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_5F5M5L8C8S4R__AVX_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_5f5m5l8c8s4r__avx_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_5F5M5L16C8S4R__AVX, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_5f5m5l16c8s4r__avx, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_5F5M5L16C8S4R__AVX_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_5f5m5l16c8s4r__avx_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_6F6M7L8C8S4R__AVX, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_6f6m7l8c8s4r__avx, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_6F6M7L8C8S4R__AVX_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_6f6m7l8c8s4r__avx_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_6F6M7L16C8S4R__AVX, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_6f6m7l16c8s4r__avx, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_6F6M7L16C8S4R__AVX_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_6f6m7l16c8s4r__avx_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_8F8M9L8C8S4R__AVX, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_8f8m9l8c8s4r__avx, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_8F8M9L8C8S4R__AVX_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_8f8m9l8c8s4r__avx_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_8F8M9L16C8S4R__AVX, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_8f8m9l16c8s4r__avx, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_8F8M9L16C8S4R__AVX_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_8f8m9l16c8s4r__avx_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_5F5M5L8C8S4R__FMA3, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_5f5m5l8c8s4r__fma3, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_5F5M5L16C8S4R__FMA3, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_5f5m5l16c8s4r__fma3, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_5F5M5L32C8S4R__FMA3, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_5f5m5l32c8s4r__fma3, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_7F6M6L8C8S4R__FMA3, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/7, - /*first_pass_tile=*/7, /*middle_pass_tile=*/6, /*last_pass_tile=*/6, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_7f6m6l8c8s4r__fma3, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_7F6M6L16C8S4R__FMA3, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/7, - /*first_pass_tile=*/7, /*middle_pass_tile=*/6, /*last_pass_tile=*/6, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_7f6m6l16c8s4r__fma3, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_7F6M6L32C8S4R__FMA3, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/7, - /*first_pass_tile=*/7, /*middle_pass_tile=*/6, /*last_pass_tile=*/6, - /*channel_subtile=*/8, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_7f6m6l32c8s4r__fma3, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_5F5M5L16C16S1R__AVX512F, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/16, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_5f5m5l16c16s1r__avx512f, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX512F; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_5F5M5L16C16S1R__AVX512F_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/16, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_5f5m5l16c16s1r__avx512f_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX512F; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_5F5M5L32C16S1R__AVX512F, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/16, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_5f5m5l32c16s1r__avx512f, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX512F; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_5F5M5L32C16S1R__AVX512F_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/16, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_5f5m5l32c16s1r__avx512f_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX512F; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_5F5M5L4C4S4R__WASMSIMD_ARM, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmsimd_arm, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_5F5M5L4C4S4R__WASMSIMD_ARM_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmsimd_arm_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_5F5M5L4C4S4R__WASMSIMD_X86, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmsimd_x86, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_5F5M5L4C4S4R__WASMSIMD_X86_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmsimd_x86_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_5F5M5L4C4S4R__WASMRELAXEDSIMD, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmrelaxedsimd, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_5F5M5L4C4S4R__WASMRELAXEDSIMD_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmrelaxedsimd_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_5F5M5L4C4S4R__WASMRELAXEDSIMD_FMA, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmrelaxedsimd_fma, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_5F5M5L4C4S4R__WASMRELAXEDSIMD_FMA_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmrelaxedsimd_fma_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_5F5M5L1C1S1R__WASM, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_5f5m5l1c1s1r__wasm, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_5F5M5L1C1S1R__WASM_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_5f5m5l1c1s1r__wasm_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_6F6M7L1C1S1R__WASM, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_6f6m7l1c1s1r__wasm, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_6F6M7L1C1S1R__WASM_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_6f6m7l1c1s1r__wasm_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_8F8M9L1C1S1R__WASM, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_8f8m9l1c1s1r__wasm, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_8F8M9L1C1S1R__WASM_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_8f8m9l1c1s1r__wasm_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_2F2M2L1C1S1R__SCALAR, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/2, - /*first_pass_tile=*/2, /*middle_pass_tile=*/2, /*last_pass_tile=*/2, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_2f2m2l1c1s1r__scalar, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_2F2M2L1C1S1R__SCALAR_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/2, - /*first_pass_tile=*/2, /*middle_pass_tile=*/2, /*last_pass_tile=*/2, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_2f2m2l1c1s1r__scalar_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; +#define XNN_DWCONV_MULTIPASS(arch_flags, ukernel, first_pass_tile, middle_pass_tile, last_pass_tile, channel_tile, channel_subtile, channel_round, datatype, weights_type, buffer_type, params_type, init_params)\ +INSTANTIATE_TEST_SUITE_P( \ + ukernel, DWConvTest, \ + testing::ValuesIn(CreateTests( \ + channel_tile, channel_tile, first_pass_tile, \ + first_pass_tile, middle_pass_tile, last_pass_tile, \ + channel_subtile, channel_round, \ + [](DWConvMicrokernelTester& tester) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + tester.Test(ukernel, init_params); \ + })), \ + [](const testing::TestParamInfo& info) { \ + return info.param.test_name; \ }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_2F2M2L4C1S1R__SCALAR, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/2, - /*first_pass_tile=*/2, /*middle_pass_tile=*/2, /*last_pass_tile=*/2, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_2f2m2l4c1s1r__scalar, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_2F2M2L4C1S1R__SCALAR_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/2, - /*first_pass_tile=*/2, /*middle_pass_tile=*/2, /*last_pass_tile=*/2, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_2f2m2l4c1s1r__scalar_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_5F5M5L1C1S1R__SCALAR, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_5f5m5l1c1s1r__scalar, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_5F5M5L1C1S1R__SCALAR_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_5f5m5l1c1s1r__scalar_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_6F6M7L1C1S1R__SCALAR, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_6f6m7l1c1s1r__scalar, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_6F6M7L1C1S1R__SCALAR_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_6f6m7l1c1s1r__scalar_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_8F8M9L1C1S1R__SCALAR, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_8f8m9l1c1s1r__scalar, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_8F8M9L1C1S1R__SCALAR_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_8f8m9l1c1s1r__scalar_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); \ No newline at end of file +#include "src/f32-dwconv/f32-dwconv-minmax-multipass.h" +#undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-dwconv-minmax-multipass.yaml b/test/f32-dwconv-minmax-multipass.yaml deleted file mode 100644 index 60b15db9880..00000000000 --- a/test/f32-dwconv-minmax-multipass.yaml +++ /dev/null @@ -1,190 +0,0 @@ -# Copyright 2022 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# ARM NEON -- name: xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__neon - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__neon_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__neonfma - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__neonfma_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_5f5m5l8c4s4r__neon - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_5f5m5l8c4s4r__neon_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_5f5m5l8c4s4r__neonfma - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_5f5m5l8c4s4r__neonfma_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_6f6m7l4c4s4r__neon - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_6f6m7l4c4s4r__neon_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_6f6m7l4c4s4r__neonfma - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_6f6m7l4c4s4r__neonfma_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_6f6m7l8c4s4r__neon - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_6f6m7l8c4s4r__neon_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_6f6m7l8c4s4r__neonfma - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_6f6m7l8c4s4r__neonfma_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_8f8m9l4c4s4r__neon - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_8f8m9l4c4s4r__neon_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_8f8m9l4c4s4r__neonfma - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_8f8m9l4c4s4r__neonfma_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_8f8m9l8c4s4r__neon - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_8f8m9l8c4s4r__neon_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_8f8m9l8c4s4r__neonfma - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_8f8m9l8c4s4r__neonfma_acc2 - init: xnn_init_f32_minmax_scalar_params -# x86 SSE -- name: xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__sse - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__sse_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_5f5m5l8c4s4r__sse - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_5f5m5l8c4s4r__sse_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_5f5m5l16c4s4r__sse - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_5f5m5l16c4s4r__sse_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_6f6m7l4c4s4r__sse - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_6f6m7l4c4s4r__sse_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_6f6m7l8c4s4r__sse - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_6f6m7l8c4s4r__sse_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_6f6m7l16c4s4r__sse - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_6f6m7l16c4s4r__sse_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_8f8m9l4c4s4r__sse - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_8f8m9l4c4s4r__sse_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_8f8m9l8c4s4r__sse - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_8f8m9l8c4s4r__sse_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_8f8m9l16c4s4r__sse - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_8f8m9l16c4s4r__sse_acc2 - init: xnn_init_f32_minmax_scalar_params -# x86 AVX -- name: xnn_f32_dwconv_minmax_ukernel_5f5m5l8c8s4r__avx - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_5f5m5l8c8s4r__avx_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_5f5m5l16c8s4r__avx - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_5f5m5l16c8s4r__avx_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_6f6m7l8c8s4r__avx - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_6f6m7l8c8s4r__avx_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_6f6m7l16c8s4r__avx - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_6f6m7l16c8s4r__avx_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_8f8m9l8c8s4r__avx - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_8f8m9l8c8s4r__avx_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_8f8m9l16c8s4r__avx - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_8f8m9l16c8s4r__avx_acc2 - init: xnn_init_f32_minmax_scalar_params -# x86 FMA3 -- name: xnn_f32_dwconv_minmax_ukernel_5f5m5l8c8s4r__fma3 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_5f5m5l16c8s4r__fma3 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_5f5m5l32c8s4r__fma3 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_7f6m6l8c8s4r__fma3 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_7f6m6l16c8s4r__fma3 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_7f6m6l32c8s4r__fma3 - init: xnn_init_f32_minmax_scalar_params -# x86 AVX512 -- name: xnn_f32_dwconv_minmax_ukernel_5f5m5l16c16s1r__avx512f - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_5f5m5l16c16s1r__avx512f_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_5f5m5l32c16s1r__avx512f - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_5f5m5l32c16s1r__avx512f_acc2 - init: xnn_init_f32_minmax_scalar_params -# Wasm SIMD -- name: xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmsimd_arm - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmsimd_arm_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmsimd_x86 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmsimd_x86_acc2 - init: xnn_init_f32_minmax_scalar_params -# Wasm Relaxed SIMD -- name: xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmrelaxedsimd - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmrelaxedsimd_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmrelaxedsimd_fma - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmrelaxedsimd_fma_acc2 - init: xnn_init_f32_minmax_scalar_params -# Wasm -- name: xnn_f32_dwconv_minmax_ukernel_5f5m5l1c1s1r__wasm - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_5f5m5l1c1s1r__wasm_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_6f6m7l1c1s1r__wasm - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_6f6m7l1c1s1r__wasm_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_8f8m9l1c1s1r__wasm - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_8f8m9l1c1s1r__wasm_acc2 - init: xnn_init_f32_minmax_scalar_params -# Scalar -- name: xnn_f32_dwconv_minmax_ukernel_2f2m2l1c1s1r__scalar - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_2f2m2l1c1s1r__scalar_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_2f2m2l4c1s1r__scalar - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_2f2m2l4c1s1r__scalar_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_5f5m5l1c1s1r__scalar - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_5f5m5l1c1s1r__scalar_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_6f6m7l1c1s1r__scalar - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_6f6m7l1c1s1r__scalar_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_8f8m9l1c1s1r__scalar - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_8f8m9l1c1s1r__scalar_acc2 - init: xnn_init_f32_minmax_scalar_params diff --git a/test/f32-dwconv-minmax-unipass.cc b/test/f32-dwconv-minmax-unipass.cc index b939477536d..00cf20ccf08 100644 --- a/test/f32-dwconv-minmax-unipass.cc +++ b/test/f32-dwconv-minmax-unipass.cc @@ -7,7 +7,7 @@ // LICENSE file in the root directory of this source tree. // // Auto-generated file. Do not edit! -// Specification: test/f32-dwconv-minmax-unipass.yaml +// Microkernel: f32-dwconv-minmax-unipass // Generator: tools/generate-dwconv-unipass-test.py @@ -28,10 +28,10 @@ namespace { -std::vector CreateTests1( - size_t c_block, size_t adj_c_block, size_t cr, size_t kr, - std::function test_func, - std::function isa_check = nullptr) { +std::vector CreateTests( + size_t c_block, bool is_pipelined, size_t cr, size_t kr, + std::function test_func) { + size_t adj_c_block = is_pipelined ? c_block * 2 : c_block; const std::string cbs = std::to_string(c_block); const std::string acbs = std::to_string(adj_c_block); @@ -44,176 +44,25 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .channels(c_block) - , test_func, isa_check)); - - - if (c_block > 1) { - tests.push_back(DWConvTestParams( - "c_div_" + cbs, - DWConvMicrokernelTester() - .channel_tile(cr) - .kernel_tile(kr) - , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); - - tests.push_back(DWConvTestParams( - "c_div_" + cbs + "_with_qmin", - DWConvMicrokernelTester() - .channel_tile(cr) - .kernel_tile(kr) - .qmin(128) - , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); - - tests.push_back(DWConvTestParams( - "c_div_" + cbs + "_with_qmax", - DWConvMicrokernelTester() - .channel_tile(cr) - .kernel_tile(kr) - .qmax(128) - , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); + , test_func)); + if (is_pipelined) { tests.push_back(DWConvTestParams( - "c_lt_" + acbs, + "c_eq_" + std::to_string(c_block * 2), DWConvMicrokernelTester() .channel_tile(cr) .kernel_tile(kr) - , test_func, isa_check) - .loop_channels(1, adj_c_block - 1)); + .channels(c_block * 2) + , test_func)); } - tests.push_back(DWConvTestParams( - "c_gt_" + acbs, - DWConvMicrokernelTester() - .channel_tile(cr) - .kernel_tile(kr) - , test_func, isa_check) - .loop_channels(adj_c_block + 1, (c_block == 1 ? 10 : adj_c_block + c_block) - 1)); - - tests.push_back(DWConvTestParams( - "c_gt_" + acbs + "_with_qmin", - DWConvMicrokernelTester() - .channel_tile(cr) - .kernel_tile(kr) - .qmin(128) - , test_func, isa_check) - .loop_channels(adj_c_block + 1, (c_block == 1 ? 10 : adj_c_block + c_block) - 1)); - - tests.push_back(DWConvTestParams( - "c_gt_" + acbs + "_with_qmax", - DWConvMicrokernelTester() - .channel_tile(cr) - .kernel_tile(kr) - .qmax(128) - , test_func, isa_check) - .loop_channels(adj_c_block + 1, (c_block == 1 ? 10 : adj_c_block + c_block) - 1)); - - tests.push_back(DWConvTestParams( - "multipixel", - DWConvMicrokernelTester() - .channel_tile(cr) - .kernel_tile(kr) - .width(3) - , test_func, isa_check) - .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); - - tests.push_back(DWConvTestParams( - "multipixel_with_step", - DWConvMicrokernelTester() - .channel_tile(cr) - .kernel_tile(kr) - .width(3) - , test_func, isa_check) - .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1)) - .loop_step(2, kr)); - - tests.push_back(DWConvTestParams( - "multipixel_with_output_stride", - DWConvMicrokernelTester() - .channel_tile(cr) - .kernel_tile(kr) - .width(5) - .output_stride(xnnpack::NextPrime(cr * 5 + 1)) - , test_func, isa_check) - .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); - - tests.push_back(DWConvTestParams( - "multipixel_with_qmin", - DWConvMicrokernelTester() - .channel_tile(cr) - .kernel_tile(kr) - .width(3) - .qmin(128) - , test_func, isa_check) - .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); - - tests.push_back(DWConvTestParams( - "multipixel_with_qmax", - DWConvMicrokernelTester() - .channel_tile(cr) - .kernel_tile(kr) - .width(3) - .qmax(128) - , test_func, isa_check) - .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); - - - tests.push_back(DWConvTestParams( - "input_offset", - DWConvMicrokernelTester() - .channel_tile(cr) - .kernel_tile(kr) - .input_offset(xnnpack::NextPrime(cr + 1) * 16) - , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); - - tests.push_back(DWConvTestParams( - "zero", - DWConvMicrokernelTester() - .channel_tile(cr) - .kernel_tile(kr) - .input_offset(xnnpack::NextPrime(cr + 1) * 16) - , test_func, isa_check) - .loop_zi(0, kr - 1) - .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); - - return tests; -} - -std::vector CreateTests2( - size_t c_block, size_t adj_c_block, size_t cr, size_t kr, - std::function test_func, - std::function isa_check = nullptr) { - const std::string cbs = std::to_string(c_block); - const std::string acbs = std::to_string(adj_c_block); - - std::vector tests; - tests.reserve(18); - - tests.push_back(DWConvTestParams( - "c_eq_" + cbs, - DWConvMicrokernelTester() - .channel_tile(cr) - .kernel_tile(kr) - .channels(c_block) - , test_func, isa_check)); - - tests.push_back(DWConvTestParams( - "c_eq_" + std::to_string(c_block * 2), - DWConvMicrokernelTester() - .channel_tile(cr) - .kernel_tile(kr) - .channels(c_block * 2) - , test_func, isa_check)); - if (c_block > 1) { tests.push_back(DWConvTestParams( "c_div_" + cbs, DWConvMicrokernelTester() .channel_tile(cr) .kernel_tile(kr) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); tests.push_back(DWConvTestParams( @@ -222,7 +71,7 @@ std::vector CreateTests2( .channel_tile(cr) .kernel_tile(kr) .qmin(128) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); tests.push_back(DWConvTestParams( @@ -231,7 +80,7 @@ std::vector CreateTests2( .channel_tile(cr) .kernel_tile(kr) .qmax(128) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); tests.push_back(DWConvTestParams( @@ -239,7 +88,7 @@ std::vector CreateTests2( DWConvMicrokernelTester() .channel_tile(cr) .kernel_tile(kr) - , test_func, isa_check) + , test_func) .loop_channels(1, adj_c_block - 1)); } @@ -248,7 +97,7 @@ std::vector CreateTests2( DWConvMicrokernelTester() .channel_tile(cr) .kernel_tile(kr) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + 1, (c_block == 1 ? 10 : adj_c_block + c_block) - 1)); tests.push_back(DWConvTestParams( @@ -257,7 +106,7 @@ std::vector CreateTests2( .channel_tile(cr) .kernel_tile(kr) .qmin(128) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + 1, (c_block == 1 ? 10 : adj_c_block + c_block) - 1)); tests.push_back(DWConvTestParams( @@ -266,7 +115,7 @@ std::vector CreateTests2( .channel_tile(cr) .kernel_tile(kr) .qmax(128) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + 1, (c_block == 1 ? 10 : adj_c_block + c_block) - 1)); tests.push_back(DWConvTestParams( @@ -275,7 +124,7 @@ std::vector CreateTests2( .channel_tile(cr) .kernel_tile(kr) .width(3) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); tests.push_back(DWConvTestParams( @@ -284,7 +133,7 @@ std::vector CreateTests2( .channel_tile(cr) .kernel_tile(kr) .width(3) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1)) .loop_step(2, kr)); @@ -295,7 +144,7 @@ std::vector CreateTests2( .kernel_tile(kr) .width(5) .output_stride(xnnpack::NextPrime(cr * 5 + 1)) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); tests.push_back(DWConvTestParams( @@ -305,7 +154,7 @@ std::vector CreateTests2( .kernel_tile(kr) .width(3) .qmin(128) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); tests.push_back(DWConvTestParams( @@ -315,7 +164,7 @@ std::vector CreateTests2( .kernel_tile(kr) .width(3) .qmax(128) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); @@ -325,7 +174,7 @@ std::vector CreateTests2( .channel_tile(cr) .kernel_tile(kr) .input_offset(xnnpack::NextPrime(cr + 1) * 16) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); tests.push_back(DWConvTestParams( @@ -334,7 +183,7 @@ std::vector CreateTests2( .channel_tile(cr) .kernel_tile(kr) .input_offset(xnnpack::NextPrime(cr + 1) * 16) - , test_func, isa_check) + , test_func) .loop_zi(0, kr - 1) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); @@ -343,3447 +192,17 @@ std::vector CreateTests2( } // namespace - -#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P4C__ASM_AARCH64_NEONFMA, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p4c__asm_aarch64_neonfma, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FMA; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY - - -#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P4C__ASM_AARCH64_NEONFMA_CORTEX_A55, DWConvTest, - testing::ValuesIn(CreateTests2( - /*c_block=*/4, /*adj_c_block=*/8, /*cr=*/4, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p4c__asm_aarch64_neonfma_cortex_a55, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FMA; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P4C__NEON, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p4c__neon, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P4C__NEON_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p4c__neon_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P4C__NEONFMA, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p4c__neonfma, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FMA; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P4C__NEONFMA_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p4c__neonfma_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FMA; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P8C__NEON, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p8c__neon, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P8C__NEON_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p8c__neon_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P8C__NEONFMA, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p8c__neonfma, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FMA; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P8C__NEONFMA_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p8c__neonfma_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FMA; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P16C__NEON, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p16c__neon, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P16C__NEON_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p16c__neon_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P16C__NEONFMA, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p16c__neonfma, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FMA; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P16C__NEONFMA_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p16c__neonfma_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FMA; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P4C__NEON, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p4c__neon, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P4C__NEON_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p4c__neon_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P4C__NEONFMA, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p4c__neonfma, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FMA; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P4C__NEONFMA_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p4c__neonfma_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FMA; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P8C__NEON, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p8c__neon, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P8C__NEON_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p8c__neon_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P8C__NEONFMA, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p8c__neonfma, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FMA; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P8C__NEONFMA_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p8c__neonfma_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FMA; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P16C__NEON, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p16c__neon, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P16C__NEON_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p16c__neon_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P16C__NEONFMA, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p16c__neonfma, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FMA; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P16C__NEONFMA_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p16c__neonfma_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FMA; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P4C__NEON, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p4c__neon, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P4C__NEON_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p4c__neon_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P4C__NEONFMA, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p4c__neonfma, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FMA; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P4C__NEONFMA_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p4c__neonfma_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FMA; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P8C__NEON, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p8c__neon, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P8C__NEON_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p8c__neon_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P8C__NEONFMA, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p8c__neonfma, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FMA; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P8C__NEONFMA_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p8c__neonfma_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FMA; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P16C__NEON, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p16c__neon, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P16C__NEON_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p16c__neon_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P16C__NEONFMA, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p16c__neonfma, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FMA; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P16C__NEONFMA_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p16c__neonfma_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FMA; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P4C__NEON, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p4c__neon, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P4C__NEON_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p4c__neon_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P4C__NEONFMA, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p4c__neonfma, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FMA; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P4C__NEONFMA_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p4c__neonfma_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FMA; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P8C__NEON, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p8c__neon, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P8C__NEON_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p8c__neon_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P8C__NEONFMA, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p8c__neonfma, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FMA; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P8C__NEONFMA_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p8c__neonfma_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FMA; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P16C__NEON, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p16c__neon, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P16C__NEON_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p16c__neon_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P16C__NEONFMA, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p16c__neonfma, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FMA; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P16C__NEONFMA_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p16c__neonfma_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_ARM_NEON_FMA; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P4C__SSE, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p4c__sse, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_SSE; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P4C__SSE_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p4c__sse_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_SSE; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P8C__SSE, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p8c__sse, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_SSE; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P8C__SSE_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p8c__sse_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_SSE; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P4C__SSE, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p4c__sse, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_SSE; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P4C__SSE_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p4c__sse_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_SSE; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P8C__SSE, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p8c__sse, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_SSE; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P8C__SSE_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p8c__sse_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_SSE; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P4C__SSE, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p4c__sse, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_SSE; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P4C__SSE_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p4c__sse_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_SSE; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P8C__SSE, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p8c__sse, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_SSE; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P8C__SSE_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p8c__sse_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_SSE; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P4C__SSE, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p4c__sse, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_SSE; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P4C__SSE_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p4c__sse_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_SSE; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P8C__SSE, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p8c__sse, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_SSE; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P8C__SSE_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p8c__sse_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_SSE; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P8C__AVX, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p8c__avx, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P8C__AVX_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p8c__avx_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P16C__AVX, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p16c__avx, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P16C__AVX_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p16c__avx_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P8C__AVX, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p8c__avx, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P8C__AVX_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p8c__avx_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P16C__AVX, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p16c__avx, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P16C__AVX_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p16c__avx_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P8C__AVX, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p8c__avx, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P8C__AVX_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p8c__avx_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P16C__AVX, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p16c__avx, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P16C__AVX_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p16c__avx_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P8C__AVX, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p8c__avx, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P8C__AVX_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p8c__avx_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P16C__AVX, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p16c__avx, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P16C__AVX_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p16c__avx_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P8C__FMA3, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p8c__fma3, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P8C__FMA3_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p8c__fma3_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P16C__FMA3, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p16c__fma3, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P16C__FMA3_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p16c__fma3_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P8C__FMA3, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p8c__fma3, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P8C__FMA3_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p8c__fma3_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P16C__FMA3, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p16c__fma3, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P16C__FMA3_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p16c__fma3_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P8C__FMA3, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p8c__fma3, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P8C__FMA3_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p8c__fma3_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P16C__FMA3, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p16c__fma3, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P16C__FMA3_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p16c__fma3_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P8C__FMA3, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p8c__fma3, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P8C__FMA3_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p8c__fma3_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P16C__FMA3, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p16c__fma3, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P16C__FMA3_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p16c__fma3_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_FMA3; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P16C__AVX512F, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p16c__avx512f, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX512F; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P16C__AVX512F_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p16c__avx512f_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX512F; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P32C__AVX512F, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p32c__avx512f, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX512F; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P32C__AVX512F_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p32c__avx512f_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX512F; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P16C__AVX512F, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p16c__avx512f, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX512F; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P16C__AVX512F_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p16c__avx512f_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX512F; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P32C__AVX512F, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p32c__avx512f, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX512F; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P32C__AVX512F_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p32c__avx512f_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX512F; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P16C__AVX512F, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p16c__avx512f, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX512F; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P16C__AVX512F_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p16c__avx512f_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX512F; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P32C__AVX512F, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p32c__avx512f, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX512F; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P32C__AVX512F_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p32c__avx512f_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX512F; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P16C__AVX512F, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p16c__avx512f, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX512F; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P16C__AVX512F_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p16c__avx512f_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX512F; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P32C__AVX512F, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p32c__avx512f, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX512F; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P32C__AVX512F_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p32c__avx512f_acc2, - xnn_init_f32_minmax_scalar_params); - }, - []() { - TEST_REQUIRES_X86_AVX512F; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P4C__WASMSIMD_ARM, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p4c__wasmsimd_arm, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P4C__WASMSIMD_ARM_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p4c__wasmsimd_arm_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P4C__WASMSIMD_X86, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p4c__wasmsimd_x86, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P4C__WASMSIMD_X86_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p4c__wasmsimd_x86_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P8C__WASMSIMD_ARM, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p8c__wasmsimd_arm, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P8C__WASMSIMD_ARM_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p8c__wasmsimd_arm_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P8C__WASMSIMD_X86, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p8c__wasmsimd_x86, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P8C__WASMSIMD_X86_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p8c__wasmsimd_x86_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P4C__WASMSIMD_ARM, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p4c__wasmsimd_arm, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P4C__WASMSIMD_ARM_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p4c__wasmsimd_arm_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P4C__WASMSIMD_X86, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p4c__wasmsimd_x86, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P4C__WASMSIMD_X86_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p4c__wasmsimd_x86_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P8C__WASMSIMD_ARM, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p8c__wasmsimd_arm, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P8C__WASMSIMD_ARM_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p8c__wasmsimd_arm_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P8C__WASMSIMD_X86, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p8c__wasmsimd_x86, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P8C__WASMSIMD_X86_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p8c__wasmsimd_x86_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P4C__WASMSIMD_ARM, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p4c__wasmsimd_arm, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P4C__WASMSIMD_ARM_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p4c__wasmsimd_arm_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P4C__WASMSIMD_X86, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p4c__wasmsimd_x86, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P4C__WASMSIMD_X86_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p4c__wasmsimd_x86_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P8C__WASMSIMD_ARM, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p8c__wasmsimd_arm, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P8C__WASMSIMD_ARM_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p8c__wasmsimd_arm_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P8C__WASMSIMD_X86, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p8c__wasmsimd_x86, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P8C__WASMSIMD_X86_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p8c__wasmsimd_x86_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P4C__WASMSIMD_ARM, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p4c__wasmsimd_arm, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P4C__WASMSIMD_ARM_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p4c__wasmsimd_arm_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P4C__WASMSIMD_X86, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p4c__wasmsimd_x86, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P4C__WASMSIMD_X86_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p4c__wasmsimd_x86_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P8C__WASMSIMD_ARM, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p8c__wasmsimd_arm, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P8C__WASMSIMD_ARM_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p8c__wasmsimd_arm_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P8C__WASMSIMD_X86, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p8c__wasmsimd_x86, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P8C__WASMSIMD_X86_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p8c__wasmsimd_x86_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P4C__WASMRELAXEDSIMD, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p4c__wasmrelaxedsimd, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P4C__WASMRELAXEDSIMD_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p4c__wasmrelaxedsimd_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P4C__WASMRELAXEDSIMD_FMA, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p4c__wasmrelaxedsimd_fma, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P4C__WASMRELAXEDSIMD_FMA_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p4c__wasmrelaxedsimd_fma_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P8C__WASMRELAXEDSIMD, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p8c__wasmrelaxedsimd, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P8C__WASMRELAXEDSIMD_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p8c__wasmrelaxedsimd_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P8C__WASMRELAXEDSIMD_FMA, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p8c__wasmrelaxedsimd_fma, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P8C__WASMRELAXEDSIMD_FMA_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p8c__wasmrelaxedsimd_fma_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P4C__WASMRELAXEDSIMD, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p4c__wasmrelaxedsimd, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P4C__WASMRELAXEDSIMD_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p4c__wasmrelaxedsimd_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P4C__WASMRELAXEDSIMD_FMA, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p4c__wasmrelaxedsimd_fma, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P4C__WASMRELAXEDSIMD_FMA_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p4c__wasmrelaxedsimd_fma_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P8C__WASMRELAXEDSIMD, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p8c__wasmrelaxedsimd, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P8C__WASMRELAXEDSIMD_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p8c__wasmrelaxedsimd_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P8C__WASMRELAXEDSIMD_FMA, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p8c__wasmrelaxedsimd_fma, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P8C__WASMRELAXEDSIMD_FMA_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p8c__wasmrelaxedsimd_fma_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P4C__WASMRELAXEDSIMD, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p4c__wasmrelaxedsimd, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P4C__WASMRELAXEDSIMD_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p4c__wasmrelaxedsimd_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P4C__WASMRELAXEDSIMD_FMA, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p4c__wasmrelaxedsimd_fma, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P4C__WASMRELAXEDSIMD_FMA_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p4c__wasmrelaxedsimd_fma_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P8C__WASMRELAXEDSIMD, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p8c__wasmrelaxedsimd, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P8C__WASMRELAXEDSIMD_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p8c__wasmrelaxedsimd_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P8C__WASMRELAXEDSIMD_FMA, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p8c__wasmrelaxedsimd_fma, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P8C__WASMRELAXEDSIMD_FMA_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p8c__wasmrelaxedsimd_fma_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P4C__WASMRELAXEDSIMD, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p4c__wasmrelaxedsimd, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P4C__WASMRELAXEDSIMD_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p4c__wasmrelaxedsimd_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P4C__WASMRELAXEDSIMD_FMA, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p4c__wasmrelaxedsimd_fma, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P4C__WASMRELAXEDSIMD_FMA_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p4c__wasmrelaxedsimd_fma_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P8C__WASMRELAXEDSIMD, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p8c__wasmrelaxedsimd, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P8C__WASMRELAXEDSIMD_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p8c__wasmrelaxedsimd_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P8C__WASMRELAXEDSIMD_FMA, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p8c__wasmrelaxedsimd_fma, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P8C__WASMRELAXEDSIMD_FMA_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p8c__wasmrelaxedsimd_fma_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P1C__WASM, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p1c__wasm, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P1C__WASM_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p1c__wasm_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P2C__WASM, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p2c__wasm, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P2C__WASM_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p2c__wasm_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P1C__WASM, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p1c__wasm, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P1C__WASM_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p1c__wasm_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P2C__WASM, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p2c__wasm, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P2C__WASM_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p2c__wasm_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P1C__WASM, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p1c__wasm, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P1C__WASM_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p1c__wasm_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P2C__WASM, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p2c__wasm, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P2C__WASM_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p2c__wasm_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P1C__WASM, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p1c__wasm, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P1C__WASM_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p1c__wasm_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P2C__WASM, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p2c__wasm, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P2C__WASM_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p2c__wasm_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P1C__SCALAR, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p1c__scalar, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P1C__SCALAR_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p1c__scalar_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P2C__SCALAR, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p2c__scalar, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_3P2C__SCALAR_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_3p2c__scalar_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P1C__SCALAR, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p1c__scalar, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P1C__SCALAR_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p1c__scalar_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P2C__SCALAR, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p2c__scalar, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_4P2C__SCALAR_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_4p2c__scalar_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P1C__SCALAR, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p1c__scalar, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P1C__SCALAR_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p1c__scalar_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P2C__SCALAR, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p2c__scalar, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; +#define XNN_DWCONV_UNIPASS(arch_flags, ukernel, c_block, is_pipelined, cr, kr, datatype, weights_type, params_type, init_params)\ +INSTANTIATE_TEST_SUITE_P( \ + ukernel, DWConvTest, \ + testing::ValuesIn(CreateTests( \ + c_block, is_pipelined, cr, kr, \ + [](DWConvMicrokernelTester& tester) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + tester.Test(ukernel, init_params); \ + })), \ + [](const testing::TestParamInfo& info) { \ + return info.param.test_name; \ }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_9P2C__SCALAR_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_9p2c__scalar_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P1C__SCALAR, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p1c__scalar, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P1C__SCALAR_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p1c__scalar_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P2C__SCALAR, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p2c__scalar, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_MINMAX_25P2C__SCALAR_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_minmax_ukernel_25p2c__scalar_acc2, - xnn_init_f32_minmax_scalar_params); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); \ No newline at end of file +#include "src/f32-dwconv/f32-dwconv-minmax-unipass.h" +#undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-dwconv-minmax-unipass.yaml b/test/f32-dwconv-minmax-unipass.yaml deleted file mode 100644 index fdcbc0e7d79..00000000000 --- a/test/f32-dwconv-minmax-unipass.yaml +++ /dev/null @@ -1,436 +0,0 @@ -# Copyright 2019 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# AArch64 assembly -- name: xnn_f32_dwconv_minmax_ukernel_9p4c__asm_aarch64_neonfma - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p4c__asm_aarch64_neonfma_cortex_a55 - init: xnn_init_f32_minmax_scalar_params - pipelined: true -# ARM NEON -- name: xnn_f32_dwconv_minmax_ukernel_3p4c__neon - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p4c__neon_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p4c__neonfma - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p4c__neonfma_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p8c__neon - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p8c__neon_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p8c__neonfma - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p8c__neonfma_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p16c__neon - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p16c__neon_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p16c__neonfma - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p16c__neonfma_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p4c__neon - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p4c__neon_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p4c__neonfma - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p4c__neonfma_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p8c__neon - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p8c__neon_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p8c__neonfma - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p8c__neonfma_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p16c__neon - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p16c__neon_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p16c__neonfma - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p16c__neonfma_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p4c__neon - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p4c__neon_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p4c__neonfma - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p4c__neonfma_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p8c__neon - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p8c__neon_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p8c__neonfma - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p8c__neonfma_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p16c__neon - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p16c__neon_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p16c__neonfma - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p16c__neonfma_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p4c__neon - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p4c__neon_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p4c__neonfma - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p4c__neonfma_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p8c__neon - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p8c__neon_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p8c__neonfma - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p8c__neonfma_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p16c__neon - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p16c__neon_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p16c__neonfma - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p16c__neonfma_acc2 - init: xnn_init_f32_minmax_scalar_params -# x86 SSE -- name: xnn_f32_dwconv_minmax_ukernel_3p4c__sse - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p4c__sse_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p8c__sse - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p8c__sse_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p4c__sse - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p4c__sse_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p8c__sse - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p8c__sse_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p4c__sse - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p4c__sse_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p8c__sse - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p8c__sse_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p4c__sse - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p4c__sse_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p8c__sse - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p8c__sse_acc2 - init: xnn_init_f32_minmax_scalar_params -# x86 AVX -- name: xnn_f32_dwconv_minmax_ukernel_3p8c__avx - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p8c__avx_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p16c__avx - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p16c__avx_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p8c__avx - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p8c__avx_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p16c__avx - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p16c__avx_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p8c__avx - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p8c__avx_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p16c__avx - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p16c__avx_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p8c__avx - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p8c__avx_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p16c__avx - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p16c__avx_acc2 - init: xnn_init_f32_minmax_scalar_params -# x86 FMA3 -- name: xnn_f32_dwconv_minmax_ukernel_3p8c__fma3 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p8c__fma3_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p16c__fma3 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p16c__fma3_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p8c__fma3 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p8c__fma3_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p16c__fma3 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p16c__fma3_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p8c__fma3 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p8c__fma3_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p16c__fma3 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p16c__fma3_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p8c__fma3 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p8c__fma3_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p16c__fma3 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p16c__fma3_acc2 - init: xnn_init_f32_minmax_scalar_params -# x86 AVX512 -- name: xnn_f32_dwconv_minmax_ukernel_3p16c__avx512f - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p16c__avx512f_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p32c__avx512f - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p32c__avx512f_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p16c__avx512f - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p16c__avx512f_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p32c__avx512f - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p32c__avx512f_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p16c__avx512f - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p16c__avx512f_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p32c__avx512f - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p32c__avx512f_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p16c__avx512f - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p16c__avx512f_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p32c__avx512f - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p32c__avx512f_acc2 - init: xnn_init_f32_minmax_scalar_params -# WAsm SIMD -- name: xnn_f32_dwconv_minmax_ukernel_3p4c__wasmsimd_arm - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p4c__wasmsimd_arm_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p4c__wasmsimd_x86 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p4c__wasmsimd_x86_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p8c__wasmsimd_arm - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p8c__wasmsimd_arm_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p8c__wasmsimd_x86 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p8c__wasmsimd_x86_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p4c__wasmsimd_arm - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p4c__wasmsimd_arm_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p4c__wasmsimd_x86 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p4c__wasmsimd_x86_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p8c__wasmsimd_arm - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p8c__wasmsimd_arm_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p8c__wasmsimd_x86 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p8c__wasmsimd_x86_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p4c__wasmsimd_arm - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p4c__wasmsimd_arm_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p4c__wasmsimd_x86 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p4c__wasmsimd_x86_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p8c__wasmsimd_arm - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p8c__wasmsimd_arm_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p8c__wasmsimd_x86 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p8c__wasmsimd_x86_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p4c__wasmsimd_arm - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p4c__wasmsimd_arm_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p4c__wasmsimd_x86 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p4c__wasmsimd_x86_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p8c__wasmsimd_arm - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p8c__wasmsimd_arm_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p8c__wasmsimd_x86 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p8c__wasmsimd_x86_acc2 - init: xnn_init_f32_minmax_scalar_params -# WAsm Relaxed SIMD -- name: xnn_f32_dwconv_minmax_ukernel_3p4c__wasmrelaxedsimd - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p4c__wasmrelaxedsimd_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p4c__wasmrelaxedsimd_fma - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p4c__wasmrelaxedsimd_fma_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p8c__wasmrelaxedsimd - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p8c__wasmrelaxedsimd_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p8c__wasmrelaxedsimd_fma - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p8c__wasmrelaxedsimd_fma_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p4c__wasmrelaxedsimd - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p4c__wasmrelaxedsimd_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p4c__wasmrelaxedsimd_fma - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p4c__wasmrelaxedsimd_fma_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p8c__wasmrelaxedsimd - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p8c__wasmrelaxedsimd_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p8c__wasmrelaxedsimd_fma - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p8c__wasmrelaxedsimd_fma_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p4c__wasmrelaxedsimd - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p4c__wasmrelaxedsimd_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p4c__wasmrelaxedsimd_fma - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p4c__wasmrelaxedsimd_fma_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p8c__wasmrelaxedsimd - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p8c__wasmrelaxedsimd_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p8c__wasmrelaxedsimd_fma - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p8c__wasmrelaxedsimd_fma_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p4c__wasmrelaxedsimd - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p4c__wasmrelaxedsimd_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p4c__wasmrelaxedsimd_fma - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p4c__wasmrelaxedsimd_fma_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p8c__wasmrelaxedsimd - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p8c__wasmrelaxedsimd_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p8c__wasmrelaxedsimd_fma - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p8c__wasmrelaxedsimd_fma_acc2 - init: xnn_init_f32_minmax_scalar_params -# WAsm -- name: xnn_f32_dwconv_minmax_ukernel_3p1c__wasm - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p1c__wasm_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p2c__wasm - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p2c__wasm_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p1c__wasm - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p1c__wasm_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p2c__wasm - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p2c__wasm_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p1c__wasm - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p1c__wasm_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p2c__wasm - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p2c__wasm_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p1c__wasm - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p1c__wasm_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p2c__wasm - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p2c__wasm_acc2 - init: xnn_init_f32_minmax_scalar_params -# Scalar -- name: xnn_f32_dwconv_minmax_ukernel_3p1c__scalar - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p1c__scalar_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p2c__scalar - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_3p2c__scalar_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p1c__scalar - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p1c__scalar_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p2c__scalar - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_4p2c__scalar_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p1c__scalar - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p1c__scalar_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p2c__scalar - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_9p2c__scalar_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p1c__scalar - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p1c__scalar_acc2 - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p2c__scalar - init: xnn_init_f32_minmax_scalar_params -- name: xnn_f32_dwconv_minmax_ukernel_25p2c__scalar_acc2 - init: xnn_init_f32_minmax_scalar_params diff --git a/test/f32-dwconv-multipass.cc b/test/f32-dwconv-multipass.cc index 4a02b284fe1..279dfc94b4c 100644 --- a/test/f32-dwconv-multipass.cc +++ b/test/f32-dwconv-multipass.cc @@ -4,7 +4,7 @@ // LICENSE file in the root directory of this source tree. // // Auto-generated file. Do not edit! -// Specification: test/f32-dwconv-multipass.yaml +// Microkernel: f32-dwconv-multipass // Generator: tools/generate-dwconv-multipass-test.py @@ -25,14 +25,13 @@ namespace { -std::vector CreateTests1( - size_t c_block, size_t adj_c_block, size_t cr, size_t kr, +std::vector CreateTests( + size_t c_block, size_t cr, size_t kr, size_t first_pass_tile, size_t middle_pass_tile, size_t last_pass_tile, size_t channel_subtile, size_t channel_round, std::function test_func, std::function isa_check = nullptr) { const std::string cbs = std::to_string(c_block); - const std::string acbs = std::to_string(adj_c_block); std::vector tests; tests.reserve(17); @@ -90,7 +89,7 @@ std::vector CreateTests1( .channel_round(channel_round) .kernel_size(first_pass_tile + 1) , test_func, isa_check) - .loop_channels(adj_c_block = c_block, cr * 16, cr * 3)); + .loop_channels(c_block * 2, cr * 16, cr * 3)); tests.push_back(DWConvTestParams( "c_div_" + cbs + "_first_pass_and_last_pass", @@ -103,7 +102,7 @@ std::vector CreateTests1( .channel_round(channel_round) .kernel_size(first_pass_tile + last_pass_tile) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3)); + .loop_channels(c_block * 2, cr * 16, cr * 3)); tests.push_back(DWConvTestParams( "c_div_" + cbs + "_multipass", @@ -115,7 +114,7 @@ std::vector CreateTests1( .channel_subtile(channel_subtile) .channel_round(channel_round) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3) + .loop_channels(c_block * 2, cr * 16, cr * 3) .loop_kernel_size( first_pass_tile + middle_pass_tile + last_pass_tile, first_pass_tile + middle_pass_tile * 2 + last_pass_tile)); @@ -123,7 +122,7 @@ std::vector CreateTests1( } tests.push_back(DWConvTestParams( - "c_gt_" + acbs + "_first_pass_plus_one", + "c_gt_" + cbs + "_first_pass_plus_one", DWConvMicrokernelTester() .first_pass_tile(first_pass_tile) .middle_pass_tile(middle_pass_tile) @@ -133,10 +132,10 @@ std::vector CreateTests1( .channel_round(channel_round) .kernel_size(first_pass_tile + 1) , test_func, isa_check) - .loop_channels(adj_c_block + 1, c_block == 1 ? 10 : adj_c_block + c_block)); + .loop_channels(c_block + 1, c_block == 1 ? 10 : c_block * 2)); tests.push_back(DWConvTestParams( - "c_gt_" + acbs + "_first_pass_and_last_pass", + "c_gt_" + cbs + "_first_pass_and_last_pass", DWConvMicrokernelTester() .first_pass_tile(first_pass_tile) .middle_pass_tile(middle_pass_tile) @@ -146,10 +145,10 @@ std::vector CreateTests1( .channel_round(channel_round) .kernel_size(first_pass_tile + last_pass_tile) , test_func, isa_check) - .loop_channels(adj_c_block + 1, c_block == 1 ? 10 : adj_c_block + c_block)); + .loop_channels(c_block + 1, c_block == 1 ? 10 : c_block * 2)); tests.push_back(DWConvTestParams( - "c_gt_" + acbs + "_multipass", + "c_gt_" + cbs + "_multipass", DWConvMicrokernelTester() .first_pass_tile(first_pass_tile) .middle_pass_tile(middle_pass_tile) @@ -158,7 +157,7 @@ std::vector CreateTests1( .channel_subtile(channel_subtile) .channel_round(channel_round) , test_func, isa_check) - .loop_channels(adj_c_block + 1, c_block == 1 ? 10 : adj_c_block + c_block) + .loop_channels(c_block + 1, c_block == 1 ? 10 : c_block * 2) .loop_kernel_size( first_pass_tile + middle_pass_tile + last_pass_tile, first_pass_tile + middle_pass_tile * 2 + last_pass_tile)); @@ -251,7 +250,7 @@ std::vector CreateTests1( .channel_round(channel_round) .input_offset(xnnpack::NextPrime(cr + 1) * 16) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3) + .loop_channels(c_block * 2, cr * 16, cr * 3) .loop_kernel_size(first_pass_tile + middle_pass_tile + last_pass_tile, first_pass_tile + middle_pass_tile * 2 + last_pass_tile)); @@ -260,196 +259,19 @@ std::vector CreateTests1( } // namespace - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_2F2M2L1C1S1R__SCALAR, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/2, - /*first_pass_tile=*/2, /*middle_pass_tile=*/2, /*last_pass_tile=*/2, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_2f2m2l1c1s1r__scalar); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_2F2M2L1C1S1R__SCALAR_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/2, - /*first_pass_tile=*/2, /*middle_pass_tile=*/2, /*last_pass_tile=*/2, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_2f2m2l1c1s1r__scalar_acc2); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_2F2M2L4C1S1R__SCALAR, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/2, - /*first_pass_tile=*/2, /*middle_pass_tile=*/2, /*last_pass_tile=*/2, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_2f2m2l4c1s1r__scalar); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_2F2M2L4C1S1R__SCALAR_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/2, - /*first_pass_tile=*/2, /*middle_pass_tile=*/2, /*last_pass_tile=*/2, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_2f2m2l4c1s1r__scalar_acc2); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_5F5M5L1C1S1R__SCALAR, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_5f5m5l1c1s1r__scalar); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_5F5M5L1C1S1R__SCALAR_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_5f5m5l1c1s1r__scalar_acc2); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_6F6M7L1C1S1R__SCALAR, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_6f6m7l1c1s1r__scalar); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; +#define XNN_DWCONV_MULTIPASS(arch_flags, ukernel, first_pass_tile, middle_pass_tile, last_pass_tile, channel_tile, channel_subtile, channel_round, datatype, weights_type, buffer_type, params_type, init_params)\ +INSTANTIATE_TEST_SUITE_P( \ + ukernel, DWConvTest, \ + testing::ValuesIn(CreateTests( \ + channel_tile, channel_tile, first_pass_tile, \ + first_pass_tile, middle_pass_tile, last_pass_tile, \ + channel_subtile, channel_round, \ + [](DWConvMicrokernelTester& tester) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + tester.Test(ukernel, init_params); \ + })), \ + [](const testing::TestParamInfo& info) { \ + return info.param.test_name; \ }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_6F6M7L1C1S1R__SCALAR_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_6f6m7l1c1s1r__scalar_acc2); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_8F8M9L1C1S1R__SCALAR, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_8f8m9l1c1s1r__scalar); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_8F8M9L1C1S1R__SCALAR_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_8f8m9l1c1s1r__scalar_acc2); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_5F5M5L4C4S4R__WASMSIMD, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_5f5m5l4c4s4r__wasmsimd); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_5F5M5L4C4S4R__WASMSIMD_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_5f5m5l4c4s4r__wasmsimd_acc2); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_5F5M5L4C4S4R__WASMRELAXEDSIMD_FMA, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_5f5m5l4c4s4r__wasmrelaxedsimd_fma); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_5F5M5L4C4S4R__WASMRELAXEDSIMD_FMA_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_5f5m5l4c4s4r__wasmrelaxedsimd_fma_acc2); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD +#include "src/f32-dwconv/f32-dwconv-multipass.h" +#undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-dwconv-multipass.yaml b/test/f32-dwconv-multipass.yaml deleted file mode 100644 index 965296ddff8..00000000000 --- a/test/f32-dwconv-multipass.yaml +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright 2022 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# Scalar -- name: xnn_f32_dwconv_ukernel_2f2m2l1c1s1r__scalar -- name: xnn_f32_dwconv_ukernel_2f2m2l1c1s1r__scalar_acc2 -- name: xnn_f32_dwconv_ukernel_2f2m2l4c1s1r__scalar -- name: xnn_f32_dwconv_ukernel_2f2m2l4c1s1r__scalar_acc2 -- name: xnn_f32_dwconv_ukernel_5f5m5l1c1s1r__scalar -- name: xnn_f32_dwconv_ukernel_5f5m5l1c1s1r__scalar_acc2 -- name: xnn_f32_dwconv_ukernel_6f6m7l1c1s1r__scalar -- name: xnn_f32_dwconv_ukernel_6f6m7l1c1s1r__scalar_acc2 -- name: xnn_f32_dwconv_ukernel_8f8m9l1c1s1r__scalar -- name: xnn_f32_dwconv_ukernel_8f8m9l1c1s1r__scalar_acc2 - -# Wasm SIMD -- name: xnn_f32_dwconv_ukernel_5f5m5l4c4s4r__wasmsimd -- name: xnn_f32_dwconv_ukernel_5f5m5l4c4s4r__wasmsimd_acc2 - -# Wasm Relaxed SIMD -- name: xnn_f32_dwconv_ukernel_5f5m5l4c4s4r__wasmrelaxedsimd_fma -- name: xnn_f32_dwconv_ukernel_5f5m5l4c4s4r__wasmrelaxedsimd_fma_acc2 diff --git a/test/f32-dwconv-unipass.cc b/test/f32-dwconv-unipass.cc index 3d0ecb5b207..db24377ba3b 100644 --- a/test/f32-dwconv-unipass.cc +++ b/test/f32-dwconv-unipass.cc @@ -7,7 +7,7 @@ // LICENSE file in the root directory of this source tree. // // Auto-generated file. Do not edit! -// Specification: test/f32-dwconv-unipass.yaml +// Microkernel: f32-dwconv-unipass // Generator: tools/generate-dwconv-unipass-test.py @@ -28,10 +28,10 @@ namespace { -std::vector CreateTests1( - size_t c_block, size_t adj_c_block, size_t cr, size_t kr, - std::function test_func, - std::function isa_check = nullptr) { +std::vector CreateTests( + size_t c_block, bool is_pipelined, size_t cr, size_t kr, + std::function test_func) { + size_t adj_c_block = is_pipelined ? c_block * 2 : c_block; const std::string cbs = std::to_string(c_block); const std::string acbs = std::to_string(adj_c_block); @@ -44,8 +44,17 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .channels(c_block) - , test_func, isa_check)); + , test_func)); + if (is_pipelined) { + tests.push_back(DWConvTestParams( + "c_eq_" + std::to_string(c_block * 2), + DWConvMicrokernelTester() + .channel_tile(cr) + .kernel_tile(kr) + .channels(c_block * 2) + , test_func)); + } if (c_block > 1) { tests.push_back(DWConvTestParams( @@ -53,7 +62,7 @@ std::vector CreateTests1( DWConvMicrokernelTester() .channel_tile(cr) .kernel_tile(kr) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); @@ -62,7 +71,7 @@ std::vector CreateTests1( DWConvMicrokernelTester() .channel_tile(cr) .kernel_tile(kr) - , test_func, isa_check) + , test_func) .loop_channels(1, adj_c_block - 1)); } @@ -71,7 +80,7 @@ std::vector CreateTests1( DWConvMicrokernelTester() .channel_tile(cr) .kernel_tile(kr) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + 1, (c_block == 1 ? 10 : adj_c_block + c_block) - 1)); @@ -81,7 +90,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .width(3) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); tests.push_back(DWConvTestParams( @@ -90,7 +99,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .width(3) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1)) .loop_step(2, kr)); @@ -101,7 +110,7 @@ std::vector CreateTests1( .kernel_tile(kr) .width(5) .output_stride(xnnpack::NextPrime(cr * 5 + 1)) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); @@ -112,7 +121,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .input_offset(xnnpack::NextPrime(cr + 1) * 16) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); tests.push_back(DWConvTestParams( @@ -121,7 +130,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .input_offset(xnnpack::NextPrime(cr + 1) * 16) - , test_func, isa_check) + , test_func) .loop_zi(0, kr - 1) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); @@ -130,431 +139,17 @@ std::vector CreateTests1( } // namespace - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_3P4C__WASMSIMD, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_3p4c__wasmsimd); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_3P8C__WASMSIMD, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_3p8c__wasmsimd); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_4P4C__WASMSIMD, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_4p4c__wasmsimd); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_4P8C__WASMSIMD, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_4p8c__wasmsimd); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_9P4C__WASMSIMD, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_9p4c__wasmsimd); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_9P4C__WASMSIMD_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_9p4c__wasmsimd_acc2); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_9P8C__WASMSIMD, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_9p8c__wasmsimd); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_9P8C__WASMSIMD_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_9p8c__wasmsimd_acc2); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_25P4C__WASMSIMD, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_25p4c__wasmsimd); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_25P8C__WASMSIMD, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_25p8c__wasmsimd); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_3P4C__WASMRELAXEDSIMD_FMA, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_3p4c__wasmrelaxedsimd_fma); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_3P8C__WASMRELAXEDSIMD_FMA, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_3p8c__wasmrelaxedsimd_fma); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_4P4C__WASMRELAXEDSIMD_FMA, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_4p4c__wasmrelaxedsimd_fma); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_4P8C__WASMRELAXEDSIMD_FMA, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_4p8c__wasmrelaxedsimd_fma); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_9P4C__WASMRELAXEDSIMD_FMA, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_9p4c__wasmrelaxedsimd_fma); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_9P8C__WASMRELAXEDSIMD_FMA, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_9p8c__wasmrelaxedsimd_fma); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_25P4C__WASMRELAXEDSIMD_FMA, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_25p4c__wasmrelaxedsimd_fma); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_25P8C__WASMRELAXEDSIMD_FMA, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_25p8c__wasmrelaxedsimd_fma); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_3P1C__SCALAR, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_3p1c__scalar); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_3P1C__SCALAR_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_3p1c__scalar_acc2); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_3P2C__SCALAR, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_3p2c__scalar); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_3P2C__SCALAR_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_3p2c__scalar_acc2); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_4P1C__SCALAR, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_4p1c__scalar); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_4P1C__SCALAR_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_4p1c__scalar_acc2); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_4P2C__SCALAR, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_4p2c__scalar); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; +#define XNN_DWCONV_UNIPASS(arch_flags, ukernel, c_block, is_pipelined, cr, kr, datatype, weights_type, params_type, init_params)\ +INSTANTIATE_TEST_SUITE_P( \ + ukernel, DWConvTest, \ + testing::ValuesIn(CreateTests( \ + c_block, is_pipelined, cr, kr, \ + [](DWConvMicrokernelTester& tester) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + tester.Test(ukernel, init_params); \ + })), \ + [](const testing::TestParamInfo& info) { \ + return info.param.test_name; \ }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_4P2C__SCALAR_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_4p2c__scalar_acc2); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_9P1C__SCALAR, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_9p1c__scalar); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_9P1C__SCALAR_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_9p1c__scalar_acc2); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_9P2C__SCALAR, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_9p2c__scalar); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_9P2C__SCALAR_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_9p2c__scalar_acc2); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_25P1C__SCALAR, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_25p1c__scalar); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_25P1C__SCALAR_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_25p1c__scalar_acc2); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_25P2C__SCALAR, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_25p2c__scalar); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - F32_DWCONV_25P2C__SCALAR_ACC2, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_f32_dwconv_ukernel_25p2c__scalar_acc2); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); \ No newline at end of file +#include "src/f32-dwconv/f32-dwconv-unipass.h" +#undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-dwconv-unipass.yaml b/test/f32-dwconv-unipass.yaml deleted file mode 100644 index ba29d7085d8..00000000000 --- a/test/f32-dwconv-unipass.yaml +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright 2020 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# WAsm SIMD -- name: xnn_f32_dwconv_ukernel_3p4c__wasmsimd -- name: xnn_f32_dwconv_ukernel_3p8c__wasmsimd -- name: xnn_f32_dwconv_ukernel_4p4c__wasmsimd -- name: xnn_f32_dwconv_ukernel_4p8c__wasmsimd -- name: xnn_f32_dwconv_ukernel_9p4c__wasmsimd -- name: xnn_f32_dwconv_ukernel_9p4c__wasmsimd_acc2 -- name: xnn_f32_dwconv_ukernel_9p8c__wasmsimd -- name: xnn_f32_dwconv_ukernel_9p8c__wasmsimd_acc2 -- name: xnn_f32_dwconv_ukernel_25p4c__wasmsimd -- name: xnn_f32_dwconv_ukernel_25p8c__wasmsimd -# WAsm Relaxed SIMD -- name: xnn_f32_dwconv_ukernel_3p4c__wasmrelaxedsimd_fma -- name: xnn_f32_dwconv_ukernel_3p8c__wasmrelaxedsimd_fma -- name: xnn_f32_dwconv_ukernel_4p4c__wasmrelaxedsimd_fma -- name: xnn_f32_dwconv_ukernel_4p8c__wasmrelaxedsimd_fma -- name: xnn_f32_dwconv_ukernel_9p4c__wasmrelaxedsimd_fma -- name: xnn_f32_dwconv_ukernel_9p8c__wasmrelaxedsimd_fma -- name: xnn_f32_dwconv_ukernel_25p4c__wasmrelaxedsimd_fma -- name: xnn_f32_dwconv_ukernel_25p8c__wasmrelaxedsimd_fma -# Scalar -- name: xnn_f32_dwconv_ukernel_3p1c__scalar -- name: xnn_f32_dwconv_ukernel_3p1c__scalar_acc2 -- name: xnn_f32_dwconv_ukernel_3p2c__scalar -- name: xnn_f32_dwconv_ukernel_3p2c__scalar_acc2 -- name: xnn_f32_dwconv_ukernel_4p1c__scalar -- name: xnn_f32_dwconv_ukernel_4p1c__scalar_acc2 -- name: xnn_f32_dwconv_ukernel_4p2c__scalar -- name: xnn_f32_dwconv_ukernel_4p2c__scalar_acc2 -- name: xnn_f32_dwconv_ukernel_9p1c__scalar -- name: xnn_f32_dwconv_ukernel_9p1c__scalar_acc2 -- name: xnn_f32_dwconv_ukernel_9p2c__scalar -- name: xnn_f32_dwconv_ukernel_9p2c__scalar_acc2 -- name: xnn_f32_dwconv_ukernel_25p1c__scalar -- name: xnn_f32_dwconv_ukernel_25p1c__scalar_acc2 -- name: xnn_f32_dwconv_ukernel_25p2c__scalar -- name: xnn_f32_dwconv_ukernel_25p2c__scalar_acc2 diff --git a/test/qs8-dwconv-minmax-multipass-fp32.cc b/test/qs8-dwconv-minmax-multipass-fp32.cc index e6508a6ffee..51e7ee9801a 100644 --- a/test/qs8-dwconv-minmax-multipass-fp32.cc +++ b/test/qs8-dwconv-minmax-multipass-fp32.cc @@ -4,7 +4,7 @@ // LICENSE file in the root directory of this source tree. // // Auto-generated file. Do not edit! -// Specification: test/qs8-dwconv-minmax-multipass-fp32.yaml +// Microkernel: qs8-dwconv-minmax-multipass-fp32 // Generator: tools/generate-dwconv-multipass-test.py @@ -25,14 +25,13 @@ namespace { -std::vector CreateTests1( - size_t c_block, size_t adj_c_block, size_t cr, size_t kr, +std::vector CreateTests( + size_t c_block, size_t cr, size_t kr, size_t first_pass_tile, size_t middle_pass_tile, size_t last_pass_tile, size_t channel_subtile, size_t channel_round, std::function test_func, std::function isa_check = nullptr) { const std::string cbs = std::to_string(c_block); - const std::string acbs = std::to_string(adj_c_block); std::vector tests; tests.reserve(17); @@ -90,7 +89,7 @@ std::vector CreateTests1( .channel_round(channel_round) .kernel_size(first_pass_tile + 1) , test_func, isa_check) - .loop_channels(adj_c_block = c_block, cr * 16, cr * 3)); + .loop_channels(c_block * 2, cr * 16, cr * 3)); tests.push_back(DWConvTestParams( "c_div_" + cbs + "_first_pass_and_last_pass", @@ -103,7 +102,7 @@ std::vector CreateTests1( .channel_round(channel_round) .kernel_size(first_pass_tile + last_pass_tile) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3)); + .loop_channels(c_block * 2, cr * 16, cr * 3)); tests.push_back(DWConvTestParams( "c_div_" + cbs + "_multipass", @@ -115,7 +114,7 @@ std::vector CreateTests1( .channel_subtile(channel_subtile) .channel_round(channel_round) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3) + .loop_channels(c_block * 2, cr * 16, cr * 3) .loop_kernel_size( first_pass_tile + middle_pass_tile + last_pass_tile, first_pass_tile + middle_pass_tile * 2 + last_pass_tile)); @@ -132,7 +131,7 @@ std::vector CreateTests1( .kernel_size(first_pass_tile + last_pass_tile) .qmin(128) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3)); + .loop_channels(c_block * 2, cr * 16, cr * 3)); tests.push_back(DWConvTestParams( "c_div_" + cbs + "_with_qmax", @@ -146,11 +145,11 @@ std::vector CreateTests1( .kernel_size(first_pass_tile + last_pass_tile) .qmax(128) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3)); + .loop_channels(c_block * 2, cr * 16, cr * 3)); } tests.push_back(DWConvTestParams( - "c_gt_" + acbs + "_first_pass_plus_one", + "c_gt_" + cbs + "_first_pass_plus_one", DWConvMicrokernelTester() .first_pass_tile(first_pass_tile) .middle_pass_tile(middle_pass_tile) @@ -160,10 +159,10 @@ std::vector CreateTests1( .channel_round(channel_round) .kernel_size(first_pass_tile + 1) , test_func, isa_check) - .loop_channels(adj_c_block + 1, c_block == 1 ? 10 : adj_c_block + c_block)); + .loop_channels(c_block + 1, c_block == 1 ? 10 : c_block * 2)); tests.push_back(DWConvTestParams( - "c_gt_" + acbs + "_first_pass_and_last_pass", + "c_gt_" + cbs + "_first_pass_and_last_pass", DWConvMicrokernelTester() .first_pass_tile(first_pass_tile) .middle_pass_tile(middle_pass_tile) @@ -173,10 +172,10 @@ std::vector CreateTests1( .channel_round(channel_round) .kernel_size(first_pass_tile + last_pass_tile) , test_func, isa_check) - .loop_channels(adj_c_block + 1, c_block == 1 ? 10 : adj_c_block + c_block)); + .loop_channels(c_block + 1, c_block == 1 ? 10 : c_block * 2)); tests.push_back(DWConvTestParams( - "c_gt_" + acbs + "_multipass", + "c_gt_" + cbs + "_multipass", DWConvMicrokernelTester() .first_pass_tile(first_pass_tile) .middle_pass_tile(middle_pass_tile) @@ -185,7 +184,7 @@ std::vector CreateTests1( .channel_subtile(channel_subtile) .channel_round(channel_round) , test_func, isa_check) - .loop_channels(adj_c_block + 1, c_block == 1 ? 10 : adj_c_block + c_block) + .loop_channels(c_block + 1, c_block == 1 ? 10 : c_block * 2) .loop_kernel_size( first_pass_tile + middle_pass_tile + last_pass_tile, first_pass_tile + middle_pass_tile * 2 + last_pass_tile)); @@ -278,7 +277,7 @@ std::vector CreateTests1( .channel_round(channel_round) .input_offset(xnnpack::NextPrime(cr + 1) * 16) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3) + .loop_channels(c_block * 2, cr * 16, cr * 3) .loop_kernel_size(first_pass_tile + middle_pass_tile + last_pass_tile, first_pass_tile + middle_pass_tile * 2 + last_pass_tile)); @@ -287,2613 +286,19 @@ std::vector CreateTests1( } // namespace - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L8C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neon_mul16, - xnn_init_qs8_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L8C8S8R__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neonv8_mul16, - xnn_init_qs8_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L16C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neon_mul16, - xnn_init_qs8_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L16C8S8R__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neonv8_mul16, - xnn_init_qs8_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L32C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l32c8s8r__neon_mul16, - xnn_init_qs8_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L32C8S8R__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l32c8s8r__neonv8_mul16, - xnn_init_qs8_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L8C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neon_mul16, - xnn_init_qs8_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L8C8S8R__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neonv8_mul16, - xnn_init_qs8_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L16C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neon_mul16, - xnn_init_qs8_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L16C8S8R__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neonv8_mul16, - xnn_init_qs8_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L32C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l32c8s8r__neon_mul16, - xnn_init_qs8_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L32C8S8R__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l32c8s8r__neonv8_mul16, - xnn_init_qs8_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L8C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neon_mul16, - xnn_init_qs8_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L8C8S8R__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neonv8_mul16, - xnn_init_qs8_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L16C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neon_mul16, - xnn_init_qs8_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L16C8S8R__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neonv8_mul16, - xnn_init_qs8_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L32C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l32c8s8r__neon_mul16, - xnn_init_qs8_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L32C8S8R__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l32c8s8r__neonv8_mul16, - xnn_init_qs8_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L8C4S4R__SSE41_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c4s4r__sse41_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L8C8S8R__SSE2_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse2_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L8C8S8R__SSE2_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse2_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L8C8S8R__SSE41_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse41_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L8C8S8R__SSE41_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse41_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L16C4S4R__SSE41_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c4s4r__sse41_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L16C8S8R__SSE2_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse2_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L16C8S8R__SSE2_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse2_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L16C8S8R__SSE41_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse41_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L16C8S8R__SSE41_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse41_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L8C4S4R__SSE41_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c4s4r__sse41_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L8C8S8R__SSE2_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse2_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L8C8S8R__SSE2_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse2_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L8C8S8R__SSE41_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse41_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L8C8S8R__SSE41_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse41_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L16C4S4R__SSE41_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c4s4r__sse41_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L16C8S8R__SSE2_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse2_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L16C8S8R__SSE2_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse2_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L16C8S8R__SSE41_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse41_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L16C8S8R__SSE41_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse41_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L8C4S4R__SSE41_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c4s4r__sse41_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L8C8S8R__SSE2_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse2_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L8C8S8R__SSE2_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse2_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L8C8S8R__SSE41_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse41_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L8C8S8R__SSE41_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse41_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L16C4S4R__SSE41_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c4s4r__sse41_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L16C8S8R__SSE2_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse2_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L16C8S8R__SSE2_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse2_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L16C8S8R__SSE41_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse41_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L16C8S8R__SSE41_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse41_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L8C4S4R__AVX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c4s4r__avx_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L8C8S8R__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__avx2_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L16C4S4R__AVX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c4s4r__avx_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L16C8S8R__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__avx2_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L16C16S16R__AVX2_MUL16_ADD16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/16, /*channel_round=*/16, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c16s16r__avx2_mul16_add16_vpunpck, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L16C16S16R__AVX2_MUL16_VPMOVSX, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/16, /*channel_round=*/16, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c16s16r__avx2_mul16_vpmovsx, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L16C16S16R__AVX2_MUL16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/16, /*channel_round=*/16, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c16s16r__avx2_mul16_vpunpck, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L32C8S8R__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l32c8s8r__avx2_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L32C16S16R__AVX2_MUL16_ADD16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/16, /*channel_round=*/16, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l32c16s16r__avx2_mul16_add16_vpunpck, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L32C16S16R__AVX2_MUL16_VPMOVSX, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/16, /*channel_round=*/16, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l32c16s16r__avx2_mul16_vpmovsx, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L32C16S16R__AVX2_MUL16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/16, /*channel_round=*/16, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l32c16s16r__avx2_mul16_vpunpck, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L8C4S4R__AVX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c4s4r__avx_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L8C8S8R__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__avx2_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L16C4S4R__AVX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c4s4r__avx_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L16C8S8R__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__avx2_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L16C16S16R__AVX2_MUL16_ADD16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/16, /*channel_round=*/16, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c16s16r__avx2_mul16_add16_vpunpck, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L16C16S16R__AVX2_MUL16_VPMOVSX, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/16, /*channel_round=*/16, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c16s16r__avx2_mul16_vpmovsx, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L16C16S16R__AVX2_MUL16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/16, /*channel_round=*/16, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c16s16r__avx2_mul16_vpunpck, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L32C8S8R__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l32c8s8r__avx2_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L32C16S16R__AVX2_MUL16_ADD16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/16, /*channel_round=*/16, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l32c16s16r__avx2_mul16_add16_vpunpck, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L32C16S16R__AVX2_MUL16_VPMOVSX, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/16, /*channel_round=*/16, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l32c16s16r__avx2_mul16_vpmovsx, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L32C16S16R__AVX2_MUL16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/16, /*channel_round=*/16, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l32c16s16r__avx2_mul16_vpunpck, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L8C4S4R__AVX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c4s4r__avx_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L8C8S8R__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__avx2_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L16C4S4R__AVX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c4s4r__avx_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L16C8S8R__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__avx2_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L16C16S16R__AVX2_MUL16_ADD16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/16, /*channel_round=*/16, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c16s16r__avx2_mul16_add16_vpunpck, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L16C16S16R__AVX2_MUL16_VPMOVSX, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/16, /*channel_round=*/16, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c16s16r__avx2_mul16_vpmovsx, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L16C16S16R__AVX2_MUL16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/16, /*channel_round=*/16, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c16s16r__avx2_mul16_vpunpck, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L32C8S8R__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l32c8s8r__avx2_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L32C16S16R__AVX2_MUL16_ADD16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/16, /*channel_round=*/16, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l32c16s16r__avx2_mul16_add16_vpunpck, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L32C16S16R__AVX2_MUL16_VPMOVSX, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/16, /*channel_round=*/16, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l32c16s16r__avx2_mul16_vpmovsx, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L32C16S16R__AVX2_MUL16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/16, /*channel_round=*/16, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l32c16s16r__avx2_mul16_vpunpck, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L16C16S1R__AVX512SKX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/16, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c16s1r__avx512skx_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX512SKX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L32C16S1R__AVX512SKX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/16, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l32c16s1r__avx512skx_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX512SKX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L16C16S1R__AVX512SKX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/16, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c16s1r__avx512skx_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX512SKX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L32C16S1R__AVX512SKX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/16, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l32c16s1r__avx512skx_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX512SKX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L16C16S1R__AVX512SKX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/16, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c16s1r__avx512skx_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX512SKX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L32C16S1R__AVX512SKX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/16, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l32c16s1r__avx512skx_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX512SKX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L8C8S8R__WASMSIMD_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__wasmsimd_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L8C8S8R__WASMSIMD_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__wasmsimd_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L16C8S8R__WASMSIMD_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__wasmsimd_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L16C8S8R__WASMSIMD_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__wasmsimd_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L8C8S8R__WASMSIMD_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__wasmsimd_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L8C8S8R__WASMSIMD_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__wasmsimd_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L16C8S8R__WASMSIMD_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__wasmsimd_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L16C8S8R__WASMSIMD_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__wasmsimd_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L8C8S8R__WASMSIMD_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__wasmsimd_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L8C8S8R__WASMSIMD_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__wasmsimd_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L16C8S8R__WASMSIMD_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__wasmsimd_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L16C8S8R__WASMSIMD_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__wasmsimd_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L1C1S1R__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__wasm_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L2C1S1R__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__wasm_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L4C1S1R__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__wasm_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L1C1S1R__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__wasm_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L2C1S1R__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__wasm_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L4C1S1R__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__wasm_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L1C1S1R__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__wasm_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L2C1S1R__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__wasm_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L4C1S1R__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__wasm_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L1C1S1R__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L1C1S1R__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_imagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L1C1S1R__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_lrintf, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L2C1S1R__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L2C1S1R__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_imagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L2C1S1R__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_lrintf, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L4C1S1R__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L4C1S1R__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_imagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_5F5M5L4C1S1R__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_lrintf, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L1C1S1R__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; +#define XNN_DWCONV_MULTIPASS(arch_flags, ukernel, first_pass_tile, middle_pass_tile, last_pass_tile, channel_tile, channel_subtile, channel_round, datatype, weights_type, buffer_type, params_type, init_params)\ +INSTANTIATE_TEST_SUITE_P( \ + ukernel, DWConvTest, \ + testing::ValuesIn(CreateTests( \ + channel_tile, channel_tile, first_pass_tile, \ + first_pass_tile, middle_pass_tile, last_pass_tile, \ + channel_subtile, channel_round, \ + [](DWConvMicrokernelTester& tester) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + tester.Test(ukernel, init_params, xnn_qs8_requantize_fp32); \ + })), \ + [](const testing::TestParamInfo& info) { \ + return info.param.test_name; \ }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L1C1S1R__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_imagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L1C1S1R__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_lrintf, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L2C1S1R__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L2C1S1R__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_imagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L2C1S1R__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_lrintf, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L4C1S1R__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L4C1S1R__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_imagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_6F6M7L4C1S1R__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_lrintf, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L1C1S1R__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L1C1S1R__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_imagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L1C1S1R__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_lrintf, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L2C1S1R__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L2C1S1R__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_imagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L2C1S1R__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_lrintf, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L4C1S1R__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L4C1S1R__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_imagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_8F8M9L4C1S1R__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_lrintf, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); \ No newline at end of file +#include "src/qs8-dwconv/qs8-dwconv-minmax-multipass-fp32.h" +#undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qs8-dwconv-minmax-multipass-fp32.yaml b/test/qs8-dwconv-minmax-multipass-fp32.yaml deleted file mode 100644 index 1b486432ec3..00000000000 --- a/test/qs8-dwconv-minmax-multipass-fp32.yaml +++ /dev/null @@ -1,288 +0,0 @@ -# Copyright 2023 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# ARM NEON -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neon_mul16 - init: xnn_init_qs8_conv_minmax_fp32_neon_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neonv8_mul16 - init: xnn_init_qs8_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neon_mul16 - init: xnn_init_qs8_conv_minmax_fp32_neon_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neonv8_mul16 - init: xnn_init_qs8_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l32c8s8r__neon_mul16 - init: xnn_init_qs8_conv_minmax_fp32_neon_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l32c8s8r__neonv8_mul16 - init: xnn_init_qs8_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neon_mul16 - init: xnn_init_qs8_conv_minmax_fp32_neon_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neonv8_mul16 - init: xnn_init_qs8_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neon_mul16 - init: xnn_init_qs8_conv_minmax_fp32_neon_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neonv8_mul16 - init: xnn_init_qs8_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l32c8s8r__neon_mul16 - init: xnn_init_qs8_conv_minmax_fp32_neon_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l32c8s8r__neonv8_mul16 - init: xnn_init_qs8_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neon_mul16 - init: xnn_init_qs8_conv_minmax_fp32_neon_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neonv8_mul16 - init: xnn_init_qs8_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neon_mul16 - init: xnn_init_qs8_conv_minmax_fp32_neon_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neonv8_mul16 - init: xnn_init_qs8_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l32c8s8r__neon_mul16 - init: xnn_init_qs8_conv_minmax_fp32_neon_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l32c8s8r__neonv8_mul16 - init: xnn_init_qs8_conv_minmax_fp32_neonv8_params - -# x86 SSE -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c4s4r__sse41_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse2_mul16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse2_mul16_add16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse41_mul16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse41_mul16_add16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c4s4r__sse41_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse2_mul16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse2_mul16_add16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse41_mul16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse41_mul16_add16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c4s4r__sse41_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse2_mul16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse2_mul16_add16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse41_mul16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse41_mul16_add16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c4s4r__sse41_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse2_mul16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse2_mul16_add16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse41_mul16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse41_mul16_add16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c4s4r__sse41_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse2_mul16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse2_mul16_add16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse41_mul16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse41_mul16_add16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c4s4r__sse41_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse2_mul16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse2_mul16_add16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse41_mul16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse41_mul16_add16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params - -# x86 AVX -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c4s4r__avx_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__avx2_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c4s4r__avx_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__avx2_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c16s16r__avx2_mul16_add16_vpunpck - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c16s16r__avx2_mul16_vpmovsx - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c16s16r__avx2_mul16_vpunpck - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l32c8s8r__avx2_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l32c16s16r__avx2_mul16_add16_vpunpck - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l32c16s16r__avx2_mul16_vpmovsx - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l32c16s16r__avx2_mul16_vpunpck - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c4s4r__avx_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__avx2_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c4s4r__avx_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__avx2_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c16s16r__avx2_mul16_add16_vpunpck - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c16s16r__avx2_mul16_vpmovsx - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c16s16r__avx2_mul16_vpunpck - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l32c8s8r__avx2_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l32c16s16r__avx2_mul16_add16_vpunpck - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l32c16s16r__avx2_mul16_vpmovsx - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l32c16s16r__avx2_mul16_vpunpck - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c4s4r__avx_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__avx2_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c4s4r__avx_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__avx2_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c16s16r__avx2_mul16_add16_vpunpck - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c16s16r__avx2_mul16_vpmovsx - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c16s16r__avx2_mul16_vpunpck - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l32c8s8r__avx2_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l32c16s16r__avx2_mul16_add16_vpunpck - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l32c16s16r__avx2_mul16_vpmovsx - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l32c16s16r__avx2_mul16_vpunpck - init: xnn_init_qs8_conv_minmax_fp32_scalar_params - -# x86 AVX512 -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c16s1r__avx512skx_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l32c16s1r__avx512skx_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c16s1r__avx512skx_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l32c16s1r__avx512skx_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c16s1r__avx512skx_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l32c16s1r__avx512skx_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params - -# Wasm SIMD -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__wasmsimd_mul16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__wasmsimd_mul16_add16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__wasmsimd_mul16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__wasmsimd_mul16_add16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__wasmsimd_mul16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__wasmsimd_mul16_add16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__wasmsimd_mul16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__wasmsimd_mul16_add16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__wasmsimd_mul16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__wasmsimd_mul16_add16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__wasmsimd_mul16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__wasmsimd_mul16_add16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params - -# Wasm -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__wasm_fmagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__wasm_fmagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__wasm_fmagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__wasm_fmagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__wasm_fmagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__wasm_fmagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__wasm_fmagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__wasm_fmagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__wasm_fmagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params - -# Scalar -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_fmagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_imagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_lrintf - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_fmagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_imagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_lrintf - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_fmagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_imagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_lrintf - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_fmagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_imagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_lrintf - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_fmagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_imagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_lrintf - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_fmagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_imagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_lrintf - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_fmagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_imagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_lrintf - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_fmagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_imagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_lrintf - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_fmagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_imagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_lrintf - init: xnn_init_qs8_conv_minmax_fp32_scalar_params diff --git a/test/qs8-dwconv-minmax-multipass-rndnu.cc b/test/qs8-dwconv-minmax-multipass-rndnu.cc index 690bfa673ce..08efff8519b 100644 --- a/test/qs8-dwconv-minmax-multipass-rndnu.cc +++ b/test/qs8-dwconv-minmax-multipass-rndnu.cc @@ -4,7 +4,7 @@ // LICENSE file in the root directory of this source tree. // // Auto-generated file. Do not edit! -// Specification: test/qs8-dwconv-minmax-multipass-rndnu.yaml +// Microkernel: qs8-dwconv-minmax-multipass-rndnu // Generator: tools/generate-dwconv-multipass-test.py @@ -25,14 +25,13 @@ namespace { -std::vector CreateTests1( - size_t c_block, size_t adj_c_block, size_t cr, size_t kr, +std::vector CreateTests( + size_t c_block, size_t cr, size_t kr, size_t first_pass_tile, size_t middle_pass_tile, size_t last_pass_tile, size_t channel_subtile, size_t channel_round, std::function test_func, std::function isa_check = nullptr) { const std::string cbs = std::to_string(c_block); - const std::string acbs = std::to_string(adj_c_block); std::vector tests; tests.reserve(17); @@ -90,7 +89,7 @@ std::vector CreateTests1( .channel_round(channel_round) .kernel_size(first_pass_tile + 1) , test_func, isa_check) - .loop_channels(adj_c_block = c_block, cr * 16, cr * 3)); + .loop_channels(c_block * 2, cr * 16, cr * 3)); tests.push_back(DWConvTestParams( "c_div_" + cbs + "_first_pass_and_last_pass", @@ -103,7 +102,7 @@ std::vector CreateTests1( .channel_round(channel_round) .kernel_size(first_pass_tile + last_pass_tile) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3)); + .loop_channels(c_block * 2, cr * 16, cr * 3)); tests.push_back(DWConvTestParams( "c_div_" + cbs + "_multipass", @@ -115,7 +114,7 @@ std::vector CreateTests1( .channel_subtile(channel_subtile) .channel_round(channel_round) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3) + .loop_channels(c_block * 2, cr * 16, cr * 3) .loop_kernel_size( first_pass_tile + middle_pass_tile + last_pass_tile, first_pass_tile + middle_pass_tile * 2 + last_pass_tile)); @@ -132,7 +131,7 @@ std::vector CreateTests1( .kernel_size(first_pass_tile + last_pass_tile) .qmin(128) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3)); + .loop_channels(c_block * 2, cr * 16, cr * 3)); tests.push_back(DWConvTestParams( "c_div_" + cbs + "_with_qmax", @@ -146,11 +145,11 @@ std::vector CreateTests1( .kernel_size(first_pass_tile + last_pass_tile) .qmax(128) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3)); + .loop_channels(c_block * 2, cr * 16, cr * 3)); } tests.push_back(DWConvTestParams( - "c_gt_" + acbs + "_first_pass_plus_one", + "c_gt_" + cbs + "_first_pass_plus_one", DWConvMicrokernelTester() .first_pass_tile(first_pass_tile) .middle_pass_tile(middle_pass_tile) @@ -160,10 +159,10 @@ std::vector CreateTests1( .channel_round(channel_round) .kernel_size(first_pass_tile + 1) , test_func, isa_check) - .loop_channels(adj_c_block + 1, c_block == 1 ? 10 : adj_c_block + c_block)); + .loop_channels(c_block + 1, c_block == 1 ? 10 : c_block * 2)); tests.push_back(DWConvTestParams( - "c_gt_" + acbs + "_first_pass_and_last_pass", + "c_gt_" + cbs + "_first_pass_and_last_pass", DWConvMicrokernelTester() .first_pass_tile(first_pass_tile) .middle_pass_tile(middle_pass_tile) @@ -173,10 +172,10 @@ std::vector CreateTests1( .channel_round(channel_round) .kernel_size(first_pass_tile + last_pass_tile) , test_func, isa_check) - .loop_channels(adj_c_block + 1, c_block == 1 ? 10 : adj_c_block + c_block)); + .loop_channels(c_block + 1, c_block == 1 ? 10 : c_block * 2)); tests.push_back(DWConvTestParams( - "c_gt_" + acbs + "_multipass", + "c_gt_" + cbs + "_multipass", DWConvMicrokernelTester() .first_pass_tile(first_pass_tile) .middle_pass_tile(middle_pass_tile) @@ -185,7 +184,7 @@ std::vector CreateTests1( .channel_subtile(channel_subtile) .channel_round(channel_round) , test_func, isa_check) - .loop_channels(adj_c_block + 1, c_block == 1 ? 10 : adj_c_block + c_block) + .loop_channels(c_block + 1, c_block == 1 ? 10 : c_block * 2) .loop_kernel_size( first_pass_tile + middle_pass_tile + last_pass_tile, first_pass_tile + middle_pass_tile * 2 + last_pass_tile)); @@ -278,7 +277,7 @@ std::vector CreateTests1( .channel_round(channel_round) .input_offset(xnnpack::NextPrime(cr + 1) * 16) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3) + .loop_channels(c_block * 2, cr * 16, cr * 3) .loop_kernel_size(first_pass_tile + middle_pass_tile + last_pass_tile, first_pass_tile + middle_pass_tile * 2 + last_pass_tile)); @@ -287,569 +286,19 @@ std::vector CreateTests1( } // namespace - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_5F5M5L8C8S8R__NEON_MLA8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l8c8s8r__neon_mla8_ld64, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_5F5M5L8C8S8R__NEON_MUL8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l8c8s8r__neon_mul8_ld64, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_5F5M5L8C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l8c8s8r__neon_mul16, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_5F5M5L16C8S8R__NEON_MLA8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l16c8s8r__neon_mla8_ld64, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_5F5M5L16C8S8R__NEON_MLA8_LD128, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l16c8s8r__neon_mla8_ld128, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_5F5M5L16C8S8R__NEON_MUL8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l16c8s8r__neon_mul8_ld64, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_5F5M5L16C8S8R__NEON_MUL8_LD128, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l16c8s8r__neon_mul8_ld128, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_5F5M5L16C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l16c8s8r__neon_mul16, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_5F5M5L32C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l32c8s8r__neon_mul16, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_6F6M7L8C8S8R__NEON_MLA8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l8c8s8r__neon_mla8_ld64, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_6F6M7L8C8S8R__NEON_MUL8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l8c8s8r__neon_mul8_ld64, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_6F6M7L8C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l8c8s8r__neon_mul16, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_6F6M7L16C8S8R__NEON_MLA8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l16c8s8r__neon_mla8_ld64, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_6F6M7L16C8S8R__NEON_MLA8_LD128, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l16c8s8r__neon_mla8_ld128, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_6F6M7L16C8S8R__NEON_MUL8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l16c8s8r__neon_mul8_ld64, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_6F6M7L16C8S8R__NEON_MUL8_LD128, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l16c8s8r__neon_mul8_ld128, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_6F6M7L16C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l16c8s8r__neon_mul16, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_6F6M7L32C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l32c8s8r__neon_mul16, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_8F8M9L8C8S8R__NEON_MLA8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l8c8s8r__neon_mla8_ld64, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_8F8M9L8C8S8R__NEON_MUL8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l8c8s8r__neon_mul8_ld64, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_8F8M9L8C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l8c8s8r__neon_mul16, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_8F8M9L16C8S8R__NEON_MLA8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l16c8s8r__neon_mla8_ld64, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_8F8M9L16C8S8R__NEON_MLA8_LD128, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l16c8s8r__neon_mla8_ld128, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_8F8M9L16C8S8R__NEON_MUL8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l16c8s8r__neon_mul8_ld64, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_8F8M9L16C8S8R__NEON_MUL8_LD128, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l16c8s8r__neon_mul8_ld128, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_8F8M9L16C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l16c8s8r__neon_mul16, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_8F8M9L32C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l32c8s8r__neon_mul16, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 +#define XNN_DWCONV_MULTIPASS(arch_flags, ukernel, first_pass_tile, middle_pass_tile, last_pass_tile, channel_tile, channel_subtile, channel_round, datatype, weights_type, buffer_type, params_type, init_params)\ +INSTANTIATE_TEST_SUITE_P( \ + ukernel, DWConvTest, \ + testing::ValuesIn(CreateTests( \ + channel_tile, channel_tile, first_pass_tile, \ + first_pass_tile, middle_pass_tile, last_pass_tile, \ + channel_subtile, channel_round, \ + [](DWConvMicrokernelTester& tester) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + tester.Test(ukernel, init_params, xnn_qs8_requantize_rndnu); \ + })), \ + [](const testing::TestParamInfo& info) { \ + return info.param.test_name; \ + }); +#include "src/qs8-dwconv/qs8-dwconv-minmax-multipass-rndnu.h" +#undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qs8-dwconv-minmax-multipass-rndnu.yaml b/test/qs8-dwconv-minmax-multipass-rndnu.yaml deleted file mode 100644 index fbbe9ddfd16..00000000000 --- a/test/qs8-dwconv-minmax-multipass-rndnu.yaml +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright 2023 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# ARM NEON -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l8c8s8r__neon_mla8_ld64 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l8c8s8r__neon_mul8_ld64 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l8c8s8r__neon_mul16 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l16c8s8r__neon_mla8_ld64 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l16c8s8r__neon_mla8_ld128 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l16c8s8r__neon_mul8_ld64 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l16c8s8r__neon_mul8_ld128 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l16c8s8r__neon_mul16 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l32c8s8r__neon_mul16 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l8c8s8r__neon_mla8_ld64 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l8c8s8r__neon_mul8_ld64 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l8c8s8r__neon_mul16 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l16c8s8r__neon_mla8_ld64 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l16c8s8r__neon_mla8_ld128 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l16c8s8r__neon_mul8_ld64 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l16c8s8r__neon_mul8_ld128 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l16c8s8r__neon_mul16 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l32c8s8r__neon_mul16 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l8c8s8r__neon_mla8_ld64 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l8c8s8r__neon_mul8_ld64 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l8c8s8r__neon_mul16 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l16c8s8r__neon_mla8_ld64 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l16c8s8r__neon_mla8_ld128 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l16c8s8r__neon_mul8_ld64 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l16c8s8r__neon_mul8_ld128 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l16c8s8r__neon_mul16 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l32c8s8r__neon_mul16 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params diff --git a/test/qs8-dwconv-minmax-unipass-fp32.cc b/test/qs8-dwconv-minmax-unipass-fp32.cc index cdb94d8a67c..4da93830b13 100644 --- a/test/qs8-dwconv-minmax-unipass-fp32.cc +++ b/test/qs8-dwconv-minmax-unipass-fp32.cc @@ -7,7 +7,7 @@ // LICENSE file in the root directory of this source tree. // // Auto-generated file. Do not edit! -// Specification: test/qs8-dwconv-minmax-unipass-fp32.yaml +// Microkernel: qs8-dwconv-minmax-unipass-fp32 // Generator: tools/generate-dwconv-unipass-test.py @@ -28,10 +28,10 @@ namespace { -std::vector CreateTests1( - size_t c_block, size_t adj_c_block, size_t cr, size_t kr, - std::function test_func, - std::function isa_check = nullptr) { +std::vector CreateTests( + size_t c_block, bool is_pipelined, size_t cr, size_t kr, + std::function test_func) { + size_t adj_c_block = is_pipelined ? c_block * 2 : c_block; const std::string cbs = std::to_string(c_block); const std::string acbs = std::to_string(adj_c_block); @@ -44,8 +44,17 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .channels(c_block) - , test_func, isa_check)); + , test_func)); + if (is_pipelined) { + tests.push_back(DWConvTestParams( + "c_eq_" + std::to_string(c_block * 2), + DWConvMicrokernelTester() + .channel_tile(cr) + .kernel_tile(kr) + .channels(c_block * 2) + , test_func)); + } if (c_block > 1) { tests.push_back(DWConvTestParams( @@ -53,7 +62,7 @@ std::vector CreateTests1( DWConvMicrokernelTester() .channel_tile(cr) .kernel_tile(kr) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); tests.push_back(DWConvTestParams( @@ -62,7 +71,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .qmin(128) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); tests.push_back(DWConvTestParams( @@ -71,7 +80,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .qmax(128) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); tests.push_back(DWConvTestParams( @@ -79,7 +88,7 @@ std::vector CreateTests1( DWConvMicrokernelTester() .channel_tile(cr) .kernel_tile(kr) - , test_func, isa_check) + , test_func) .loop_channels(1, adj_c_block - 1)); } @@ -88,7 +97,7 @@ std::vector CreateTests1( DWConvMicrokernelTester() .channel_tile(cr) .kernel_tile(kr) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + 1, (c_block == 1 ? 10 : adj_c_block + c_block) - 1)); tests.push_back(DWConvTestParams( @@ -97,7 +106,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .qmin(128) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + 1, (c_block == 1 ? 10 : adj_c_block + c_block) - 1)); tests.push_back(DWConvTestParams( @@ -106,7 +115,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .qmax(128) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + 1, (c_block == 1 ? 10 : adj_c_block + c_block) - 1)); tests.push_back(DWConvTestParams( @@ -115,7 +124,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .width(3) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); tests.push_back(DWConvTestParams( @@ -124,7 +133,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .width(3) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1)) .loop_step(2, kr)); @@ -135,7 +144,7 @@ std::vector CreateTests1( .kernel_tile(kr) .width(5) .output_stride(xnnpack::NextPrime(cr * 5 + 1)) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); tests.push_back(DWConvTestParams( @@ -145,7 +154,7 @@ std::vector CreateTests1( .kernel_tile(kr) .width(3) .qmin(128) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); tests.push_back(DWConvTestParams( @@ -155,7 +164,7 @@ std::vector CreateTests1( .kernel_tile(kr) .width(3) .qmax(128) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); @@ -165,7 +174,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .input_offset(xnnpack::NextPrime(cr + 1) * 16) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); tests.push_back(DWConvTestParams( @@ -174,7 +183,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .input_offset(xnnpack::NextPrime(cr + 1) * 16) - , test_func, isa_check) + , test_func) .loop_zi(0, kr - 1) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); @@ -183,1715 +192,17 @@ std::vector CreateTests1( } // namespace - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P8C__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__neon_mul16, - xnn_init_qs8_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P8C__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__neonv8_mul16, - xnn_init_qs8_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P16C__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__neon_mul16, - xnn_init_qs8_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P16C__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__neonv8_mul16, - xnn_init_qs8_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P32C__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p32c__neon_mul16, - xnn_init_qs8_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P32C__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p32c__neonv8_mul16, - xnn_init_qs8_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P8C__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__neon_mul16, - xnn_init_qs8_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P8C__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__neonv8_mul16, - xnn_init_qs8_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P16C__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__neon_mul16, - xnn_init_qs8_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P16C__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__neonv8_mul16, - xnn_init_qs8_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P32C__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__neon_mul16, - xnn_init_qs8_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P32C__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__neonv8_mul16, - xnn_init_qs8_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P8C__SSE2_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__sse2_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P8C__SSE2_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__sse2_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P8C__SSE41_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P8C__SSE41_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P8C__SSE41_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P16C__SSE2_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__sse2_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P16C__SSE2_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__sse2_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P16C__SSE41_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__sse41_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P16C__SSE41_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__sse41_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P16C__SSE41_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__sse41_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P8C__SSE2_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__sse2_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P8C__SSE2_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__sse2_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P8C__SSE41_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P8C__SSE41_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P8C__SSE41_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P16C__SSE2_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__sse2_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P16C__SSE2_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__sse2_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P16C__SSE41_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__sse41_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P16C__SSE41_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__sse41_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P16C__SSE41_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__sse41_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P8C__AVX_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__avx_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P8C__AVX_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__avx_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P8C__AVX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__avx_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P8C__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__avx2_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P16C__AVX_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P16C__AVX_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P16C__AVX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P16C__AVX2_MUL16_ADD16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul16_add16_vpunpck, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P16C__AVX2_MUL16_VPMOVSX, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul16_vpmovsx, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P16C__AVX2_MUL16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul16_vpunpck, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P16C__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P32C__AVX2_MUL16_ADD16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul16_add16_vpunpck, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P32C__AVX2_MUL16_VPMOVSX, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul16_vpmovsx, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P32C__AVX2_MUL16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul16_vpunpck, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P32C__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P8C__AVX_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__avx_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P8C__AVX_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__avx_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P8C__AVX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__avx_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P8C__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__avx2_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P16C__AVX_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__avx_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P16C__AVX_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__avx_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P16C__AVX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__avx_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P16C__AVX2_MUL16_ADD16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul16_add16_vpunpck, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P16C__AVX2_MUL16_VPMOVSX, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul16_vpmovsx, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P16C__AVX2_MUL16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul16_vpunpck, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P16C__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P32C__AVX2_MUL16_ADD16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul16_add16_vpunpck, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P32C__AVX2_MUL16_VPMOVSX, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul16_vpmovsx, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P32C__AVX2_MUL16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul16_vpunpck, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P32C__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P16C__AVX512SKX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx512skx_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX512SKX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P32C__AVX512SKX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p32c__avx512skx_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX512SKX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P16C__AVX512SKX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__avx512skx_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX512SKX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P32C__AVX512SKX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__avx512skx_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX512SKX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P8C__WASMSIMD_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__wasmsimd_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P8C__WASMSIMD_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__wasmsimd_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P16C__WASMSIMD_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__wasmsimd_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P16C__WASMSIMD_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__wasmsimd_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P8C__WASMSIMD_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__wasmsimd_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P8C__WASMSIMD_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__wasmsimd_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P16C__WASMSIMD_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__wasmsimd_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P16C__WASMSIMD_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__wasmsimd_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P1C__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p1c__wasm_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P2C__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p2c__wasm_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P4C__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p4c__wasm_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P1C__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p1c__wasm_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P2C__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p2c__wasm_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P4C__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p4c__wasm_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P1C__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p1c__scalar_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P1C__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p1c__scalar_imagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P1C__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p1c__scalar_lrintf, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P2C__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p2c__scalar_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P2C__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p2c__scalar_imagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P2C__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p2c__scalar_lrintf, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P4C__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p4c__scalar_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P4C__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p4c__scalar_imagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_9P4C__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_9p4c__scalar_lrintf, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P1C__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p1c__scalar_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P1C__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p1c__scalar_imagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P1C__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p1c__scalar_lrintf, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P2C__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p2c__scalar_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P2C__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p2c__scalar_imagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P2C__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p2c__scalar_lrintf, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P4C__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p4c__scalar_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P4C__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p4c__scalar_imagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_FP32_25P4C__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_fp32_ukernel_25p4c__scalar_lrintf, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); \ No newline at end of file +#define XNN_DWCONV_UNIPASS(arch_flags, ukernel, c_block, is_pipelined, cr, kr, datatype, weights_type, params_type, init_params)\ +INSTANTIATE_TEST_SUITE_P( \ + ukernel, DWConvTest, \ + testing::ValuesIn(CreateTests( \ + c_block, is_pipelined, cr, kr, \ + [](DWConvMicrokernelTester& tester) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + tester.Test(ukernel, init_params, xnn_qs8_requantize_fp32); \ + })), \ + [](const testing::TestParamInfo& info) { \ + return info.param.test_name; \ + }); +#include "src/qs8-dwconv/qs8-dwconv-minmax-unipass-fp32.h" +#undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qs8-dwconv-minmax-unipass-fp32.yaml b/test/qs8-dwconv-minmax-unipass-fp32.yaml deleted file mode 100644 index d3a579016fa..00000000000 --- a/test/qs8-dwconv-minmax-unipass-fp32.yaml +++ /dev/null @@ -1,208 +0,0 @@ -# Copyright 2021 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# ARM NEON -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__neon_mul16 - init: xnn_init_qs8_conv_minmax_fp32_neon_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__neonv8_mul16 - init: xnn_init_qs8_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__neon_mul16 - init: xnn_init_qs8_conv_minmax_fp32_neon_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__neonv8_mul16 - init: xnn_init_qs8_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p32c__neon_mul16 - init: xnn_init_qs8_conv_minmax_fp32_neon_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p32c__neonv8_mul16 - init: xnn_init_qs8_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__neon_mul16 - init: xnn_init_qs8_conv_minmax_fp32_neon_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__neonv8_mul16 - init: xnn_init_qs8_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__neon_mul16 - init: xnn_init_qs8_conv_minmax_fp32_neon_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__neonv8_mul16 - init: xnn_init_qs8_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__neon_mul16 - init: xnn_init_qs8_conv_minmax_fp32_neon_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__neonv8_mul16 - init: xnn_init_qs8_conv_minmax_fp32_neonv8_params -# x86 SSE -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__sse2_mul16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__sse2_mul16_add16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul16_add16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__sse2_mul16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__sse2_mul16_add16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__sse41_mul16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__sse41_mul16_add16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__sse41_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__sse2_mul16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__sse2_mul16_add16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul16_add16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__sse2_mul16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__sse2_mul16_add16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__sse41_mul16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__sse41_mul16_add16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__sse41_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -# x86 AVX -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__avx_mul16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__avx_mul16_add16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__avx_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__avx2_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx_mul16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx_mul16_add16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul16_add16_vpunpck - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul16_vpmovsx - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul16_vpunpck - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul16_add16_vpunpck - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul16_vpmovsx - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul16_vpunpck - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__avx_mul16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__avx_mul16_add16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__avx_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__avx2_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__avx_mul16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__avx_mul16_add16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__avx_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul16_add16_vpunpck - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul16_vpmovsx - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul16_vpunpck - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul16_add16_vpunpck - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul16_vpmovsx - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul16_vpunpck - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -# x86 AVX512 -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx512skx_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p32c__avx512skx_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__avx512skx_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__avx512skx_mul32 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -# WAsm SIMD -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__wasmsimd_mul16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__wasmsimd_mul16_add16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__wasmsimd_mul16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__wasmsimd_mul16_add16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__wasmsimd_mul16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__wasmsimd_mul16_add16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__wasmsimd_mul16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__wasmsimd_mul16_add16 - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -# WAsm -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p1c__wasm_fmagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p2c__wasm_fmagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p4c__wasm_fmagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p1c__wasm_fmagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p2c__wasm_fmagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p4c__wasm_fmagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -# Scalar -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p1c__scalar_fmagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p1c__scalar_imagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p1c__scalar_lrintf - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p2c__scalar_fmagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p2c__scalar_imagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p2c__scalar_lrintf - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p4c__scalar_fmagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p4c__scalar_imagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_9p4c__scalar_lrintf - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p1c__scalar_fmagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p1c__scalar_imagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p1c__scalar_lrintf - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p2c__scalar_fmagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p2c__scalar_imagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p2c__scalar_lrintf - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p4c__scalar_fmagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p4c__scalar_imagic - init: xnn_init_qs8_conv_minmax_fp32_scalar_params -- name: xnn_qs8_dwconv_minmax_fp32_ukernel_25p4c__scalar_lrintf - init: xnn_init_qs8_conv_minmax_fp32_scalar_params diff --git a/test/qs8-dwconv-minmax-unipass-rndnu.cc b/test/qs8-dwconv-minmax-unipass-rndnu.cc index 52a5b2aab44..13ee096f8b5 100644 --- a/test/qs8-dwconv-minmax-unipass-rndnu.cc +++ b/test/qs8-dwconv-minmax-unipass-rndnu.cc @@ -7,7 +7,7 @@ // LICENSE file in the root directory of this source tree. // // Auto-generated file. Do not edit! -// Specification: test/qs8-dwconv-minmax-unipass-rndnu.yaml +// Microkernel: qs8-dwconv-minmax-unipass-rndnu // Generator: tools/generate-dwconv-unipass-test.py @@ -28,10 +28,10 @@ namespace { -std::vector CreateTests1( - size_t c_block, size_t adj_c_block, size_t cr, size_t kr, - std::function test_func, - std::function isa_check = nullptr) { +std::vector CreateTests( + size_t c_block, bool is_pipelined, size_t cr, size_t kr, + std::function test_func) { + size_t adj_c_block = is_pipelined ? c_block * 2 : c_block; const std::string cbs = std::to_string(c_block); const std::string acbs = std::to_string(adj_c_block); @@ -44,8 +44,17 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .channels(c_block) - , test_func, isa_check)); + , test_func)); + if (is_pipelined) { + tests.push_back(DWConvTestParams( + "c_eq_" + std::to_string(c_block * 2), + DWConvMicrokernelTester() + .channel_tile(cr) + .kernel_tile(kr) + .channels(c_block * 2) + , test_func)); + } if (c_block > 1) { tests.push_back(DWConvTestParams( @@ -53,7 +62,7 @@ std::vector CreateTests1( DWConvMicrokernelTester() .channel_tile(cr) .kernel_tile(kr) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); tests.push_back(DWConvTestParams( @@ -62,7 +71,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .qmin(128) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); tests.push_back(DWConvTestParams( @@ -71,7 +80,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .qmax(128) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); tests.push_back(DWConvTestParams( @@ -79,7 +88,7 @@ std::vector CreateTests1( DWConvMicrokernelTester() .channel_tile(cr) .kernel_tile(kr) - , test_func, isa_check) + , test_func) .loop_channels(1, adj_c_block - 1)); } @@ -88,7 +97,7 @@ std::vector CreateTests1( DWConvMicrokernelTester() .channel_tile(cr) .kernel_tile(kr) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + 1, (c_block == 1 ? 10 : adj_c_block + c_block) - 1)); tests.push_back(DWConvTestParams( @@ -97,7 +106,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .qmin(128) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + 1, (c_block == 1 ? 10 : adj_c_block + c_block) - 1)); tests.push_back(DWConvTestParams( @@ -106,7 +115,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .qmax(128) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + 1, (c_block == 1 ? 10 : adj_c_block + c_block) - 1)); tests.push_back(DWConvTestParams( @@ -115,7 +124,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .width(3) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); tests.push_back(DWConvTestParams( @@ -124,7 +133,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .width(3) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1)) .loop_step(2, kr)); @@ -135,7 +144,7 @@ std::vector CreateTests1( .kernel_tile(kr) .width(5) .output_stride(xnnpack::NextPrime(cr * 5 + 1)) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); tests.push_back(DWConvTestParams( @@ -145,7 +154,7 @@ std::vector CreateTests1( .kernel_tile(kr) .width(3) .qmin(128) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); tests.push_back(DWConvTestParams( @@ -155,7 +164,7 @@ std::vector CreateTests1( .kernel_tile(kr) .width(3) .qmax(128) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); @@ -165,7 +174,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .input_offset(xnnpack::NextPrime(cr + 1) * 16) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); tests.push_back(DWConvTestParams( @@ -174,7 +183,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .input_offset(xnnpack::NextPrime(cr + 1) * 16) - , test_func, isa_check) + , test_func) .loop_zi(0, kr - 1) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); @@ -183,384 +192,17 @@ std::vector CreateTests1( } // namespace - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_9P8C__NEON_MLA8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_9p8c__neon_mla8_ld64, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_9P8C__NEON_MUL8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_9p8c__neon_mul8_ld64, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_9P8C__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_9p8c__neon_mul16, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_9P16C__NEON_MLA8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mla8_ld64, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_9P16C__NEON_MLA8_LD128, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mla8_ld128, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_9P16C__NEON_MUL8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mul8_ld64, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_9P16C__NEON_MUL8_LD128, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mul8_ld128, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_9P16C__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mul16, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_9P32C__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_9p32c__neon_mul16, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_25P8C__NEON_MLA8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_25p8c__neon_mla8_ld64, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_25P8C__NEON_MUL8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_25p8c__neon_mul8_ld64, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_25P8C__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_25p8c__neon_mul16, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_25P16C__NEON_MLA8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_25p16c__neon_mla8_ld64, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_25P16C__NEON_MLA8_LD128, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_25p16c__neon_mla8_ld128, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_25P16C__NEON_MUL8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_25p16c__neon_mul8_ld64, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_25P16C__NEON_MUL8_LD128, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_25p16c__neon_mul8_ld128, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_25P16C__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_25p16c__neon_mul16, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_25P32C__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_25p32c__neon_mul16, - xnn_init_qs8_conv_minmax_rndnu_neon_params, - xnn_qs8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_9P1C__SCALAR, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_9p1c__scalar, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - xnn_qs8_requantize_rndnu); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; +#define XNN_DWCONV_UNIPASS(arch_flags, ukernel, c_block, is_pipelined, cr, kr, datatype, weights_type, params_type, init_params)\ +INSTANTIATE_TEST_SUITE_P( \ + ukernel, DWConvTest, \ + testing::ValuesIn(CreateTests( \ + c_block, is_pipelined, cr, kr, \ + [](DWConvMicrokernelTester& tester) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + tester.Test(ukernel, init_params, xnn_qs8_requantize_rndnu); \ + })), \ + [](const testing::TestParamInfo& info) { \ + return info.param.test_name; \ }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_9P2C__SCALAR, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_9p2c__scalar, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - xnn_qs8_requantize_rndnu); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_DWCONV_MINMAX_RNDNU_9P4C__SCALAR, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_dwconv_minmax_rndnu_ukernel_9p4c__scalar, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - xnn_qs8_requantize_rndnu); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); \ No newline at end of file +#include "src/qs8-dwconv/qs8-dwconv-minmax-unipass-rndnu.h" +#undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qs8-dwconv-minmax-unipass-rndnu.yaml b/test/qs8-dwconv-minmax-unipass-rndnu.yaml deleted file mode 100644 index 5e1d1bc5078..00000000000 --- a/test/qs8-dwconv-minmax-unipass-rndnu.yaml +++ /dev/null @@ -1,49 +0,0 @@ -# Copyright 2021 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# ARM NEON -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_9p8c__neon_mla8_ld64 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_9p8c__neon_mul8_ld64 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_9p8c__neon_mul16 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mla8_ld64 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mla8_ld128 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mul8_ld64 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mul8_ld128 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mul16 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_9p32c__neon_mul16 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_25p8c__neon_mla8_ld64 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_25p8c__neon_mul8_ld64 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_25p8c__neon_mul16 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_25p16c__neon_mla8_ld64 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_25p16c__neon_mla8_ld128 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_25p16c__neon_mul8_ld64 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_25p16c__neon_mul8_ld128 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_25p16c__neon_mul16 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_25p32c__neon_mul16 - init: xnn_init_qs8_conv_minmax_rndnu_neon_params -# Scalar -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_9p1c__scalar - init: xnn_init_qs8_conv_minmax_rndnu_scalar_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_9p2c__scalar - init: xnn_init_qs8_conv_minmax_rndnu_scalar_params -- name: xnn_qs8_dwconv_minmax_rndnu_ukernel_9p4c__scalar - init: xnn_init_qs8_conv_minmax_rndnu_scalar_params diff --git a/test/qs8-qc8w-dwconv-minmax-multipass-fp32.cc b/test/qs8-qc8w-dwconv-minmax-multipass-fp32.cc index c6ca56a4f03..5ae7ca8f286 100644 --- a/test/qs8-qc8w-dwconv-minmax-multipass-fp32.cc +++ b/test/qs8-qc8w-dwconv-minmax-multipass-fp32.cc @@ -4,7 +4,7 @@ // LICENSE file in the root directory of this source tree. // // Auto-generated file. Do not edit! -// Specification: test/qs8-qc8w-dwconv-minmax-multipass-fp32.yaml +// Microkernel: qs8-qc8w-dwconv-minmax-multipass-fp32 // Generator: tools/generate-dwconv-multipass-test.py @@ -25,14 +25,13 @@ namespace { -std::vector CreateTests1( - size_t c_block, size_t adj_c_block, size_t cr, size_t kr, +std::vector CreateTests( + size_t c_block, size_t cr, size_t kr, size_t first_pass_tile, size_t middle_pass_tile, size_t last_pass_tile, size_t channel_subtile, size_t channel_round, std::function test_func, std::function isa_check = nullptr) { const std::string cbs = std::to_string(c_block); - const std::string acbs = std::to_string(adj_c_block); std::vector tests; tests.reserve(17); @@ -90,7 +89,7 @@ std::vector CreateTests1( .channel_round(channel_round) .kernel_size(first_pass_tile + 1) , test_func, isa_check) - .loop_channels(adj_c_block = c_block, cr * 16, cr * 3)); + .loop_channels(c_block * 2, cr * 16, cr * 3)); tests.push_back(DWConvTestParams( "c_div_" + cbs + "_first_pass_and_last_pass", @@ -103,7 +102,7 @@ std::vector CreateTests1( .channel_round(channel_round) .kernel_size(first_pass_tile + last_pass_tile) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3)); + .loop_channels(c_block * 2, cr * 16, cr * 3)); tests.push_back(DWConvTestParams( "c_div_" + cbs + "_multipass", @@ -115,15 +114,42 @@ std::vector CreateTests1( .channel_subtile(channel_subtile) .channel_round(channel_round) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3) + .loop_channels(c_block * 2, cr * 16, cr * 3) .loop_kernel_size( first_pass_tile + middle_pass_tile + last_pass_tile, first_pass_tile + middle_pass_tile * 2 + last_pass_tile)); + tests.push_back(DWConvTestParams( + "c_div_" + cbs + "_with_qmin", + DWConvMicrokernelTester() + .first_pass_tile(first_pass_tile) + .middle_pass_tile(middle_pass_tile) + .last_pass_tile(last_pass_tile) + .channel_tile(cr) + .channel_subtile(channel_subtile) + .channel_round(channel_round) + .kernel_size(first_pass_tile + last_pass_tile) + .qmin(128) + , test_func, isa_check) + .loop_channels(c_block * 2, cr * 16, cr * 3)); + + tests.push_back(DWConvTestParams( + "c_div_" + cbs + "_with_qmax", + DWConvMicrokernelTester() + .first_pass_tile(first_pass_tile) + .middle_pass_tile(middle_pass_tile) + .last_pass_tile(last_pass_tile) + .channel_tile(cr) + .channel_subtile(channel_subtile) + .channel_round(channel_round) + .kernel_size(first_pass_tile + last_pass_tile) + .qmax(128) + , test_func, isa_check) + .loop_channels(c_block * 2, cr * 16, cr * 3)); } tests.push_back(DWConvTestParams( - "c_gt_" + acbs + "_first_pass_plus_one", + "c_gt_" + cbs + "_first_pass_plus_one", DWConvMicrokernelTester() .first_pass_tile(first_pass_tile) .middle_pass_tile(middle_pass_tile) @@ -133,10 +159,10 @@ std::vector CreateTests1( .channel_round(channel_round) .kernel_size(first_pass_tile + 1) , test_func, isa_check) - .loop_channels(adj_c_block + 1, c_block == 1 ? 10 : adj_c_block + c_block)); + .loop_channels(c_block + 1, c_block == 1 ? 10 : c_block * 2)); tests.push_back(DWConvTestParams( - "c_gt_" + acbs + "_first_pass_and_last_pass", + "c_gt_" + cbs + "_first_pass_and_last_pass", DWConvMicrokernelTester() .first_pass_tile(first_pass_tile) .middle_pass_tile(middle_pass_tile) @@ -146,10 +172,10 @@ std::vector CreateTests1( .channel_round(channel_round) .kernel_size(first_pass_tile + last_pass_tile) , test_func, isa_check) - .loop_channels(adj_c_block + 1, c_block == 1 ? 10 : adj_c_block + c_block)); + .loop_channels(c_block + 1, c_block == 1 ? 10 : c_block * 2)); tests.push_back(DWConvTestParams( - "c_gt_" + acbs + "_multipass", + "c_gt_" + cbs + "_multipass", DWConvMicrokernelTester() .first_pass_tile(first_pass_tile) .middle_pass_tile(middle_pass_tile) @@ -158,7 +184,7 @@ std::vector CreateTests1( .channel_subtile(channel_subtile) .channel_round(channel_round) , test_func, isa_check) - .loop_channels(adj_c_block + 1, c_block == 1 ? 10 : adj_c_block + c_block) + .loop_channels(c_block + 1, c_block == 1 ? 10 : c_block * 2) .loop_kernel_size( first_pass_tile + middle_pass_tile + last_pass_tile, first_pass_tile + middle_pass_tile * 2 + last_pass_tile)); @@ -251,7 +277,7 @@ std::vector CreateTests1( .channel_round(channel_round) .input_offset(xnnpack::NextPrime(cr + 1) * 16) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3) + .loop_channels(c_block * 2, cr * 16, cr * 3) .loop_kernel_size(first_pass_tile + middle_pass_tile + last_pass_tile, first_pass_tile + middle_pass_tile * 2 + last_pass_tile)); @@ -260,3369 +286,19 @@ std::vector CreateTests1( } // namespace - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L8C8S8R__NEON_MLA8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neon_mla8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L8C8S8R__NEON_MUL8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neon_mul8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L8C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neon_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L8C8S8R__NEONV8_MLA8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neonv8_mla8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L8C8S8R__NEONV8_MUL8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neonv8_mul8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L8C8S8R__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neonv8_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L16C8S8R__NEON_MLA8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neon_mla8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L16C8S8R__NEON_MLA8_LD128, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neon_mla8_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L16C8S8R__NEON_MUL8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neon_mul8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L16C8S8R__NEON_MUL8_LD128, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neon_mul8_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L16C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neon_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L16C8S8R__NEONV8_MLA8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neonv8_mla8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L16C8S8R__NEONV8_MLA8_LD128, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neonv8_mla8_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L16C8S8R__NEONV8_MUL8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neonv8_mul8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L16C8S8R__NEONV8_MUL8_LD128, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neonv8_mul8_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L16C8S8R__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neonv8_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L32C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l32c8s8r__neon_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L32C8S8R__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l32c8s8r__neonv8_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L8C8S8R__NEON_MLA8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neon_mla8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L8C8S8R__NEON_MUL8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neon_mul8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L8C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neon_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L8C8S8R__NEONV8_MLA8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neonv8_mla8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L8C8S8R__NEONV8_MUL8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neonv8_mul8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L8C8S8R__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neonv8_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L16C8S8R__NEON_MLA8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neon_mla8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L16C8S8R__NEON_MLA8_LD128, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neon_mla8_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L16C8S8R__NEON_MUL8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neon_mul8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L16C8S8R__NEON_MUL8_LD128, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neon_mul8_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L16C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neon_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L16C8S8R__NEONV8_MLA8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neonv8_mla8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L16C8S8R__NEONV8_MLA8_LD128, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neonv8_mla8_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L16C8S8R__NEONV8_MUL8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neonv8_mul8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L16C8S8R__NEONV8_MUL8_LD128, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neonv8_mul8_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L16C8S8R__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neonv8_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L32C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l32c8s8r__neon_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L32C8S8R__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l32c8s8r__neonv8_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L8C8S8R__NEON_MLA8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neon_mla8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L8C8S8R__NEON_MUL8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neon_mul8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L8C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neon_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L8C8S8R__NEONV8_MLA8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neonv8_mla8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L8C8S8R__NEONV8_MUL8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neonv8_mul8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L8C8S8R__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neonv8_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L16C8S8R__NEON_MLA8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neon_mla8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L16C8S8R__NEON_MLA8_LD128, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neon_mla8_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L16C8S8R__NEON_MUL8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neon_mul8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L16C8S8R__NEON_MUL8_LD128, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neon_mul8_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L16C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neon_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L16C8S8R__NEONV8_MLA8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neonv8_mla8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L16C8S8R__NEONV8_MLA8_LD128, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neonv8_mla8_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L16C8S8R__NEONV8_MUL8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neonv8_mul8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L16C8S8R__NEONV8_MUL8_LD128, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neonv8_mul8_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L16C8S8R__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neonv8_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L32C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l32c8s8r__neon_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L32C8S8R__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l32c8s8r__neonv8_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L8C4S4R__SSE41_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c4s4r__sse41_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L8C8S8R__SSE2_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse2_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L8C8S8R__SSE2_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse2_mul16_add16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L8C8S8R__SSE41_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse41_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L8C8S8R__SSE41_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse41_mul16_add16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L16C4S4R__SSE41_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c4s4r__sse41_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L16C8S8R__SSE2_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse2_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L16C8S8R__SSE2_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse2_mul16_add16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L16C8S8R__SSE41_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse41_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L16C8S8R__SSE41_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse41_mul16_add16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L8C4S4R__SSE41_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c4s4r__sse41_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L8C8S8R__SSE2_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse2_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L8C8S8R__SSE2_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse2_mul16_add16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L8C8S8R__SSE41_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse41_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L8C8S8R__SSE41_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse41_mul16_add16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L16C4S4R__SSE41_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c4s4r__sse41_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L16C8S8R__SSE2_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse2_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L16C8S8R__SSE2_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse2_mul16_add16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L16C8S8R__SSE41_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse41_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L16C8S8R__SSE41_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse41_mul16_add16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L8C4S4R__SSE41_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c4s4r__sse41_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L8C8S8R__SSE2_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse2_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L8C8S8R__SSE2_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse2_mul16_add16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L8C8S8R__SSE41_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse41_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L8C8S8R__SSE41_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse41_mul16_add16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L16C4S4R__SSE41_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c4s4r__sse41_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L16C8S8R__SSE2_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse2_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L16C8S8R__SSE2_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse2_mul16_add16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L16C8S8R__SSE41_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse41_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L16C8S8R__SSE41_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse41_mul16_add16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L8C4S4R__AVX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c4s4r__avx_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L8C8S8R__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__avx2_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L16C4S4R__AVX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c4s4r__avx_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L16C8S8R__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__avx2_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L16C16S16R__AVX2_MUL16_ADD16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/16, /*channel_round=*/16, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c16s16r__avx2_mul16_add16_vpunpck, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L16C16S16R__AVX2_MUL16_VPMOVSX, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/16, /*channel_round=*/16, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c16s16r__avx2_mul16_vpmovsx, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L16C16S16R__AVX2_MUL16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/16, /*channel_round=*/16, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c16s16r__avx2_mul16_vpunpck, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L32C8S8R__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l32c8s8r__avx2_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L32C16S16R__AVX2_MUL16_ADD16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/16, /*channel_round=*/16, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l32c16s16r__avx2_mul16_add16_vpunpck, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L32C16S16R__AVX2_MUL16_VPMOVSX, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/16, /*channel_round=*/16, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l32c16s16r__avx2_mul16_vpmovsx, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L32C16S16R__AVX2_MUL16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/16, /*channel_round=*/16, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l32c16s16r__avx2_mul16_vpunpck, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L8C4S4R__AVX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c4s4r__avx_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L8C8S8R__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__avx2_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L16C4S4R__AVX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c4s4r__avx_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L16C8S8R__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__avx2_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L16C16S16R__AVX2_MUL16_ADD16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/16, /*channel_round=*/16, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c16s16r__avx2_mul16_add16_vpunpck, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L16C16S16R__AVX2_MUL16_VPMOVSX, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/16, /*channel_round=*/16, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c16s16r__avx2_mul16_vpmovsx, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L16C16S16R__AVX2_MUL16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/16, /*channel_round=*/16, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c16s16r__avx2_mul16_vpunpck, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L32C8S8R__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l32c8s8r__avx2_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L32C16S16R__AVX2_MUL16_ADD16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/16, /*channel_round=*/16, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l32c16s16r__avx2_mul16_add16_vpunpck, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L32C16S16R__AVX2_MUL16_VPMOVSX, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/16, /*channel_round=*/16, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l32c16s16r__avx2_mul16_vpmovsx, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L32C16S16R__AVX2_MUL16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/16, /*channel_round=*/16, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l32c16s16r__avx2_mul16_vpunpck, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L8C4S4R__AVX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c4s4r__avx_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L8C8S8R__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__avx2_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L16C4S4R__AVX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c4s4r__avx_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L16C8S8R__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__avx2_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L16C16S16R__AVX2_MUL16_ADD16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/16, /*channel_round=*/16, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c16s16r__avx2_mul16_add16_vpunpck, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L16C16S16R__AVX2_MUL16_VPMOVSX, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/16, /*channel_round=*/16, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c16s16r__avx2_mul16_vpmovsx, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L16C16S16R__AVX2_MUL16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/16, /*channel_round=*/16, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c16s16r__avx2_mul16_vpunpck, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L32C8S8R__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l32c8s8r__avx2_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L32C16S16R__AVX2_MUL16_ADD16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/16, /*channel_round=*/16, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l32c16s16r__avx2_mul16_add16_vpunpck, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L32C16S16R__AVX2_MUL16_VPMOVSX, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/16, /*channel_round=*/16, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l32c16s16r__avx2_mul16_vpmovsx, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L32C16S16R__AVX2_MUL16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/16, /*channel_round=*/16, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l32c16s16r__avx2_mul16_vpunpck, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L16C16S1R__AVX512SKX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/16, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c16s1r__avx512skx_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX512SKX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L32C16S1R__AVX512SKX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/16, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l32c16s1r__avx512skx_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX512SKX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L16C16S1R__AVX512SKX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/16, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c16s1r__avx512skx_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX512SKX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L32C16S1R__AVX512SKX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/16, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l32c16s1r__avx512skx_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX512SKX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L16C16S1R__AVX512SKX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/16, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c16s1r__avx512skx_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX512SKX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L32C16S1R__AVX512SKX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/16, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l32c16s1r__avx512skx_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX512SKX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L8C8S8R__WASMSIMD_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__wasmsimd_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L8C8S8R__WASMSIMD_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__wasmsimd_mul16_add16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L16C8S8R__WASMSIMD_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__wasmsimd_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L16C8S8R__WASMSIMD_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__wasmsimd_mul16_add16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L8C8S8R__WASMSIMD_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__wasmsimd_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L8C8S8R__WASMSIMD_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__wasmsimd_mul16_add16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L16C8S8R__WASMSIMD_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__wasmsimd_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L16C8S8R__WASMSIMD_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__wasmsimd_mul16_add16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L8C8S8R__WASMSIMD_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__wasmsimd_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L8C8S8R__WASMSIMD_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__wasmsimd_mul16_add16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L16C8S8R__WASMSIMD_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__wasmsimd_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L16C8S8R__WASMSIMD_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__wasmsimd_mul16_add16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L1C1S1R__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__wasm_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L2C1S1R__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__wasm_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L4C1S1R__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__wasm_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L1C1S1R__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__wasm_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L2C1S1R__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__wasm_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L4C1S1R__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__wasm_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L1C1S1R__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__wasm_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L2C1S1R__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__wasm_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L4C1S1R__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__wasm_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L1C1S1R__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L1C1S1R__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_imagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L1C1S1R__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_lrintf, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L2C1S1R__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; +#define XNN_DWCONV_MULTIPASS(arch_flags, ukernel, first_pass_tile, middle_pass_tile, last_pass_tile, channel_tile, channel_subtile, channel_round, datatype, weights_type, buffer_type, params_type, init_params)\ +INSTANTIATE_TEST_SUITE_P( \ + ukernel, DWConvTest, \ + testing::ValuesIn(CreateTests( \ + channel_tile, channel_tile, first_pass_tile, \ + first_pass_tile, middle_pass_tile, last_pass_tile, \ + channel_subtile, channel_round, \ + [](DWConvMicrokernelTester& tester) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + tester.Test(ukernel, init_params, xnn_qs8_requantize_fp32); \ + })), \ + [](const testing::TestParamInfo& info) { \ + return info.param.test_name; \ }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L2C1S1R__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_imagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L2C1S1R__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_lrintf, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L4C1S1R__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L4C1S1R__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_imagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_5F5M5L4C1S1R__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_lrintf, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L1C1S1R__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L1C1S1R__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_imagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L1C1S1R__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_lrintf, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L2C1S1R__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L2C1S1R__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_imagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L2C1S1R__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_lrintf, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L4C1S1R__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L4C1S1R__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_imagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_6F6M7L4C1S1R__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_lrintf, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L1C1S1R__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L1C1S1R__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_imagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L1C1S1R__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_lrintf, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L2C1S1R__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L2C1S1R__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_imagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L2C1S1R__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_lrintf, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L4C1S1R__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L4C1S1R__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_imagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_8F8M9L4C1S1R__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_lrintf, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); \ No newline at end of file +#include "src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-multipass-fp32.h" +#undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qs8-qc8w-dwconv-minmax-multipass-fp32.yaml b/test/qs8-qc8w-dwconv-minmax-multipass-fp32.yaml deleted file mode 100644 index 25e2df3b66d..00000000000 --- a/test/qs8-qc8w-dwconv-minmax-multipass-fp32.yaml +++ /dev/null @@ -1,360 +0,0 @@ -# Copyright 2023 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# ARM NEON -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neon_mla8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neon_mul8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neon_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neonv8_mla8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neonv8_mul8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neonv8_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neon_mla8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neon_mla8_ld128 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neon_mul8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neon_mul8_ld128 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neon_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neonv8_mla8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neonv8_mla8_ld128 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neonv8_mul8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neonv8_mul8_ld128 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neonv8_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l32c8s8r__neon_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l32c8s8r__neonv8_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neon_mla8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neon_mul8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neon_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neonv8_mla8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neonv8_mul8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neonv8_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neon_mla8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neon_mla8_ld128 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neon_mul8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neon_mul8_ld128 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neon_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neonv8_mla8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neonv8_mla8_ld128 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neonv8_mul8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neonv8_mul8_ld128 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neonv8_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l32c8s8r__neon_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l32c8s8r__neonv8_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neon_mla8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neon_mul8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neon_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neonv8_mla8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neonv8_mul8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neonv8_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neon_mla8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neon_mla8_ld128 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neon_mul8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neon_mul8_ld128 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neon_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neonv8_mla8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neonv8_mla8_ld128 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neonv8_mul8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neonv8_mul8_ld128 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neonv8_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l32c8s8r__neon_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l32c8s8r__neonv8_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params - -# x86 SSE -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c4s4r__sse41_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse2_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse2_mul16_add16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse41_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse41_mul16_add16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c4s4r__sse41_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse2_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse2_mul16_add16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse41_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse41_mul16_add16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c4s4r__sse41_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse2_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse2_mul16_add16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse41_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse41_mul16_add16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c4s4r__sse41_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse2_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse2_mul16_add16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse41_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse41_mul16_add16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c4s4r__sse41_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse2_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse2_mul16_add16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse41_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse41_mul16_add16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c4s4r__sse41_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse2_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse2_mul16_add16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse41_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse41_mul16_add16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params - -# x86 AVX -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c4s4r__avx_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__avx2_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c4s4r__avx_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__avx2_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c16s16r__avx2_mul16_add16_vpunpck - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c16s16r__avx2_mul16_vpmovsx - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c16s16r__avx2_mul16_vpunpck - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l32c8s8r__avx2_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l32c16s16r__avx2_mul16_add16_vpunpck - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l32c16s16r__avx2_mul16_vpmovsx - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l32c16s16r__avx2_mul16_vpunpck - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c4s4r__avx_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__avx2_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c4s4r__avx_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__avx2_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c16s16r__avx2_mul16_add16_vpunpck - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c16s16r__avx2_mul16_vpmovsx - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c16s16r__avx2_mul16_vpunpck - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l32c8s8r__avx2_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l32c16s16r__avx2_mul16_add16_vpunpck - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l32c16s16r__avx2_mul16_vpmovsx - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l32c16s16r__avx2_mul16_vpunpck - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c4s4r__avx_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__avx2_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c4s4r__avx_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__avx2_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c16s16r__avx2_mul16_add16_vpunpck - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c16s16r__avx2_mul16_vpmovsx - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c16s16r__avx2_mul16_vpunpck - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l32c8s8r__avx2_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l32c16s16r__avx2_mul16_add16_vpunpck - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l32c16s16r__avx2_mul16_vpmovsx - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l32c16s16r__avx2_mul16_vpunpck - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params - -# x86 AVX512 -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c16s1r__avx512skx_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l32c16s1r__avx512skx_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c16s1r__avx512skx_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l32c16s1r__avx512skx_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c16s1r__avx512skx_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l32c16s1r__avx512skx_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params - -# Wasm SIMD -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__wasmsimd_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__wasmsimd_mul16_add16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__wasmsimd_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__wasmsimd_mul16_add16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__wasmsimd_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__wasmsimd_mul16_add16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__wasmsimd_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__wasmsimd_mul16_add16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__wasmsimd_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__wasmsimd_mul16_add16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__wasmsimd_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__wasmsimd_mul16_add16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params - -# Wasm -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__wasm_fmagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__wasm_fmagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__wasm_fmagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__wasm_fmagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__wasm_fmagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__wasm_fmagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__wasm_fmagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__wasm_fmagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__wasm_fmagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params - -# Scalar -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_fmagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_imagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_lrintf - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_fmagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_imagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_lrintf - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_fmagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_imagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_lrintf - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_fmagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_imagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_lrintf - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_fmagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_imagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_lrintf - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_fmagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_imagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_lrintf - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_fmagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_imagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_lrintf - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_fmagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_imagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_lrintf - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_fmagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_imagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_lrintf - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params diff --git a/test/qs8-qc8w-dwconv-minmax-unipass-fp32.cc b/test/qs8-qc8w-dwconv-minmax-unipass-fp32.cc index ec0c9d28581..95f334d0e22 100644 --- a/test/qs8-qc8w-dwconv-minmax-unipass-fp32.cc +++ b/test/qs8-qc8w-dwconv-minmax-unipass-fp32.cc @@ -7,7 +7,7 @@ // LICENSE file in the root directory of this source tree. // // Auto-generated file. Do not edit! -// Specification: test/qs8-qc8w-dwconv-minmax-unipass-fp32.yaml +// Microkernel: qs8-qc8w-dwconv-minmax-unipass-fp32 // Generator: tools/generate-dwconv-unipass-test.py @@ -28,10 +28,10 @@ namespace { -std::vector CreateTests1( - size_t c_block, size_t adj_c_block, size_t cr, size_t kr, - std::function test_func, - std::function isa_check = nullptr) { +std::vector CreateTests( + size_t c_block, bool is_pipelined, size_t cr, size_t kr, + std::function test_func) { + size_t adj_c_block = is_pipelined ? c_block * 2 : c_block; const std::string cbs = std::to_string(c_block); const std::string acbs = std::to_string(adj_c_block); @@ -44,8 +44,17 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .channels(c_block) - , test_func, isa_check)); + , test_func)); + if (is_pipelined) { + tests.push_back(DWConvTestParams( + "c_eq_" + std::to_string(c_block * 2), + DWConvMicrokernelTester() + .channel_tile(cr) + .kernel_tile(kr) + .channels(c_block * 2) + , test_func)); + } if (c_block > 1) { tests.push_back(DWConvTestParams( @@ -53,16 +62,33 @@ std::vector CreateTests1( DWConvMicrokernelTester() .channel_tile(cr) .kernel_tile(kr) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); + tests.push_back(DWConvTestParams( + "c_div_" + cbs + "_with_qmin", + DWConvMicrokernelTester() + .channel_tile(cr) + .kernel_tile(kr) + .qmin(128) + , test_func) + .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); + + tests.push_back(DWConvTestParams( + "c_div_" + cbs + "_with_qmax", + DWConvMicrokernelTester() + .channel_tile(cr) + .kernel_tile(kr) + .qmax(128) + , test_func) + .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); tests.push_back(DWConvTestParams( "c_lt_" + acbs, DWConvMicrokernelTester() .channel_tile(cr) .kernel_tile(kr) - , test_func, isa_check) + , test_func) .loop_channels(1, adj_c_block - 1)); } @@ -71,9 +97,26 @@ std::vector CreateTests1( DWConvMicrokernelTester() .channel_tile(cr) .kernel_tile(kr) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + 1, (c_block == 1 ? 10 : adj_c_block + c_block) - 1)); + tests.push_back(DWConvTestParams( + "c_gt_" + acbs + "_with_qmin", + DWConvMicrokernelTester() + .channel_tile(cr) + .kernel_tile(kr) + .qmin(128) + , test_func) + .loop_channels(adj_c_block + 1, (c_block == 1 ? 10 : adj_c_block + c_block) - 1)); + + tests.push_back(DWConvTestParams( + "c_gt_" + acbs + "_with_qmax", + DWConvMicrokernelTester() + .channel_tile(cr) + .kernel_tile(kr) + .qmax(128) + , test_func) + .loop_channels(adj_c_block + 1, (c_block == 1 ? 10 : adj_c_block + c_block) - 1)); tests.push_back(DWConvTestParams( "multipixel", @@ -81,7 +124,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .width(3) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); tests.push_back(DWConvTestParams( @@ -90,7 +133,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .width(3) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1)) .loop_step(2, kr)); @@ -101,9 +144,28 @@ std::vector CreateTests1( .kernel_tile(kr) .width(5) .output_stride(xnnpack::NextPrime(cr * 5 + 1)) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); + tests.push_back(DWConvTestParams( + "multipixel_with_qmin", + DWConvMicrokernelTester() + .channel_tile(cr) + .kernel_tile(kr) + .width(3) + .qmin(128) + , test_func) + .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); + + tests.push_back(DWConvTestParams( + "multipixel_with_qmax", + DWConvMicrokernelTester() + .channel_tile(cr) + .kernel_tile(kr) + .width(3) + .qmax(128) + , test_func) + .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); tests.push_back(DWConvTestParams( @@ -112,7 +174,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .input_offset(xnnpack::NextPrime(cr + 1) * 16) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); tests.push_back(DWConvTestParams( @@ -121,7 +183,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .input_offset(xnnpack::NextPrime(cr + 1) * 16) - , test_func, isa_check) + , test_func) .loop_zi(0, kr - 1) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); @@ -130,2521 +192,17 @@ std::vector CreateTests1( } // namespace - -#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_3P8C__ASM_AARCH32_NEONV8_MLA8_CORTEX_A35, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p8c__asm_aarch32_neonv8_mla8_cortex_a35, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_3P8C__NEON_MLA8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p8c__neon_mla8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_3P8C__NEONV8_MLA8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p8c__neonv8_mla8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_3P16C__ASM_AARCH32_NEONV8_MLA8_CORTEX_A35, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__asm_aarch32_neonv8_mla8_cortex_a35, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_3P16C__NEON_MLA8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__neon_mla8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_3P16C__NEON_MLA8_LD128, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__neon_mla8_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_3P16C__NEONV8_MLA8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__neonv8_mla8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_3P16C__NEONV8_MLA8_LD128, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__neonv8_mla8_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_4P8C__NEON_MLA8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_4p8c__neon_mla8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P8C__NEON_MLA8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__neon_mla8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P8C__NEON_MUL8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__neon_mul8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P8C__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__neon_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P8C__NEONV8_MLA8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__neonv8_mla8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P8C__NEONV8_MUL8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__neonv8_mul8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P8C__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__neonv8_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P16C__NEON_MLA8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neon_mla8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P16C__NEON_MLA8_LD128, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neon_mla8_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P16C__NEON_MUL8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neon_mul8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P16C__NEON_MUL8_LD128, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neon_mul8_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P16C__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neon_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P16C__NEONV8_MLA8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neonv8_mla8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P16C__NEONV8_MLA8_LD128, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neonv8_mla8_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P16C__NEONV8_MUL8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neonv8_mul8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P16C__NEONV8_MUL8_LD128, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neonv8_mul8_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P16C__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neonv8_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P32C__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p32c__neon_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P32C__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p32c__neonv8_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P8C__NEON_MLA8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__neon_mla8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P8C__NEON_MUL8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__neon_mul8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P8C__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__neon_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P8C__NEONV8_MLA8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__neonv8_mla8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P8C__NEONV8_MUL8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__neonv8_mul8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P8C__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__neonv8_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P16C__NEON_MLA8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neon_mla8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P16C__NEON_MLA8_LD128, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neon_mla8_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P16C__NEON_MUL8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neon_mul8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P16C__NEON_MUL8_LD128, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neon_mul8_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P16C__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neon_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P16C__NEONV8_MLA8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neonv8_mla8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P16C__NEONV8_MLA8_LD128, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neonv8_mla8_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P16C__NEONV8_MUL8_LD64, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neonv8_mul8_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P16C__NEONV8_MUL8_LD128, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neonv8_mul8_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P16C__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neonv8_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P32C__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__neon_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P32C__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__neonv8_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_3P8C__SSE2_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p8c__sse2_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_3P8C__SSE41_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p8c__sse41_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P8C__SSE2_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__sse2_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P8C__SSE2_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__sse2_mul16_add16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P8C__SSE41_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P8C__SSE41_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul16_add16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P8C__SSE41_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P16C__SSE2_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__sse2_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P16C__SSE2_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__sse2_mul16_add16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P16C__SSE41_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__sse41_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P16C__SSE41_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__sse41_mul16_add16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P16C__SSE41_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__sse41_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P8C__SSE2_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__sse2_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P8C__SSE2_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__sse2_mul16_add16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P8C__SSE41_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P8C__SSE41_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul16_add16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P8C__SSE41_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P16C__SSE2_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__sse2_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P16C__SSE2_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__sse2_mul16_add16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P16C__SSE41_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__sse41_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P16C__SSE41_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__sse41_mul16_add16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P16C__SSE41_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__sse41_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_3P16C__AVX_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__avx_mul16_add16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_3P16C__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__avx2_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P8C__AVX_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__avx_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P8C__AVX_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__avx_mul16_add16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P8C__AVX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__avx_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P8C__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__avx2_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P16C__AVX_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__avx_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P16C__AVX_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__avx_mul16_add16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P16C__AVX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__avx_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P16C__AVX2_MUL16_ADD16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul16_add16_vpunpck, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P16C__AVX2_MUL16_VPMOVSX, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul16_vpmovsx, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P16C__AVX2_MUL16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul16_vpunpck, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P16C__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P32C__AVX2_MUL16_ADD16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul16_add16_vpunpck, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P32C__AVX2_MUL16_VPMOVSX, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul16_vpmovsx, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P32C__AVX2_MUL16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul16_vpunpck, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P32C__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P8C__AVX_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__avx_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P8C__AVX_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__avx_mul16_add16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P8C__AVX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__avx_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P8C__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__avx2_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P16C__AVX_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__avx_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P16C__AVX_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__avx_mul16_add16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P16C__AVX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__avx_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P16C__AVX2_MUL16_ADD16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul16_add16_vpunpck, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P16C__AVX2_MUL16_VPMOVSX, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul16_vpmovsx, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P16C__AVX2_MUL16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul16_vpunpck, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P16C__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P32C__AVX2_MUL16_ADD16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul16_add16_vpunpck, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P32C__AVX2_MUL16_VPMOVSX, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul16_vpmovsx, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P32C__AVX2_MUL16_VPUNPCK, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul16_vpunpck, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P32C__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_3P32C__AVX512SKX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p32c__avx512skx_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX512SKX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P16C__AVX512SKX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__avx512skx_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX512SKX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P32C__AVX512SKX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p32c__avx512skx_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX512SKX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P16C__AVX512SKX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__avx512skx_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX512SKX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P32C__AVX512SKX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__avx512skx_mul32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX512SKX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_3P16C__WASMSIMD_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__wasmsimd_mul16_add16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P8C__WASMSIMD_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__wasmsimd_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P8C__WASMSIMD_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__wasmsimd_mul16_add16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P16C__WASMSIMD_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__wasmsimd_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P16C__WASMSIMD_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__wasmsimd_mul16_add16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P8C__WASMSIMD_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__wasmsimd_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P8C__WASMSIMD_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__wasmsimd_mul16_add16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P16C__WASMSIMD_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__wasmsimd_mul16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P16C__WASMSIMD_MUL16_ADD16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__wasmsimd_mul16_add16, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_3P2C__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p2c__wasm_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P1C__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p1c__wasm_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P2C__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p2c__wasm_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P4C__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p4c__wasm_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P1C__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p1c__wasm_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P2C__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p2c__wasm_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P4C__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p4c__wasm_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_3P1C__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p1c__scalar_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_3P2C__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p2c__scalar_imagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_3P2C__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/3, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p2c__scalar_lrintf, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_4P2C__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_4p2c__scalar_imagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P1C__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p1c__scalar_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P1C__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p1c__scalar_imagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P1C__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p1c__scalar_lrintf, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P2C__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p2c__scalar_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P2C__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p2c__scalar_imagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P2C__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p2c__scalar_lrintf, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P4C__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p4c__scalar_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P4C__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p4c__scalar_imagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_9P4C__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p4c__scalar_lrintf, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P1C__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p1c__scalar_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P1C__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p1c__scalar_imagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P1C__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p1c__scalar_lrintf, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P2C__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p2c__scalar_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P2C__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p2c__scalar_imagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P2C__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p2c__scalar_lrintf, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P4C__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p4c__scalar_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P4C__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p4c__scalar_imagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QS8_QC8W_DWCONV_MINMAX_FP32_25P4C__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p4c__scalar_lrintf, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - xnn_qs8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); \ No newline at end of file +#define XNN_DWCONV_UNIPASS(arch_flags, ukernel, c_block, is_pipelined, cr, kr, datatype, weights_type, params_type, init_params)\ +INSTANTIATE_TEST_SUITE_P( \ + ukernel, DWConvTest, \ + testing::ValuesIn(CreateTests( \ + c_block, is_pipelined, cr, kr, \ + [](DWConvMicrokernelTester& tester) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + tester.Test(ukernel, init_params, xnn_qs8_requantize_fp32); \ + })), \ + [](const testing::TestParamInfo& info) { \ + return info.param.test_name; \ + }); +#include "src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-unipass-fp32.h" +#undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qs8-qc8w-dwconv-minmax-unipass-fp32.yaml b/test/qs8-qc8w-dwconv-minmax-unipass-fp32.yaml deleted file mode 100644 index a888f1ca210..00000000000 --- a/test/qs8-qc8w-dwconv-minmax-unipass-fp32.yaml +++ /dev/null @@ -1,296 +0,0 @@ -# Copyright 2021 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# ARM NEON -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p8c__asm_aarch32_neonv8_mla8_cortex_a35 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p8c__neon_mla8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p8c__neonv8_mla8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__asm_aarch32_neonv8_mla8_cortex_a35 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__neon_mla8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__neon_mla8_ld128 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__neonv8_mla8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__neonv8_mla8_ld128 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_4p8c__neon_mla8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__neon_mla8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__neon_mul8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__neon_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__neonv8_mla8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__neonv8_mul8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__neonv8_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neon_mla8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neon_mla8_ld128 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neon_mul8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neon_mul8_ld128 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neon_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neonv8_mla8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neonv8_mla8_ld128 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neonv8_mul8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neonv8_mul8_ld128 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neonv8_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p32c__neon_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p32c__neonv8_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__neon_mla8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__neon_mul8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__neon_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__neonv8_mla8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__neonv8_mul8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__neonv8_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neon_mla8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neon_mla8_ld128 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neon_mul8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neon_mul8_ld128 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neon_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neonv8_mla8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neonv8_mla8_ld128 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neonv8_mul8_ld64 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neonv8_mul8_ld128 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neonv8_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__neon_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__neonv8_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params -# x86 SSE -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p8c__sse2_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p8c__sse41_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__sse2_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__sse2_mul16_add16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul16_add16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__sse2_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__sse2_mul16_add16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__sse41_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__sse41_mul16_add16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__sse41_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__sse2_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__sse2_mul16_add16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul16_add16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__sse2_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__sse2_mul16_add16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__sse41_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__sse41_mul16_add16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__sse41_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -# x86 AVX -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__avx_mul16_add16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__avx2_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__avx_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__avx_mul16_add16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__avx_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__avx2_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__avx_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__avx_mul16_add16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__avx_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul16_add16_vpunpck - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul16_vpmovsx - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul16_vpunpck - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul16_add16_vpunpck - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul16_vpmovsx - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul16_vpunpck - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__avx_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__avx_mul16_add16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__avx_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__avx2_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__avx_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__avx_mul16_add16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__avx_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul16_add16_vpunpck - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul16_vpmovsx - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul16_vpunpck - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul16_add16_vpunpck - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul16_vpmovsx - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul16_vpunpck - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -# x86 AVX512 -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p32c__avx512skx_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__avx512skx_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p32c__avx512skx_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__avx512skx_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__avx512skx_mul32 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -# WAsm SIMD -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__wasmsimd_mul16_add16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__wasmsimd_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__wasmsimd_mul16_add16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__wasmsimd_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__wasmsimd_mul16_add16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__wasmsimd_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__wasmsimd_mul16_add16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__wasmsimd_mul16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__wasmsimd_mul16_add16 - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -# WAsm -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p2c__wasm_fmagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p1c__wasm_fmagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p2c__wasm_fmagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p4c__wasm_fmagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p1c__wasm_fmagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p2c__wasm_fmagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p4c__wasm_fmagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -# Scalar -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p1c__scalar_fmagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p2c__scalar_imagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p2c__scalar_lrintf - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_4p2c__scalar_imagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p1c__scalar_fmagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p1c__scalar_imagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p1c__scalar_lrintf - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p2c__scalar_fmagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p2c__scalar_imagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p2c__scalar_lrintf - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p4c__scalar_fmagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p4c__scalar_imagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p4c__scalar_lrintf - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p1c__scalar_fmagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p1c__scalar_imagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p1c__scalar_lrintf - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p2c__scalar_fmagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p2c__scalar_imagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p2c__scalar_lrintf - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p4c__scalar_fmagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p4c__scalar_imagic - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params -- name: xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p4c__scalar_lrintf - init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params diff --git a/test/qu8-dwconv-minmax-multipass-fp32.cc b/test/qu8-dwconv-minmax-multipass-fp32.cc index a77e4306efc..29dd4cdf532 100644 --- a/test/qu8-dwconv-minmax-multipass-fp32.cc +++ b/test/qu8-dwconv-minmax-multipass-fp32.cc @@ -4,7 +4,7 @@ // LICENSE file in the root directory of this source tree. // // Auto-generated file. Do not edit! -// Specification: test/qu8-dwconv-minmax-multipass-fp32.yaml +// Microkernel: qu8-dwconv-minmax-multipass-fp32 // Generator: tools/generate-dwconv-multipass-test.py @@ -25,14 +25,13 @@ namespace { -std::vector CreateTests1( - size_t c_block, size_t adj_c_block, size_t cr, size_t kr, +std::vector CreateTests( + size_t c_block, size_t cr, size_t kr, size_t first_pass_tile, size_t middle_pass_tile, size_t last_pass_tile, size_t channel_subtile, size_t channel_round, std::function test_func, std::function isa_check = nullptr) { const std::string cbs = std::to_string(c_block); - const std::string acbs = std::to_string(adj_c_block); std::vector tests; tests.reserve(17); @@ -90,7 +89,7 @@ std::vector CreateTests1( .channel_round(channel_round) .kernel_size(first_pass_tile + 1) , test_func, isa_check) - .loop_channels(adj_c_block = c_block, cr * 16, cr * 3)); + .loop_channels(c_block * 2, cr * 16, cr * 3)); tests.push_back(DWConvTestParams( "c_div_" + cbs + "_first_pass_and_last_pass", @@ -103,7 +102,7 @@ std::vector CreateTests1( .channel_round(channel_round) .kernel_size(first_pass_tile + last_pass_tile) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3)); + .loop_channels(c_block * 2, cr * 16, cr * 3)); tests.push_back(DWConvTestParams( "c_div_" + cbs + "_multipass", @@ -115,7 +114,7 @@ std::vector CreateTests1( .channel_subtile(channel_subtile) .channel_round(channel_round) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3) + .loop_channels(c_block * 2, cr * 16, cr * 3) .loop_kernel_size( first_pass_tile + middle_pass_tile + last_pass_tile, first_pass_tile + middle_pass_tile * 2 + last_pass_tile)); @@ -132,7 +131,7 @@ std::vector CreateTests1( .kernel_size(first_pass_tile + last_pass_tile) .qmin(128) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3)); + .loop_channels(c_block * 2, cr * 16, cr * 3)); tests.push_back(DWConvTestParams( "c_div_" + cbs + "_with_qmax", @@ -146,11 +145,11 @@ std::vector CreateTests1( .kernel_size(first_pass_tile + last_pass_tile) .qmax(128) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3)); + .loop_channels(c_block * 2, cr * 16, cr * 3)); } tests.push_back(DWConvTestParams( - "c_gt_" + acbs + "_first_pass_plus_one", + "c_gt_" + cbs + "_first_pass_plus_one", DWConvMicrokernelTester() .first_pass_tile(first_pass_tile) .middle_pass_tile(middle_pass_tile) @@ -160,10 +159,10 @@ std::vector CreateTests1( .channel_round(channel_round) .kernel_size(first_pass_tile + 1) , test_func, isa_check) - .loop_channels(adj_c_block + 1, c_block == 1 ? 10 : adj_c_block + c_block)); + .loop_channels(c_block + 1, c_block == 1 ? 10 : c_block * 2)); tests.push_back(DWConvTestParams( - "c_gt_" + acbs + "_first_pass_and_last_pass", + "c_gt_" + cbs + "_first_pass_and_last_pass", DWConvMicrokernelTester() .first_pass_tile(first_pass_tile) .middle_pass_tile(middle_pass_tile) @@ -173,10 +172,10 @@ std::vector CreateTests1( .channel_round(channel_round) .kernel_size(first_pass_tile + last_pass_tile) , test_func, isa_check) - .loop_channels(adj_c_block + 1, c_block == 1 ? 10 : adj_c_block + c_block)); + .loop_channels(c_block + 1, c_block == 1 ? 10 : c_block * 2)); tests.push_back(DWConvTestParams( - "c_gt_" + acbs + "_multipass", + "c_gt_" + cbs + "_multipass", DWConvMicrokernelTester() .first_pass_tile(first_pass_tile) .middle_pass_tile(middle_pass_tile) @@ -185,7 +184,7 @@ std::vector CreateTests1( .channel_subtile(channel_subtile) .channel_round(channel_round) , test_func, isa_check) - .loop_channels(adj_c_block + 1, c_block == 1 ? 10 : adj_c_block + c_block) + .loop_channels(c_block + 1, c_block == 1 ? 10 : c_block * 2) .loop_kernel_size( first_pass_tile + middle_pass_tile + last_pass_tile, first_pass_tile + middle_pass_tile * 2 + last_pass_tile)); @@ -278,7 +277,7 @@ std::vector CreateTests1( .channel_round(channel_round) .input_offset(xnnpack::NextPrime(cr + 1) * 16) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3) + .loop_channels(c_block * 2, cr * 16, cr * 3) .loop_kernel_size(first_pass_tile + middle_pass_tile + last_pass_tile, first_pass_tile + middle_pass_tile * 2 + last_pass_tile)); @@ -287,1875 +286,19 @@ std::vector CreateTests1( } // namespace - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_5F5M5L8C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neon_mul16, - xnn_init_qu8_conv_minmax_fp32_neon_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_5F5M5L8C8S8R__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neonv8_mul16, - xnn_init_qu8_conv_minmax_fp32_neonv8_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_5F5M5L16C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neon_mul16, - xnn_init_qu8_conv_minmax_fp32_neon_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_5F5M5L16C8S8R__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neonv8_mul16, - xnn_init_qu8_conv_minmax_fp32_neonv8_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_5F5M5L32C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l32c8s8r__neon_mul16, - xnn_init_qu8_conv_minmax_fp32_neon_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_5F5M5L32C8S8R__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l32c8s8r__neonv8_mul16, - xnn_init_qu8_conv_minmax_fp32_neonv8_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_6F6M7L8C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neon_mul16, - xnn_init_qu8_conv_minmax_fp32_neon_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_6F6M7L8C8S8R__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neonv8_mul16, - xnn_init_qu8_conv_minmax_fp32_neonv8_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_6F6M7L16C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neon_mul16, - xnn_init_qu8_conv_minmax_fp32_neon_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_6F6M7L16C8S8R__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neonv8_mul16, - xnn_init_qu8_conv_minmax_fp32_neonv8_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_6F6M7L32C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l32c8s8r__neon_mul16, - xnn_init_qu8_conv_minmax_fp32_neon_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_6F6M7L32C8S8R__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l32c8s8r__neonv8_mul16, - xnn_init_qu8_conv_minmax_fp32_neonv8_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_8F8M9L8C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neon_mul16, - xnn_init_qu8_conv_minmax_fp32_neon_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_8F8M9L8C8S8R__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neonv8_mul16, - xnn_init_qu8_conv_minmax_fp32_neonv8_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_8F8M9L16C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neon_mul16, - xnn_init_qu8_conv_minmax_fp32_neon_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_8F8M9L16C8S8R__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neonv8_mul16, - xnn_init_qu8_conv_minmax_fp32_neonv8_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_8F8M9L32C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l32c8s8r__neon_mul16, - xnn_init_qu8_conv_minmax_fp32_neon_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_8F8M9L32C8S8R__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l32c8s8r__neonv8_mul16, - xnn_init_qu8_conv_minmax_fp32_neonv8_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_5F5M5L8C4S4R__SSE41_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c4s4r__sse41_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_5F5M5L8C8S8R__SSE2_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse2_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_5F5M5L8C8S8R__SSE41_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse41_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_5F5M5L16C4S4R__SSE41_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c4s4r__sse41_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_5F5M5L16C8S8R__SSE2_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse2_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_5F5M5L16C8S8R__SSE41_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse41_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_6F6M7L8C4S4R__SSE41_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l8c4s4r__sse41_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_6F6M7L8C8S8R__SSE2_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse2_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_6F6M7L8C8S8R__SSE41_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse41_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_6F6M7L16C4S4R__SSE41_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c4s4r__sse41_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_6F6M7L16C8S8R__SSE2_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse2_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_6F6M7L16C8S8R__SSE41_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse41_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_8F8M9L8C4S4R__SSE41_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l8c4s4r__sse41_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_8F8M9L8C8S8R__SSE2_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse2_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_8F8M9L8C8S8R__SSE41_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse41_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_8F8M9L16C4S4R__SSE41_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c4s4r__sse41_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_8F8M9L16C8S8R__SSE2_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse2_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_8F8M9L16C8S8R__SSE41_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse41_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_5F5M5L8C4S4R__AVX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c4s4r__avx_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_5F5M5L8C8S8R__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__avx2_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_5F5M5L16C4S4R__AVX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c4s4r__avx_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_5F5M5L16C8S8R__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__avx2_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_5F5M5L32C8S8R__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l32c8s8r__avx2_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_6F6M7L8C4S4R__AVX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l8c4s4r__avx_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_6F6M7L8C8S8R__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__avx2_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_6F6M7L16C4S4R__AVX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c4s4r__avx_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_6F6M7L16C8S8R__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__avx2_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_6F6M7L32C8S8R__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l32c8s8r__avx2_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_8F8M9L8C4S4R__AVX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l8c4s4r__avx_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_8F8M9L8C8S8R__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__avx2_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_8F8M9L16C4S4R__AVX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/4, /*channel_round=*/4, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c4s4r__avx_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_8F8M9L16C8S8R__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__avx2_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_8F8M9L32C8S8R__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l32c8s8r__avx2_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_5F5M5L16C16S1R__AVX512SKX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/16, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c16s1r__avx512skx_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX512SKX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_5F5M5L32C16S1R__AVX512SKX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/16, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l32c16s1r__avx512skx_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX512SKX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_6F6M7L16C16S1R__AVX512SKX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/16, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c16s1r__avx512skx_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX512SKX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_6F6M7L32C16S1R__AVX512SKX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/16, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l32c16s1r__avx512skx_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX512SKX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_8F8M9L16C16S1R__AVX512SKX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/16, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c16s1r__avx512skx_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX512SKX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_8F8M9L32C16S1R__AVX512SKX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/16, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l32c16s1r__avx512skx_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX512SKX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_5F5M5L8C8S8R__WASMSIMD_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__wasmsimd_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_5F5M5L16C8S8R__WASMSIMD_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__wasmsimd_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_6F6M7L8C8S8R__WASMSIMD_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__wasmsimd_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_6F6M7L16C8S8R__WASMSIMD_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__wasmsimd_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_8F8M9L8C8S8R__WASMSIMD_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__wasmsimd_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_8F8M9L16C8S8R__WASMSIMD_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__wasmsimd_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_5F5M5L1C1S1R__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__wasm_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_5F5M5L2C1S1R__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__wasm_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_5F5M5L4C1S1R__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__wasm_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_6F6M7L1C1S1R__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__wasm_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_6F6M7L2C1S1R__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__wasm_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_6F6M7L4C1S1R__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__wasm_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_8F8M9L1C1S1R__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__wasm_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_8F8M9L2C1S1R__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__wasm_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_8F8M9L4C1S1R__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__wasm_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_5F5M5L1C1S1R__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_5F5M5L1C1S1R__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_imagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; +#define XNN_DWCONV_MULTIPASS(arch_flags, ukernel, first_pass_tile, middle_pass_tile, last_pass_tile, channel_tile, channel_subtile, channel_round, datatype, weights_type, buffer_type, params_type, init_params)\ +INSTANTIATE_TEST_SUITE_P( \ + ukernel, DWConvTest, \ + testing::ValuesIn(CreateTests( \ + channel_tile, channel_tile, first_pass_tile, \ + first_pass_tile, middle_pass_tile, last_pass_tile, \ + channel_subtile, channel_round, \ + [](DWConvMicrokernelTester& tester) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + tester.Test(ukernel, init_params, xnn_qu8_requantize_fp32); \ + })), \ + [](const testing::TestParamInfo& info) { \ + return info.param.test_name; \ }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_5F5M5L1C1S1R__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_lrintf, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_5F5M5L2C1S1R__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_5F5M5L2C1S1R__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_imagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_5F5M5L2C1S1R__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_lrintf, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_5F5M5L4C1S1R__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_5F5M5L4C1S1R__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_imagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_5F5M5L4C1S1R__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_lrintf, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_6F6M7L1C1S1R__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_6F6M7L1C1S1R__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_imagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_6F6M7L1C1S1R__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_lrintf, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_6F6M7L2C1S1R__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_6F6M7L2C1S1R__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_imagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_6F6M7L2C1S1R__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_lrintf, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_6F6M7L4C1S1R__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_6F6M7L4C1S1R__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_imagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_6F6M7L4C1S1R__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_lrintf, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_8F8M9L1C1S1R__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_8F8M9L1C1S1R__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_imagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_8F8M9L1C1S1R__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_lrintf, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_8F8M9L2C1S1R__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_8F8M9L2C1S1R__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_imagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_8F8M9L2C1S1R__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_lrintf, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_8F8M9L4C1S1R__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_8F8M9L4C1S1R__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_imagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_8F8M9L4C1S1R__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/1, /*channel_round=*/1, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_lrintf, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); \ No newline at end of file +#include "src/qu8-dwconv/qu8-dwconv-minmax-multipass-fp32.h" +#undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qu8-dwconv-minmax-multipass-fp32.yaml b/test/qu8-dwconv-minmax-multipass-fp32.yaml deleted file mode 100644 index 37d5d89c558..00000000000 --- a/test/qu8-dwconv-minmax-multipass-fp32.yaml +++ /dev/null @@ -1,216 +0,0 @@ -# Copyright 2023 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# ARM NEON -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neon_mul16 - init: xnn_init_qu8_conv_minmax_fp32_neon_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__neonv8_mul16 - init: xnn_init_qu8_conv_minmax_fp32_neonv8_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neon_mul16 - init: xnn_init_qu8_conv_minmax_fp32_neon_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__neonv8_mul16 - init: xnn_init_qu8_conv_minmax_fp32_neonv8_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l32c8s8r__neon_mul16 - init: xnn_init_qu8_conv_minmax_fp32_neon_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l32c8s8r__neonv8_mul16 - init: xnn_init_qu8_conv_minmax_fp32_neonv8_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neon_mul16 - init: xnn_init_qu8_conv_minmax_fp32_neon_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__neonv8_mul16 - init: xnn_init_qu8_conv_minmax_fp32_neonv8_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neon_mul16 - init: xnn_init_qu8_conv_minmax_fp32_neon_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__neonv8_mul16 - init: xnn_init_qu8_conv_minmax_fp32_neonv8_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l32c8s8r__neon_mul16 - init: xnn_init_qu8_conv_minmax_fp32_neon_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l32c8s8r__neonv8_mul16 - init: xnn_init_qu8_conv_minmax_fp32_neonv8_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neon_mul16 - init: xnn_init_qu8_conv_minmax_fp32_neon_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__neonv8_mul16 - init: xnn_init_qu8_conv_minmax_fp32_neonv8_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neon_mul16 - init: xnn_init_qu8_conv_minmax_fp32_neon_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__neonv8_mul16 - init: xnn_init_qu8_conv_minmax_fp32_neonv8_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l32c8s8r__neon_mul16 - init: xnn_init_qu8_conv_minmax_fp32_neon_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l32c8s8r__neonv8_mul16 - init: xnn_init_qu8_conv_minmax_fp32_neonv8_params - -# x86 SSE -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c4s4r__sse41_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse2_mul16 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse41_mul16 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c4s4r__sse41_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse2_mul16 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse41_mul16 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l8c4s4r__sse41_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse2_mul16 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse41_mul16 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c4s4r__sse41_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse2_mul16 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse41_mul16 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l8c4s4r__sse41_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse2_mul16 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse41_mul16 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c4s4r__sse41_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse2_mul16 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse41_mul16 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params - -# x86 AVX -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c4s4r__avx_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__avx2_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c4s4r__avx_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__avx2_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l32c8s8r__avx2_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l8c4s4r__avx_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__avx2_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c4s4r__avx_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__avx2_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l32c8s8r__avx2_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l8c4s4r__avx_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__avx2_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c4s4r__avx_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__avx2_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l32c8s8r__avx2_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params - -# x86 AVX512 -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c16s1r__avx512skx_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l32c16s1r__avx512skx_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c16s1r__avx512skx_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l32c16s1r__avx512skx_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c16s1r__avx512skx_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l32c16s1r__avx512skx_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params - -# Wasm SIMD -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__wasmsimd_mul16 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__wasmsimd_mul16 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__wasmsimd_mul16 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__wasmsimd_mul16 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__wasmsimd_mul16 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__wasmsimd_mul16 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params - -# Wasm -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__wasm_fmagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__wasm_fmagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__wasm_fmagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__wasm_fmagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__wasm_fmagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__wasm_fmagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__wasm_fmagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__wasm_fmagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__wasm_fmagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params - -# Scalar -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_fmagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_imagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_lrintf - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_fmagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_imagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_lrintf - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_fmagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_imagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_lrintf - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_fmagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_imagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_lrintf - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_fmagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_imagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_lrintf - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_fmagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_imagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_lrintf - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_fmagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_imagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_lrintf - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_fmagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_imagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_lrintf - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_fmagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_imagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_lrintf - init: xnn_init_qu8_conv_minmax_fp32_scalar_params diff --git a/test/qu8-dwconv-minmax-multipass-rndnu.cc b/test/qu8-dwconv-minmax-multipass-rndnu.cc index 60dd1096159..b300ab8218a 100644 --- a/test/qu8-dwconv-minmax-multipass-rndnu.cc +++ b/test/qu8-dwconv-minmax-multipass-rndnu.cc @@ -4,7 +4,7 @@ // LICENSE file in the root directory of this source tree. // // Auto-generated file. Do not edit! -// Specification: test/qu8-dwconv-minmax-multipass-rndnu.yaml +// Microkernel: qu8-dwconv-minmax-multipass-rndnu // Generator: tools/generate-dwconv-multipass-test.py @@ -25,14 +25,13 @@ namespace { -std::vector CreateTests1( - size_t c_block, size_t adj_c_block, size_t cr, size_t kr, +std::vector CreateTests( + size_t c_block, size_t cr, size_t kr, size_t first_pass_tile, size_t middle_pass_tile, size_t last_pass_tile, size_t channel_subtile, size_t channel_round, std::function test_func, std::function isa_check = nullptr) { const std::string cbs = std::to_string(c_block); - const std::string acbs = std::to_string(adj_c_block); std::vector tests; tests.reserve(17); @@ -90,7 +89,7 @@ std::vector CreateTests1( .channel_round(channel_round) .kernel_size(first_pass_tile + 1) , test_func, isa_check) - .loop_channels(adj_c_block = c_block, cr * 16, cr * 3)); + .loop_channels(c_block * 2, cr * 16, cr * 3)); tests.push_back(DWConvTestParams( "c_div_" + cbs + "_first_pass_and_last_pass", @@ -103,7 +102,7 @@ std::vector CreateTests1( .channel_round(channel_round) .kernel_size(first_pass_tile + last_pass_tile) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3)); + .loop_channels(c_block * 2, cr * 16, cr * 3)); tests.push_back(DWConvTestParams( "c_div_" + cbs + "_multipass", @@ -115,7 +114,7 @@ std::vector CreateTests1( .channel_subtile(channel_subtile) .channel_round(channel_round) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3) + .loop_channels(c_block * 2, cr * 16, cr * 3) .loop_kernel_size( first_pass_tile + middle_pass_tile + last_pass_tile, first_pass_tile + middle_pass_tile * 2 + last_pass_tile)); @@ -132,7 +131,7 @@ std::vector CreateTests1( .kernel_size(first_pass_tile + last_pass_tile) .qmin(128) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3)); + .loop_channels(c_block * 2, cr * 16, cr * 3)); tests.push_back(DWConvTestParams( "c_div_" + cbs + "_with_qmax", @@ -146,11 +145,11 @@ std::vector CreateTests1( .kernel_size(first_pass_tile + last_pass_tile) .qmax(128) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3)); + .loop_channels(c_block * 2, cr * 16, cr * 3)); } tests.push_back(DWConvTestParams( - "c_gt_" + acbs + "_first_pass_plus_one", + "c_gt_" + cbs + "_first_pass_plus_one", DWConvMicrokernelTester() .first_pass_tile(first_pass_tile) .middle_pass_tile(middle_pass_tile) @@ -160,10 +159,10 @@ std::vector CreateTests1( .channel_round(channel_round) .kernel_size(first_pass_tile + 1) , test_func, isa_check) - .loop_channels(adj_c_block + 1, c_block == 1 ? 10 : adj_c_block + c_block)); + .loop_channels(c_block + 1, c_block == 1 ? 10 : c_block * 2)); tests.push_back(DWConvTestParams( - "c_gt_" + acbs + "_first_pass_and_last_pass", + "c_gt_" + cbs + "_first_pass_and_last_pass", DWConvMicrokernelTester() .first_pass_tile(first_pass_tile) .middle_pass_tile(middle_pass_tile) @@ -173,10 +172,10 @@ std::vector CreateTests1( .channel_round(channel_round) .kernel_size(first_pass_tile + last_pass_tile) , test_func, isa_check) - .loop_channels(adj_c_block + 1, c_block == 1 ? 10 : adj_c_block + c_block)); + .loop_channels(c_block + 1, c_block == 1 ? 10 : c_block * 2)); tests.push_back(DWConvTestParams( - "c_gt_" + acbs + "_multipass", + "c_gt_" + cbs + "_multipass", DWConvMicrokernelTester() .first_pass_tile(first_pass_tile) .middle_pass_tile(middle_pass_tile) @@ -185,7 +184,7 @@ std::vector CreateTests1( .channel_subtile(channel_subtile) .channel_round(channel_round) , test_func, isa_check) - .loop_channels(adj_c_block + 1, c_block == 1 ? 10 : adj_c_block + c_block) + .loop_channels(c_block + 1, c_block == 1 ? 10 : c_block * 2) .loop_kernel_size( first_pass_tile + middle_pass_tile + last_pass_tile, first_pass_tile + middle_pass_tile * 2 + last_pass_tile)); @@ -278,7 +277,7 @@ std::vector CreateTests1( .channel_round(channel_round) .input_offset(xnnpack::NextPrime(cr + 1) * 16) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3) + .loop_channels(c_block * 2, cr * 16, cr * 3) .loop_kernel_size(first_pass_tile + middle_pass_tile + last_pass_tile, first_pass_tile + middle_pass_tile * 2 + last_pass_tile)); @@ -287,380 +286,19 @@ std::vector CreateTests1( } // namespace - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_RNDNU_5F5M5L8C8S8R__NEON_MUL8, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_rndnu_ukernel_5f5m5l8c8s8r__neon_mul8, - xnn_init_qu8_conv_minmax_rndnu_neon_params, - xnn_qu8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_RNDNU_5F5M5L8C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_rndnu_ukernel_5f5m5l8c8s8r__neon_mul16, - xnn_init_qu8_conv_minmax_rndnu_neon_params, - xnn_qu8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_RNDNU_5F5M5L16C8S8R__NEON_MUL8, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_rndnu_ukernel_5f5m5l16c8s8r__neon_mul8, - xnn_init_qu8_conv_minmax_rndnu_neon_params, - xnn_qu8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_RNDNU_5F5M5L16C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_rndnu_ukernel_5f5m5l16c8s8r__neon_mul16, - xnn_init_qu8_conv_minmax_rndnu_neon_params, - xnn_qu8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_RNDNU_5F5M5L32C8S8R__NEON_MUL8, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_rndnu_ukernel_5f5m5l32c8s8r__neon_mul8, - xnn_init_qu8_conv_minmax_rndnu_neon_params, - xnn_qu8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_RNDNU_5F5M5L32C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/5, - /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_rndnu_ukernel_5f5m5l32c8s8r__neon_mul16, - xnn_init_qu8_conv_minmax_rndnu_neon_params, - xnn_qu8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_RNDNU_6F6M7L8C8S8R__NEON_MUL8, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_rndnu_ukernel_6f6m7l8c8s8r__neon_mul8, - xnn_init_qu8_conv_minmax_rndnu_neon_params, - xnn_qu8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_RNDNU_6F6M7L8C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_rndnu_ukernel_6f6m7l8c8s8r__neon_mul16, - xnn_init_qu8_conv_minmax_rndnu_neon_params, - xnn_qu8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_RNDNU_6F6M7L16C8S8R__NEON_MUL8, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_rndnu_ukernel_6f6m7l16c8s8r__neon_mul8, - xnn_init_qu8_conv_minmax_rndnu_neon_params, - xnn_qu8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_RNDNU_6F6M7L16C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_rndnu_ukernel_6f6m7l16c8s8r__neon_mul16, - xnn_init_qu8_conv_minmax_rndnu_neon_params, - xnn_qu8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_RNDNU_6F6M7L32C8S8R__NEON_MUL8, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_rndnu_ukernel_6f6m7l32c8s8r__neon_mul8, - xnn_init_qu8_conv_minmax_rndnu_neon_params, - xnn_qu8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_RNDNU_6F6M7L32C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/6, - /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_rndnu_ukernel_6f6m7l32c8s8r__neon_mul16, - xnn_init_qu8_conv_minmax_rndnu_neon_params, - xnn_qu8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_RNDNU_8F8M9L8C8S8R__NEON_MUL8, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_rndnu_ukernel_8f8m9l8c8s8r__neon_mul8, - xnn_init_qu8_conv_minmax_rndnu_neon_params, - xnn_qu8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_RNDNU_8F8M9L8C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_rndnu_ukernel_8f8m9l8c8s8r__neon_mul16, - xnn_init_qu8_conv_minmax_rndnu_neon_params, - xnn_qu8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_RNDNU_8F8M9L16C8S8R__NEON_MUL8, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_rndnu_ukernel_8f8m9l16c8s8r__neon_mul8, - xnn_init_qu8_conv_minmax_rndnu_neon_params, - xnn_qu8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_RNDNU_8F8M9L16C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_rndnu_ukernel_8f8m9l16c8s8r__neon_mul16, - xnn_init_qu8_conv_minmax_rndnu_neon_params, - xnn_qu8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_RNDNU_8F8M9L32C8S8R__NEON_MUL8, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_rndnu_ukernel_8f8m9l32c8s8r__neon_mul8, - xnn_init_qu8_conv_minmax_rndnu_neon_params, - xnn_qu8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_RNDNU_8F8M9L32C8S8R__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/8, - /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9, - /*channel_subtile=*/8, /*channel_round=*/8, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_rndnu_ukernel_8f8m9l32c8s8r__neon_mul16, - xnn_init_qu8_conv_minmax_rndnu_neon_params, - xnn_qu8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 +#define XNN_DWCONV_MULTIPASS(arch_flags, ukernel, first_pass_tile, middle_pass_tile, last_pass_tile, channel_tile, channel_subtile, channel_round, datatype, weights_type, buffer_type, params_type, init_params)\ +INSTANTIATE_TEST_SUITE_P( \ + ukernel, DWConvTest, \ + testing::ValuesIn(CreateTests( \ + channel_tile, channel_tile, first_pass_tile, \ + first_pass_tile, middle_pass_tile, last_pass_tile, \ + channel_subtile, channel_round, \ + [](DWConvMicrokernelTester& tester) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + tester.Test(ukernel, init_params, xnn_qu8_requantize_rndnu); \ + })), \ + [](const testing::TestParamInfo& info) { \ + return info.param.test_name; \ + }); +#include "src/qu8-dwconv/qu8-dwconv-minmax-multipass-rndnu.h" +#undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qu8-dwconv-minmax-multipass-rndnu.yaml b/test/qu8-dwconv-minmax-multipass-rndnu.yaml deleted file mode 100644 index 18ef808a52b..00000000000 --- a/test/qu8-dwconv-minmax-multipass-rndnu.yaml +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright 2023 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# ARM NEON -- name: xnn_qu8_dwconv_minmax_rndnu_ukernel_5f5m5l8c8s8r__neon_mul8 - init: xnn_init_qu8_conv_minmax_rndnu_neon_params -- name: xnn_qu8_dwconv_minmax_rndnu_ukernel_5f5m5l8c8s8r__neon_mul16 - init: xnn_init_qu8_conv_minmax_rndnu_neon_params -- name: xnn_qu8_dwconv_minmax_rndnu_ukernel_5f5m5l16c8s8r__neon_mul8 - init: xnn_init_qu8_conv_minmax_rndnu_neon_params -- name: xnn_qu8_dwconv_minmax_rndnu_ukernel_5f5m5l16c8s8r__neon_mul16 - init: xnn_init_qu8_conv_minmax_rndnu_neon_params -- name: xnn_qu8_dwconv_minmax_rndnu_ukernel_5f5m5l32c8s8r__neon_mul8 - init: xnn_init_qu8_conv_minmax_rndnu_neon_params -- name: xnn_qu8_dwconv_minmax_rndnu_ukernel_5f5m5l32c8s8r__neon_mul16 - init: xnn_init_qu8_conv_minmax_rndnu_neon_params -- name: xnn_qu8_dwconv_minmax_rndnu_ukernel_6f6m7l8c8s8r__neon_mul8 - init: xnn_init_qu8_conv_minmax_rndnu_neon_params -- name: xnn_qu8_dwconv_minmax_rndnu_ukernel_6f6m7l8c8s8r__neon_mul16 - init: xnn_init_qu8_conv_minmax_rndnu_neon_params -- name: xnn_qu8_dwconv_minmax_rndnu_ukernel_6f6m7l16c8s8r__neon_mul8 - init: xnn_init_qu8_conv_minmax_rndnu_neon_params -- name: xnn_qu8_dwconv_minmax_rndnu_ukernel_6f6m7l16c8s8r__neon_mul16 - init: xnn_init_qu8_conv_minmax_rndnu_neon_params -- name: xnn_qu8_dwconv_minmax_rndnu_ukernel_6f6m7l32c8s8r__neon_mul8 - init: xnn_init_qu8_conv_minmax_rndnu_neon_params -- name: xnn_qu8_dwconv_minmax_rndnu_ukernel_6f6m7l32c8s8r__neon_mul16 - init: xnn_init_qu8_conv_minmax_rndnu_neon_params -- name: xnn_qu8_dwconv_minmax_rndnu_ukernel_8f8m9l8c8s8r__neon_mul8 - init: xnn_init_qu8_conv_minmax_rndnu_neon_params -- name: xnn_qu8_dwconv_minmax_rndnu_ukernel_8f8m9l8c8s8r__neon_mul16 - init: xnn_init_qu8_conv_minmax_rndnu_neon_params -- name: xnn_qu8_dwconv_minmax_rndnu_ukernel_8f8m9l16c8s8r__neon_mul8 - init: xnn_init_qu8_conv_minmax_rndnu_neon_params -- name: xnn_qu8_dwconv_minmax_rndnu_ukernel_8f8m9l16c8s8r__neon_mul16 - init: xnn_init_qu8_conv_minmax_rndnu_neon_params -- name: xnn_qu8_dwconv_minmax_rndnu_ukernel_8f8m9l32c8s8r__neon_mul8 - init: xnn_init_qu8_conv_minmax_rndnu_neon_params -- name: xnn_qu8_dwconv_minmax_rndnu_ukernel_8f8m9l32c8s8r__neon_mul16 - init: xnn_init_qu8_conv_minmax_rndnu_neon_params diff --git a/test/qu8-dwconv-minmax-unipass-fp32.cc b/test/qu8-dwconv-minmax-unipass-fp32.cc index 426177e2cda..586851cea87 100644 --- a/test/qu8-dwconv-minmax-unipass-fp32.cc +++ b/test/qu8-dwconv-minmax-unipass-fp32.cc @@ -7,7 +7,7 @@ // LICENSE file in the root directory of this source tree. // // Auto-generated file. Do not edit! -// Specification: test/qu8-dwconv-minmax-unipass-fp32.yaml +// Microkernel: qu8-dwconv-minmax-unipass-fp32 // Generator: tools/generate-dwconv-unipass-test.py @@ -28,10 +28,10 @@ namespace { -std::vector CreateTests1( - size_t c_block, size_t adj_c_block, size_t cr, size_t kr, - std::function test_func, - std::function isa_check = nullptr) { +std::vector CreateTests( + size_t c_block, bool is_pipelined, size_t cr, size_t kr, + std::function test_func) { + size_t adj_c_block = is_pipelined ? c_block * 2 : c_block; const std::string cbs = std::to_string(c_block); const std::string acbs = std::to_string(adj_c_block); @@ -44,8 +44,17 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .channels(c_block) - , test_func, isa_check)); + , test_func)); + if (is_pipelined) { + tests.push_back(DWConvTestParams( + "c_eq_" + std::to_string(c_block * 2), + DWConvMicrokernelTester() + .channel_tile(cr) + .kernel_tile(kr) + .channels(c_block * 2) + , test_func)); + } if (c_block > 1) { tests.push_back(DWConvTestParams( @@ -53,7 +62,7 @@ std::vector CreateTests1( DWConvMicrokernelTester() .channel_tile(cr) .kernel_tile(kr) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); tests.push_back(DWConvTestParams( @@ -62,7 +71,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .qmin(128) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); tests.push_back(DWConvTestParams( @@ -71,7 +80,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .qmax(128) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); tests.push_back(DWConvTestParams( @@ -79,7 +88,7 @@ std::vector CreateTests1( DWConvMicrokernelTester() .channel_tile(cr) .kernel_tile(kr) - , test_func, isa_check) + , test_func) .loop_channels(1, adj_c_block - 1)); } @@ -88,7 +97,7 @@ std::vector CreateTests1( DWConvMicrokernelTester() .channel_tile(cr) .kernel_tile(kr) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + 1, (c_block == 1 ? 10 : adj_c_block + c_block) - 1)); tests.push_back(DWConvTestParams( @@ -97,7 +106,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .qmin(128) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + 1, (c_block == 1 ? 10 : adj_c_block + c_block) - 1)); tests.push_back(DWConvTestParams( @@ -106,7 +115,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .qmax(128) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + 1, (c_block == 1 ? 10 : adj_c_block + c_block) - 1)); tests.push_back(DWConvTestParams( @@ -115,7 +124,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .width(3) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); tests.push_back(DWConvTestParams( @@ -124,7 +133,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .width(3) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1)) .loop_step(2, kr)); @@ -135,7 +144,7 @@ std::vector CreateTests1( .kernel_tile(kr) .width(5) .output_stride(xnnpack::NextPrime(cr * 5 + 1)) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); tests.push_back(DWConvTestParams( @@ -145,7 +154,7 @@ std::vector CreateTests1( .kernel_tile(kr) .width(3) .qmin(128) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); tests.push_back(DWConvTestParams( @@ -155,7 +164,7 @@ std::vector CreateTests1( .kernel_tile(kr) .width(3) .qmax(128) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); tests.push_back(DWConvTestParams( @@ -166,7 +175,7 @@ std::vector CreateTests1( .width(3) .input_zero_point(255) .kernel_zero_point(0) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); tests.push_back(DWConvTestParams( @@ -177,7 +186,7 @@ std::vector CreateTests1( .width(3) .input_zero_point(0) .kernel_zero_point(255) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); tests.push_back(DWConvTestParams( @@ -186,7 +195,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .input_offset(xnnpack::NextPrime(cr + 1) * 16) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); tests.push_back(DWConvTestParams( @@ -195,7 +204,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .input_offset(xnnpack::NextPrime(cr + 1) * 16) - , test_func, isa_check) + , test_func) .loop_zi(0, kr - 1) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); @@ -204,1195 +213,17 @@ std::vector CreateTests1( } // namespace - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_9P8C__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__neon_mul16, - xnn_init_qu8_conv_minmax_fp32_neon_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_9P8C__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__neonv8_mul16, - xnn_init_qu8_conv_minmax_fp32_neonv8_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_9P16C__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__neon_mul16, - xnn_init_qu8_conv_minmax_fp32_neon_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_9P16C__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__neonv8_mul16, - xnn_init_qu8_conv_minmax_fp32_neonv8_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_9P32C__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_9p32c__neon_mul16, - xnn_init_qu8_conv_minmax_fp32_neon_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_9P32C__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_9p32c__neonv8_mul16, - xnn_init_qu8_conv_minmax_fp32_neonv8_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_25P8C__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__neon_mul16, - xnn_init_qu8_conv_minmax_fp32_neon_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_25P8C__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__neonv8_mul16, - xnn_init_qu8_conv_minmax_fp32_neonv8_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_25P16C__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__neon_mul16, - xnn_init_qu8_conv_minmax_fp32_neon_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_25P16C__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__neonv8_mul16, - xnn_init_qu8_conv_minmax_fp32_neonv8_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_25P32C__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_25p32c__neon_mul16, - xnn_init_qu8_conv_minmax_fp32_neon_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_25P32C__NEONV8_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_25p32c__neonv8_mul16, - xnn_init_qu8_conv_minmax_fp32_neonv8_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_ARM_NEON_V8; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_9P8C__SSE2_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__sse2_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_9P8C__SSE41_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_9P8C__SSE41_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_9P16C__SSE2_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__sse2_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_9P16C__SSE41_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__sse41_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_9P16C__SSE41_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__sse41_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_25P8C__SSE2_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__sse2_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_25P8C__SSE41_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_25P8C__SSE41_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_25P16C__SSE2_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__sse2_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_25P16C__SSE41_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__sse41_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_25P16C__SSE41_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__sse41_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_SSE41; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_9P8C__AVX_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__avx_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_9P8C__AVX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__avx_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_9P8C__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__avx2_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_9P16C__AVX_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__avx_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_9P16C__AVX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__avx_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_9P16C__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_9P32C__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_25P8C__AVX_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__avx_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_25P8C__AVX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__avx_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_25P8C__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__avx2_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_25P16C__AVX_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__avx_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_25P16C__AVX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__avx_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_25P16C__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_25P32C__AVX2_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX2; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_9P16C__AVX512SKX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__avx512skx_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX512SKX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_9P32C__AVX512SKX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_9p32c__avx512skx_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX512SKX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_25P16C__AVX512SKX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__avx512skx_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX512SKX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_25P32C__AVX512SKX_MUL32, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_25p32c__avx512skx_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - }, - []() { - TEST_REQUIRES_X86_AVX512SKX; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_9P8C__WASMSIMD_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__wasmsimd_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_9P16C__WASMSIMD_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__wasmsimd_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_25P8C__WASMSIMD_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__wasmsimd_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_25P16C__WASMSIMD_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__wasmsimd_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_9P1C__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_9p1c__wasm_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_9P2C__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_9p2c__wasm_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_9P4C__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_9p4c__wasm_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_25P1C__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_25p1c__wasm_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_25P2C__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_25p2c__wasm_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_25P4C__WASM_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_25p4c__wasm_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_9P1C__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_9p1c__scalar_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_9P1C__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_9p1c__scalar_imagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_9P1C__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_9p1c__scalar_lrintf, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_9P2C__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_9p2c__scalar_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_9P2C__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_9p2c__scalar_imagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_9P2C__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_9p2c__scalar_lrintf, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_9P4C__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_9p4c__scalar_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_9P4C__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_9p4c__scalar_imagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_9P4C__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_9p4c__scalar_lrintf, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_25P1C__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_25p1c__scalar_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_25P1C__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_25p1c__scalar_imagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_25P1C__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_25p1c__scalar_lrintf, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_25P2C__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_25p2c__scalar_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_25P2C__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_25p2c__scalar_imagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_25P2C__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_25p2c__scalar_lrintf, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_25P4C__SCALAR_FMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_25p4c__scalar_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_25P4C__SCALAR_IMAGIC, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_25p4c__scalar_imagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_FP32_25P4C__SCALAR_LRINTF, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_fp32_ukernel_25p4c__scalar_lrintf, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - xnn_qu8_requantize_fp32); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); \ No newline at end of file +#define XNN_DWCONV_UNIPASS(arch_flags, ukernel, c_block, is_pipelined, cr, kr, datatype, weights_type, params_type, init_params)\ +INSTANTIATE_TEST_SUITE_P( \ + ukernel, DWConvTest, \ + testing::ValuesIn(CreateTests( \ + c_block, is_pipelined, cr, kr, \ + [](DWConvMicrokernelTester& tester) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + tester.Test(ukernel, init_params, xnn_qu8_requantize_fp32); \ + })), \ + [](const testing::TestParamInfo& info) { \ + return info.param.test_name; \ + }); +#include "src/qu8-dwconv/qu8-dwconv-minmax-unipass-fp32.h" +#undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qu8-dwconv-minmax-unipass-fp32.yaml b/test/qu8-dwconv-minmax-unipass-fp32.yaml deleted file mode 100644 index 5fe1df0a29e..00000000000 --- a/test/qu8-dwconv-minmax-unipass-fp32.yaml +++ /dev/null @@ -1,152 +0,0 @@ -# Copyright 2021 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# ARM NEON -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__neon_mul16 - init: xnn_init_qu8_conv_minmax_fp32_neon_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__neonv8_mul16 - init: xnn_init_qu8_conv_minmax_fp32_neonv8_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__neon_mul16 - init: xnn_init_qu8_conv_minmax_fp32_neon_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__neonv8_mul16 - init: xnn_init_qu8_conv_minmax_fp32_neonv8_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_9p32c__neon_mul16 - init: xnn_init_qu8_conv_minmax_fp32_neon_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_9p32c__neonv8_mul16 - init: xnn_init_qu8_conv_minmax_fp32_neonv8_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__neon_mul16 - init: xnn_init_qu8_conv_minmax_fp32_neon_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__neonv8_mul16 - init: xnn_init_qu8_conv_minmax_fp32_neonv8_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__neon_mul16 - init: xnn_init_qu8_conv_minmax_fp32_neon_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__neonv8_mul16 - init: xnn_init_qu8_conv_minmax_fp32_neonv8_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_25p32c__neon_mul16 - init: xnn_init_qu8_conv_minmax_fp32_neon_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_25p32c__neonv8_mul16 - init: xnn_init_qu8_conv_minmax_fp32_neonv8_params -# x86 SSE -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__sse2_mul16 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul16 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__sse2_mul16 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__sse41_mul16 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__sse41_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__sse2_mul16 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul16 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__sse2_mul16 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__sse41_mul16 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__sse41_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -# x86 AVX -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__avx_mul16 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__avx_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__avx2_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__avx_mul16 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__avx_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__avx_mul16 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__avx_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__avx2_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__avx_mul16 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__avx_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -# x86 AVX512 -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__avx512skx_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_9p32c__avx512skx_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__avx512skx_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_25p32c__avx512skx_mul32 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -# WAsm SIMD -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__wasmsimd_mul16 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__wasmsimd_mul16 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__wasmsimd_mul16 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__wasmsimd_mul16 - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -# WAsm -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_9p1c__wasm_fmagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_9p2c__wasm_fmagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_9p4c__wasm_fmagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_25p1c__wasm_fmagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_25p2c__wasm_fmagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_25p4c__wasm_fmagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -# Scalar -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_9p1c__scalar_fmagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_9p1c__scalar_imagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_9p1c__scalar_lrintf - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_9p2c__scalar_fmagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_9p2c__scalar_imagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_9p2c__scalar_lrintf - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_9p4c__scalar_fmagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_9p4c__scalar_imagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_9p4c__scalar_lrintf - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_25p1c__scalar_fmagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_25p1c__scalar_imagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_25p1c__scalar_lrintf - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_25p2c__scalar_fmagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_25p2c__scalar_imagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_25p2c__scalar_lrintf - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_25p4c__scalar_fmagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_25p4c__scalar_imagic - init: xnn_init_qu8_conv_minmax_fp32_scalar_params -- name: xnn_qu8_dwconv_minmax_fp32_ukernel_25p4c__scalar_lrintf - init: xnn_init_qu8_conv_minmax_fp32_scalar_params diff --git a/test/qu8-dwconv-minmax-unipass-rndnu.cc b/test/qu8-dwconv-minmax-unipass-rndnu.cc index 927b9a303d7..361838e6368 100644 --- a/test/qu8-dwconv-minmax-unipass-rndnu.cc +++ b/test/qu8-dwconv-minmax-unipass-rndnu.cc @@ -7,7 +7,7 @@ // LICENSE file in the root directory of this source tree. // // Auto-generated file. Do not edit! -// Specification: test/qu8-dwconv-minmax-unipass-rndnu.yaml +// Microkernel: qu8-dwconv-minmax-unipass-rndnu // Generator: tools/generate-dwconv-unipass-test.py @@ -28,10 +28,10 @@ namespace { -std::vector CreateTests1( - size_t c_block, size_t adj_c_block, size_t cr, size_t kr, - std::function test_func, - std::function isa_check = nullptr) { +std::vector CreateTests( + size_t c_block, bool is_pipelined, size_t cr, size_t kr, + std::function test_func) { + size_t adj_c_block = is_pipelined ? c_block * 2 : c_block; const std::string cbs = std::to_string(c_block); const std::string acbs = std::to_string(adj_c_block); @@ -44,8 +44,17 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .channels(c_block) - , test_func, isa_check)); + , test_func)); + if (is_pipelined) { + tests.push_back(DWConvTestParams( + "c_eq_" + std::to_string(c_block * 2), + DWConvMicrokernelTester() + .channel_tile(cr) + .kernel_tile(kr) + .channels(c_block * 2) + , test_func)); + } if (c_block > 1) { tests.push_back(DWConvTestParams( @@ -53,7 +62,7 @@ std::vector CreateTests1( DWConvMicrokernelTester() .channel_tile(cr) .kernel_tile(kr) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); tests.push_back(DWConvTestParams( @@ -62,7 +71,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .qmin(128) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); tests.push_back(DWConvTestParams( @@ -71,7 +80,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .qmax(128) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); tests.push_back(DWConvTestParams( @@ -79,7 +88,7 @@ std::vector CreateTests1( DWConvMicrokernelTester() .channel_tile(cr) .kernel_tile(kr) - , test_func, isa_check) + , test_func) .loop_channels(1, adj_c_block - 1)); } @@ -88,7 +97,7 @@ std::vector CreateTests1( DWConvMicrokernelTester() .channel_tile(cr) .kernel_tile(kr) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + 1, (c_block == 1 ? 10 : adj_c_block + c_block) - 1)); tests.push_back(DWConvTestParams( @@ -97,7 +106,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .qmin(128) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + 1, (c_block == 1 ? 10 : adj_c_block + c_block) - 1)); tests.push_back(DWConvTestParams( @@ -106,7 +115,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .qmax(128) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + 1, (c_block == 1 ? 10 : adj_c_block + c_block) - 1)); tests.push_back(DWConvTestParams( @@ -115,7 +124,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .width(3) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); tests.push_back(DWConvTestParams( @@ -124,7 +133,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .width(3) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1)) .loop_step(2, kr)); @@ -135,7 +144,7 @@ std::vector CreateTests1( .kernel_tile(kr) .width(5) .output_stride(xnnpack::NextPrime(cr * 5 + 1)) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); tests.push_back(DWConvTestParams( @@ -145,7 +154,7 @@ std::vector CreateTests1( .kernel_tile(kr) .width(3) .qmin(128) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); tests.push_back(DWConvTestParams( @@ -155,7 +164,7 @@ std::vector CreateTests1( .kernel_tile(kr) .width(3) .qmax(128) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); tests.push_back(DWConvTestParams( @@ -166,7 +175,7 @@ std::vector CreateTests1( .width(3) .input_zero_point(255) .kernel_zero_point(0) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); tests.push_back(DWConvTestParams( @@ -177,7 +186,7 @@ std::vector CreateTests1( .width(3) .input_zero_point(0) .kernel_zero_point(255) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); tests.push_back(DWConvTestParams( @@ -186,7 +195,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .input_offset(xnnpack::NextPrime(cr + 1) * 16) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); tests.push_back(DWConvTestParams( @@ -195,7 +204,7 @@ std::vector CreateTests1( .channel_tile(cr) .kernel_tile(kr) .input_offset(xnnpack::NextPrime(cr + 1) * 16) - , test_func, isa_check) + , test_func) .loop_zi(0, kr - 1) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); @@ -204,270 +213,17 @@ std::vector CreateTests1( } // namespace - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_RNDNU_9P8C__NEON_MUL8, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_rndnu_ukernel_9p8c__neon_mul8, - xnn_init_qu8_conv_minmax_rndnu_neon_params, - xnn_qu8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_RNDNU_9P8C__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_rndnu_ukernel_9p8c__neon_mul16, - xnn_init_qu8_conv_minmax_rndnu_neon_params, - xnn_qu8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_RNDNU_9P16C__NEON_MUL8, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mul8, - xnn_init_qu8_conv_minmax_rndnu_neon_params, - xnn_qu8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_RNDNU_9P16C__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mul16, - xnn_init_qu8_conv_minmax_rndnu_neon_params, - xnn_qu8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_RNDNU_9P32C__NEON_MUL8, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_rndnu_ukernel_9p32c__neon_mul8, - xnn_init_qu8_conv_minmax_rndnu_neon_params, - xnn_qu8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_RNDNU_9P32C__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_rndnu_ukernel_9p32c__neon_mul16, - xnn_init_qu8_conv_minmax_rndnu_neon_params, - xnn_qu8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_RNDNU_25P8C__NEON_MUL8, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_rndnu_ukernel_25p8c__neon_mul8, - xnn_init_qu8_conv_minmax_rndnu_neon_params, - xnn_qu8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_RNDNU_25P8C__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/8, /*adj_c_block=*/8, /*cr=*/8, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_rndnu_ukernel_25p8c__neon_mul16, - xnn_init_qu8_conv_minmax_rndnu_neon_params, - xnn_qu8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_RNDNU_25P16C__NEON_MUL8, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_rndnu_ukernel_25p16c__neon_mul8, - xnn_init_qu8_conv_minmax_rndnu_neon_params, - xnn_qu8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_RNDNU_25P16C__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/16, /*adj_c_block=*/16, /*cr=*/16, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_rndnu_ukernel_25p16c__neon_mul16, - xnn_init_qu8_conv_minmax_rndnu_neon_params, - xnn_qu8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_RNDNU_25P32C__NEON_MUL8, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_rndnu_ukernel_25p32c__neon_mul8, - xnn_init_qu8_conv_minmax_rndnu_neon_params, - xnn_qu8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_RNDNU_25P32C__NEON_MUL16, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/32, /*adj_c_block=*/32, /*cr=*/32, /*kr=*/25, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_rndnu_ukernel_25p32c__neon_mul16, - xnn_init_qu8_conv_minmax_rndnu_neon_params, - xnn_qu8_requantize_rndnu); - }, - []() { - TEST_REQUIRES_ARM_NEON; - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_RNDNU_9P1C__SCALAR, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/1, /*adj_c_block=*/1, /*cr=*/1, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_rndnu_ukernel_9p1c__scalar, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - xnn_qu8_requantize_rndnu); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_RNDNU_9P2C__SCALAR, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/2, /*adj_c_block=*/2, /*cr=*/2, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_rndnu_ukernel_9p2c__scalar, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - xnn_qu8_requantize_rndnu); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; +#define XNN_DWCONV_UNIPASS(arch_flags, ukernel, c_block, is_pipelined, cr, kr, datatype, weights_type, params_type, init_params)\ +INSTANTIATE_TEST_SUITE_P( \ + ukernel, DWConvTest, \ + testing::ValuesIn(CreateTests( \ + c_block, is_pipelined, cr, kr, \ + [](DWConvMicrokernelTester& tester) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + tester.Test(ukernel, init_params, xnn_qu8_requantize_rndnu); \ + })), \ + [](const testing::TestParamInfo& info) { \ + return info.param.test_name; \ }); - -INSTANTIATE_TEST_SUITE_P( - QU8_DWCONV_MINMAX_RNDNU_9P4C__SCALAR, DWConvTest, - testing::ValuesIn(CreateTests1( - /*c_block=*/4, /*adj_c_block=*/4, /*cr=*/4, /*kr=*/9, - [](DWConvMicrokernelTester& tester) { - tester.Test(xnn_qu8_dwconv_minmax_rndnu_ukernel_9p4c__scalar, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - xnn_qu8_requantize_rndnu); - })), - [](const testing::TestParamInfo& info) { - return info.param.test_name; - }); \ No newline at end of file +#include "src/qu8-dwconv/qu8-dwconv-minmax-unipass-rndnu.h" +#undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qu8-dwconv-minmax-unipass-rndnu.yaml b/test/qu8-dwconv-minmax-unipass-rndnu.yaml deleted file mode 100644 index b866fe11168..00000000000 --- a/test/qu8-dwconv-minmax-unipass-rndnu.yaml +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright 2021 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# ARM NEON -- name: xnn_qu8_dwconv_minmax_rndnu_ukernel_9p8c__neon_mul8 - init: xnn_init_qu8_conv_minmax_rndnu_neon_params -- name: xnn_qu8_dwconv_minmax_rndnu_ukernel_9p8c__neon_mul16 - init: xnn_init_qu8_conv_minmax_rndnu_neon_params -- name: xnn_qu8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mul8 - init: xnn_init_qu8_conv_minmax_rndnu_neon_params -- name: xnn_qu8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mul16 - init: xnn_init_qu8_conv_minmax_rndnu_neon_params -- name: xnn_qu8_dwconv_minmax_rndnu_ukernel_9p32c__neon_mul8 - init: xnn_init_qu8_conv_minmax_rndnu_neon_params -- name: xnn_qu8_dwconv_minmax_rndnu_ukernel_9p32c__neon_mul16 - init: xnn_init_qu8_conv_minmax_rndnu_neon_params -- name: xnn_qu8_dwconv_minmax_rndnu_ukernel_25p8c__neon_mul8 - init: xnn_init_qu8_conv_minmax_rndnu_neon_params -- name: xnn_qu8_dwconv_minmax_rndnu_ukernel_25p8c__neon_mul16 - init: xnn_init_qu8_conv_minmax_rndnu_neon_params -- name: xnn_qu8_dwconv_minmax_rndnu_ukernel_25p16c__neon_mul8 - init: xnn_init_qu8_conv_minmax_rndnu_neon_params -- name: xnn_qu8_dwconv_minmax_rndnu_ukernel_25p16c__neon_mul16 - init: xnn_init_qu8_conv_minmax_rndnu_neon_params -- name: xnn_qu8_dwconv_minmax_rndnu_ukernel_25p32c__neon_mul8 - init: xnn_init_qu8_conv_minmax_rndnu_neon_params -- name: xnn_qu8_dwconv_minmax_rndnu_ukernel_25p32c__neon_mul16 - init: xnn_init_qu8_conv_minmax_rndnu_neon_params -# Scalar -- name: xnn_qu8_dwconv_minmax_rndnu_ukernel_9p1c__scalar - init: xnn_init_qu8_conv_minmax_rndnu_scalar_params -- name: xnn_qu8_dwconv_minmax_rndnu_ukernel_9p2c__scalar - init: xnn_init_qu8_conv_minmax_rndnu_scalar_params -- name: xnn_qu8_dwconv_minmax_rndnu_ukernel_9p4c__scalar - init: xnn_init_qu8_conv_minmax_rndnu_scalar_params diff --git a/tools/generate-dwconv-multipass-test.py b/tools/generate-dwconv-multipass-test.py index 8b8268ea5d7..13ff15dd20d 100755 --- a/tools/generate-dwconv-multipass-test.py +++ b/tools/generate-dwconv-multipass-test.py @@ -20,8 +20,8 @@ parser = argparse.ArgumentParser(description='XNNPACK generator') -parser.add_argument("-s", "--spec", metavar="FILE", required=True, - help="Spec (YAML) file") +parser.add_argument("-k", "--ukernel", required=True, + help="microkernel") parser.add_argument("-o", "--output", metavar="FILE", required=True, help='Output (C++ source) file') parser.set_defaults(defines=list()) @@ -50,13 +50,12 @@ def split_ukernel_name(name): DWCONV_CREATE_TESTS_CODE = """\ std::vector CreateTests( - size_t c_block, size_t adj_c_block, size_t cr, size_t kr, + size_t c_block, size_t cr, size_t kr, size_t first_pass_tile, size_t middle_pass_tile, size_t last_pass_tile, size_t channel_subtile, size_t channel_round, std::function test_func, std::function isa_check = nullptr) { const std::string cbs = std::to_string(c_block); - const std::string acbs = std::to_string(adj_c_block); std::vector tests; tests.reserve(17); @@ -114,7 +113,7 @@ def split_ukernel_name(name): .channel_round(channel_round) .kernel_size(first_pass_tile + 1) , test_func, isa_check) - .loop_channels(adj_c_block = c_block, cr * 16, cr * 3)); + .loop_channels(c_block * 2, cr * 16, cr * 3)); tests.push_back(DWConvTestParams( "c_div_" + cbs + "_first_pass_and_last_pass", @@ -127,7 +126,7 @@ def split_ukernel_name(name): .channel_round(channel_round) .kernel_size(first_pass_tile + last_pass_tile) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3)); + .loop_channels(c_block * 2, cr * 16, cr * 3)); tests.push_back(DWConvTestParams( "c_div_" + cbs + "_multipass", @@ -139,7 +138,7 @@ def split_ukernel_name(name): .channel_subtile(channel_subtile) .channel_round(channel_round) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3) + .loop_channels(c_block * 2, cr * 16, cr * 3) .loop_kernel_size( first_pass_tile + middle_pass_tile + last_pass_tile, first_pass_tile + middle_pass_tile * 2 + last_pass_tile)); @@ -157,7 +156,7 @@ def split_ukernel_name(name): .kernel_size(first_pass_tile + last_pass_tile) .qmin(128) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3)); + .loop_channels(c_block * 2, cr * 16, cr * 3)); tests.push_back(DWConvTestParams( "c_div_" + cbs + "_with_qmax", @@ -171,11 +170,11 @@ def split_ukernel_name(name): .kernel_size(first_pass_tile + last_pass_tile) .qmax(128) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3)); + .loop_channels(c_block * 2, cr * 16, cr * 3)); } tests.push_back(DWConvTestParams( - "c_gt_" + acbs + "_first_pass_plus_one", + "c_gt_" + cbs + "_first_pass_plus_one", DWConvMicrokernelTester() .first_pass_tile(first_pass_tile) .middle_pass_tile(middle_pass_tile) @@ -185,10 +184,10 @@ def split_ukernel_name(name): .channel_round(channel_round) .kernel_size(first_pass_tile + 1) , test_func, isa_check) - .loop_channels(adj_c_block + 1, c_block == 1 ? 10 : adj_c_block + c_block)); + .loop_channels(c_block + 1, c_block == 1 ? 10 : c_block * 2)); tests.push_back(DWConvTestParams( - "c_gt_" + acbs + "_first_pass_and_last_pass", + "c_gt_" + cbs + "_first_pass_and_last_pass", DWConvMicrokernelTester() .first_pass_tile(first_pass_tile) .middle_pass_tile(middle_pass_tile) @@ -198,10 +197,10 @@ def split_ukernel_name(name): .channel_round(channel_round) .kernel_size(first_pass_tile + last_pass_tile) , test_func, isa_check) - .loop_channels(adj_c_block + 1, c_block == 1 ? 10 : adj_c_block + c_block)); + .loop_channels(c_block + 1, c_block == 1 ? 10 : c_block * 2)); tests.push_back(DWConvTestParams( - "c_gt_" + acbs + "_multipass", + "c_gt_" + cbs + "_multipass", DWConvMicrokernelTester() .first_pass_tile(first_pass_tile) .middle_pass_tile(middle_pass_tile) @@ -210,7 +209,7 @@ def split_ukernel_name(name): .channel_subtile(channel_subtile) .channel_round(channel_round) , test_func, isa_check) - .loop_channels(adj_c_block + 1, c_block == 1 ? 10 : adj_c_block + c_block) + .loop_channels(c_block + 1, c_block == 1 ? 10 : c_block * 2) .loop_kernel_size( first_pass_tile + middle_pass_tile + last_pass_tile, first_pass_tile + middle_pass_tile * 2 + last_pass_tile)); @@ -303,7 +302,7 @@ def split_ukernel_name(name): .channel_round(channel_round) .input_offset(xnnpack::NextPrime(cr + 1) * 16) , test_func, isa_check) - .loop_channels(adj_c_block + c_block, cr * 16, cr * 3) + .loop_channels(c_block * 2, cr * 16, cr * 3) .loop_kernel_size(first_pass_tile + middle_pass_tile + last_pass_tile, first_pass_tile + middle_pass_tile * 2 + last_pass_tile)); @@ -311,109 +310,36 @@ def split_ukernel_name(name): } """ -DWCONV_TEST_CODE = """\ +TEST_TEMPLATE = """\ +#define XNN_DWCONV_MULTIPASS(arch_flags, ukernel, first_pass_tile, middle_pass_tile, last_pass_tile, channel_tile, channel_subtile, channel_round, datatype, weights_type, buffer_type, params_type, init_params) INSTANTIATE_TEST_SUITE_P( - ${TEST_NAME}, DWConvTest, + ukernel, DWConvTest, testing::ValuesIn(CreateTests( - /*c_block=*/${CBLOCK}, /*adj_c_block=*/${ADJCBLOCK}, /*cr=*/${CR}, /*kr=*/${KR}, - /*first_pass_tile=*/${FIRST_PASS_TILE}, /*middle_pass_tile=*/${MIDDLE_PASS_TILE}, /*last_pass_tile=*/${LAST_PASS_TILE}, - /*channel_subtile=*/${CHANNEL_SUBTILE}, /*channel_round=*/${CHANNEL_ROUND}, + channel_tile, channel_tile, first_pass_tile, + first_pass_tile, middle_pass_tile, last_pass_tile, + channel_subtile, channel_round, [](DWConvMicrokernelTester& tester) { - tester.Test(${",\\n ".join(TEST_ARGS)}); - $if ISA_CHECK: - }, - []() { - ${ISA_CHECK}; - })), - $else: - })), + TEST_REQUIRES_ARCH_FLAGS(arch_flags); + tester.Test(${", ".join(TEST_ARGS)}); + })), [](const testing::TestParamInfo& info) { return info.param.test_name; }); """ - -def generate_test_cases(ukernel, first_pass_tile, middle_pass_tile, last_pass_tile, cr, c_block, - channel_subtile, channel_round, init_fn, requantization, is_pipelined, isa): - """Generates all tests cases for a DWCONV micro-kernel. - - Args: - ukernel: C name of the micro-kernel function. - cr: CR parameter of the DWCONV micro-kernel. - channel_subtile: channel_subtile parameter of the DWCONV micro-kernel. - channel_round: channel_round parameter of the DWCONV micro-kernel. - kr: KR parameter of the DWCONV micro-kernel. - k_block: Number of C values processed per one iteration of the main loop of - the micro-kernel. - init_fn: C name of the function to initialize microkernel parameters. - requantization: name of the requantization scheme used by the microkernel. - is_pipelined: Indicates if the micro-kernel is implemented with software - pipelining. Additional test cases are generated for software - pipelined micro-kernels to separately test prologue + epiloque - of the pipelined loop and iteration of the pipelined loop. - isa: instruction set required to run the micro-kernel. Generated unit test - will skip execution if the host processor doesn't support this ISA. - - Returns: - Code for the test case. - """ - kr = first_pass_tile - _, test_name = ukernel.split("_", 1) - _, datatype, ukernel_type, activation, _ = ukernel.split("_", 4) - if activation == "ukernel": - activation = "linear" - test_args = [ukernel] - if init_fn: - test_args.append(init_fn) - if requantization: - requantization_datatype = {"qc8": "qs8"}.get(datatype, datatype) - test_args.append( - "xnn_%s_requantize_%s" % (requantization_datatype, requantization) - ) - - args = { - "TEST_NAME": test_name.upper().replace("UKERNEL_", ""), - "TEST_ARGS": test_args, - "UKERNEL_TYPE": ukernel_type.upper(), - "DATATYPE": datatype, - "ACTIVATION": activation.upper(), - "FIRST_PASS_TILE": first_pass_tile, - "MIDDLE_PASS_TILE": middle_pass_tile, - "LAST_PASS_TILE": last_pass_tile, - "CR": cr, - "CHANNEL_SUBTILE": channel_subtile, - "CHANNEL_ROUND": channel_round, - "KR": kr, - "CBLOCK": c_block, - "ADJCBLOCK": 2 * c_block if is_pipelined else c_block, - "IS_PIPELINED": is_pipelined, - "ISA_CHECK": xnncommon.generate_isa_check_macro(isa), - "next_prime": next_prime, - "sqrt": math.sqrt, - } - - return ( - xngen.preprocess(DWCONV_CREATE_TESTS_CODE, args), - xngen.preprocess(DWCONV_TEST_CODE, args), - ) - - def main(args): options = parser.parse_args(args) - with codecs.open(options.spec, "r", encoding="utf-8") as spec_file: - spec_yaml = yaml.safe_load(spec_file) - if not isinstance(spec_yaml, list): - raise ValueError("expected a list of micro-kernels in the spec") + ukernel = options.ukernel - test_header = """\ + test_header = """\ // Copyright 2022 Google LLC // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. // // Auto-generated file. Do not edit! -// Specification: {specification} +// Microkernel: {ukernel} // Generator: {generator} @@ -431,59 +357,52 @@ def main(args): #include "xnnpack/requantization.h" #include "dwconv-microkernel-tester.h" #include "next_prime.h" -""".format(specification=options.spec, generator=sys.argv[0]) +""".format(ukernel=ukernel, generator=sys.argv[0]) - # Cached `CreateTests` functions. - idx_from_create_tests_hash = collections.defaultdict( - lambda: len(idx_from_create_tests_hash) + 1 - ) - create_tests_from_idx = {} - - test_cases = "" - - for ukernel_spec in spec_yaml: - name = ukernel_spec["name"] - init_fn = ukernel_spec.get("init") - pipelined = bool(ukernel_spec.get("pipelined", False)) - first_pass_tile, middle_pass_tile, last_pass_tile, cr, channel_subtile, channel_round, requantization, arch, isa, assembly = split_ukernel_name(name) - - create_tests, test_case = generate_test_cases( - name, - first_pass_tile, - middle_pass_tile, - last_pass_tile, - cr, - cr, - channel_subtile, - channel_round, - init_fn, - requantization, - pipelined, - isa, - ) - - # Store or reuse the `CreateTests` function? - create_tests_hash = hash(create_tests) - create_tests_idx = idx_from_create_tests_hash[create_tests_hash] - if create_tests_idx not in create_tests_from_idx: - create_tests_from_idx[create_tests_idx] = create_tests.replace( - "CreateTests(", f"CreateTests{create_tests_idx}(" - ) - test_case = test_case.replace( - "CreateTests(", f"CreateTests{create_tests_idx}(" - ) - - test_cases += "\n\n" + xnncommon.postprocess_test_case( - test_case, arch, isa, assembly - ) - - create_tests = ( - "namespace {\n\n" - + "\n".join(create_tests_from_idx.values()) - + "\n} // namespace\n" + test_cases = "" + + parts = ukernel.split("-") + datatype = parts[0] + folder = datatype + "-dwconv" + if parts[1] == "qc8w": + folder = datatype + "-qc8w-dwconv" + parts.pop(1) + activation = "minmax" if "minmax" in parts else "linear" + ukernel_type = "unipass" if "unipass" in parts else "multipass" + requantization = "fp32" if "fp32" in parts else "rndnu" if "rndnu" in parts else None + + create_tests_args = { + "UKERNEL_TYPE": ukernel_type.upper(), + "DATATYPE": datatype, + "ACTIVATION": activation.upper(), + } + create_tests = xngen.preprocess(DWCONV_CREATE_TESTS_CODE, create_tests_args) + + create_tests = ( + "namespace {\n\n" + + "\n".join([create_tests]) + + "\n} // namespace\n" + ) + tests = test_header + "\n" + create_tests + "\n" + test_cases + + test_args = ["ukernel", "init_params"] + if requantization: + requantization_datatype = {"qc8": "qs8"}.get(datatype, datatype) + test_args.append( + "xnn_%s_requantize_%s" % (requantization_datatype, requantization) ) - tests = test_header + "\n" + create_tests + test_cases - xnncommon.overwrite_if_changed(options.output, tests) + + tests += xnncommon.make_multiline_macro(xngen.preprocess( + TEST_TEMPLATE, + { + "TEST_ARGS": test_args, + }, + )) + + tests += f'#include "{xnncommon.xnnpack_src()}/{folder}/{options.ukernel}.h"\n' + tests += "#undef XNN_UKERNEL_WITH_PARAMS\n" + + xnncommon.overwrite_if_changed(options.output, tests) if __name__ == "__main__": diff --git a/tools/generate-dwconv-unipass-test.py b/tools/generate-dwconv-unipass-test.py index 784a75044dc..fb0b1d8c2e8 100755 --- a/tools/generate-dwconv-unipass-test.py +++ b/tools/generate-dwconv-unipass-test.py @@ -20,8 +20,8 @@ parser = argparse.ArgumentParser(description='XNNPACK generator') -parser.add_argument("-s", "--spec", metavar="FILE", required=True, - help="Spec (YAML) file") +parser.add_argument("-k", "--ukernel", required=True, + help="ukernel name") parser.add_argument("-o", "--output", metavar="FILE", required=True, help='Output (C++ source) file') parser.set_defaults(defines=list()) @@ -48,9 +48,9 @@ def split_ukernel_name(name): DWCONV_CREATE_TESTS_CODE = """\ std::vector CreateTests( - size_t c_block, size_t adj_c_block, size_t cr, size_t kr, - std::function test_func, - std::function isa_check = nullptr) { + size_t c_block, bool is_pipelined, size_t cr, size_t kr, + std::function test_func) { + size_t adj_c_block = is_pipelined ? c_block * 2 : c_block; const std::string cbs = std::to_string(c_block); const std::string acbs = std::to_string(adj_c_block); @@ -63,16 +63,17 @@ def split_ukernel_name(name): .channel_tile(cr) .kernel_tile(kr) .channels(c_block) - , test_func, isa_check)); + , test_func)); - $if IS_PIPELINED: + if (is_pipelined) { tests.push_back(DWConvTestParams( "c_eq_" + std::to_string(c_block * 2), DWConvMicrokernelTester() .channel_tile(cr) .kernel_tile(kr) .channels(c_block * 2) - , test_func, isa_check)); + , test_func)); + } if (c_block > 1) { tests.push_back(DWConvTestParams( @@ -80,7 +81,7 @@ def split_ukernel_name(name): DWConvMicrokernelTester() .channel_tile(cr) .kernel_tile(kr) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); $if ACTIVATION == "MINMAX": @@ -90,7 +91,7 @@ def split_ukernel_name(name): .channel_tile(cr) .kernel_tile(kr) .qmin(128) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); tests.push_back(DWConvTestParams( @@ -99,7 +100,7 @@ def split_ukernel_name(name): .channel_tile(cr) .kernel_tile(kr) .qmax(128) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); tests.push_back(DWConvTestParams( @@ -107,7 +108,7 @@ def split_ukernel_name(name): DWConvMicrokernelTester() .channel_tile(cr) .kernel_tile(kr) - , test_func, isa_check) + , test_func) .loop_channels(1, adj_c_block - 1)); } @@ -116,7 +117,7 @@ def split_ukernel_name(name): DWConvMicrokernelTester() .channel_tile(cr) .kernel_tile(kr) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + 1, (c_block == 1 ? 10 : adj_c_block + c_block) - 1)); $if ACTIVATION == "MINMAX": @@ -126,7 +127,7 @@ def split_ukernel_name(name): .channel_tile(cr) .kernel_tile(kr) .qmin(128) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + 1, (c_block == 1 ? 10 : adj_c_block + c_block) - 1)); tests.push_back(DWConvTestParams( @@ -135,7 +136,7 @@ def split_ukernel_name(name): .channel_tile(cr) .kernel_tile(kr) .qmax(128) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + 1, (c_block == 1 ? 10 : adj_c_block + c_block) - 1)); tests.push_back(DWConvTestParams( @@ -144,7 +145,7 @@ def split_ukernel_name(name): .channel_tile(cr) .kernel_tile(kr) .width(3) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); tests.push_back(DWConvTestParams( @@ -153,7 +154,7 @@ def split_ukernel_name(name): .channel_tile(cr) .kernel_tile(kr) .width(3) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1)) .loop_step(2, kr)); @@ -164,7 +165,7 @@ def split_ukernel_name(name): .kernel_tile(kr) .width(5) .output_stride(xnnpack::NextPrime(cr * 5 + 1)) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); $if ACTIVATION == "MINMAX": @@ -175,7 +176,7 @@ def split_ukernel_name(name): .kernel_tile(kr) .width(3) .qmin(128) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); tests.push_back(DWConvTestParams( @@ -185,7 +186,7 @@ def split_ukernel_name(name): .kernel_tile(kr) .width(3) .qmax(128) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); $if DATATYPE == "qu8": @@ -197,7 +198,7 @@ def split_ukernel_name(name): .width(3) .input_zero_point(255) .kernel_zero_point(0) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); tests.push_back(DWConvTestParams( @@ -208,7 +209,7 @@ def split_ukernel_name(name): .width(3) .input_zero_point(0) .kernel_zero_point(255) - , test_func, isa_check) + , test_func) .loop_channels(1, c_block * 5, std::max(size_t(1), c_block - 1))); tests.push_back(DWConvTestParams( @@ -217,7 +218,7 @@ def split_ukernel_name(name): .channel_tile(cr) .kernel_tile(kr) .input_offset(xnnpack::NextPrime(cr + 1) * 16) - , test_func, isa_check) + , test_func) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); tests.push_back(DWConvTestParams( @@ -226,7 +227,7 @@ def split_ukernel_name(name): .channel_tile(cr) .kernel_tile(kr) .input_offset(xnnpack::NextPrime(cr + 1) * 16) - , test_func, isa_check) + , test_func) .loop_zi(0, kr - 1) .loop_channels(adj_c_block + c_block, cr * 16 - 1, cr * 3)); @@ -234,91 +235,27 @@ def split_ukernel_name(name): } """ -DWCONV_TEST_CODE = """\ +TEST_TEMPLATE = """\ +#define XNN_DWCONV_UNIPASS(arch_flags, ukernel, c_block, is_pipelined, cr, kr, datatype, weights_type, params_type, init_params) INSTANTIATE_TEST_SUITE_P( - ${TEST_NAME}, DWConvTest, + ukernel, DWConvTest, testing::ValuesIn(CreateTests( - /*c_block=*/${CBLOCK}, /*adj_c_block=*/${ADJCBLOCK}, /*cr=*/${CR}, /*kr=*/${KR}, + c_block, is_pipelined, cr, kr, [](DWConvMicrokernelTester& tester) { - tester.Test(${",\\n ".join(TEST_ARGS)}); - $if ISA_CHECK: - }, - []() { - ${ISA_CHECK}; - })), - $else: - })), + TEST_REQUIRES_ARCH_FLAGS(arch_flags); + tester.Test(${", ".join(TEST_ARGS)}); + })), [](const testing::TestParamInfo& info) { return info.param.test_name; }); """ - -def generate_test_cases(ukernel, primary_tile, cr, kr, c_block, - init_fn, requantization, is_pipelined, isa): - """Generates all tests cases for a DWCONV micro-kernel. - - Args: - ukernel: C name of the micro-kernel function. - cr: CR parameter of the DWCONV micro-kernel. - kr: KR parameter of the DWCONV micro-kernel. - k_block: Number of C values processed per one iteration of the main loop of - the micro-kernel. - init_fn: C name of the function to initialize microkernel parameters. - requantization: name of the requantization scheme used by the microkernel. - is_pipelined: Indicates if the micro-kernel is implemented with software - pipelining. Additional test cases are generated for software - pipelined micro-kernels to separately test prologue + epiloque - of the pipelined loop and iteration of the pipelined loop. - isa: instruction set required to run the micro-kernel. Generated unit test - will skip execution if the host processor doesn't support this ISA. - - Returns: - Code for the test case. - """ - _, test_name = ukernel.split("_", 1) - _, datatype, ukernel_type, activation, _ = ukernel.split("_", 4) - if activation == "ukernel": - activation = "linear" - test_args = [ukernel] - if init_fn: - test_args.append(init_fn) - if requantization: - requantization_datatype = {"qc8": "qs8"}.get(datatype, datatype) - test_args.append( - "xnn_%s_requantize_%s" % (requantization_datatype, requantization) - ) - - args = { - "TEST_NAME": test_name.upper().replace("UKERNEL_", ""), - "TEST_ARGS": test_args, - "UKERNEL_TYPE": ukernel_type.upper(), - "DATATYPE": datatype, - "ACTIVATION": activation.upper(), - "PRIMARY_TILE": primary_tile, - "CR": cr, - "KR": kr, - "CBLOCK": c_block, - "ADJCBLOCK": 2 * c_block if is_pipelined else c_block, - "IS_PIPELINED": is_pipelined, - "ISA_CHECK": xnncommon.generate_isa_check_macro(isa), - "next_prime": next_prime, - "sqrt": math.sqrt, - } - return xngen.preprocess(DWCONV_CREATE_TESTS_CODE, args), xngen.preprocess( - DWCONV_TEST_CODE, args - ) - - def main(args): options = parser.parse_args(args) - with codecs.open(options.spec, "r", encoding="utf-8") as spec_file: - spec_yaml = yaml.safe_load(spec_file) - if not isinstance(spec_yaml, list): - raise ValueError("expected a list of micro-kernels in the spec") + ukernel = options.ukernel - test_header = """\ + test_header = """\ // Copyright (c) Facebook, Inc. and its affiliates. // All rights reserved. // @@ -328,7 +265,7 @@ def main(args): // LICENSE file in the root directory of this source tree. // // Auto-generated file. Do not edit! -// Specification: {specification} +// Microkernel: {ukernel} // Generator: {generator} @@ -346,57 +283,52 @@ def main(args): #include "xnnpack/requantization.h" #include "dwconv-microkernel-tester.h" #include "next_prime.h" -""".format(specification=options.spec, generator=sys.argv[0]) +""".format(ukernel=ukernel, generator=sys.argv[0]) - # Cached `CreateTests` functions. - idx_from_create_tests_hash = collections.defaultdict( - lambda: len(idx_from_create_tests_hash) + 1 - ) - create_tests_from_idx = {} - - test_cases = "" - - for ukernel_spec in spec_yaml: - name = ukernel_spec["name"] - init_fn = ukernel_spec.get("init") - pipelined = bool(ukernel_spec.get("pipelined", False)) - primary_tile, cr, kr, requantization, arch, isa, assembly = \ - split_ukernel_name(name) - - create_tests, test_case = generate_test_cases( - name, - primary_tile, - cr, - kr, - cr, - init_fn, - requantization, - pipelined, - isa, - ) - - # Store or reuse the `CreateTests` function? - create_tests_hash = hash(create_tests) - create_tests_idx = idx_from_create_tests_hash[create_tests_hash] - if create_tests_idx not in create_tests_from_idx: - create_tests_from_idx[create_tests_idx] = create_tests.replace( - "CreateTests(", f"CreateTests{create_tests_idx}(" - ) - test_case = test_case.replace( - "CreateTests(", f"CreateTests{create_tests_idx}(" - ) - - test_cases += "\n\n" + xnncommon.postprocess_test_case( - test_case, arch, isa, assembly - ) - - create_tests = ( - "namespace {\n\n" - + "\n".join(create_tests_from_idx.values()) - + "\n} // namespace\n" + test_cases = "" + + parts = ukernel.split("-") + datatype = parts[0] + folder = datatype + "-dwconv" + if parts[1] == "qc8w": + folder = datatype + "-qc8w-dwconv" + parts.pop(1) + activation = "minmax" if "minmax" in parts else "linear" + ukernel_type = "unipass" if "unipass" in parts else "multipass" + requantization = "fp32" if "fp32" in parts else "rndnu" if "rndnu" in parts else None + + create_tests_args = { + "UKERNEL_TYPE": ukernel_type.upper(), + "DATATYPE": datatype, + "ACTIVATION": activation.upper(), + } + create_tests = xngen.preprocess(DWCONV_CREATE_TESTS_CODE, create_tests_args) + + create_tests = ( + "namespace {\n\n" + + "\n".join([create_tests]) + + "\n} // namespace\n" + ) + tests = test_header + "\n" + create_tests + "\n" + test_cases + + test_args = ["ukernel", "init_params"] + if requantization: + requantization_datatype = {"qc8": "qs8"}.get(datatype, datatype) + test_args.append( + "xnn_%s_requantize_%s" % (requantization_datatype, requantization) ) - tests = test_header + "\n" + create_tests + test_cases - xnncommon.overwrite_if_changed(options.output, tests) + + tests += xnncommon.make_multiline_macro(xngen.preprocess( + TEST_TEMPLATE, + { + "TEST_ARGS": test_args, + }, + )) + + tests += f'#include "{xnncommon.xnnpack_src()}/{folder}/{options.ukernel}.h"\n' + tests += "#undef XNN_UKERNEL_WITH_PARAMS\n" + + xnncommon.overwrite_if_changed(options.output, tests) if __name__ == "__main__": From a72a8b102fb521578cc4b5ffae9ef6a02ad7dc0d Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Thu, 19 Sep 2024 16:44:42 -0700 Subject: [PATCH 11/50] Softmax benchmark add variation that uses sse and avx - in production, sse is used for softmax, while avx is used for everything else - this benchmark is to measure the impact of sse-avx-assists PiperOrigin-RevId: 676612035 --- bench/f32-raddstoreexpminusmax.cc | 72 +++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/bench/f32-raddstoreexpminusmax.cc b/bench/f32-raddstoreexpminusmax.cc index 11469e0461b..df11e04896a 100644 --- a/bench/f32-raddstoreexpminusmax.cc +++ b/bench/f32-raddstoreexpminusmax.cc @@ -662,6 +662,78 @@ static void f32_raddstoreexpminusmax( ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_avx_rr2_p5_u4, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u4, + nullptr) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_avx_rr2_p5_u8, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u8, + nullptr) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_avx_rr2_p5_u8_acc2, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u8_acc2, + nullptr) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_avx_rr2_p5_u12, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u12, + nullptr) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_avx_rr2_p5_u12_acc2, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u12_acc2, + nullptr) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_avx_rr2_p5_u12_acc3, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u12_acc3, + nullptr) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_avx_rr2_p5_u16, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16, + nullptr) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_avx_rr2_p5_u16_acc2, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16_acc2, + nullptr) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_avx_rr2_p5_u16_acc4, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16_acc4, + nullptr) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_avx_rr2_p5_u20, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u20, + nullptr) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_avx_rr2_p5_u20_acc2, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u20_acc2, + nullptr) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_avx_rr2_p5_u20_acc5, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u20_acc5, + nullptr) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_rr2_p5_u4, xnn_f32_rmax_ukernel__sse_u16_acc4, xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u4, From de123c1734a2d0fb889e8b80b5ae146e78fd41a4 Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Thu, 19 Sep 2024 18:00:14 -0700 Subject: [PATCH 12/50] Use header tables to generate binary microkernel benchmarks This change replaces the existing quantized binary operator benchmarks. We also now finally have enough automation in place to add binary operator benchmarks easily. Fixes #6396 Also add vunary benchmark to CMake build, and fix some issues found on some targets. PiperOrigin-RevId: 676633750 --- CMakeLists.txt | 10 +- bench/BUILD.bazel | 17 ++-- bench/qs8-vadd.cc | 81 ---------------- bench/qs8-vaddc.cc | 80 ---------------- bench/qs8-vmul.cc | 82 ----------------- bench/qs8-vmulc.cc | 81 ---------------- bench/qu8-vadd.cc | 81 ---------------- bench/qu8-vaddc.cc | 80 ---------------- bench/qu8-vmul.cc | 82 ----------------- bench/qu8-vmulc.cc | 81 ---------------- bench/vbinary.cc | 224 +++++++++++++++++++++++++++++++++++++++++++++ bench/vunary.cc | 76 +++++++++------ 12 files changed, 281 insertions(+), 694 deletions(-) delete mode 100644 bench/qs8-vadd.cc delete mode 100644 bench/qs8-vaddc.cc delete mode 100644 bench/qs8-vmul.cc delete mode 100644 bench/qs8-vmulc.cc delete mode 100644 bench/qu8-vadd.cc delete mode 100644 bench/qu8-vaddc.cc delete mode 100644 bench/qu8-vmul.cc delete mode 100644 bench/qu8-vmulc.cc create mode 100644 bench/vbinary.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 6b0e25c7b27..b2b413f1749 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2083,26 +2083,20 @@ IF(XNNPACK_BUILD_BENCHMARKS) qu8-rdsum qs8-rsum qu8-rsum - qs8-vadd - qs8-vaddc qs8-vcvt - qs8-vmul - qs8-vmulc qu8-f32-vcvt qu8-gemm qu8-gemm-fp32 qu8-gemm-rndnu qu8-requantization - qu8-vadd - qu8-vaddc qu8-vcvt - qu8-vmul - qu8-vmulc x16-packw x32-packw x8-lut x8-packq x8-packw + vunary + vbinary xN-transposec xx-transposev) FOREACH(BENCH ${MICROKERNEL_BENCHMARKS}) diff --git a/bench/BUILD.bazel b/bench/BUILD.bazel index a46f19f2103..309737e442f 100644 --- a/bench/BUILD.bazel +++ b/bench/BUILD.bazel @@ -241,14 +241,6 @@ xnnpack_benchmark( ], deps = MICROKERNEL_BENCHMARK_DEPS, ) for kernel in [ - "qs8_vadd", - "qs8_vaddc", - "qs8_vmul", - "qs8_vmulc", - "qu8_vadd", - "qu8_vaddc", - "qu8_vmul", - "qu8_vmulc", "f16_gavgpool_cw", "f16_raddstoreexpminusmax", "f16_rmax", @@ -343,6 +335,15 @@ xnnpack_benchmark( deps = MICROKERNEL_BENCHMARK_DEPS, ) +xnnpack_benchmark( + name = "vbinary_bench", + srcs = ["vbinary.cc"], + # TODO(b/367939259): This is not really that slow, but --config=ios_x86_64 fails to pass + # --benchmark_min_time=1x to the benchmark. + tags = xnnpack_slow_benchmark_tags(), + deps = MICROKERNEL_BENCHMARK_DEPS, +) + xnnpack_benchmark( name = "f32_igemm_bench", srcs = [ diff --git a/bench/qs8-vadd.cc b/bench/qs8-vadd.cc deleted file mode 100644 index 2447dc0694d..00000000000 --- a/bench/qs8-vadd.cc +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include - -#include -#include "bench/utils.h" - -#include "xnnpack.h" -#include "xnnpack/aligned-allocator.h" -#include "xnnpack/common.h" -#include "xnnpack/hardware-config.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams-init.h" -#include "xnnpack/vbinary.h" - - -static void qs8_vadd( - benchmark::State& state, - uint64_t arch_flags, - xnn_qs8_vadd_minmax_ukernel_fn vadd, - xnn_init_qs8_add_minmax_params_fn init_params) -{ - if (!benchmark::utils::CheckArchFlags(state, arch_flags)) { - return; - } - - const size_t num_elements = state.range(0); - - std::random_device random_device; - auto rng = std::mt19937(random_device()); - auto i8rng = std::bind( - std::uniform_int_distribution(std::numeric_limits::min(), std::numeric_limits::max()), - std::ref(rng)); - - std::vector> a(num_elements); - std::vector> b(num_elements); - std::vector> sum(num_elements); - std::generate(a.begin(), a.end(), std::ref(i8rng)); - std::generate(b.begin(), b.end(), std::ref(i8rng)); - - struct xnn_qs8_add_minmax_params params; - init_params(¶ms, - 1 /* a zero point */, 1 /* b zero point */, 1 /* output zero point */, - 0.5f /* a-output scale */, 0.75f /* b-output scale */, - std::numeric_limits::min() + 1, std::numeric_limits::max() - 1); - for (auto _ : state) { - vadd(num_elements * sizeof(int8_t), a.data(), b.data(), sum.data(), ¶ms); - } - - const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); - if (cpu_frequency != 0) { - state.counters["cpufreq"] = cpu_frequency; - } - - const size_t num_elements_per_iteration = num_elements; - state.counters["num_elements"] = - benchmark::Counter(uint64_t(state.iterations()) * num_elements_per_iteration, benchmark::Counter::kIsRate); - - const size_t bytes_per_iteration = 3 * num_elements * sizeof(int8_t); - state.counters["bytes"] = - benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate); -} - -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ - BENCHMARK_CAPTURE(qs8_vadd, ukernel, arch_flags, ukernel, init_params) \ - ->Apply(benchmark::utils::BinaryElementwiseParameters) \ - ->UseRealTime(); -#include "src/qs8-vadd/qs8-vadd-minmax.h" -#undef XNN_UKERNEL_WITH_PARAMS - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/qs8-vaddc.cc b/bench/qs8-vaddc.cc deleted file mode 100644 index 0c39dd600de..00000000000 --- a/bench/qs8-vaddc.cc +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include - -#include "bench/utils.h" -#include - -#include "xnnpack.h" -#include "xnnpack/aligned-allocator.h" -#include "xnnpack/common.h" -#include "xnnpack/hardware-config.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams-init.h" -#include "xnnpack/vbinary.h" - - -static void qs8_vaddc( - benchmark::State& state, - uint64_t arch_flags, - xnn_qs8_vadd_minmax_ukernel_fn vaddc, - xnn_init_qs8_add_minmax_params_fn init_params) -{ - if (!benchmark::utils::CheckArchFlags(state, arch_flags)) { - return; - } - - const size_t num_elements = state.range(0); - - std::random_device random_device; - auto rng = std::mt19937(random_device()); - auto i8rng = std::bind( - std::uniform_int_distribution(std::numeric_limits::min(), std::numeric_limits::max()), - std::ref(rng)); - - std::vector> a(num_elements); - std::vector> sum(num_elements); - std::generate(a.begin(), a.end(), std::ref(i8rng)); - const int8_t b = i8rng(); - - struct xnn_qs8_add_minmax_params params; - init_params(¶ms, - 1 /* a zero point */, 1 /* b zero point */, 1 /* output zero point */, - 0.5f /* a-output scale */, 0.75f /* b-output scale */, - std::numeric_limits::min() + 1, std::numeric_limits::max() - 1); - for (auto _ : state) { - vaddc(num_elements * sizeof(int8_t), a.data(), &b, sum.data(), ¶ms); - } - - const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); - if (cpu_frequency != 0) { - state.counters["cpufreq"] = cpu_frequency; - } - - const size_t num_elements_per_iteration = num_elements; - state.counters["num_elements"] = - benchmark::Counter(uint64_t(state.iterations()) * num_elements_per_iteration, benchmark::Counter::kIsRate); - - const size_t bytes_per_iteration = 2 * num_elements * sizeof(int8_t); - state.counters["bytes"] = - benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate); -} - -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ - BENCHMARK_CAPTURE(qs8_vaddc, ukernel, arch_flags, ukernel, init_params) \ - ->Apply(benchmark::utils::BinaryElementwiseParameters) \ - ->UseRealTime(); -#include "src/qs8-vaddc/qs8-vaddc-minmax.h" -#undef XNN_UKERNEL_WITH_PARAMS - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/qs8-vmul.cc b/bench/qs8-vmul.cc deleted file mode 100644 index 06f1e81b9cd..00000000000 --- a/bench/qs8-vmul.cc +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include - -#include -#include "bench/utils.h" - -#include "xnnpack.h" -#include "xnnpack/aligned-allocator.h" -#include "xnnpack/common.h" -#include "xnnpack/hardware-config.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams-init.h" -#include "xnnpack/vbinary.h" - - -static void qs8_vmul( - benchmark::State& state, - uint64_t arch_flags, - xnn_qs8_vmul_minmax_ukernel_fn vmul, - xnn_init_qs8_mul_minmax_params_fn init_params) -{ - if (!benchmark::utils::CheckArchFlags(state, arch_flags)) { - return; - } - - const size_t num_elements = state.range(0); - - std::random_device random_device; - auto rng = std::mt19937(random_device()); - auto i8rng = std::bind( - std::uniform_int_distribution(std::numeric_limits::min(), std::numeric_limits::max()), - std::ref(rng)); - - std::vector> a(num_elements); - std::vector> b(num_elements); - std::vector> product(num_elements); - std::generate(a.begin(), a.end(), std::ref(i8rng)); - std::generate(b.begin(), b.end(), std::ref(i8rng)); - - union xnn_qs8_mul_minmax_params params; - init_params(¶ms, - 1 /* a zero point */, 1 /* b zero point */, 1 /* output zero point */, - 0.75f /* product-output scale */, - std::numeric_limits::min() + 1, std::numeric_limits::max() - 1); - for (auto _ : state) { - vmul(num_elements * sizeof(int8_t), a.data(), b.data(), product.data(), ¶ms); - } - - const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); - if (cpu_frequency != 0) { - state.counters["cpufreq"] = cpu_frequency; - } - - const size_t num_elements_per_iteration = num_elements; - state.counters["num_elements"] = - benchmark::Counter(uint64_t(state.iterations()) * num_elements_per_iteration, benchmark::Counter::kIsRate); - - const size_t bytes_per_iteration = 3 * num_elements * sizeof(int8_t); - state.counters["bytes"] = - benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate); -} - -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ - BENCHMARK_CAPTURE(qs8_vmul, ukernel, arch_flags, ukernel, init_params) \ - ->Apply(benchmark::utils::BinaryElementwiseParameters) \ - ->UseRealTime(); -#include "src/qs8-vmul/qs8-vmul-minmax-fp32.h" -#include "src/qs8-vmul/qs8-vmul-minmax-rndnu.h" -#undef XNN_UKERNEL_WITH_PARAMS - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/qs8-vmulc.cc b/bench/qs8-vmulc.cc deleted file mode 100644 index 43b767ff928..00000000000 --- a/bench/qs8-vmulc.cc +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include - -#include -#include "bench/utils.h" - -#include "xnnpack.h" -#include "xnnpack/aligned-allocator.h" -#include "xnnpack/common.h" -#include "xnnpack/hardware-config.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams-init.h" -#include "xnnpack/vbinary.h" - - -static void qs8_vmulc( - benchmark::State& state, - uint64_t arch_flags, - xnn_qs8_vmul_minmax_ukernel_fn vmulc, - xnn_init_qs8_mul_minmax_params_fn init_params) -{ - if (!benchmark::utils::CheckArchFlags(state, arch_flags)) { - return; - } - - const size_t num_elements = state.range(0); - - std::random_device random_device; - auto rng = std::mt19937(random_device()); - auto i8rng = std::bind( - std::uniform_int_distribution(std::numeric_limits::min(), std::numeric_limits::max()), - std::ref(rng)); - - std::vector> a(num_elements); - std::vector> product(num_elements); - std::generate(a.begin(), a.end(), std::ref(i8rng)); - const int8_t b = i8rng(); - - union xnn_qs8_mul_minmax_params params; - init_params(¶ms, - 1 /* a zero point */, 1 /* b zero point */, 1 /* output zero point */, - 0.75f /* product-output scale */, - std::numeric_limits::min() + 1, std::numeric_limits::max() - 1); - for (auto _ : state) { - vmulc(num_elements * sizeof(int8_t), a.data(), &b, product.data(), ¶ms); - } - - const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); - if (cpu_frequency != 0) { - state.counters["cpufreq"] = cpu_frequency; - } - - const size_t num_elements_per_iteration = num_elements; - state.counters["num_elements"] = - benchmark::Counter(uint64_t(state.iterations()) * num_elements_per_iteration, benchmark::Counter::kIsRate); - - const size_t bytes_per_iteration = 2 * num_elements * sizeof(int8_t); - state.counters["bytes"] = - benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate); -} - -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ - BENCHMARK_CAPTURE(qs8_vmulc, ukernel, arch_flags, ukernel, init_params) \ - ->Apply(benchmark::utils::BinaryElementwiseParameters) \ - ->UseRealTime(); -#include "src/qs8-vmulc/qs8-vmulc-minmax-fp32.h" -#include "src/qs8-vmulc/qs8-vmulc-minmax-rndnu.h" -#undef XNN_UKERNEL_WITH_PARAMS - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/qu8-vadd.cc b/bench/qu8-vadd.cc deleted file mode 100644 index 7171877b435..00000000000 --- a/bench/qu8-vadd.cc +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include - -#include -#include "bench/utils.h" - -#include "xnnpack.h" -#include "xnnpack/aligned-allocator.h" -#include "xnnpack/common.h" -#include "xnnpack/hardware-config.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams-init.h" -#include "xnnpack/vbinary.h" - - -static void qu8_vadd( - benchmark::State& state, - uint64_t arch_flags, - xnn_qu8_vadd_minmax_ukernel_fn vadd, - xnn_init_qu8_add_minmax_params_fn init_params) -{ - if (!benchmark::utils::CheckArchFlags(state, arch_flags)) { - return; - } - - const size_t num_elements = state.range(0); - - std::random_device random_device; - auto rng = std::mt19937(random_device()); - auto u8rng = std::bind( - std::uniform_int_distribution(std::numeric_limits::min(), std::numeric_limits::max()), - std::ref(rng)); - - std::vector> a(num_elements); - std::vector> b(num_elements); - std::vector> sum(num_elements); - std::generate(a.begin(), a.end(), std::ref(u8rng)); - std::generate(b.begin(), b.end(), std::ref(u8rng)); - - struct xnn_qu8_add_minmax_params params; - init_params(¶ms, - 127 /* a zero point */, 127 /* b zero point */, 127 /* output zero point */, - 0.5f /* a-output scale */, 0.75f /* b-output scale */, - std::numeric_limits::min() + 1, std::numeric_limits::max() - 1); - for (auto _ : state) { - vadd(num_elements * sizeof(uint8_t), a.data(), b.data(), sum.data(), ¶ms); - } - - const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); - if (cpu_frequency != 0) { - state.counters["cpufreq"] = cpu_frequency; - } - - const size_t num_elements_per_iteration = num_elements; - state.counters["num_elements"] = - benchmark::Counter(uint64_t(state.iterations()) * num_elements_per_iteration, benchmark::Counter::kIsRate); - - const size_t bytes_per_iteration = 3 * num_elements * sizeof(int8_t); - state.counters["bytes"] = - benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate); -} - -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ - BENCHMARK_CAPTURE(qu8_vadd, ukernel, arch_flags, ukernel, init_params) \ - ->Apply(benchmark::utils::BinaryElementwiseParameters) \ - ->UseRealTime(); -#include "src/qu8-vadd/qu8-vadd-minmax.h" -#undef XNN_UKERNEL_WITH_PARAMS - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/qu8-vaddc.cc b/bench/qu8-vaddc.cc deleted file mode 100644 index 0b7f84421a4..00000000000 --- a/bench/qu8-vaddc.cc +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include - -#include -#include "bench/utils.h" - -#include "xnnpack.h" -#include "xnnpack/aligned-allocator.h" -#include "xnnpack/common.h" -#include "xnnpack/hardware-config.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams-init.h" -#include "xnnpack/vbinary.h" - - -static void qu8_vaddc( - benchmark::State& state, - uint64_t arch_flags, - xnn_qu8_vadd_minmax_ukernel_fn vaddc, - xnn_init_qu8_add_minmax_params_fn init_params) -{ - if (!benchmark::utils::CheckArchFlags(state, arch_flags)) { - return; - } - - const size_t num_elements = state.range(0); - - std::random_device random_device; - auto rng = std::mt19937(random_device()); - auto u8rng = std::bind( - std::uniform_int_distribution(std::numeric_limits::min(), std::numeric_limits::max()), - std::ref(rng)); - - std::vector> a(num_elements); - std::vector> sum(num_elements); - std::generate(a.begin(), a.end(), std::ref(u8rng)); - const uint8_t b = u8rng(); - - struct xnn_qu8_add_minmax_params params; - init_params(¶ms, - 127 /* a zero point */, 127 /* b zero point */, 127 /* output zero point */, - 0.5f /* a-output scale */, 0.75f /* b-output scale */, - std::numeric_limits::min() + 1, std::numeric_limits::max() - 1); - for (auto _ : state) { - vaddc(num_elements * sizeof(uint8_t), a.data(), &b, sum.data(), ¶ms); - } - - const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); - if (cpu_frequency != 0) { - state.counters["cpufreq"] = cpu_frequency; - } - - const size_t num_elements_per_iteration = num_elements; - state.counters["num_elements"] = - benchmark::Counter(uint64_t(state.iterations()) * num_elements_per_iteration, benchmark::Counter::kIsRate); - - const size_t bytes_per_iteration = 2 * num_elements * sizeof(int8_t); - state.counters["bytes"] = - benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate); -} - -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ - BENCHMARK_CAPTURE(qu8_vaddc, ukernel, arch_flags, ukernel, init_params) \ - ->Apply(benchmark::utils::BinaryElementwiseParameters) \ - ->UseRealTime(); -#include "src/qu8-vaddc/qu8-vaddc-minmax.h" -#undef XNN_UKERNEL_WITH_PARAMS - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/qu8-vmul.cc b/bench/qu8-vmul.cc deleted file mode 100644 index 0d948bfc072..00000000000 --- a/bench/qu8-vmul.cc +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include - -#include -#include "bench/utils.h" - -#include "xnnpack.h" -#include "xnnpack/aligned-allocator.h" -#include "xnnpack/common.h" -#include "xnnpack/hardware-config.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams-init.h" -#include "xnnpack/vbinary.h" - - -static void qu8_vmul( - benchmark::State& state, - uint64_t arch_flags, - xnn_qu8_vmul_minmax_ukernel_fn vmul, - xnn_init_qu8_mul_minmax_params_fn init_params) -{ - if (!benchmark::utils::CheckArchFlags(state, arch_flags)) { - return; - } - - const size_t num_elements = state.range(0); - - std::random_device random_device; - auto rng = std::mt19937(random_device()); - auto u8rng = std::bind( - std::uniform_int_distribution(std::numeric_limits::min(), std::numeric_limits::max()), - std::ref(rng)); - - std::vector> a(num_elements); - std::vector> b(num_elements); - std::vector> product(num_elements); - std::generate(a.begin(), a.end(), std::ref(u8rng)); - std::generate(b.begin(), b.end(), std::ref(u8rng)); - - union xnn_qu8_mul_minmax_params params; - init_params(¶ms, - 127 /* a zero point */, 127 /* b zero point */, 127 /* output zero point */, - 0.75f /* product-output scale */, - std::numeric_limits::min() + 1, std::numeric_limits::max() - 1); - for (auto _ : state) { - vmul(num_elements * sizeof(uint8_t), a.data(), b.data(), product.data(), ¶ms); - } - - const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); - if (cpu_frequency != 0) { - state.counters["cpufreq"] = cpu_frequency; - } - - const size_t num_elements_per_iteration = num_elements; - state.counters["num_elements"] = - benchmark::Counter(uint64_t(state.iterations()) * num_elements_per_iteration, benchmark::Counter::kIsRate); - - const size_t bytes_per_iteration = 3 * num_elements * sizeof(int8_t); - state.counters["bytes"] = - benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate); -} - -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ - BENCHMARK_CAPTURE(qu8_vmul, ukernel, arch_flags, ukernel, init_params) \ - ->Apply(benchmark::utils::BinaryElementwiseParameters) \ - ->UseRealTime(); -#include "src/qu8-vmul/qu8-vmul-minmax-fp32.h" -#include "src/qu8-vmul/qu8-vmul-minmax-rndnu.h" -#undef XNN_UKERNEL_WITH_PARAMS - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/qu8-vmulc.cc b/bench/qu8-vmulc.cc deleted file mode 100644 index ff0c6ad98fb..00000000000 --- a/bench/qu8-vmulc.cc +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include - -#include "bench/utils.h" -#include - -#include "xnnpack.h" -#include "xnnpack/aligned-allocator.h" -#include "xnnpack/common.h" -#include "xnnpack/hardware-config.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams-init.h" -#include "xnnpack/vbinary.h" - - -static void qu8_vmulc( - benchmark::State& state, - uint64_t arch_flags, - xnn_qu8_vmul_minmax_ukernel_fn vmulc, - xnn_init_qu8_mul_minmax_params_fn init_params) -{ - if (!benchmark::utils::CheckArchFlags(state, arch_flags)) { - return; - } - - const size_t num_elements = state.range(0); - - std::random_device random_device; - auto rng = std::mt19937(random_device()); - auto u8rng = std::bind( - std::uniform_int_distribution(std::numeric_limits::min(), std::numeric_limits::max()), - std::ref(rng)); - - std::vector> a(num_elements); - std::vector> product(num_elements); - std::generate(a.begin(), a.end(), std::ref(u8rng)); - const uint8_t b = u8rng(); - - union xnn_qu8_mul_minmax_params params; - init_params(¶ms, - 127 /* a zero point */, 127 /* b zero point */, 127 /* output zero point */, - 0.75f /* product-output scale */, - std::numeric_limits::min() + 1, std::numeric_limits::max() - 1); - for (auto _ : state) { - vmulc(num_elements * sizeof(uint8_t), a.data(), &b, product.data(), ¶ms); - } - - const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); - if (cpu_frequency != 0) { - state.counters["cpufreq"] = cpu_frequency; - } - - const size_t num_elements_per_iteration = num_elements; - state.counters["num_elements"] = - benchmark::Counter(uint64_t(state.iterations()) * num_elements_per_iteration, benchmark::Counter::kIsRate); - - const size_t bytes_per_iteration = 2 * num_elements * sizeof(int8_t); - state.counters["bytes"] = - benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate); -} - -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ - BENCHMARK_CAPTURE(qu8_vmulc, ukernel, arch_flags, ukernel, init_params) \ - ->Apply(benchmark::utils::BinaryElementwiseParameters) \ - ->UseRealTime(); -#include "src/qu8-vmulc/qu8-vmulc-minmax-fp32.h" -#include "src/qu8-vmulc/qu8-vmulc-minmax-rndnu.h" -#undef XNN_UKERNEL_WITH_PARAMS - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/vbinary.cc b/bench/vbinary.cc new file mode 100644 index 00000000000..9e259777398 --- /dev/null +++ b/bench/vbinary.cc @@ -0,0 +1,224 @@ +// Copyright 2021 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include "xnnpack/vbinary.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "bench/utils.h" +#include "xnnpack.h" +#include "xnnpack/aligned-allocator.h" +#include "xnnpack/common.h" +#include "xnnpack/hardware-config.h" +#include "xnnpack/math.h" +#include "xnnpack/microfnptr.h" +#include "xnnpack/microparams-init.h" +#include "xnnpack/microparams.h" +#include + +template +struct UniformDistribution { + std::uniform_real_distribution dist{-10.0f, 10.0f}; + + template + T operator()(Generator& g) { + return dist(g); + } +}; + +template <> +struct UniformDistribution { + std::uniform_real_distribution dist{-10.0f, 10.0f}; + + template + xnn_float16 operator()(Generator& g) { + return dist(g); + } +}; + +template <> +struct UniformDistribution { + std::uniform_int_distribution dist{ + std::numeric_limits::lowest(), + std::numeric_limits::max()}; + + template + int8_t operator()(Generator& g) { + return dist(g); + } +}; + +template <> +struct UniformDistribution { + std::uniform_int_distribution dist{ + std::numeric_limits::lowest(), + std::numeric_limits::max()}; + + template + uint8_t operator()(Generator& g) { + return dist(g); + } +}; + +template +T make_params(InitFn init_fn, Args... args) { + T result; + init_fn(&result, args...); + return result; +} + +template +struct ParamsWrapper { + Params params; +}; + +template <> +struct ParamsWrapper { + xnn_qs8_add_minmax_params params = make_params( + xnn_init_qs8_add_minmax_scalar_params, 0, 0, 0, 1.0f, 1.0f, -128, 127); +}; + +template <> +struct ParamsWrapper { + xnn_qu8_add_minmax_params params = make_params( + xnn_init_qu8_add_minmax_scalar_params, 0, 0, 0, 1.0f, 1.0f, 0, 255); +}; + +template <> +struct ParamsWrapper { + xnn_qs8_mul_minmax_params params = make_params( + xnn_init_qs8_mul_minmax_scalar_params, 0, 0, 0, 1.0f, -128, 127); +}; + +template <> +struct ParamsWrapper { + xnn_qu8_mul_minmax_params params = make_params( + xnn_init_qu8_mul_minmax_scalar_params, 0, 0, 0, 1.0f, 0, 255); +}; + +// Microkernel function, templated on the `params` type. +template +using UKernelFn = void (*)(size_t, const T*, const T*, T*, + const UKernelParams* params); + +template +static void vbinary(benchmark::State& state, uint64_t arch_flags, + UKernelFn ukernel) { + if (!benchmark::utils::CheckArchFlags(state, arch_flags)) { + return; + } + + const size_t num_elements = state.range(0); + + std::random_device random_device; + auto rng = std::mt19937(random_device()); + UniformDistribution dist; + + std::vector> a(num_elements); + std::vector> b(num_elements); + std::vector> output(num_elements); + std::generate(a.begin(), a.end(), [&]() { return dist(rng); }); + std::generate(b.begin(), b.end(), [&]() { return dist(rng); }); + + ParamsWrapper params; + for (auto _ : state) { + ukernel(num_elements * sizeof(T), a.data(), b.data(), output.data(), + ¶ms.params); + } + + const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); + if (cpu_frequency != 0) { + state.counters["cpufreq"] = cpu_frequency; + } + + const size_t num_elements_per_iteration = num_elements; + state.counters["num_elements"] = + benchmark::Counter(uint64_t(state.iterations()) * num_elements_per_iteration, benchmark::Counter::kIsRate); + + const size_t bytes_per_iteration = 3 * num_elements * sizeof(T); + state.counters["bytes"] = + benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate); +} + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ + datatype, params_type, init_params) \ + BENCHMARK_CAPTURE(vbinary, ukernel, arch_flags, ukernel) \ + ->Apply( \ + benchmark::utils::BinaryElementwiseParameters) \ + ->UseRealTime(); +#include "src/f16-vbinary/f16-vadd-minmax.h" +#include "src/f16-vbinary/f16-vaddc-minmax.h" +#include "src/f16-vbinary/f16-vdiv-minmax.h" +#include "src/f16-vbinary/f16-vdivc-minmax.h" +#include "src/f16-vbinary/f16-vmax.h" +#include "src/f16-vbinary/f16-vmaxc.h" +#include "src/f16-vbinary/f16-vmin.h" +#include "src/f16-vbinary/f16-vminc.h" +#include "src/f16-vbinary/f16-vmul-minmax.h" +#include "src/f16-vbinary/f16-vmulc-minmax.h" +#include "src/f16-vbinary/f16-vprelu.h" +#include "src/f16-vbinary/f16-vpreluc.h" +#include "src/f16-vbinary/f16-vrdivc-minmax.h" +#include "src/f16-vbinary/f16-vrpreluc.h" +#include "src/f16-vbinary/f16-vrsubc-minmax.h" +#include "src/f16-vbinary/f16-vsqrdiff.h" +#include "src/f16-vbinary/f16-vsqrdiffc.h" +#include "src/f16-vbinary/f16-vsub-minmax.h" +#include "src/f16-vbinary/f16-vsubc-minmax.h" +#include "src/f32-vbinary/f32-vadd-minmax.h" +#include "src/f32-vbinary/f32-vadd.h" +#include "src/f32-vbinary/f32-vaddc-minmax.h" +#include "src/f32-vbinary/f32-vaddc.h" +#include "src/f32-vbinary/f32-vcopysign.h" +#include "src/f32-vbinary/f32-vcopysignc.h" +#include "src/f32-vbinary/f32-vdiv-minmax.h" +#include "src/f32-vbinary/f32-vdiv.h" +#include "src/f32-vbinary/f32-vdivc-minmax.h" +#include "src/f32-vbinary/f32-vdivc.h" +#include "src/f32-vbinary/f32-vmax.h" +#include "src/f32-vbinary/f32-vmaxc.h" +#include "src/f32-vbinary/f32-vmin.h" +#include "src/f32-vbinary/f32-vminc.h" +#include "src/f32-vbinary/f32-vmul-minmax.h" +#include "src/f32-vbinary/f32-vmul.h" +#include "src/f32-vbinary/f32-vmulc-minmax.h" +#include "src/f32-vbinary/f32-vmulc.h" +#include "src/f32-vbinary/f32-vprelu.h" +#include "src/f32-vbinary/f32-vpreluc.h" +#include "src/f32-vbinary/f32-vrcopysignc.h" +#include "src/f32-vbinary/f32-vrdivc-minmax.h" +#include "src/f32-vbinary/f32-vrdivc.h" +#include "src/f32-vbinary/f32-vrpreluc.h" +#include "src/f32-vbinary/f32-vrsubc-minmax.h" +#include "src/f32-vbinary/f32-vrsubc.h" +#include "src/f32-vbinary/f32-vsqrdiff.h" +#include "src/f32-vbinary/f32-vsqrdiffc.h" +#include "src/f32-vbinary/f32-vsub-minmax.h" +#include "src/f32-vbinary/f32-vsub.h" +#include "src/f32-vbinary/f32-vsubc-minmax.h" +#include "src/f32-vbinary/f32-vsubc.h" +#include "src/qs8-vadd/qs8-vadd-minmax.h" +#include "src/qs8-vaddc/qs8-vaddc-minmax.h" +#include "src/qs8-vmul/qs8-vmul-minmax-fp32.h" +#include "src/qs8-vmul/qs8-vmul-minmax-rndnu.h" +#include "src/qs8-vmulc/qs8-vmulc-minmax-fp32.h" +#include "src/qs8-vmulc/qs8-vmulc-minmax-rndnu.h" +#include "src/qu8-vadd/qu8-vadd-minmax.h" +#include "src/qu8-vaddc/qu8-vaddc-minmax.h" +#include "src/qu8-vmul/qu8-vmul-minmax-fp32.h" +#include "src/qu8-vmul/qu8-vmul-minmax-rndnu.h" +#include "src/qu8-vmulc/qu8-vmulc-minmax-fp32.h" +#include "src/qu8-vmulc/qu8-vmulc-minmax-rndnu.h" +#undef XNN_UKERNEL_WITH_PARAMS + +#ifndef XNNPACK_BENCHMARK_NO_MAIN +BENCHMARK_MAIN(); +#endif diff --git a/bench/vunary.cc b/bench/vunary.cc index f8bf850701e..05badb05fae 100644 --- a/bench/vunary.cc +++ b/bench/vunary.cc @@ -26,6 +26,50 @@ #include "xnnpack/vlrelu.h" #include +template +struct UniformDistribution { + std::uniform_real_distribution dist{-10.0f, 10.0f}; + + template + T operator()(Generator& g) { + return dist(g); + } +}; + +template <> +struct UniformDistribution { + std::uniform_real_distribution dist{-10.0f, 10.0f}; + + template + xnn_float16 operator()(Generator& g) { + return dist(g); + } +}; + +template <> +struct UniformDistribution { + std::uniform_int_distribution dist{ + std::numeric_limits::lowest(), + std::numeric_limits::max()}; + + template + int8_t operator()(Generator& g) { + return dist(g); + } +}; + +template <> +struct UniformDistribution { + std::uniform_int_distribution dist{ + std::numeric_limits::lowest(), + std::numeric_limits::max()}; + + template + uint8_t operator()(Generator& g) { + return dist(g); + } +}; + template T make_params(InitFn init_fn, Args... args) { T result; @@ -36,96 +80,70 @@ T make_params(InitFn init_fn, Args... args) { template struct Config { Params params; - static constexpr TIn min = std::numeric_limits::lowest(); - static constexpr TIn max = std::numeric_limits::max(); }; template <> struct Config { xnn_f16_minmax_params params = {{-1.0f, 1.0f}}; - static constexpr float min = -std::numeric_limits::infinity(); - static constexpr float max = +std::numeric_limits::infinity(); }; template <> struct Config { xnn_f32_minmax_params params = {{-1.0f, 1.0f}}; - static constexpr float min = -std::numeric_limits::infinity(); - static constexpr float max = +std::numeric_limits::infinity(); }; template <> struct Config { xnn_f16_elu_params params = {{1.0f, 1.0f, 1.0f}}; - static constexpr float min = -std::numeric_limits::infinity(); - static constexpr float max = +std::numeric_limits::infinity(); }; template <> struct Config { xnn_f32_elu_params params = {{1.0f, 1.0f, 1.0f}}; - static constexpr float min = -std::numeric_limits::infinity(); - static constexpr float max = +std::numeric_limits::infinity(); }; template <> struct Config { xnn_f16_lrelu_params params = {{0.01f}}; - static constexpr float min = -std::numeric_limits::infinity(); - static constexpr float max = +std::numeric_limits::infinity(); }; template <> struct Config { xnn_f32_lrelu_params params = {{0.01f}}; - static constexpr float min = -std::numeric_limits::infinity(); - static constexpr float max = +std::numeric_limits::infinity(); }; template <> struct Config { xnn_s8_minmax_params params = {{-100, 100}}; - static constexpr int8_t min = std::numeric_limits::lowest(); - static constexpr int8_t max = std::numeric_limits::max(); }; template <> struct Config { xnn_u8_minmax_params params = {{0, 200}}; - static constexpr uint8_t min = std::numeric_limits::lowest(); - static constexpr uint8_t max = std::numeric_limits::max(); }; template <> struct Config { xnn_qs8_lrelu_params params = make_params( xnn_init_qs8_lrelu_scalar_params, 0.1f, 1.0f, 1, 1); - static constexpr int8_t min = std::numeric_limits::lowest(); - static constexpr int8_t max = std::numeric_limits::max(); }; template <> struct Config { xnn_qu8_lrelu_params params = make_params( xnn_init_qu8_lrelu_scalar_params, 0.1f, 1.0f, 1, 1); - static constexpr int8_t min = std::numeric_limits::lowest(); - static constexpr int8_t max = std::numeric_limits::max(); }; template <> struct Config { xnn_qs8_hswish_params params = make_params( xnn_init_qs8_hswish_scalar_params, 0, 0, 1.0f, 1.0f); - static constexpr int8_t min = std::numeric_limits::lowest(); - static constexpr int8_t max = std::numeric_limits::max(); }; template <> struct Config { xnn_qu8_hswish_params params = make_params( xnn_init_qu8_hswish_scalar_params, 0, 0, 1.0f, 1.0f); - static constexpr int8_t min = std::numeric_limits::lowest(); - static constexpr int8_t max = std::numeric_limits::max(); }; // Microkernel function, templated on the `params` type. @@ -146,13 +164,11 @@ void vunary(benchmark::State& state, uint64_t arch_flags, std::random_device random_device; auto rng = std::mt19937(random_device()); - auto f32rng = - std::bind(std::uniform_real_distribution(config.min, config.max), - std::ref(rng)); + UniformDistribution dist; std::vector> x(num_elements); std::vector> y(num_elements); - std::generate(x.begin(), x.end(), f32rng); + std::generate(x.begin(), x.end(), [&]() { return dist(rng); }); std::fill(y.begin(), y.end(), 0); for (auto _ : state) { From 21012fbc700517fba9afce7bccd261f0c8f8a29b Mon Sep 17 00:00:00 2001 From: Alan Kelly Date: Thu, 19 Sep 2024 22:41:27 -0700 Subject: [PATCH 13/50] xnn_create_runtime_v4 allocates a non-shared workspace if none is provided PiperOrigin-RevId: 676701267 --- src/runtime.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/runtime.c b/src/runtime.c index 695b6cbc41b..b5350824e4c 100644 --- a/src/runtime.c +++ b/src/runtime.c @@ -464,9 +464,8 @@ enum xnn_status xnn_create_runtime_v4( } if (workspace == NULL) { - xnn_log_error("failed to create runtime: workspace is NULL"); - status = xnn_status_invalid_parameter; - goto error; + xnn_log_debug("Allocating non-shared workspace"); + workspace = xnn_allocate_zero_simd_memory(sizeof(struct xnn_workspace)); } const uint32_t optimization_flags = XNN_FLAG_HINT_SPARSE_INFERENCE | XNN_FLAG_HINT_FP16_INFERENCE | From bddd270e22bb0bbb72a8ee842c5bbf05994228c8 Mon Sep 17 00:00:00 2001 From: Alan Kelly Date: Thu, 19 Sep 2024 23:14:32 -0700 Subject: [PATCH 14/50] Log a clear error message when a hanging node has been found instead of crashing. PiperOrigin-RevId: 676709328 --- src/subgraph.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/subgraph.c b/src/subgraph.c index b012fbfc77d..625c0de6e8d 100644 --- a/src/subgraph.c +++ b/src/subgraph.c @@ -1145,7 +1145,10 @@ enum xnn_status xnn_subgraph_fusion( struct xnn_node* producer = &subgraph->nodes[producer_id]; assert(producer->type != xnn_node_type_invalid); struct xnn_node* consumer = &subgraph->nodes[consumer_id]; - assert(consumer->type != xnn_node_type_invalid); + if (consumer->type == xnn_node_type_invalid) { + xnn_log_fatal("Node %u has no consumers. Should an external output have been set?", consumer_id); + return xnn_status_invalid_state; + } // Try to fuse Clamp Node upstream into producer Node if (consumer->type == xnn_node_type_clamp) { From 447aff76b3bdadf073ac1008ca1857879bb2e014 Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Fri, 20 Sep 2024 01:44:01 -0700 Subject: [PATCH 15/50] Allow 1x1 pooling This is maybe not something we expect to see in optimized code, but having them fail is annoying when trying to test user pipelines that we don't control, and it might actually be useful to do some strided copies. PiperOrigin-RevId: 676748461 --- src/operators/average-pooling-nhwc.c | 21 --------------------- src/subgraph/average-pooling-2d.c | 21 --------------------- test/average-pooling-2d.cc | 2 +- 3 files changed, 1 insertion(+), 43 deletions(-) diff --git a/src/operators/average-pooling-nhwc.c b/src/operators/average-pooling-nhwc.c index f1219ac1ad1..754da020202 100644 --- a/src/operators/average-pooling-nhwc.c +++ b/src/operators/average-pooling-nhwc.c @@ -68,13 +68,6 @@ enum xnn_status create_average_pooling2d_nhwc( return xnn_status_invalid_parameter; } - if (pooling_size == 1) { - xnn_log_error( - "failed to create %s operator with 1 pooling element: 1x1 pooling is meaningless", - xnn_operator_type_to_string(operator_type)); - return xnn_status_invalid_parameter; - } - if (stride_height == 0 || stride_width == 0) { xnn_log_error( "failed to create %s operator with %" PRIu32 "x%" PRIu32 " stride: stride dimensions must be non-zero", @@ -82,20 +75,6 @@ enum xnn_status create_average_pooling2d_nhwc( return xnn_status_invalid_parameter; } - if (stride_height > pooling_height) { - xnn_log_error( - "failed to create %s operator with %" PRIu32 " stride height: must be less than pooling height %" PRIu32, - xnn_operator_type_to_string(operator_type), stride_height, pooling_height); - return xnn_status_invalid_parameter; - } - - if (stride_width > pooling_width) { - xnn_log_error( - "failed to create %s operator with %" PRIu32 " stride width: must be less than pooling width %" PRIu32, - xnn_operator_type_to_string(operator_type), stride_width, pooling_width); - return xnn_status_invalid_parameter; - } - if (isnan(output_min)) { xnn_log_error( "failed to create %s operator with NaN output lower bound: lower bound must be non-NaN", diff --git a/src/subgraph/average-pooling-2d.c b/src/subgraph/average-pooling-2d.c index 1e7edf879c8..b0d4b9a04dc 100644 --- a/src/subgraph/average-pooling-2d.c +++ b/src/subgraph/average-pooling-2d.c @@ -212,13 +212,6 @@ enum xnn_status xnn_define_average_pooling_2d( return xnn_status_invalid_parameter; } - if (pooling_size == 1) { - xnn_log_error( - "failed to define %s operator with 1 pooling element: 1x1 pooling is meaningless", - xnn_node_type_to_string(xnn_node_type_average_pooling_2d)); - return xnn_status_invalid_parameter; - } - if (stride_height == 0 || stride_width == 0) { xnn_log_error( "failed to define %s operator with %" PRIu32 "x%" PRIu32 " stride: " @@ -227,20 +220,6 @@ enum xnn_status xnn_define_average_pooling_2d( return xnn_status_invalid_parameter; } - if (stride_height > pooling_height) { - xnn_log_error( - "failed to define %s operator with %" PRIu32 " stride height: must be less than pooling height %" PRIu32, - xnn_node_type_to_string(xnn_node_type_max_pooling_2d), stride_height, pooling_height); - return xnn_status_invalid_parameter; - } - - if (stride_width > pooling_width) { - xnn_log_error( - "failed to define %s operator with %" PRIu32 " stride width: must be less than pooling width %" PRIu32, - xnn_node_type_to_string(xnn_node_type_max_pooling_2d), stride_width, pooling_width); - return xnn_status_invalid_parameter; - } - status = xnn_subgraph_check_output_min_max(xnn_node_type_average_pooling_2d, output_min, output_max); if (status != xnn_status_success) { return status; diff --git a/test/average-pooling-2d.cc b/test/average-pooling-2d.cc index d71cf0343aa..4602f2aded5 100644 --- a/test/average-pooling-2d.cc +++ b/test/average-pooling-2d.cc @@ -30,7 +30,7 @@ class AveragePoolingTest : public ::testing::Test { protected: AveragePoolingTest() { input_size_dist = std::uniform_int_distribution(10, 15); - pooling_size_dist = std::uniform_int_distribution(2, 5); + pooling_size_dist = std::uniform_int_distribution(1, 5); stride_dist = std::uniform_int_distribution(1, 2); batch_size = input_size_dist(rng); input_height = input_size_dist(rng); From 7a7c7e1f9e89e036caf607fa6c3493edc53eb99f Mon Sep 17 00:00:00 2001 From: Pedro Gonnet Date: Fri, 20 Sep 2024 06:52:41 -0700 Subject: [PATCH 16/50] Pre-pack constant left-hand operands for `batch_matrix_multiply_nc_f32`, similarly to what is done for `qd8_f32_qcx8w`. PiperOrigin-RevId: 676828689 --- include/xnnpack.h | 7 +- src/operators/batch-matrix-multiply-nc.c | 166 ++++++++++++++++--- src/subgraph/batch-matrix-multiply.c | 38 ++++- src/xnnpack/operator.h | 6 +- test/BUILD.bazel | 1 + test/batch-matrix-multiply-operator-tester.h | 108 ++++++------ test/multiply2.cc | 10 +- 7 files changed, 253 insertions(+), 83 deletions(-) diff --git a/include/xnnpack.h b/include/xnnpack.h index 6e584f206cf..242dd477dd5 100644 --- a/include/xnnpack.h +++ b/include/xnnpack.h @@ -2614,8 +2614,11 @@ enum xnn_status xnn_setup_batch_matrix_multiply_nc_f16( const void* input_a, const void* input_b, void* output); enum xnn_status xnn_create_batch_matrix_multiply_nc_f32( - uint32_t flags, - xnn_operator_t* batch_matrix_multiply_op); + uint32_t flags, xnn_operator_t* batch_matrix_multiply_op); + +enum xnn_status xnn_create_batch_matrix_multiply_nc_f32_const_weights( + size_t batch_size_b, size_t k, size_t n, const float* data_b, + uint32_t flags, xnn_operator_t* batch_matrix_multiply_op); enum xnn_status xnn_reshape_batch_matrix_multiply_nc_f32( xnn_operator_t batch_matrix_multiply_op, size_t num_batch_dims, diff --git a/src/operators/batch-matrix-multiply-nc.c b/src/operators/batch-matrix-multiply-nc.c index ff260d62076..9e102ab34a8 100644 --- a/src/operators/batch-matrix-multiply-nc.c +++ b/src/operators/batch-matrix-multiply-nc.c @@ -90,9 +90,7 @@ enum xnn_status create_batch_matrix_multiply_nc( } enum xnn_status xnn_create_batch_matrix_multiply_nc_f32( - uint32_t flags, - xnn_operator_t* batch_matrix_multiply_op_out) -{ + uint32_t flags, xnn_operator_t* batch_matrix_multiply_op_out) { const struct xnn_gemm_config* gemm_config = xnn_init_f32_gemm_config(); if (gemm_config == NULL) { xnn_log_error("failed to create %s operator: unsupported hardware configuration", @@ -111,12 +109,117 @@ enum xnn_status xnn_create_batch_matrix_multiply_nc_f32( } return create_batch_matrix_multiply_nc( - flags, - ¶ms, sizeof(params), - gemm_config, gemm_ukernels, - (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w, - xnn_operator_type_batch_matrix_multiply_nc_f32, - batch_matrix_multiply_op_out); + flags, ¶ms, sizeof(params), gemm_config, gemm_ukernels, + (xnn_packw_gemm_gio_ukernel_fn)xnn_pack_f32_gemm_gio_w, + xnn_operator_type_batch_matrix_multiply_nc_f32, + batch_matrix_multiply_op_out); +} + +enum xnn_status xnn_create_batch_matrix_multiply_nc_f32_const_weights( + size_t batch_size_b, size_t k, size_t n, const float* data_b, + uint32_t flags, xnn_operator_t* batch_matrix_multiply_op_out) { + const enum xnn_status status = xnn_create_batch_matrix_multiply_nc_f32( + flags, batch_matrix_multiply_op_out); + if (status != xnn_status_success) { + return status; + } + + const struct xnn_gemm_config* gemm_config = xnn_init_f32_gemm_config(); + xnn_operator_t batch_matrix_multiply_op = *batch_matrix_multiply_op_out; + batch_matrix_multiply_op->context.gemm.const_weights = true; + + // Check if we've already cached the packed data for `B`. + uint32_t cache_seed = murmur_hash3( + &batch_matrix_multiply_op->context.gemm.gemm.gemm, + sizeof(batch_matrix_multiply_op->context.gemm.gemm.gemm), k * n); + if (batch_matrix_multiply_op->flags & XNN_FLAG_TRANSPOSE_WEIGHTS) { + cache_seed = ~cache_seed; + } + size_t cache_offset = XNN_CACHE_NOT_FOUND; + struct xnn_weights_cache_look_up_key cache_key; + cache_key.seed = cache_seed; + cache_key.kernel = data_b; + cache_key.bias = NULL; + if (use_weights_cache(batch_matrix_multiply_op)) { + cache_offset = xnn_weights_cache_look_up( + batch_matrix_multiply_op->weights_cache, &cache_key); + } + + // If the packed data has not been cached, pack and cache it. + if (cache_offset == XNN_CACHE_NOT_FOUND) { + // Compute the shape and size of the packed data. + const uint32_t nr = batch_matrix_multiply_op->ukernel.gemm.nr; + const uint32_t kr = batch_matrix_multiply_op->ukernel.gemm.kr; + const uint32_t sr = batch_matrix_multiply_op->ukernel.gemm.sr; + const size_t bias_element_size = sizeof(float); + const size_t n_stride = round_up(n, nr); + const size_t k_stride = round_up_po2(k, kr * sr); + const size_t input_b_batch_stride = + (n_stride * bias_element_size + + ((n_stride * k_stride) << XNN_LOG2_SIZEOF_FLOAT)); + const size_t packed_size = batch_size_b * input_b_batch_stride; + const size_t aligned_size = + round_up_po2(packed_size, XNN_ALLOCATION_ALIGNMENT); + + // Allocate the packed weights. + void* packed_data = xnn_get_pointer_to_write_weights( + batch_matrix_multiply_op, aligned_size, /*padding_byte=*/0); + if (packed_data == NULL) { + xnn_log_error( + "failed to allocate %zu bytes for %s operator packed weights", + packed_size, + xnn_operator_type_to_string(batch_matrix_multiply_op->type)); + return xnn_status_out_of_memory; + } + xnn_log_debug( + "allocated %zu bytes for packed weights in %s operator (ptr=%p)", + aligned_size, + xnn_operator_type_to_string(batch_matrix_multiply_op->type), + packed_data); + + // Pack the weights. + if (gemm_config->pack_weights_and_biases) { + gemm_config->pack_weights_and_biases(flags, gemm_config, k, n, + /*groups=*/batch_size_b, k_stride, + /*accumulator_init=*/NULL, + /*weights=*/data_b, + /*int_extra_data0_fn=*/NULL, + /*extra_data0=*/NULL, + /*extra_data0_size=*/0, + /*init_extra_data1_fn=*/ + NULL, + /*extra_data1=*/NULL, + /*extra_data1_size=*/0, + /*packed_weights_ptr=*/packed_data, + /*packing_params=*/NULL); + } else { + if (flags & XNN_FLAG_TRANSPOSE_WEIGHTS) { + batch_matrix_multiply_op->ukernel.gemm.packw_gemm_goi( + /*groups=*/batch_size_b, n, k, nr, kr, sr, data_b, + /*bias=*/NULL, /*scale=*/NULL, packed_data, + /*extra_bytes=*/0, /*packing_params=*/NULL); + } else { + batch_matrix_multiply_op->ukernel.gemm.packw_gemm_gio( + /*groups=*/batch_size_b, n, k, nr, kr, sr, n, data_b, /*bias=*/NULL, + /*scale=*/NULL, packed_data, + /*extra_bytes=*/0, /*packing_params=*/NULL); + } + } + + // Cache the weights. + if (use_weights_cache(batch_matrix_multiply_op)) { + batch_matrix_multiply_op->packed_weights.offset = + xnn_look_up_or_insert_weights_cache( + batch_matrix_multiply_op->weights_cache, &cache_key, packed_data, + aligned_size); + } + + } else { + // Retrieve the packed weights from the cache entry. + batch_matrix_multiply_op->packed_weights.offset = cache_offset; + } + + return xnn_status_success; } enum xnn_status xnn_create_batch_matrix_multiply_nc_f16( @@ -387,19 +490,6 @@ static enum xnn_status reshape_batch_matrix_multiply_nc( const uint32_t kr = batch_matrix_multiply_op->ukernel.gemm.kr; const uint32_t sr = batch_matrix_multiply_op->ukernel.gemm.sr; - const size_t n_stride = round_up(n, nr); - const size_t k_stride = round_up_po2(k, kr * sr); - const size_t input_b_batch_stride = - (n_stride * bias_element_size + - ((n_stride * k_stride) << log2_input_b_element_size)); - - if (workspace_size != NULL) { - *workspace_size = batch_size_b * input_b_batch_stride; - } - if (workspace_alignment != NULL) { - *workspace_alignment = XNN_ALLOCATION_ALIGNMENT; - } - uint32_t mr = batch_matrix_multiply_op->ukernel.gemm.mr; struct xnn_hmp_gemm_ukernel *gemm_cases = batch_matrix_multiply_op->ukernel.gemm.gemm_cases; @@ -410,18 +500,37 @@ static enum xnn_status reshape_batch_matrix_multiply_nc( assert(mr != 0 && mr <= XNN_MAX_MR); struct xnn_hmp_gemm_ukernel gemm_ukernel = gemm_cases[mr-1]; - struct compute_parameters* gemm_compute = NULL; + struct compute_parameters* gemm_compute = + &batch_matrix_multiply_op->compute[0]; switch (batch_matrix_multiply_op->type) { case xnn_operator_type_batch_matrix_multiply_nc_qd8_f32_qc8w: // Nothing to do here, the `B` matrix has already been packed. - gemm_compute = &batch_matrix_multiply_op->compute[0]; break; case xnn_operator_type_batch_matrix_multiply_nc_f16: - case xnn_operator_type_batch_matrix_multiply_nc_f32: + case xnn_operator_type_batch_matrix_multiply_nc_f32: { + // Do nothing if the weights don't need to be packed. + if (batch_matrix_multiply_op->context.gemm.const_weights) { + break; + } + gemm_compute = &batch_matrix_multiply_op->compute[1]; + const size_t n_stride = round_up(n, nr); + const size_t k_stride = round_up_po2(k, kr * sr); + const size_t input_b_batch_stride = + (n_stride * bias_element_size + + ((n_stride * k_stride) << log2_input_b_element_size)); + + // Compute the required workspace size. + if (workspace_size != NULL) { + *workspace_size = batch_size_b * input_b_batch_stride; + } + if (workspace_alignment != NULL) { + *workspace_alignment = XNN_ALLOCATION_ALIGNMENT; + } + if (batch_matrix_multiply_op->flags & XNN_FLAG_TRANSPOSE_B) { assert(batch_matrix_multiply_op->ukernel.gemm.packw_gemm_goi != NULL); batch_matrix_multiply_op->context.gemm.packw_gemm_goi = @@ -484,6 +593,7 @@ static enum xnn_status reshape_batch_matrix_multiply_nc( batch_matrix_multiply_op->compute[0].tile[0] = nr; } break; + } default: XNN_UNREACHABLE; } @@ -677,7 +787,11 @@ enum xnn_status xnn_setup_batch_matrix_multiply_nc_f32( return setup_batch_matrix_multiply_nc( batch_matrix_multiply_op, xnn_operator_type_batch_matrix_multiply_nc_f32, input_a, /*quantization_params=*/NULL, input_b, - /*packed_weights=*/workspace, output); + /*packed_weights=*/ + batch_matrix_multiply_op->context.gemm.const_weights + ? packed_weights(batch_matrix_multiply_op) + : workspace, + output); } enum xnn_status xnn_setup_batch_matrix_multiply_nc_qd8_f32_qc8w( diff --git a/src/subgraph/batch-matrix-multiply.c b/src/subgraph/batch-matrix-multiply.c index 1073be545b2..e2268202da2 100644 --- a/src/subgraph/batch-matrix-multiply.c +++ b/src/subgraph/batch-matrix-multiply.c @@ -55,9 +55,43 @@ static enum xnn_status create_batch_matrix_multiply_operator( break; case xnn_datatype_fp32: switch (inputb_datatype) { - case xnn_datatype_fp32: - status = xnn_create_batch_matrix_multiply_nc_f32(node->flags, &opdata->operator_objects[0]); + case xnn_datatype_fp32: { + // Get the shape and size of the second input. + const uint32_t input_b_id = opdata->inputs[1]; + assert(input_b_id != XNN_INVALID_VALUE_ID); + assert(input_b_id < num_values); + const struct xnn_value* input_b = values + input_b_id; + if (xnn_value_is_static(input_b)) { + if (input_b->shape.num_dims < 2) { + xnn_log_error( + "failed to create %s operator with input_b ID #%" PRIu32 + ": unsupported number of dimension %zu, must be at least 2", + xnn_node_type_to_string(xnn_node_type_batch_matrix_multiply), + input_b_id, input_b->shape.num_dims); + return xnn_status_invalid_parameter; + } + size_t batch_size_b = 1; + for (size_t i = 0; i < input_b->shape.num_dims - 2; i++) { + batch_size_b *= input_b->shape.dim[i]; + } + const size_t k = + node->flags & XNN_FLAG_TRANSPOSE_B + ? input_b->shape.dim[input_b->shape.num_dims - 1] + : input_b->shape.dim[input_b->shape.num_dims - 2]; + const size_t n = + node->flags & XNN_FLAG_TRANSPOSE_B + ? input_b->shape.dim[input_b->shape.num_dims - 2] + : input_b->shape.dim[input_b->shape.num_dims - 1]; + + status = xnn_create_batch_matrix_multiply_nc_f32_const_weights( + batch_size_b, k, n, input_b->data, node->flags, + &opdata->operator_objects[0]); + } else { + status = xnn_create_batch_matrix_multiply_nc_f32( + node->flags, &opdata->operator_objects[0]); + } break; + } default: XNN_UNREACHABLE; } diff --git a/src/xnnpack/operator.h b/src/xnnpack/operator.h index d1c543086c4..7a53ae4d79a 100644 --- a/src/xnnpack/operator.h +++ b/src/xnnpack/operator.h @@ -11,9 +11,10 @@ #include #include -#include "xnnpack/allocator.h" -#include "xnnpack/cache.h" +#include "xnnpack.h" +#include "xnnpack/common.h" #include "xnnpack/compute.h" +#include "xnnpack/microfnptr.h" #include "xnnpack/microkernel-type.h" #include "xnnpack/microparams.h" #include "xnnpack/operator-type.h" @@ -380,6 +381,7 @@ struct xnn_operator { } gemm; struct packw_gemm_goi_context packw_gemm_goi; struct packw_gemm_gio_context packw_gemm_gio; + bool const_weights; } gemm; struct global_average_pooling_nwc_context global_average_pooling_nwc; struct global_average_pooling_ncw_context global_average_pooling_ncw; diff --git a/test/BUILD.bazel b/test/BUILD.bazel index f10f2257fc2..83b716c5ca5 100644 --- a/test/BUILD.bazel +++ b/test/BUILD.bazel @@ -1877,6 +1877,7 @@ xnnpack_unit_test( ":subgraph_binary_tester", "@FP16", "//:XNNPACK", + "//:math", "//:node_type", "//:operators", "//:requantization", diff --git a/test/batch-matrix-multiply-operator-tester.h b/test/batch-matrix-multiply-operator-tester.h index b50cc5715c3..bdffb620eb6 100644 --- a/test/batch-matrix-multiply-operator-tester.h +++ b/test/batch-matrix-multiply-operator-tester.h @@ -358,57 +358,71 @@ class BatchMatMulOperatorTester { std::vector output(batch_size_output * m() * n()); std::vector output_ref(batch_size_output * m() * n()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input_a.begin(), input_a.end(), - [&]() { return f32dist(rng); }); - std::generate(input_b.begin(), input_b.end(), - [&]() { return f32dist(rng); }); - std::fill(output.begin(), output.end(), nanf("")); - std::fill(output_ref.begin(), output_ref.end(), 0.0f); + for (bool const_weights : {true, false}) { + for (size_t iteration = 0; iteration < iterations(); iteration++) { + std::generate(input_a.begin(), input_a.end(), + [&]() { return f32dist(rng); }); + std::generate(input_b.begin(), input_b.end(), + [&]() { return f32dist(rng); }); + std::fill(output.begin(), output.end(), nanf("")); + std::fill(output_ref.begin(), output_ref.end(), 0.0f); + + // Compute reference results. + ComputeReference(batch_dims_output, input_a.data(), input_b.data(), + output_ref.data(), ComputeRefF32); + + // Create, setup, run, and destroy Fully Connected operator. + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + xnn_operator_t batch_matrix_multiply_op = nullptr; + + xnn_status status; + if (const_weights) { + status = xnn_create_batch_matrix_multiply_nc_f32_const_weights( + batch_size_b, k(), n(), input_b.data(), flags(), + &batch_matrix_multiply_op); + } else { + status = xnn_create_batch_matrix_multiply_nc_f32( + flags(), &batch_matrix_multiply_op); + } + if (status == xnn_status_unsupported_hardware) { + GTEST_SKIP(); + } + ASSERT_EQ(xnn_status_success, status); + ASSERT_NE(nullptr, batch_matrix_multiply_op); + + // Smart pointer to automatically delete batch_matrix_multiply_op. + std::unique_ptr + auto_batch_matrix_multiply_op(batch_matrix_multiply_op, + xnn_delete_operator); + + size_t workspace_size = 0; + size_t workspace_alignment = 0; + ASSERT_EQ(expected_status_reshape(), + xnn_reshape_batch_matrix_multiply_nc_f32( + batch_matrix_multiply_op, num_batch_dims, + batch_dims_a().data(), batch_dims_b().data(), m(), k(), + n(), &workspace_size, &workspace_alignment, + /*threadpool=*/nullptr)); + if (expected_status_reshape() != xnn_status_success) { + return; + } + if (!const_weights) { + ASSERT_NE(workspace_size, 0); + ASSERT_LE(workspace_alignment, XNN_ALLOCATION_ALIGNMENT); + } + std::vector> + workspace(workspace_size); - // Compute reference results. - ComputeReference(batch_dims_output, input_a.data(), input_b.data(), - output_ref.data(), ComputeRefF32); + ASSERT_EQ(xnn_status_success, + xnn_setup_batch_matrix_multiply_nc_f32( + batch_matrix_multiply_op, workspace.data(), + input_a.data(), input_b.data(), output.data())); - // Create, setup, run, and destroy Fully Connected operator. - ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); - xnn_operator_t batch_matrix_multiply_op = nullptr; + ASSERT_EQ(xnn_status_success, xnn_run_operator(batch_matrix_multiply_op, + /*threadpool=*/nullptr)); - const xnn_status status = xnn_create_batch_matrix_multiply_nc_f32(flags(), &batch_matrix_multiply_op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); + VerifyF32(output, output_ref); } - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, batch_matrix_multiply_op); - - // Smart pointer to automatically delete batch_matrix_multiply_op. - std::unique_ptr auto_batch_matrix_multiply_op( - batch_matrix_multiply_op, xnn_delete_operator); - - size_t workspace_size = 0; - size_t workspace_alignment = 0; - ASSERT_EQ(expected_status_reshape(), - xnn_reshape_batch_matrix_multiply_nc_f32( - batch_matrix_multiply_op, num_batch_dims, - batch_dims_a().data(), batch_dims_b().data(), m(), k(), n(), - &workspace_size, &workspace_alignment, - /*threadpool=*/nullptr)); - if (expected_status_reshape() != xnn_status_success) { - return; - } - ASSERT_NE(workspace_size, 0); - ASSERT_LE(workspace_alignment, XNN_ALLOCATION_ALIGNMENT); - std::vector> workspace(workspace_size); - - ASSERT_EQ(xnn_status_success, - xnn_setup_batch_matrix_multiply_nc_f32( - batch_matrix_multiply_op, workspace.data(), input_a.data(), - input_b.data(), output.data())); - - ASSERT_EQ(xnn_status_success, - xnn_run_operator(batch_matrix_multiply_op, /*threadpool=*/nullptr)); - - VerifyF32(output, output_ref); } } diff --git a/test/multiply2.cc b/test/multiply2.cc index 54ca951a387..4ed45f5189c 100644 --- a/test/multiply2.cc +++ b/test/multiply2.cc @@ -7,13 +7,13 @@ #include #include #include +#include #include #include #include -#include #include "xnnpack.h" -#include "xnnpack/node-type.h" +#include "xnnpack/math.h" #include "xnnpack/operator.h" #include "xnnpack/requantization.h" #include "xnnpack/subgraph.h" @@ -347,8 +347,10 @@ TEST_F(MultiplyTestS32, matches_operator_api) { std::generate(input1.begin(), input1.end(), [&]() { return s32dist(rng); }); std::generate(input2.begin(), input2.end(), [&]() { return s32dist(rng); }); - std::fill(operator_output.begin(), operator_output.end(), INT_MAX); - std::fill(subgraph_output.begin(), subgraph_output.end(), INT_MAX); + std::fill(operator_output.begin(), operator_output.end(), + std::numeric_limits::max()); + std::fill(subgraph_output.begin(), subgraph_output.end(), + std::numeric_limits::max()); ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); From 1b449b43fb198d86767fb329cde06d59720bf5b7 Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Fri, 20 Sep 2024 12:14:38 -0700 Subject: [PATCH 17/50] Don't mark unary and binary benchmark tests as slow. PiperOrigin-RevId: 676928346 --- bench/BUILD.bazel | 6 ------ 1 file changed, 6 deletions(-) diff --git a/bench/BUILD.bazel b/bench/BUILD.bazel index 309737e442f..5e9006dd815 100644 --- a/bench/BUILD.bazel +++ b/bench/BUILD.bazel @@ -329,18 +329,12 @@ xnnpack_benchmark( xnnpack_benchmark( name = "vunary_bench", srcs = ["vunary.cc"], - # TODO(b/367939259): This is not really that slow, but --config=ios_x86_64 fails to pass - # --benchmark_min_time=1x to the benchmark. - tags = xnnpack_slow_benchmark_tags(), deps = MICROKERNEL_BENCHMARK_DEPS, ) xnnpack_benchmark( name = "vbinary_bench", srcs = ["vbinary.cc"], - # TODO(b/367939259): This is not really that slow, but --config=ios_x86_64 fails to pass - # --benchmark_min_time=1x to the benchmark. - tags = xnnpack_slow_benchmark_tags(), deps = MICROKERNEL_BENCHMARK_DEPS, ) From e169381f19f2c80ca5770748eb56e27d3ddf4982 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Fri, 20 Sep 2024 15:06:41 -0700 Subject: [PATCH 18/50] generator script compare all outputs against input PiperOrigin-RevId: 676985747 --- scripts/check_files_changed.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/check_files_changed.py b/scripts/check_files_changed.py index 569386fbc03..9f51ab9e156 100755 --- a/scripts/check_files_changed.py +++ b/scripts/check_files_changed.py @@ -13,15 +13,16 @@ def main(): file_paths_contain = args.files_contain file_edit_dates = {} - inputs_outputs = {} + outputs = {} files_exist = {} with open(script_to_check, 'r') as f: for line in f.readlines(): if line.lstrip().startswith('#'):continue - file_names = line.split(' ') + file_names = [x.strip() for x in line.split(' ')] # Filter file names based on presence of 'files_contain' file_names = [x for x in file_names if file_paths_contain in x] if len(file_names) >= 3: + # outputs[file_names[1]] = file_names[0] # i==0 is script. i==1 is input file. i>=1 is for output files for i, file_name in enumerate(file_names): if file_name not in files_exist: @@ -29,8 +30,7 @@ def main(): # if file doesn't exist if not files_exist[file_name]: - # if output file does nott exist - print(file_name) + # if output file does not exist if i > 1: file_edit_dates[file_name] = 0 else: @@ -38,9 +38,9 @@ def main(): if file_name not in file_edit_dates: file_edit_dates[file_name] = os.path.getmtime(file_name) if i > 1: - inputs_outputs[file_names[1]] = file_name + outputs[file_name] = file_names[1] - for in_file, out_file in inputs_outputs.items(): + for out_file, in_file in outputs.items(): if file_edit_dates[in_file] > file_edit_dates[out_file]: print(out_file) From 37c81d8fb8e95fe205c32fd7de603d475e458fcd Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Fri, 20 Sep 2024 18:23:47 -0700 Subject: [PATCH 19/50] Remove concatenation of microkernel srcs for non-prod microkernel libraries PiperOrigin-RevId: 677039145 --- BUILD.bazel | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/BUILD.bazel b/BUILD.bazel index 2d41fc143cb..2da97e46eff 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -798,18 +798,6 @@ filegroup( "fi", compatible_with = [], ), - genrule( - name = arch + "_non_prod_microkernel_srcs", - srcs = non_prod_c_srcs_for_arch(arch), - outs = [arch + "_non_prod_microkernels.c"], - cmd = "if [ -z \"$(SRCS)\" ]; then " + - " echo \"\" > $@; " + - "else " + - " cat $(SRCS) | grep -E '^#include ' | sort -u > $@; " + - " cat $(SRCS) | grep -v -E '^#include ' >> $@; " + - "fi", - compatible_with = [], - ), xnnpack_cc_library_for_arch( name = arch + "_prod_microkernels", srcs = prod_asm_srcs_for_arch(arch) + [":" + arch + "_prod_microkernel_srcs"], @@ -821,7 +809,7 @@ filegroup( ), xnnpack_cc_library_for_arch( name = arch + "_all_microkernels", - srcs = non_prod_asm_srcs_for_arch(arch) + [":" + arch + "_non_prod_microkernel_srcs"], + srcs = non_prod_asm_srcs_for_arch(arch) + non_prod_c_srcs_for_arch(arch), arch = arch, compatible_with = [], defines = xnnpack_configurable_defines(), From 94f67ae51d6f1222fea50325cbe8d8a29ade1668 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Fri, 20 Sep 2024 18:52:25 -0700 Subject: [PATCH 20/50] Mark scripts as text and executable PiperOrigin-RevId: 677045271 --- scripts/generate-build-identifier.py | 3 +++ scripts/generate-f32-vunary.sh | 8 ++++---- scripts/generate-s32-f32-vcvt.sh | 4 ++-- scripts/generate-u32-f32-vcvt.sh | 4 ++-- 4 files changed, 11 insertions(+), 8 deletions(-) mode change 100644 => 100755 scripts/generate-build-identifier.py diff --git a/scripts/generate-build-identifier.py b/scripts/generate-build-identifier.py old mode 100644 new mode 100755 index 0ea57ef4123..f4ae4dfeada --- a/scripts/generate-build-identifier.py +++ b/scripts/generate-build-identifier.py @@ -22,15 +22,18 @@ " output." ), ) + parser.add_argument( "--output", required=True, action="store", help="Set the output" ) + parser.add_argument( "--input_file_list", required=False, action="store", help="Set an input file list to use instead of the arguments.", ) + parser.add_argument( "inputs", nargs="*", diff --git a/scripts/generate-f32-vunary.sh b/scripts/generate-f32-vunary.sh index 851747a2d38..8eaf7aa42bb 100755 --- a/scripts/generate-f32-vunary.sh +++ b/scripts/generate-f32-vunary.sh @@ -1,5 +1,5 @@ #!/bin/sh -# Copyright 2020 Google LLC +# Copyright 2024 Google LLC # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -25,17 +25,17 @@ tools/xngen src/f32-vunary/simd.c.in -D OP=ABS -D ARCH=scalar -D BATCH_TILES="1, tools/xngen src/f32-vunary/simd.c.in -D OP=NEG -D ARCH=scalar -D BATCH_TILES="1,2,4" -o src/f32-vunary/gen/f32-vneg-scalar.c tools/xngen src/f32-vunary/simd.c.in -D OP=SQR -D ARCH=scalar -D BATCH_TILES="1,2,4" -o src/f32-vunary/gen/f32-vsqr-scalar.c -################################# x86 128-bit ################################# +################################# x86 SSE2 ################################# tools/xngen src/f32-vunary/simd.c.in -D OP=ABS -D ARCH=sse2 -D BATCH_TILES="4,8,12" -o src/f32-vunary/gen/f32-vabs-sse2.c tools/xngen src/f32-vunary/simd.c.in -D OP=NEG -D ARCH=sse2 -D BATCH_TILES="4,8,12" -o src/f32-vunary/gen/f32-vneg-sse2.c tools/xngen src/f32-vunary/simd.c.in -D OP=SQR -D ARCH=sse2 -D BATCH_TILES="4,8,12" -o src/f32-vunary/gen/f32-vsqr-sse2.c -################################# x86 256-bit ################################# +################################# x86 AVX ################################# tools/xngen src/f32-vunary/simd.c.in -D OP=ABS -D ARCH=avx -D BATCH_TILES="8,16,24" -o src/f32-vunary/gen/f32-vabs-avx.c tools/xngen src/f32-vunary/simd.c.in -D OP=NEG -D ARCH=avx -D BATCH_TILES="8,16,24" -o src/f32-vunary/gen/f32-vneg-avx.c tools/xngen src/f32-vunary/simd.c.in -D OP=SQR -D ARCH=avx -D BATCH_TILES="8,16,24" -o src/f32-vunary/gen/f32-vsqr-avx.c -################################# x86 512-bit ################################# +################################# x86 AVX512 ################################# tools/xngen src/f32-vunary/simd.c.in -D OP=ABS -D ARCH=avx512f -D BATCH_TILES="16,32,48" -o src/f32-vunary/gen/f32-vabs-avx512f.c tools/xngen src/f32-vunary/simd.c.in -D OP=NEG -D ARCH=avx512f -D BATCH_TILES="16,32,48" -o src/f32-vunary/gen/f32-vneg-avx512f.c tools/xngen src/f32-vunary/simd.c.in -D OP=SQR -D ARCH=avx512f -D BATCH_TILES="16,32,48" -o src/f32-vunary/gen/f32-vsqr-avx512f.c diff --git a/scripts/generate-s32-f32-vcvt.sh b/scripts/generate-s32-f32-vcvt.sh index f30b2d82216..d2f5a7fa6c0 100755 --- a/scripts/generate-s32-f32-vcvt.sh +++ b/scripts/generate-s32-f32-vcvt.sh @@ -7,10 +7,10 @@ ################################## ARM NEON ################################### tools/xngen src/s32-f32-vcvt/simd.c.in -D BATCH_TILES=4,8,12,16 -D ARCH=neon -o src/s32-f32-vcvt/gen/s32-f32-vcvt-neon.c -################################# x86 256-bit ################################# +################################# x86 AVX2 ################################# tools/xngen src/s32-f32-vcvt/simd.c.in -D BATCH_TILES=8,16,24,32 -D ARCH=avx2 -o src/s32-f32-vcvt/gen/s32-f32-vcvt-avx2.c -################################# x86 512-bit ################################# +################################# x86 AVX512 ################################# tools/xngen src/s32-f32-vcvt/simd.c.in -D BATCH_TILES=16,32,48,64 -D ARCH=avx512f -o src/s32-f32-vcvt/gen/s32-f32-vcvt-avx512f.c ################################## WAsm SIMD ################################## diff --git a/scripts/generate-u32-f32-vcvt.sh b/scripts/generate-u32-f32-vcvt.sh index 1bfb8961fd1..b76c3d39dfe 100755 --- a/scripts/generate-u32-f32-vcvt.sh +++ b/scripts/generate-u32-f32-vcvt.sh @@ -7,10 +7,10 @@ ################################## ARM NEON ################################### tools/xngen src/u32-f32-vcvt/simd.c.in -D BATCH_TILES=4,8,12,16 -D ARCH=neon -o src/u32-f32-vcvt/gen/u32-f32-vcvt-neon.c -################################# x86 256-bit ################################# +################################# x86 AVX2 ################################# tools/xngen src/u32-f32-vcvt/simd.c.in -D BATCH_TILES=8,16,24,32 -D ARCH=avx2 -o src/u32-f32-vcvt/gen/u32-f32-vcvt-avx2.c -################################# x86 512-bit ################################# +################################# x86 AVX512 ################################# tools/xngen src/u32-f32-vcvt/simd.c.in -D BATCH_TILES=16,32,48,64 -D ARCH=avx512f -o src/u32-f32-vcvt/gen/u32-f32-vcvt-avx512f.c ################################## WAsm SIMD ################################## From fd55c5b781ccc75bc421079add2effe48c5c621f Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Fri, 20 Sep 2024 19:06:57 -0700 Subject: [PATCH 21/50] Fix missing generator change to remove int from names of x8-packw functions PiperOrigin-RevId: 677048425 --- src/x8-packw/scalar.c.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/x8-packw/scalar.c.in b/src/x8-packw/scalar.c.in index 43995c13e9b..30d5e1ed687 100644 --- a/src/x8-packw/scalar.c.in +++ b/src/x8-packw/scalar.c.in @@ -18,7 +18,7 @@ $assert TYPE in ["int8_t"] $BITS = {"int8_t": 8}[TYPE] $BTYPE = {"int8_t": "uint32_t"}[TYPE] $WTYPE = {"int8_t": "int8_t"}[TYPE] -void xnn_x${BITS}_packw_gemm_goi_ukernel_x${NR}__scalar_int_u${KBLOCK}( +void xnn_x${BITS}_packw_gemm_goi_ukernel_x${NR}__scalar_u${KBLOCK}( size_t g, size_t nc, size_t kc, From 34b2e491aea125144ac1f9ca12d547b2cc10a53f Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Fri, 20 Sep 2024 19:11:01 -0700 Subject: [PATCH 22/50] f32-raddstoreexpminusmax-avx-rr2 VEX128 version of SSE2 microkernel - Remove ABC and use SIMD_TILE PiperOrigin-RevId: 677049137 --- bench/f32-raddstoreexpminusmax.cc | 74 +++ cmake/gen/avx_microkernels.cmake | 12 + gen/avx_microkernels.bzl | 12 + scripts/generate-f32-raddstoreexpminusmax.sh | 38 +- ...raddstoreexpminusmax-avx-rr2-p5-u12-acc2.c | 257 ++++++++++ ...raddstoreexpminusmax-avx-rr2-p5-u12-acc3.c | 259 ++++++++++ .../f32-raddstoreexpminusmax-avx-rr2-p5-u12.c | 254 ++++++++++ ...raddstoreexpminusmax-avx-rr2-p5-u16-acc2.c | 273 +++++++++++ ...raddstoreexpminusmax-avx-rr2-p5-u16-acc4.c | 277 +++++++++++ .../f32-raddstoreexpminusmax-avx-rr2-p5-u16.c | 270 +++++++++++ ...raddstoreexpminusmax-avx-rr2-p5-u20-acc2.c | 289 ++++++++++++ ...raddstoreexpminusmax-avx-rr2-p5-u20-acc5.c | 295 ++++++++++++ .../f32-raddstoreexpminusmax-avx-rr2-p5-u20.c | 286 +++++++++++ .../f32-raddstoreexpminusmax-avx-rr2-p5-u4.c | 222 +++++++++ ...-raddstoreexpminusmax-avx-rr2-p5-u8-acc2.c | 241 ++++++++++ .../f32-raddstoreexpminusmax-avx-rr2-p5-u8.c | 238 ++++++++++ ...addstoreexpminusmax-sse2-rr2-p5-u12-acc2.c | 96 ++-- ...addstoreexpminusmax-sse2-rr2-p5-u12-acc3.c | 96 ++-- ...f32-raddstoreexpminusmax-sse2-rr2-p5-u12.c | 96 ++-- ...addstoreexpminusmax-sse2-rr2-p5-u16-acc2.c | 134 +++--- ...addstoreexpminusmax-sse2-rr2-p5-u16-acc4.c | 134 +++--- ...f32-raddstoreexpminusmax-sse2-rr2-p5-u16.c | 134 +++--- ...addstoreexpminusmax-sse2-rr2-p5-u20-acc2.c | 170 +++---- ...addstoreexpminusmax-sse2-rr2-p5-u20-acc5.c | 170 +++---- ...f32-raddstoreexpminusmax-sse2-rr2-p5-u20.c | 170 +++---- .../f32-raddstoreexpminusmax-sse2-rr2-p5-u4.c | 32 +- ...raddstoreexpminusmax-sse2-rr2-p5-u8-acc2.c | 64 +-- .../f32-raddstoreexpminusmax-sse2-rr2-p5-u8.c | 64 +-- src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in | 72 +-- src/xnnpack/raddstoreexpminusmax.h | 13 + test/f32-raddstoreexpminusmax.cc | 444 ++++++++++++++++++ test/f32-raddstoreexpminusmax.yaml | 16 +- 32 files changed, 4473 insertions(+), 729 deletions(-) create mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12-acc2.c create mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12-acc3.c create mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12.c create mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16-acc2.c create mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16-acc4.c create mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16.c create mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20-acc2.c create mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20-acc5.c create mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20.c create mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u4.c create mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u8-acc2.c create mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u8.c diff --git a/bench/f32-raddstoreexpminusmax.cc b/bench/f32-raddstoreexpminusmax.cc index df11e04896a..4a3cae36122 100644 --- a/bench/f32-raddstoreexpminusmax.cc +++ b/bench/f32-raddstoreexpminusmax.cc @@ -734,6 +734,80 @@ static void f32_raddstoreexpminusmax( nullptr) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); + + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx_rr2_p5_u4, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u4, + nullptr) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx_rr2_p5_u8, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u8, + nullptr) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx_rr2_p5_u8_acc2, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u8_acc2, + nullptr) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx_rr2_p5_u12, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12, + nullptr) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx_rr2_p5_u12_acc2, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12_acc2, + nullptr) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx_rr2_p5_u12_acc3, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12_acc3, + nullptr) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx_rr2_p5_u16, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16, + nullptr) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx_rr2_p5_u16_acc2, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16_acc2, + nullptr) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx_rr2_p5_u16_acc4, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16_acc4, + nullptr) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx_rr2_p5_u20, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20, + nullptr) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx_rr2_p5_u20_acc2, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20_acc2, + nullptr) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx_rr2_p5_u20_acc5, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20_acc5, + nullptr) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_rr2_p5_u4, xnn_f32_rmax_ukernel__sse_u16_acc4, xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u4, diff --git a/cmake/gen/avx_microkernels.cmake b/cmake/gen/avx_microkernels.cmake index 1ec0d1b32d1..4b54801c46f 100644 --- a/cmake/gen/avx_microkernels.cmake +++ b/cmake/gen/avx_microkernels.cmake @@ -184,6 +184,18 @@ SET(NON_PROD_AVX_MICROKERNEL_SRCS src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u8.c src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u16.c src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u24.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u4.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u8-acc2.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u8.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12-acc2.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12-acc3.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16-acc2.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16-acc4.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20-acc2.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20-acc5.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20.c src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx-c16.c src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx-c64.c src/f32-rminmax/gen/f32-rmax-avx-u8.c diff --git a/gen/avx_microkernels.bzl b/gen/avx_microkernels.bzl index ae1d8916f1a..ee8f9f70cf1 100644 --- a/gen/avx_microkernels.bzl +++ b/gen/avx_microkernels.bzl @@ -181,6 +181,18 @@ NON_PROD_AVX_MICROKERNEL_SRCS = [ "src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u8.c", "src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u16.c", "src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u24.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u4.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u8-acc2.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u8.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12-acc2.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12-acc3.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16-acc2.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16-acc4.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20-acc2.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20-acc5.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20.c", "src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx-c16.c", "src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx-c64.c", "src/f32-rminmax/gen/f32-rmax-avx-u8.c", diff --git a/scripts/generate-f32-raddstoreexpminusmax.sh b/scripts/generate-f32-raddstoreexpminusmax.sh index 57ad28ea64e..e789f74146f 100755 --- a/scripts/generate-f32-raddstoreexpminusmax.sh +++ b/scripts/generate-f32-raddstoreexpminusmax.sh @@ -62,18 +62,32 @@ tools/xngen src/f32-raddstoreexpminusmax/rvv-rr2-p6.c.in -D LMUL=2 -o src/f32-ra tools/xngen src/f32-raddstoreexpminusmax/rvv-rr2-p6.c.in -D LMUL=4 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-rvv-rr2-p6-u4v.c & ################################### x86 SSE2 ################################## -tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=4 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u4.c & -tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=8 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u8.c & -tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=8 -D ACCUMULATORS=2 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u8-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=12 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u12.c & -tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=12 -D ACCUMULATORS=2 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u12-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=12 -D ACCUMULATORS=3 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u12-acc3.c & -tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=16 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16.c & -tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=16 -D ACCUMULATORS=2 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=16 -D ACCUMULATORS=4 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16-acc4.c & -tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=20 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20.c & -tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=20 -D ACCUMULATORS=2 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=20 -D ACCUMULATORS=5 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20-acc5.c & +tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=4 -D ACCUMULATORS=1 -D AVX=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u4.c & +tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=8 -D ACCUMULATORS=1 -D AVX=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u8.c & +tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=8 -D ACCUMULATORS=2 -D AVX=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u8-acc2.c & +tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=12 -D ACCUMULATORS=1 -D AVX=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u12.c & +tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=12 -D ACCUMULATORS=2 -D AVX=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u12-acc2.c & +tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=12 -D ACCUMULATORS=3 -D AVX=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u12-acc3.c & +tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=16 -D ACCUMULATORS=1 -D AVX=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16.c & +tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=16 -D ACCUMULATORS=2 -D AVX=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16-acc2.c & +tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=16 -D ACCUMULATORS=4 -D AVX=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16-acc4.c & +tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=20 -D ACCUMULATORS=1 -D AVX=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20.c & +tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=20 -D ACCUMULATORS=2 -D AVX=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20-acc2.c & +tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=20 -D ACCUMULATORS=5 -D AVX=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20-acc5.c & + +################################### x86 AVX ################################## +tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=4 -D ACCUMULATORS=1 -D AVX=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u4.c & +tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=8 -D ACCUMULATORS=1 -D AVX=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u8.c & +tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=8 -D ACCUMULATORS=2 -D AVX=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u8-acc2.c & +tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=12 -D ACCUMULATORS=1 -D AVX=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12.c & +tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=12 -D ACCUMULATORS=2 -D AVX=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12-acc2.c & +tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=12 -D ACCUMULATORS=3 -D AVX=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12-acc3.c & +tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=16 -D ACCUMULATORS=1 -D AVX=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16.c & +tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=16 -D ACCUMULATORS=2 -D AVX=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16-acc2.c & +tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=16 -D ACCUMULATORS=4 -D AVX=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16-acc4.c & +tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=20 -D ACCUMULATORS=1 -D AVX=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20.c & +tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=20 -D ACCUMULATORS=2 -D AVX=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20-acc2.c & +tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=20 -D ACCUMULATORS=5 -D AVX=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20-acc5.c & ################################### x86 AVX2 ################################## tools/xngen src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in -D BATCH_TILE=32 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u32.c & diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12-acc2.c new file mode 100644 index 00000000000..01e3bc41ea6 --- /dev/null +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12-acc2.c @@ -0,0 +1,257 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in +// Generator: tools/xngen +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/raddstoreexpminusmax.h" + + +void xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12_acc2( + size_t batch, + const float* input, + const float* max, + float* output, + float* sum, + const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(max != NULL); + assert(output != NULL); + assert(sum != NULL); + + const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f); + const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f); + const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f); + const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f); + const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f); + const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f); + const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f); + const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f); + const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f); + const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f); + + XNN_FORCE_REALIZATION(vlog2e); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vminus_ln2_hi); + XNN_FORCE_REALIZATION(vminus_ln2_lo); + XNN_FORCE_REALIZATION(vc5); + XNN_FORCE_REALIZATION(vc4); + XNN_FORCE_REALIZATION(vc3); + XNN_FORCE_REALIZATION(vc2); + XNN_FORCE_REALIZATION(vc1); + XNN_FORCE_REALIZATION(vdenorm_cutoff); + + const __m128 vi_max = _mm_load1_ps(max); + + __m128 vacc0 = _mm_setzero_ps(); + __m128 vacc1 = _mm_setzero_ps(); + for (; batch >= 12 * sizeof(float); batch -= 12 * sizeof(float)) { + // Load 12 (3x4) inputs at a time. + const __m128 vi0 = _mm_loadu_ps(input); + const __m128 vi1 = _mm_loadu_ps(input + 4); + const __m128 vi2 = _mm_loadu_ps(input + 8); + input += 12; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m128 vx0 = _mm_sub_ps(vi0, vi_max); + const __m128 vx1 = _mm_sub_ps(vi1, vi_max); + const __m128 vx2 = _mm_sub_ps(vi2, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); + __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); + __m128 vn2 = _mm_add_ps(_mm_mul_ps(vx2, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); + const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); + const __m128 vs2 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn2), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn0 = _mm_sub_ps(vn0, vmagic_bias); + vn1 = _mm_sub_ps(vn1, vmagic_bias); + vn2 = _mm_sub_ps(vn2, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); + __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); + __m128 vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_hi), vx2); + + vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); + vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_lo), vt2); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); + __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); + __m128 vp2 = _mm_add_ps(_mm_mul_ps(vc5, vt2), vc4); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc3); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc2); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt0 = _mm_mul_ps(vt0, vs0); + vt1 = _mm_mul_ps(vt1, vs1); + vt2 = _mm_mul_ps(vt2, vs2); + + __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); + __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); + __m128 vf2 = _mm_add_ps(_mm_mul_ps(vt2, vp2), vs2); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); + vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); + vf2 = _mm_andnot_ps(_mm_cmplt_ps(vx2, vdenorm_cutoff), vf2); + + // Store 12 (3x4) outputs at a time. + _mm_storeu_ps(output, vf0); + _mm_storeu_ps(output + 4, vf1); + _mm_storeu_ps(output + 8, vf2); + output += 12; + + // Accumulate computed exponents. + vacc0 = _mm_add_ps(vacc0, vf0); + vacc1 = _mm_add_ps(vacc1, vf1); + vacc0 = _mm_add_ps(vacc0, vf2); + } + // Add up all accumulators to vacc0 + vacc0 = _mm_add_ps(vacc0, vacc1); + + __m128 vacc = vacc0; + for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { + // Load 4 inputs at a time. + const __m128 vi = _mm_loadu_ps(input); + input += 4; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m128 vx = _mm_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm_mul_ps(vt, vs); + __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); + + // Store 4 outputs at a time. + _mm_storeu_ps(output, vf); + output += 4; + + // Accumulate computed exponents. + vacc = _mm_add_ps(vacc, vf); + } + if (batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 3 * sizeof(float)); + // Load 4 inputs at a time. + const __m128 vi = _mm_loadu_ps(input); + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m128 vx = _mm_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm_mul_ps(vt, vs); + __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); + + if (batch & (2 * sizeof(float))) { + // Store 2 outputs at a time. + _mm_storel_pi((__m64*) output, vf); + output += 2; + + // Accumulate 2 computed exponents. + vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps())); + + vf = _mm_movehl_ps(vf, vf); + } + if (batch & (1 * sizeof(float))) { + // Store 1 output at a time. + _mm_store_ss(output, vf); + + // Accumulate 1 computed exponent. + vacc = _mm_add_ss(vacc, vf); + } + } + // Reduce 4 batch in the SIMD register + vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); + vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1))); + _mm_store_ss(sum, vacc); +} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12-acc3.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12-acc3.c new file mode 100644 index 00000000000..4d1456328d0 --- /dev/null +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12-acc3.c @@ -0,0 +1,259 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in +// Generator: tools/xngen +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/raddstoreexpminusmax.h" + + +void xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12_acc3( + size_t batch, + const float* input, + const float* max, + float* output, + float* sum, + const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(max != NULL); + assert(output != NULL); + assert(sum != NULL); + + const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f); + const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f); + const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f); + const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f); + const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f); + const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f); + const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f); + const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f); + const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f); + const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f); + + XNN_FORCE_REALIZATION(vlog2e); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vminus_ln2_hi); + XNN_FORCE_REALIZATION(vminus_ln2_lo); + XNN_FORCE_REALIZATION(vc5); + XNN_FORCE_REALIZATION(vc4); + XNN_FORCE_REALIZATION(vc3); + XNN_FORCE_REALIZATION(vc2); + XNN_FORCE_REALIZATION(vc1); + XNN_FORCE_REALIZATION(vdenorm_cutoff); + + const __m128 vi_max = _mm_load1_ps(max); + + __m128 vacc0 = _mm_setzero_ps(); + __m128 vacc1 = _mm_setzero_ps(); + __m128 vacc2 = _mm_setzero_ps(); + for (; batch >= 12 * sizeof(float); batch -= 12 * sizeof(float)) { + // Load 12 (3x4) inputs at a time. + const __m128 vi0 = _mm_loadu_ps(input); + const __m128 vi1 = _mm_loadu_ps(input + 4); + const __m128 vi2 = _mm_loadu_ps(input + 8); + input += 12; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m128 vx0 = _mm_sub_ps(vi0, vi_max); + const __m128 vx1 = _mm_sub_ps(vi1, vi_max); + const __m128 vx2 = _mm_sub_ps(vi2, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); + __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); + __m128 vn2 = _mm_add_ps(_mm_mul_ps(vx2, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); + const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); + const __m128 vs2 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn2), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn0 = _mm_sub_ps(vn0, vmagic_bias); + vn1 = _mm_sub_ps(vn1, vmagic_bias); + vn2 = _mm_sub_ps(vn2, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); + __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); + __m128 vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_hi), vx2); + + vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); + vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_lo), vt2); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); + __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); + __m128 vp2 = _mm_add_ps(_mm_mul_ps(vc5, vt2), vc4); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc3); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc2); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt0 = _mm_mul_ps(vt0, vs0); + vt1 = _mm_mul_ps(vt1, vs1); + vt2 = _mm_mul_ps(vt2, vs2); + + __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); + __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); + __m128 vf2 = _mm_add_ps(_mm_mul_ps(vt2, vp2), vs2); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); + vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); + vf2 = _mm_andnot_ps(_mm_cmplt_ps(vx2, vdenorm_cutoff), vf2); + + // Store 12 (3x4) outputs at a time. + _mm_storeu_ps(output, vf0); + _mm_storeu_ps(output + 4, vf1); + _mm_storeu_ps(output + 8, vf2); + output += 12; + + // Accumulate computed exponents. + vacc0 = _mm_add_ps(vacc0, vf0); + vacc1 = _mm_add_ps(vacc1, vf1); + vacc2 = _mm_add_ps(vacc2, vf2); + } + // Add up all accumulators to vacc0 + vacc0 = _mm_add_ps(vacc0, vacc1); + vacc0 = _mm_add_ps(vacc0, vacc2); + + __m128 vacc = vacc0; + for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { + // Load 4 inputs at a time. + const __m128 vi = _mm_loadu_ps(input); + input += 4; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m128 vx = _mm_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm_mul_ps(vt, vs); + __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); + + // Store 4 outputs at a time. + _mm_storeu_ps(output, vf); + output += 4; + + // Accumulate computed exponents. + vacc = _mm_add_ps(vacc, vf); + } + if (batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 3 * sizeof(float)); + // Load 4 inputs at a time. + const __m128 vi = _mm_loadu_ps(input); + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m128 vx = _mm_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm_mul_ps(vt, vs); + __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); + + if (batch & (2 * sizeof(float))) { + // Store 2 outputs at a time. + _mm_storel_pi((__m64*) output, vf); + output += 2; + + // Accumulate 2 computed exponents. + vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps())); + + vf = _mm_movehl_ps(vf, vf); + } + if (batch & (1 * sizeof(float))) { + // Store 1 output at a time. + _mm_store_ss(output, vf); + + // Accumulate 1 computed exponent. + vacc = _mm_add_ss(vacc, vf); + } + } + // Reduce 4 batch in the SIMD register + vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); + vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1))); + _mm_store_ss(sum, vacc); +} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12.c new file mode 100644 index 00000000000..83b546b8140 --- /dev/null +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12.c @@ -0,0 +1,254 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in +// Generator: tools/xngen +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/raddstoreexpminusmax.h" + + +void xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12( + size_t batch, + const float* input, + const float* max, + float* output, + float* sum, + const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(max != NULL); + assert(output != NULL); + assert(sum != NULL); + + const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f); + const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f); + const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f); + const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f); + const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f); + const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f); + const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f); + const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f); + const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f); + const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f); + + XNN_FORCE_REALIZATION(vlog2e); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vminus_ln2_hi); + XNN_FORCE_REALIZATION(vminus_ln2_lo); + XNN_FORCE_REALIZATION(vc5); + XNN_FORCE_REALIZATION(vc4); + XNN_FORCE_REALIZATION(vc3); + XNN_FORCE_REALIZATION(vc2); + XNN_FORCE_REALIZATION(vc1); + XNN_FORCE_REALIZATION(vdenorm_cutoff); + + const __m128 vi_max = _mm_load1_ps(max); + + __m128 vacc0 = _mm_setzero_ps(); + for (; batch >= 12 * sizeof(float); batch -= 12 * sizeof(float)) { + // Load 12 (3x4) inputs at a time. + const __m128 vi0 = _mm_loadu_ps(input); + const __m128 vi1 = _mm_loadu_ps(input + 4); + const __m128 vi2 = _mm_loadu_ps(input + 8); + input += 12; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m128 vx0 = _mm_sub_ps(vi0, vi_max); + const __m128 vx1 = _mm_sub_ps(vi1, vi_max); + const __m128 vx2 = _mm_sub_ps(vi2, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); + __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); + __m128 vn2 = _mm_add_ps(_mm_mul_ps(vx2, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); + const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); + const __m128 vs2 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn2), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn0 = _mm_sub_ps(vn0, vmagic_bias); + vn1 = _mm_sub_ps(vn1, vmagic_bias); + vn2 = _mm_sub_ps(vn2, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); + __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); + __m128 vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_hi), vx2); + + vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); + vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_lo), vt2); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); + __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); + __m128 vp2 = _mm_add_ps(_mm_mul_ps(vc5, vt2), vc4); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc3); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc2); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt0 = _mm_mul_ps(vt0, vs0); + vt1 = _mm_mul_ps(vt1, vs1); + vt2 = _mm_mul_ps(vt2, vs2); + + __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); + __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); + __m128 vf2 = _mm_add_ps(_mm_mul_ps(vt2, vp2), vs2); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); + vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); + vf2 = _mm_andnot_ps(_mm_cmplt_ps(vx2, vdenorm_cutoff), vf2); + + // Store 12 (3x4) outputs at a time. + _mm_storeu_ps(output, vf0); + _mm_storeu_ps(output + 4, vf1); + _mm_storeu_ps(output + 8, vf2); + output += 12; + + // Accumulate computed exponents. + vacc0 = _mm_add_ps(vacc0, vf0); + vacc0 = _mm_add_ps(vacc0, vf1); + vacc0 = _mm_add_ps(vacc0, vf2); + } + + __m128 vacc = vacc0; + for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { + // Load 4 inputs at a time. + const __m128 vi = _mm_loadu_ps(input); + input += 4; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m128 vx = _mm_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm_mul_ps(vt, vs); + __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); + + // Store 4 outputs at a time. + _mm_storeu_ps(output, vf); + output += 4; + + // Accumulate computed exponents. + vacc = _mm_add_ps(vacc, vf); + } + if (batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 3 * sizeof(float)); + // Load 4 inputs at a time. + const __m128 vi = _mm_loadu_ps(input); + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m128 vx = _mm_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm_mul_ps(vt, vs); + __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); + + if (batch & (2 * sizeof(float))) { + // Store 2 outputs at a time. + _mm_storel_pi((__m64*) output, vf); + output += 2; + + // Accumulate 2 computed exponents. + vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps())); + + vf = _mm_movehl_ps(vf, vf); + } + if (batch & (1 * sizeof(float))) { + // Store 1 output at a time. + _mm_store_ss(output, vf); + + // Accumulate 1 computed exponent. + vacc = _mm_add_ss(vacc, vf); + } + } + // Reduce 4 batch in the SIMD register + vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); + vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1))); + _mm_store_ss(sum, vacc); +} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16-acc2.c new file mode 100644 index 00000000000..7bb87dc34d5 --- /dev/null +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16-acc2.c @@ -0,0 +1,273 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in +// Generator: tools/xngen +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/raddstoreexpminusmax.h" + + +void xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16_acc2( + size_t batch, + const float* input, + const float* max, + float* output, + float* sum, + const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(max != NULL); + assert(output != NULL); + assert(sum != NULL); + + const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f); + const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f); + const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f); + const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f); + const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f); + const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f); + const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f); + const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f); + const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f); + const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f); + + XNN_FORCE_REALIZATION(vlog2e); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vminus_ln2_hi); + XNN_FORCE_REALIZATION(vminus_ln2_lo); + XNN_FORCE_REALIZATION(vc5); + XNN_FORCE_REALIZATION(vc4); + XNN_FORCE_REALIZATION(vc3); + XNN_FORCE_REALIZATION(vc2); + XNN_FORCE_REALIZATION(vc1); + XNN_FORCE_REALIZATION(vdenorm_cutoff); + + const __m128 vi_max = _mm_load1_ps(max); + + __m128 vacc0 = _mm_setzero_ps(); + __m128 vacc1 = _mm_setzero_ps(); + for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { + // Load 16 (4x4) inputs at a time. + const __m128 vi0 = _mm_loadu_ps(input); + const __m128 vi1 = _mm_loadu_ps(input + 4); + const __m128 vi2 = _mm_loadu_ps(input + 8); + const __m128 vi3 = _mm_loadu_ps(input + 12); + input += 16; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m128 vx0 = _mm_sub_ps(vi0, vi_max); + const __m128 vx1 = _mm_sub_ps(vi1, vi_max); + const __m128 vx2 = _mm_sub_ps(vi2, vi_max); + const __m128 vx3 = _mm_sub_ps(vi3, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); + __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); + __m128 vn2 = _mm_add_ps(_mm_mul_ps(vx2, vlog2e), vmagic_bias); + __m128 vn3 = _mm_add_ps(_mm_mul_ps(vx3, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); + const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); + const __m128 vs2 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn2), 23)); + const __m128 vs3 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn3), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn0 = _mm_sub_ps(vn0, vmagic_bias); + vn1 = _mm_sub_ps(vn1, vmagic_bias); + vn2 = _mm_sub_ps(vn2, vmagic_bias); + vn3 = _mm_sub_ps(vn3, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); + __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); + __m128 vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_hi), vx2); + __m128 vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_hi), vx3); + + vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); + vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_lo), vt2); + vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_lo), vt3); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); + __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); + __m128 vp2 = _mm_add_ps(_mm_mul_ps(vc5, vt2), vc4); + __m128 vp3 = _mm_add_ps(_mm_mul_ps(vc5, vt3), vc4); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc3); + vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc3); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc2); + vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc2); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc1); + vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt0 = _mm_mul_ps(vt0, vs0); + vt1 = _mm_mul_ps(vt1, vs1); + vt2 = _mm_mul_ps(vt2, vs2); + vt3 = _mm_mul_ps(vt3, vs3); + + __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); + __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); + __m128 vf2 = _mm_add_ps(_mm_mul_ps(vt2, vp2), vs2); + __m128 vf3 = _mm_add_ps(_mm_mul_ps(vt3, vp3), vs3); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); + vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); + vf2 = _mm_andnot_ps(_mm_cmplt_ps(vx2, vdenorm_cutoff), vf2); + vf3 = _mm_andnot_ps(_mm_cmplt_ps(vx3, vdenorm_cutoff), vf3); + + // Store 16 (4x4) outputs at a time. + _mm_storeu_ps(output, vf0); + _mm_storeu_ps(output + 4, vf1); + _mm_storeu_ps(output + 8, vf2); + _mm_storeu_ps(output + 12, vf3); + output += 16; + + // Accumulate computed exponents. + vacc0 = _mm_add_ps(vacc0, vf0); + vacc1 = _mm_add_ps(vacc1, vf1); + vacc0 = _mm_add_ps(vacc0, vf2); + vacc1 = _mm_add_ps(vacc1, vf3); + } + // Add up all accumulators to vacc0 + vacc0 = _mm_add_ps(vacc0, vacc1); + + __m128 vacc = vacc0; + for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { + // Load 4 inputs at a time. + const __m128 vi = _mm_loadu_ps(input); + input += 4; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m128 vx = _mm_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm_mul_ps(vt, vs); + __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); + + // Store 4 outputs at a time. + _mm_storeu_ps(output, vf); + output += 4; + + // Accumulate computed exponents. + vacc = _mm_add_ps(vacc, vf); + } + if (batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 3 * sizeof(float)); + // Load 4 inputs at a time. + const __m128 vi = _mm_loadu_ps(input); + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m128 vx = _mm_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm_mul_ps(vt, vs); + __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); + + if (batch & (2 * sizeof(float))) { + // Store 2 outputs at a time. + _mm_storel_pi((__m64*) output, vf); + output += 2; + + // Accumulate 2 computed exponents. + vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps())); + + vf = _mm_movehl_ps(vf, vf); + } + if (batch & (1 * sizeof(float))) { + // Store 1 output at a time. + _mm_store_ss(output, vf); + + // Accumulate 1 computed exponent. + vacc = _mm_add_ss(vacc, vf); + } + } + // Reduce 4 batch in the SIMD register + vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); + vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1))); + _mm_store_ss(sum, vacc); +} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16-acc4.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16-acc4.c new file mode 100644 index 00000000000..fa79b152af6 --- /dev/null +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16-acc4.c @@ -0,0 +1,277 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in +// Generator: tools/xngen +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/raddstoreexpminusmax.h" + + +void xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16_acc4( + size_t batch, + const float* input, + const float* max, + float* output, + float* sum, + const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(max != NULL); + assert(output != NULL); + assert(sum != NULL); + + const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f); + const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f); + const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f); + const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f); + const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f); + const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f); + const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f); + const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f); + const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f); + const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f); + + XNN_FORCE_REALIZATION(vlog2e); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vminus_ln2_hi); + XNN_FORCE_REALIZATION(vminus_ln2_lo); + XNN_FORCE_REALIZATION(vc5); + XNN_FORCE_REALIZATION(vc4); + XNN_FORCE_REALIZATION(vc3); + XNN_FORCE_REALIZATION(vc2); + XNN_FORCE_REALIZATION(vc1); + XNN_FORCE_REALIZATION(vdenorm_cutoff); + + const __m128 vi_max = _mm_load1_ps(max); + + __m128 vacc0 = _mm_setzero_ps(); + __m128 vacc1 = _mm_setzero_ps(); + __m128 vacc2 = _mm_setzero_ps(); + __m128 vacc3 = _mm_setzero_ps(); + for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { + // Load 16 (4x4) inputs at a time. + const __m128 vi0 = _mm_loadu_ps(input); + const __m128 vi1 = _mm_loadu_ps(input + 4); + const __m128 vi2 = _mm_loadu_ps(input + 8); + const __m128 vi3 = _mm_loadu_ps(input + 12); + input += 16; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m128 vx0 = _mm_sub_ps(vi0, vi_max); + const __m128 vx1 = _mm_sub_ps(vi1, vi_max); + const __m128 vx2 = _mm_sub_ps(vi2, vi_max); + const __m128 vx3 = _mm_sub_ps(vi3, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); + __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); + __m128 vn2 = _mm_add_ps(_mm_mul_ps(vx2, vlog2e), vmagic_bias); + __m128 vn3 = _mm_add_ps(_mm_mul_ps(vx3, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); + const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); + const __m128 vs2 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn2), 23)); + const __m128 vs3 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn3), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn0 = _mm_sub_ps(vn0, vmagic_bias); + vn1 = _mm_sub_ps(vn1, vmagic_bias); + vn2 = _mm_sub_ps(vn2, vmagic_bias); + vn3 = _mm_sub_ps(vn3, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); + __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); + __m128 vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_hi), vx2); + __m128 vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_hi), vx3); + + vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); + vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_lo), vt2); + vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_lo), vt3); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); + __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); + __m128 vp2 = _mm_add_ps(_mm_mul_ps(vc5, vt2), vc4); + __m128 vp3 = _mm_add_ps(_mm_mul_ps(vc5, vt3), vc4); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc3); + vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc3); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc2); + vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc2); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc1); + vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt0 = _mm_mul_ps(vt0, vs0); + vt1 = _mm_mul_ps(vt1, vs1); + vt2 = _mm_mul_ps(vt2, vs2); + vt3 = _mm_mul_ps(vt3, vs3); + + __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); + __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); + __m128 vf2 = _mm_add_ps(_mm_mul_ps(vt2, vp2), vs2); + __m128 vf3 = _mm_add_ps(_mm_mul_ps(vt3, vp3), vs3); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); + vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); + vf2 = _mm_andnot_ps(_mm_cmplt_ps(vx2, vdenorm_cutoff), vf2); + vf3 = _mm_andnot_ps(_mm_cmplt_ps(vx3, vdenorm_cutoff), vf3); + + // Store 16 (4x4) outputs at a time. + _mm_storeu_ps(output, vf0); + _mm_storeu_ps(output + 4, vf1); + _mm_storeu_ps(output + 8, vf2); + _mm_storeu_ps(output + 12, vf3); + output += 16; + + // Accumulate computed exponents. + vacc0 = _mm_add_ps(vacc0, vf0); + vacc1 = _mm_add_ps(vacc1, vf1); + vacc2 = _mm_add_ps(vacc2, vf2); + vacc3 = _mm_add_ps(vacc3, vf3); + } + // Add up all accumulators to vacc0 + vacc0 = _mm_add_ps(vacc0, vacc1); + vacc2 = _mm_add_ps(vacc2, vacc3); + vacc0 = _mm_add_ps(vacc0, vacc2); + + __m128 vacc = vacc0; + for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { + // Load 4 inputs at a time. + const __m128 vi = _mm_loadu_ps(input); + input += 4; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m128 vx = _mm_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm_mul_ps(vt, vs); + __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); + + // Store 4 outputs at a time. + _mm_storeu_ps(output, vf); + output += 4; + + // Accumulate computed exponents. + vacc = _mm_add_ps(vacc, vf); + } + if (batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 3 * sizeof(float)); + // Load 4 inputs at a time. + const __m128 vi = _mm_loadu_ps(input); + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m128 vx = _mm_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm_mul_ps(vt, vs); + __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); + + if (batch & (2 * sizeof(float))) { + // Store 2 outputs at a time. + _mm_storel_pi((__m64*) output, vf); + output += 2; + + // Accumulate 2 computed exponents. + vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps())); + + vf = _mm_movehl_ps(vf, vf); + } + if (batch & (1 * sizeof(float))) { + // Store 1 output at a time. + _mm_store_ss(output, vf); + + // Accumulate 1 computed exponent. + vacc = _mm_add_ss(vacc, vf); + } + } + // Reduce 4 batch in the SIMD register + vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); + vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1))); + _mm_store_ss(sum, vacc); +} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16.c new file mode 100644 index 00000000000..73afcd02a67 --- /dev/null +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16.c @@ -0,0 +1,270 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in +// Generator: tools/xngen +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/raddstoreexpminusmax.h" + + +void xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16( + size_t batch, + const float* input, + const float* max, + float* output, + float* sum, + const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(max != NULL); + assert(output != NULL); + assert(sum != NULL); + + const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f); + const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f); + const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f); + const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f); + const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f); + const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f); + const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f); + const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f); + const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f); + const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f); + + XNN_FORCE_REALIZATION(vlog2e); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vminus_ln2_hi); + XNN_FORCE_REALIZATION(vminus_ln2_lo); + XNN_FORCE_REALIZATION(vc5); + XNN_FORCE_REALIZATION(vc4); + XNN_FORCE_REALIZATION(vc3); + XNN_FORCE_REALIZATION(vc2); + XNN_FORCE_REALIZATION(vc1); + XNN_FORCE_REALIZATION(vdenorm_cutoff); + + const __m128 vi_max = _mm_load1_ps(max); + + __m128 vacc0 = _mm_setzero_ps(); + for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { + // Load 16 (4x4) inputs at a time. + const __m128 vi0 = _mm_loadu_ps(input); + const __m128 vi1 = _mm_loadu_ps(input + 4); + const __m128 vi2 = _mm_loadu_ps(input + 8); + const __m128 vi3 = _mm_loadu_ps(input + 12); + input += 16; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m128 vx0 = _mm_sub_ps(vi0, vi_max); + const __m128 vx1 = _mm_sub_ps(vi1, vi_max); + const __m128 vx2 = _mm_sub_ps(vi2, vi_max); + const __m128 vx3 = _mm_sub_ps(vi3, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); + __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); + __m128 vn2 = _mm_add_ps(_mm_mul_ps(vx2, vlog2e), vmagic_bias); + __m128 vn3 = _mm_add_ps(_mm_mul_ps(vx3, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); + const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); + const __m128 vs2 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn2), 23)); + const __m128 vs3 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn3), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn0 = _mm_sub_ps(vn0, vmagic_bias); + vn1 = _mm_sub_ps(vn1, vmagic_bias); + vn2 = _mm_sub_ps(vn2, vmagic_bias); + vn3 = _mm_sub_ps(vn3, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); + __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); + __m128 vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_hi), vx2); + __m128 vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_hi), vx3); + + vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); + vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_lo), vt2); + vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_lo), vt3); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); + __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); + __m128 vp2 = _mm_add_ps(_mm_mul_ps(vc5, vt2), vc4); + __m128 vp3 = _mm_add_ps(_mm_mul_ps(vc5, vt3), vc4); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc3); + vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc3); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc2); + vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc2); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc1); + vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt0 = _mm_mul_ps(vt0, vs0); + vt1 = _mm_mul_ps(vt1, vs1); + vt2 = _mm_mul_ps(vt2, vs2); + vt3 = _mm_mul_ps(vt3, vs3); + + __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); + __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); + __m128 vf2 = _mm_add_ps(_mm_mul_ps(vt2, vp2), vs2); + __m128 vf3 = _mm_add_ps(_mm_mul_ps(vt3, vp3), vs3); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); + vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); + vf2 = _mm_andnot_ps(_mm_cmplt_ps(vx2, vdenorm_cutoff), vf2); + vf3 = _mm_andnot_ps(_mm_cmplt_ps(vx3, vdenorm_cutoff), vf3); + + // Store 16 (4x4) outputs at a time. + _mm_storeu_ps(output, vf0); + _mm_storeu_ps(output + 4, vf1); + _mm_storeu_ps(output + 8, vf2); + _mm_storeu_ps(output + 12, vf3); + output += 16; + + // Accumulate computed exponents. + vacc0 = _mm_add_ps(vacc0, vf0); + vacc0 = _mm_add_ps(vacc0, vf1); + vacc0 = _mm_add_ps(vacc0, vf2); + vacc0 = _mm_add_ps(vacc0, vf3); + } + + __m128 vacc = vacc0; + for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { + // Load 4 inputs at a time. + const __m128 vi = _mm_loadu_ps(input); + input += 4; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m128 vx = _mm_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm_mul_ps(vt, vs); + __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); + + // Store 4 outputs at a time. + _mm_storeu_ps(output, vf); + output += 4; + + // Accumulate computed exponents. + vacc = _mm_add_ps(vacc, vf); + } + if (batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 3 * sizeof(float)); + // Load 4 inputs at a time. + const __m128 vi = _mm_loadu_ps(input); + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m128 vx = _mm_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm_mul_ps(vt, vs); + __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); + + if (batch & (2 * sizeof(float))) { + // Store 2 outputs at a time. + _mm_storel_pi((__m64*) output, vf); + output += 2; + + // Accumulate 2 computed exponents. + vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps())); + + vf = _mm_movehl_ps(vf, vf); + } + if (batch & (1 * sizeof(float))) { + // Store 1 output at a time. + _mm_store_ss(output, vf); + + // Accumulate 1 computed exponent. + vacc = _mm_add_ss(vacc, vf); + } + } + // Reduce 4 batch in the SIMD register + vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); + vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1))); + _mm_store_ss(sum, vacc); +} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20-acc2.c new file mode 100644 index 00000000000..dd99ce2cf0d --- /dev/null +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20-acc2.c @@ -0,0 +1,289 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in +// Generator: tools/xngen +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/raddstoreexpminusmax.h" + + +void xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20_acc2( + size_t batch, + const float* input, + const float* max, + float* output, + float* sum, + const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(max != NULL); + assert(output != NULL); + assert(sum != NULL); + + const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f); + const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f); + const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f); + const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f); + const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f); + const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f); + const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f); + const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f); + const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f); + const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f); + + XNN_FORCE_REALIZATION(vlog2e); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vminus_ln2_hi); + XNN_FORCE_REALIZATION(vminus_ln2_lo); + XNN_FORCE_REALIZATION(vc5); + XNN_FORCE_REALIZATION(vc4); + XNN_FORCE_REALIZATION(vc3); + XNN_FORCE_REALIZATION(vc2); + XNN_FORCE_REALIZATION(vc1); + XNN_FORCE_REALIZATION(vdenorm_cutoff); + + const __m128 vi_max = _mm_load1_ps(max); + + __m128 vacc0 = _mm_setzero_ps(); + __m128 vacc1 = _mm_setzero_ps(); + for (; batch >= 20 * sizeof(float); batch -= 20 * sizeof(float)) { + // Load 20 (5x4) inputs at a time. + const __m128 vi0 = _mm_loadu_ps(input); + const __m128 vi1 = _mm_loadu_ps(input + 4); + const __m128 vi2 = _mm_loadu_ps(input + 8); + const __m128 vi3 = _mm_loadu_ps(input + 12); + const __m128 vi4 = _mm_loadu_ps(input + 16); + input += 20; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m128 vx0 = _mm_sub_ps(vi0, vi_max); + const __m128 vx1 = _mm_sub_ps(vi1, vi_max); + const __m128 vx2 = _mm_sub_ps(vi2, vi_max); + const __m128 vx3 = _mm_sub_ps(vi3, vi_max); + const __m128 vx4 = _mm_sub_ps(vi4, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); + __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); + __m128 vn2 = _mm_add_ps(_mm_mul_ps(vx2, vlog2e), vmagic_bias); + __m128 vn3 = _mm_add_ps(_mm_mul_ps(vx3, vlog2e), vmagic_bias); + __m128 vn4 = _mm_add_ps(_mm_mul_ps(vx4, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); + const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); + const __m128 vs2 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn2), 23)); + const __m128 vs3 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn3), 23)); + const __m128 vs4 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn0 = _mm_sub_ps(vn0, vmagic_bias); + vn1 = _mm_sub_ps(vn1, vmagic_bias); + vn2 = _mm_sub_ps(vn2, vmagic_bias); + vn3 = _mm_sub_ps(vn3, vmagic_bias); + vn4 = _mm_sub_ps(vn4, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); + __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); + __m128 vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_hi), vx2); + __m128 vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_hi), vx3); + __m128 vt4 = _mm_add_ps(_mm_mul_ps(vn4, vminus_ln2_hi), vx4); + + vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); + vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_lo), vt2); + vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_lo), vt3); + vt4 = _mm_add_ps(_mm_mul_ps(vn4, vminus_ln2_lo), vt4); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); + __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); + __m128 vp2 = _mm_add_ps(_mm_mul_ps(vc5, vt2), vc4); + __m128 vp3 = _mm_add_ps(_mm_mul_ps(vc5, vt3), vc4); + __m128 vp4 = _mm_add_ps(_mm_mul_ps(vc5, vt4), vc4); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc3); + vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc3); + vp4 = _mm_add_ps(_mm_mul_ps(vp4, vt4), vc3); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc2); + vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc2); + vp4 = _mm_add_ps(_mm_mul_ps(vp4, vt4), vc2); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc1); + vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc1); + vp4 = _mm_add_ps(_mm_mul_ps(vp4, vt4), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt0 = _mm_mul_ps(vt0, vs0); + vt1 = _mm_mul_ps(vt1, vs1); + vt2 = _mm_mul_ps(vt2, vs2); + vt3 = _mm_mul_ps(vt3, vs3); + vt4 = _mm_mul_ps(vt4, vs4); + + __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); + __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); + __m128 vf2 = _mm_add_ps(_mm_mul_ps(vt2, vp2), vs2); + __m128 vf3 = _mm_add_ps(_mm_mul_ps(vt3, vp3), vs3); + __m128 vf4 = _mm_add_ps(_mm_mul_ps(vt4, vp4), vs4); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); + vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); + vf2 = _mm_andnot_ps(_mm_cmplt_ps(vx2, vdenorm_cutoff), vf2); + vf3 = _mm_andnot_ps(_mm_cmplt_ps(vx3, vdenorm_cutoff), vf3); + vf4 = _mm_andnot_ps(_mm_cmplt_ps(vx4, vdenorm_cutoff), vf4); + + // Store 20 (5x4) outputs at a time. + _mm_storeu_ps(output, vf0); + _mm_storeu_ps(output + 4, vf1); + _mm_storeu_ps(output + 8, vf2); + _mm_storeu_ps(output + 12, vf3); + _mm_storeu_ps(output + 16, vf4); + output += 20; + + // Accumulate computed exponents. + vacc0 = _mm_add_ps(vacc0, vf0); + vacc1 = _mm_add_ps(vacc1, vf1); + vacc0 = _mm_add_ps(vacc0, vf2); + vacc1 = _mm_add_ps(vacc1, vf3); + vacc0 = _mm_add_ps(vacc0, vf4); + } + // Add up all accumulators to vacc0 + vacc0 = _mm_add_ps(vacc0, vacc1); + + __m128 vacc = vacc0; + for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { + // Load 4 inputs at a time. + const __m128 vi = _mm_loadu_ps(input); + input += 4; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m128 vx = _mm_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm_mul_ps(vt, vs); + __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); + + // Store 4 outputs at a time. + _mm_storeu_ps(output, vf); + output += 4; + + // Accumulate computed exponents. + vacc = _mm_add_ps(vacc, vf); + } + if (batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 3 * sizeof(float)); + // Load 4 inputs at a time. + const __m128 vi = _mm_loadu_ps(input); + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m128 vx = _mm_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm_mul_ps(vt, vs); + __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); + + if (batch & (2 * sizeof(float))) { + // Store 2 outputs at a time. + _mm_storel_pi((__m64*) output, vf); + output += 2; + + // Accumulate 2 computed exponents. + vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps())); + + vf = _mm_movehl_ps(vf, vf); + } + if (batch & (1 * sizeof(float))) { + // Store 1 output at a time. + _mm_store_ss(output, vf); + + // Accumulate 1 computed exponent. + vacc = _mm_add_ss(vacc, vf); + } + } + // Reduce 4 batch in the SIMD register + vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); + vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1))); + _mm_store_ss(sum, vacc); +} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20-acc5.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20-acc5.c new file mode 100644 index 00000000000..c83c2044873 --- /dev/null +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20-acc5.c @@ -0,0 +1,295 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in +// Generator: tools/xngen +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/raddstoreexpminusmax.h" + + +void xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20_acc5( + size_t batch, + const float* input, + const float* max, + float* output, + float* sum, + const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(max != NULL); + assert(output != NULL); + assert(sum != NULL); + + const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f); + const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f); + const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f); + const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f); + const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f); + const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f); + const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f); + const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f); + const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f); + const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f); + + XNN_FORCE_REALIZATION(vlog2e); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vminus_ln2_hi); + XNN_FORCE_REALIZATION(vminus_ln2_lo); + XNN_FORCE_REALIZATION(vc5); + XNN_FORCE_REALIZATION(vc4); + XNN_FORCE_REALIZATION(vc3); + XNN_FORCE_REALIZATION(vc2); + XNN_FORCE_REALIZATION(vc1); + XNN_FORCE_REALIZATION(vdenorm_cutoff); + + const __m128 vi_max = _mm_load1_ps(max); + + __m128 vacc0 = _mm_setzero_ps(); + __m128 vacc1 = _mm_setzero_ps(); + __m128 vacc2 = _mm_setzero_ps(); + __m128 vacc3 = _mm_setzero_ps(); + __m128 vacc4 = _mm_setzero_ps(); + for (; batch >= 20 * sizeof(float); batch -= 20 * sizeof(float)) { + // Load 20 (5x4) inputs at a time. + const __m128 vi0 = _mm_loadu_ps(input); + const __m128 vi1 = _mm_loadu_ps(input + 4); + const __m128 vi2 = _mm_loadu_ps(input + 8); + const __m128 vi3 = _mm_loadu_ps(input + 12); + const __m128 vi4 = _mm_loadu_ps(input + 16); + input += 20; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m128 vx0 = _mm_sub_ps(vi0, vi_max); + const __m128 vx1 = _mm_sub_ps(vi1, vi_max); + const __m128 vx2 = _mm_sub_ps(vi2, vi_max); + const __m128 vx3 = _mm_sub_ps(vi3, vi_max); + const __m128 vx4 = _mm_sub_ps(vi4, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); + __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); + __m128 vn2 = _mm_add_ps(_mm_mul_ps(vx2, vlog2e), vmagic_bias); + __m128 vn3 = _mm_add_ps(_mm_mul_ps(vx3, vlog2e), vmagic_bias); + __m128 vn4 = _mm_add_ps(_mm_mul_ps(vx4, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); + const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); + const __m128 vs2 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn2), 23)); + const __m128 vs3 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn3), 23)); + const __m128 vs4 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn0 = _mm_sub_ps(vn0, vmagic_bias); + vn1 = _mm_sub_ps(vn1, vmagic_bias); + vn2 = _mm_sub_ps(vn2, vmagic_bias); + vn3 = _mm_sub_ps(vn3, vmagic_bias); + vn4 = _mm_sub_ps(vn4, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); + __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); + __m128 vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_hi), vx2); + __m128 vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_hi), vx3); + __m128 vt4 = _mm_add_ps(_mm_mul_ps(vn4, vminus_ln2_hi), vx4); + + vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); + vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_lo), vt2); + vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_lo), vt3); + vt4 = _mm_add_ps(_mm_mul_ps(vn4, vminus_ln2_lo), vt4); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); + __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); + __m128 vp2 = _mm_add_ps(_mm_mul_ps(vc5, vt2), vc4); + __m128 vp3 = _mm_add_ps(_mm_mul_ps(vc5, vt3), vc4); + __m128 vp4 = _mm_add_ps(_mm_mul_ps(vc5, vt4), vc4); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc3); + vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc3); + vp4 = _mm_add_ps(_mm_mul_ps(vp4, vt4), vc3); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc2); + vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc2); + vp4 = _mm_add_ps(_mm_mul_ps(vp4, vt4), vc2); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc1); + vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc1); + vp4 = _mm_add_ps(_mm_mul_ps(vp4, vt4), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt0 = _mm_mul_ps(vt0, vs0); + vt1 = _mm_mul_ps(vt1, vs1); + vt2 = _mm_mul_ps(vt2, vs2); + vt3 = _mm_mul_ps(vt3, vs3); + vt4 = _mm_mul_ps(vt4, vs4); + + __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); + __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); + __m128 vf2 = _mm_add_ps(_mm_mul_ps(vt2, vp2), vs2); + __m128 vf3 = _mm_add_ps(_mm_mul_ps(vt3, vp3), vs3); + __m128 vf4 = _mm_add_ps(_mm_mul_ps(vt4, vp4), vs4); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); + vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); + vf2 = _mm_andnot_ps(_mm_cmplt_ps(vx2, vdenorm_cutoff), vf2); + vf3 = _mm_andnot_ps(_mm_cmplt_ps(vx3, vdenorm_cutoff), vf3); + vf4 = _mm_andnot_ps(_mm_cmplt_ps(vx4, vdenorm_cutoff), vf4); + + // Store 20 (5x4) outputs at a time. + _mm_storeu_ps(output, vf0); + _mm_storeu_ps(output + 4, vf1); + _mm_storeu_ps(output + 8, vf2); + _mm_storeu_ps(output + 12, vf3); + _mm_storeu_ps(output + 16, vf4); + output += 20; + + // Accumulate computed exponents. + vacc0 = _mm_add_ps(vacc0, vf0); + vacc1 = _mm_add_ps(vacc1, vf1); + vacc2 = _mm_add_ps(vacc2, vf2); + vacc3 = _mm_add_ps(vacc3, vf3); + vacc4 = _mm_add_ps(vacc4, vf4); + } + // Add up all accumulators to vacc0 + vacc0 = _mm_add_ps(vacc0, vacc1); + vacc2 = _mm_add_ps(vacc2, vacc3); + vacc0 = _mm_add_ps(vacc0, vacc2); + vacc0 = _mm_add_ps(vacc0, vacc4); + + __m128 vacc = vacc0; + for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { + // Load 4 inputs at a time. + const __m128 vi = _mm_loadu_ps(input); + input += 4; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m128 vx = _mm_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm_mul_ps(vt, vs); + __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); + + // Store 4 outputs at a time. + _mm_storeu_ps(output, vf); + output += 4; + + // Accumulate computed exponents. + vacc = _mm_add_ps(vacc, vf); + } + if (batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 3 * sizeof(float)); + // Load 4 inputs at a time. + const __m128 vi = _mm_loadu_ps(input); + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m128 vx = _mm_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm_mul_ps(vt, vs); + __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); + + if (batch & (2 * sizeof(float))) { + // Store 2 outputs at a time. + _mm_storel_pi((__m64*) output, vf); + output += 2; + + // Accumulate 2 computed exponents. + vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps())); + + vf = _mm_movehl_ps(vf, vf); + } + if (batch & (1 * sizeof(float))) { + // Store 1 output at a time. + _mm_store_ss(output, vf); + + // Accumulate 1 computed exponent. + vacc = _mm_add_ss(vacc, vf); + } + } + // Reduce 4 batch in the SIMD register + vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); + vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1))); + _mm_store_ss(sum, vacc); +} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20.c new file mode 100644 index 00000000000..d6ee4ad84c4 --- /dev/null +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20.c @@ -0,0 +1,286 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in +// Generator: tools/xngen +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/raddstoreexpminusmax.h" + + +void xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20( + size_t batch, + const float* input, + const float* max, + float* output, + float* sum, + const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(max != NULL); + assert(output != NULL); + assert(sum != NULL); + + const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f); + const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f); + const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f); + const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f); + const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f); + const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f); + const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f); + const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f); + const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f); + const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f); + + XNN_FORCE_REALIZATION(vlog2e); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vminus_ln2_hi); + XNN_FORCE_REALIZATION(vminus_ln2_lo); + XNN_FORCE_REALIZATION(vc5); + XNN_FORCE_REALIZATION(vc4); + XNN_FORCE_REALIZATION(vc3); + XNN_FORCE_REALIZATION(vc2); + XNN_FORCE_REALIZATION(vc1); + XNN_FORCE_REALIZATION(vdenorm_cutoff); + + const __m128 vi_max = _mm_load1_ps(max); + + __m128 vacc0 = _mm_setzero_ps(); + for (; batch >= 20 * sizeof(float); batch -= 20 * sizeof(float)) { + // Load 20 (5x4) inputs at a time. + const __m128 vi0 = _mm_loadu_ps(input); + const __m128 vi1 = _mm_loadu_ps(input + 4); + const __m128 vi2 = _mm_loadu_ps(input + 8); + const __m128 vi3 = _mm_loadu_ps(input + 12); + const __m128 vi4 = _mm_loadu_ps(input + 16); + input += 20; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m128 vx0 = _mm_sub_ps(vi0, vi_max); + const __m128 vx1 = _mm_sub_ps(vi1, vi_max); + const __m128 vx2 = _mm_sub_ps(vi2, vi_max); + const __m128 vx3 = _mm_sub_ps(vi3, vi_max); + const __m128 vx4 = _mm_sub_ps(vi4, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); + __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); + __m128 vn2 = _mm_add_ps(_mm_mul_ps(vx2, vlog2e), vmagic_bias); + __m128 vn3 = _mm_add_ps(_mm_mul_ps(vx3, vlog2e), vmagic_bias); + __m128 vn4 = _mm_add_ps(_mm_mul_ps(vx4, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); + const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); + const __m128 vs2 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn2), 23)); + const __m128 vs3 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn3), 23)); + const __m128 vs4 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn0 = _mm_sub_ps(vn0, vmagic_bias); + vn1 = _mm_sub_ps(vn1, vmagic_bias); + vn2 = _mm_sub_ps(vn2, vmagic_bias); + vn3 = _mm_sub_ps(vn3, vmagic_bias); + vn4 = _mm_sub_ps(vn4, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); + __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); + __m128 vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_hi), vx2); + __m128 vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_hi), vx3); + __m128 vt4 = _mm_add_ps(_mm_mul_ps(vn4, vminus_ln2_hi), vx4); + + vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); + vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_lo), vt2); + vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_lo), vt3); + vt4 = _mm_add_ps(_mm_mul_ps(vn4, vminus_ln2_lo), vt4); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); + __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); + __m128 vp2 = _mm_add_ps(_mm_mul_ps(vc5, vt2), vc4); + __m128 vp3 = _mm_add_ps(_mm_mul_ps(vc5, vt3), vc4); + __m128 vp4 = _mm_add_ps(_mm_mul_ps(vc5, vt4), vc4); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc3); + vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc3); + vp4 = _mm_add_ps(_mm_mul_ps(vp4, vt4), vc3); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc2); + vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc2); + vp4 = _mm_add_ps(_mm_mul_ps(vp4, vt4), vc2); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc1); + vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc1); + vp4 = _mm_add_ps(_mm_mul_ps(vp4, vt4), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt0 = _mm_mul_ps(vt0, vs0); + vt1 = _mm_mul_ps(vt1, vs1); + vt2 = _mm_mul_ps(vt2, vs2); + vt3 = _mm_mul_ps(vt3, vs3); + vt4 = _mm_mul_ps(vt4, vs4); + + __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); + __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); + __m128 vf2 = _mm_add_ps(_mm_mul_ps(vt2, vp2), vs2); + __m128 vf3 = _mm_add_ps(_mm_mul_ps(vt3, vp3), vs3); + __m128 vf4 = _mm_add_ps(_mm_mul_ps(vt4, vp4), vs4); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); + vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); + vf2 = _mm_andnot_ps(_mm_cmplt_ps(vx2, vdenorm_cutoff), vf2); + vf3 = _mm_andnot_ps(_mm_cmplt_ps(vx3, vdenorm_cutoff), vf3); + vf4 = _mm_andnot_ps(_mm_cmplt_ps(vx4, vdenorm_cutoff), vf4); + + // Store 20 (5x4) outputs at a time. + _mm_storeu_ps(output, vf0); + _mm_storeu_ps(output + 4, vf1); + _mm_storeu_ps(output + 8, vf2); + _mm_storeu_ps(output + 12, vf3); + _mm_storeu_ps(output + 16, vf4); + output += 20; + + // Accumulate computed exponents. + vacc0 = _mm_add_ps(vacc0, vf0); + vacc0 = _mm_add_ps(vacc0, vf1); + vacc0 = _mm_add_ps(vacc0, vf2); + vacc0 = _mm_add_ps(vacc0, vf3); + vacc0 = _mm_add_ps(vacc0, vf4); + } + + __m128 vacc = vacc0; + for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { + // Load 4 inputs at a time. + const __m128 vi = _mm_loadu_ps(input); + input += 4; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m128 vx = _mm_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm_mul_ps(vt, vs); + __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); + + // Store 4 outputs at a time. + _mm_storeu_ps(output, vf); + output += 4; + + // Accumulate computed exponents. + vacc = _mm_add_ps(vacc, vf); + } + if (batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 3 * sizeof(float)); + // Load 4 inputs at a time. + const __m128 vi = _mm_loadu_ps(input); + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m128 vx = _mm_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm_mul_ps(vt, vs); + __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); + + if (batch & (2 * sizeof(float))) { + // Store 2 outputs at a time. + _mm_storel_pi((__m64*) output, vf); + output += 2; + + // Accumulate 2 computed exponents. + vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps())); + + vf = _mm_movehl_ps(vf, vf); + } + if (batch & (1 * sizeof(float))) { + // Store 1 output at a time. + _mm_store_ss(output, vf); + + // Accumulate 1 computed exponent. + vacc = _mm_add_ss(vacc, vf); + } + } + // Reduce 4 batch in the SIMD register + vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); + vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1))); + _mm_store_ss(sum, vacc); +} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u4.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u4.c new file mode 100644 index 00000000000..ee1ac67db15 --- /dev/null +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u4.c @@ -0,0 +1,222 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in +// Generator: tools/xngen +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/raddstoreexpminusmax.h" + + +void xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u4( + size_t batch, + const float* input, + const float* max, + float* output, + float* sum, + const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(max != NULL); + assert(output != NULL); + assert(sum != NULL); + + const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f); + const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f); + const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f); + const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f); + const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f); + const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f); + const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f); + const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f); + const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f); + const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f); + + XNN_FORCE_REALIZATION(vlog2e); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vminus_ln2_hi); + XNN_FORCE_REALIZATION(vminus_ln2_lo); + XNN_FORCE_REALIZATION(vc5); + XNN_FORCE_REALIZATION(vc4); + XNN_FORCE_REALIZATION(vc3); + XNN_FORCE_REALIZATION(vc2); + XNN_FORCE_REALIZATION(vc1); + XNN_FORCE_REALIZATION(vdenorm_cutoff); + + const __m128 vi_max = _mm_load1_ps(max); + + __m128 vacc0 = _mm_setzero_ps(); + for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { + // Load 4 (1x4) inputs at a time. + const __m128 vi0 = _mm_loadu_ps(input); + input += 4; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m128 vx0 = _mm_sub_ps(vi0, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn0 = _mm_sub_ps(vn0, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); + + vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt0 = _mm_mul_ps(vt0, vs0); + + __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); + + // Store 4 (1x4) outputs at a time. + _mm_storeu_ps(output, vf0); + output += 4; + + // Accumulate computed exponents. + vacc0 = _mm_add_ps(vacc0, vf0); + } + + __m128 vacc = vacc0; + for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { + // Load 4 inputs at a time. + const __m128 vi = _mm_loadu_ps(input); + input += 4; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m128 vx = _mm_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm_mul_ps(vt, vs); + __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); + + // Store 4 outputs at a time. + _mm_storeu_ps(output, vf); + output += 4; + + // Accumulate computed exponents. + vacc = _mm_add_ps(vacc, vf); + } + if (batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 3 * sizeof(float)); + // Load 4 inputs at a time. + const __m128 vi = _mm_loadu_ps(input); + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m128 vx = _mm_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm_mul_ps(vt, vs); + __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); + + if (batch & (2 * sizeof(float))) { + // Store 2 outputs at a time. + _mm_storel_pi((__m64*) output, vf); + output += 2; + + // Accumulate 2 computed exponents. + vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps())); + + vf = _mm_movehl_ps(vf, vf); + } + if (batch & (1 * sizeof(float))) { + // Store 1 output at a time. + _mm_store_ss(output, vf); + + // Accumulate 1 computed exponent. + vacc = _mm_add_ss(vacc, vf); + } + } + // Reduce 4 batch in the SIMD register + vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); + vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1))); + _mm_store_ss(sum, vacc); +} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u8-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u8-acc2.c new file mode 100644 index 00000000000..92ca6f83081 --- /dev/null +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u8-acc2.c @@ -0,0 +1,241 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in +// Generator: tools/xngen +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/raddstoreexpminusmax.h" + + +void xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u8_acc2( + size_t batch, + const float* input, + const float* max, + float* output, + float* sum, + const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(max != NULL); + assert(output != NULL); + assert(sum != NULL); + + const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f); + const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f); + const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f); + const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f); + const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f); + const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f); + const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f); + const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f); + const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f); + const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f); + + XNN_FORCE_REALIZATION(vlog2e); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vminus_ln2_hi); + XNN_FORCE_REALIZATION(vminus_ln2_lo); + XNN_FORCE_REALIZATION(vc5); + XNN_FORCE_REALIZATION(vc4); + XNN_FORCE_REALIZATION(vc3); + XNN_FORCE_REALIZATION(vc2); + XNN_FORCE_REALIZATION(vc1); + XNN_FORCE_REALIZATION(vdenorm_cutoff); + + const __m128 vi_max = _mm_load1_ps(max); + + __m128 vacc0 = _mm_setzero_ps(); + __m128 vacc1 = _mm_setzero_ps(); + for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { + // Load 8 (2x4) inputs at a time. + const __m128 vi0 = _mm_loadu_ps(input); + const __m128 vi1 = _mm_loadu_ps(input + 4); + input += 8; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m128 vx0 = _mm_sub_ps(vi0, vi_max); + const __m128 vx1 = _mm_sub_ps(vi1, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); + __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); + const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn0 = _mm_sub_ps(vn0, vmagic_bias); + vn1 = _mm_sub_ps(vn1, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); + __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); + + vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); + __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt0 = _mm_mul_ps(vt0, vs0); + vt1 = _mm_mul_ps(vt1, vs1); + + __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); + __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); + vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); + + // Store 8 (2x4) outputs at a time. + _mm_storeu_ps(output, vf0); + _mm_storeu_ps(output + 4, vf1); + output += 8; + + // Accumulate computed exponents. + vacc0 = _mm_add_ps(vacc0, vf0); + vacc1 = _mm_add_ps(vacc1, vf1); + } + // Add up all accumulators to vacc0 + vacc0 = _mm_add_ps(vacc0, vacc1); + + __m128 vacc = vacc0; + for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { + // Load 4 inputs at a time. + const __m128 vi = _mm_loadu_ps(input); + input += 4; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m128 vx = _mm_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm_mul_ps(vt, vs); + __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); + + // Store 4 outputs at a time. + _mm_storeu_ps(output, vf); + output += 4; + + // Accumulate computed exponents. + vacc = _mm_add_ps(vacc, vf); + } + if (batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 3 * sizeof(float)); + // Load 4 inputs at a time. + const __m128 vi = _mm_loadu_ps(input); + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m128 vx = _mm_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm_mul_ps(vt, vs); + __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); + + if (batch & (2 * sizeof(float))) { + // Store 2 outputs at a time. + _mm_storel_pi((__m64*) output, vf); + output += 2; + + // Accumulate 2 computed exponents. + vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps())); + + vf = _mm_movehl_ps(vf, vf); + } + if (batch & (1 * sizeof(float))) { + // Store 1 output at a time. + _mm_store_ss(output, vf); + + // Accumulate 1 computed exponent. + vacc = _mm_add_ss(vacc, vf); + } + } + // Reduce 4 batch in the SIMD register + vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); + vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1))); + _mm_store_ss(sum, vacc); +} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u8.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u8.c new file mode 100644 index 00000000000..742b25b307a --- /dev/null +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u8.c @@ -0,0 +1,238 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in +// Generator: tools/xngen +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/raddstoreexpminusmax.h" + + +void xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u8( + size_t batch, + const float* input, + const float* max, + float* output, + float* sum, + const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(max != NULL); + assert(output != NULL); + assert(sum != NULL); + + const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f); + const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f); + const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f); + const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f); + const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f); + const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f); + const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f); + const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f); + const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f); + const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f); + + XNN_FORCE_REALIZATION(vlog2e); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vminus_ln2_hi); + XNN_FORCE_REALIZATION(vminus_ln2_lo); + XNN_FORCE_REALIZATION(vc5); + XNN_FORCE_REALIZATION(vc4); + XNN_FORCE_REALIZATION(vc3); + XNN_FORCE_REALIZATION(vc2); + XNN_FORCE_REALIZATION(vc1); + XNN_FORCE_REALIZATION(vdenorm_cutoff); + + const __m128 vi_max = _mm_load1_ps(max); + + __m128 vacc0 = _mm_setzero_ps(); + for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { + // Load 8 (2x4) inputs at a time. + const __m128 vi0 = _mm_loadu_ps(input); + const __m128 vi1 = _mm_loadu_ps(input + 4); + input += 8; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m128 vx0 = _mm_sub_ps(vi0, vi_max); + const __m128 vx1 = _mm_sub_ps(vi1, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); + __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); + const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn0 = _mm_sub_ps(vn0, vmagic_bias); + vn1 = _mm_sub_ps(vn1, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); + __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); + + vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); + __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt0 = _mm_mul_ps(vt0, vs0); + vt1 = _mm_mul_ps(vt1, vs1); + + __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); + __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); + vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); + + // Store 8 (2x4) outputs at a time. + _mm_storeu_ps(output, vf0); + _mm_storeu_ps(output + 4, vf1); + output += 8; + + // Accumulate computed exponents. + vacc0 = _mm_add_ps(vacc0, vf0); + vacc0 = _mm_add_ps(vacc0, vf1); + } + + __m128 vacc = vacc0; + for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { + // Load 4 inputs at a time. + const __m128 vi = _mm_loadu_ps(input); + input += 4; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m128 vx = _mm_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm_mul_ps(vt, vs); + __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); + + // Store 4 outputs at a time. + _mm_storeu_ps(output, vf); + output += 4; + + // Accumulate computed exponents. + vacc = _mm_add_ps(vacc, vf); + } + if (batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 3 * sizeof(float)); + // Load 4 inputs at a time. + const __m128 vi = _mm_loadu_ps(input); + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m128 vx = _mm_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); + vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm_mul_ps(vt, vs); + __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); + + if (batch & (2 * sizeof(float))) { + // Store 2 outputs at a time. + _mm_storel_pi((__m64*) output, vf); + output += 2; + + // Accumulate 2 computed exponents. + vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps())); + + vf = _mm_movehl_ps(vf, vf); + } + if (batch & (1 * sizeof(float))) { + // Store 1 output at a time. + _mm_store_ss(output, vf); + + // Accumulate 1 computed exponent. + vacc = _mm_add_ss(vacc, vf); + } + } + // Reduce 4 batch in the SIMD register + vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); + vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1))); + _mm_store_ss(sum, vacc); +} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u12-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u12-acc2.c index c269417fe1d..2ff1d6bfb9b 100644 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u12-acc2.c +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u12-acc2.c @@ -58,87 +58,87 @@ void xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u12_acc2( __m128 vacc1 = _mm_setzero_ps(); for (; batch >= 12 * sizeof(float); batch -= 12 * sizeof(float)) { // Load 12 (3x4) inputs at a time. - const __m128 vi0123 = _mm_loadu_ps(input); - const __m128 vi4567 = _mm_loadu_ps(input + 4); - const __m128 vi89AB = _mm_loadu_ps(input + 8); + const __m128 vi0 = _mm_loadu_ps(input); + const __m128 vi1 = _mm_loadu_ps(input + 4); + const __m128 vi2 = _mm_loadu_ps(input + 8); input += 12; // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx0123 = _mm_sub_ps(vi0123, vi_max); - const __m128 vx4567 = _mm_sub_ps(vi4567, vi_max); - const __m128 vx89AB = _mm_sub_ps(vi89AB, vi_max); + const __m128 vx0 = _mm_sub_ps(vi0, vi_max); + const __m128 vx1 = _mm_sub_ps(vi1, vi_max); + const __m128 vx2 = _mm_sub_ps(vi2, vi_max); // Compute reduced argument batch := round(x / log(2)). - __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vx0123, vlog2e), vmagic_bias); - __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vx4567, vlog2e), vmagic_bias); - __m128 vn89AB = _mm_add_ps(_mm_mul_ps(vx89AB, vlog2e), vmagic_bias); + __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); + __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); + __m128 vn2 = _mm_add_ps(_mm_mul_ps(vx2, vlog2e), vmagic_bias); // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23)); - const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23)); - const __m128 vs89AB = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn89AB), 23)); + const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); + const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); + const __m128 vs2 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn2), 23)); // Subtract the large number back to get final batch := round(x / log(2)). - vn0123 = _mm_sub_ps(vn0123, vmagic_bias); - vn4567 = _mm_sub_ps(vn4567, vmagic_bias); - vn89AB = _mm_sub_ps(vn89AB, vmagic_bias); + vn0 = _mm_sub_ps(vn0, vmagic_bias); + vn1 = _mm_sub_ps(vn1, vmagic_bias); + vn2 = _mm_sub_ps(vn2, vmagic_bias); // Compute reduced argument t := x - batch * log(2). // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vx0123); - __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vx4567); - __m128 vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_hi), vx89AB); + __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); + __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); + __m128 vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_hi), vx2); - vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123); - vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567); - vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_lo), vt89AB); + vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); + vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_lo), vt2); // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4); - __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4); - __m128 vp89AB = _mm_add_ps(_mm_mul_ps(vc5, vt89AB), vc4); + __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); + __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); + __m128 vp2 = _mm_add_ps(_mm_mul_ps(vc5, vt2), vc4); - vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3); - vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3); - vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc3); + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc3); - vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2); - vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2); - vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc2); + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc2); - vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1); - vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1); - vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc1); + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc1); // Reconstruct the final f value: // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) // = s + (t * s) * p - vt0123 = _mm_mul_ps(vt0123, vs0123); - vt4567 = _mm_mul_ps(vt4567, vs4567); - vt89AB = _mm_mul_ps(vt89AB, vs89AB); + vt0 = _mm_mul_ps(vt0, vs0); + vt1 = _mm_mul_ps(vt1, vs1); + vt2 = _mm_mul_ps(vt2, vs2); - __m128 vf0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123); - __m128 vf4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567); - __m128 vf89AB = _mm_add_ps(_mm_mul_ps(vt89AB, vp89AB), vs89AB); + __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); + __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); + __m128 vf2 = _mm_add_ps(_mm_mul_ps(vt2, vp2), vs2); // For inputs below zero cutoff, replace output with +0.0f. // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123); - vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vx4567, vdenorm_cutoff), vf4567); - vf89AB = _mm_andnot_ps(_mm_cmplt_ps(vx89AB, vdenorm_cutoff), vf89AB); + vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); + vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); + vf2 = _mm_andnot_ps(_mm_cmplt_ps(vx2, vdenorm_cutoff), vf2); // Store 12 (3x4) outputs at a time. - _mm_storeu_ps(output, vf0123); - _mm_storeu_ps(output + 4, vf4567); - _mm_storeu_ps(output + 8, vf89AB); + _mm_storeu_ps(output, vf0); + _mm_storeu_ps(output + 4, vf1); + _mm_storeu_ps(output + 8, vf2); output += 12; // Accumulate computed exponents. - vacc0 = _mm_add_ps(vacc0, vf0123); - vacc0 = _mm_add_ps(vacc0, vf4567); - vacc0 = _mm_add_ps(vacc0, vf89AB); + vacc0 = _mm_add_ps(vacc0, vf0); + vacc1 = _mm_add_ps(vacc1, vf1); + vacc0 = _mm_add_ps(vacc0, vf2); } // Add up all accumulators to vacc0 vacc0 = _mm_add_ps(vacc0, vacc1); diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u12-acc3.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u12-acc3.c index 202dff7dab1..d67d9fb7caf 100644 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u12-acc3.c +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u12-acc3.c @@ -59,87 +59,87 @@ void xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u12_acc3( __m128 vacc2 = _mm_setzero_ps(); for (; batch >= 12 * sizeof(float); batch -= 12 * sizeof(float)) { // Load 12 (3x4) inputs at a time. - const __m128 vi0123 = _mm_loadu_ps(input); - const __m128 vi4567 = _mm_loadu_ps(input + 4); - const __m128 vi89AB = _mm_loadu_ps(input + 8); + const __m128 vi0 = _mm_loadu_ps(input); + const __m128 vi1 = _mm_loadu_ps(input + 4); + const __m128 vi2 = _mm_loadu_ps(input + 8); input += 12; // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx0123 = _mm_sub_ps(vi0123, vi_max); - const __m128 vx4567 = _mm_sub_ps(vi4567, vi_max); - const __m128 vx89AB = _mm_sub_ps(vi89AB, vi_max); + const __m128 vx0 = _mm_sub_ps(vi0, vi_max); + const __m128 vx1 = _mm_sub_ps(vi1, vi_max); + const __m128 vx2 = _mm_sub_ps(vi2, vi_max); // Compute reduced argument batch := round(x / log(2)). - __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vx0123, vlog2e), vmagic_bias); - __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vx4567, vlog2e), vmagic_bias); - __m128 vn89AB = _mm_add_ps(_mm_mul_ps(vx89AB, vlog2e), vmagic_bias); + __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); + __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); + __m128 vn2 = _mm_add_ps(_mm_mul_ps(vx2, vlog2e), vmagic_bias); // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23)); - const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23)); - const __m128 vs89AB = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn89AB), 23)); + const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); + const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); + const __m128 vs2 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn2), 23)); // Subtract the large number back to get final batch := round(x / log(2)). - vn0123 = _mm_sub_ps(vn0123, vmagic_bias); - vn4567 = _mm_sub_ps(vn4567, vmagic_bias); - vn89AB = _mm_sub_ps(vn89AB, vmagic_bias); + vn0 = _mm_sub_ps(vn0, vmagic_bias); + vn1 = _mm_sub_ps(vn1, vmagic_bias); + vn2 = _mm_sub_ps(vn2, vmagic_bias); // Compute reduced argument t := x - batch * log(2). // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vx0123); - __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vx4567); - __m128 vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_hi), vx89AB); + __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); + __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); + __m128 vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_hi), vx2); - vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123); - vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567); - vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_lo), vt89AB); + vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); + vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_lo), vt2); // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4); - __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4); - __m128 vp89AB = _mm_add_ps(_mm_mul_ps(vc5, vt89AB), vc4); + __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); + __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); + __m128 vp2 = _mm_add_ps(_mm_mul_ps(vc5, vt2), vc4); - vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3); - vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3); - vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc3); + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc3); - vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2); - vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2); - vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc2); + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc2); - vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1); - vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1); - vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc1); + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc1); // Reconstruct the final f value: // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) // = s + (t * s) * p - vt0123 = _mm_mul_ps(vt0123, vs0123); - vt4567 = _mm_mul_ps(vt4567, vs4567); - vt89AB = _mm_mul_ps(vt89AB, vs89AB); + vt0 = _mm_mul_ps(vt0, vs0); + vt1 = _mm_mul_ps(vt1, vs1); + vt2 = _mm_mul_ps(vt2, vs2); - __m128 vf0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123); - __m128 vf4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567); - __m128 vf89AB = _mm_add_ps(_mm_mul_ps(vt89AB, vp89AB), vs89AB); + __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); + __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); + __m128 vf2 = _mm_add_ps(_mm_mul_ps(vt2, vp2), vs2); // For inputs below zero cutoff, replace output with +0.0f. // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123); - vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vx4567, vdenorm_cutoff), vf4567); - vf89AB = _mm_andnot_ps(_mm_cmplt_ps(vx89AB, vdenorm_cutoff), vf89AB); + vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); + vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); + vf2 = _mm_andnot_ps(_mm_cmplt_ps(vx2, vdenorm_cutoff), vf2); // Store 12 (3x4) outputs at a time. - _mm_storeu_ps(output, vf0123); - _mm_storeu_ps(output + 4, vf4567); - _mm_storeu_ps(output + 8, vf89AB); + _mm_storeu_ps(output, vf0); + _mm_storeu_ps(output + 4, vf1); + _mm_storeu_ps(output + 8, vf2); output += 12; // Accumulate computed exponents. - vacc0 = _mm_add_ps(vacc0, vf0123); - vacc1 = _mm_add_ps(vacc1, vf4567); - vacc2 = _mm_add_ps(vacc2, vf89AB); + vacc0 = _mm_add_ps(vacc0, vf0); + vacc1 = _mm_add_ps(vacc1, vf1); + vacc2 = _mm_add_ps(vacc2, vf2); } // Add up all accumulators to vacc0 vacc0 = _mm_add_ps(vacc0, vacc1); diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u12.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u12.c index d90eea0a38f..227262dc565 100644 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u12.c +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u12.c @@ -57,87 +57,87 @@ void xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u12( __m128 vacc0 = _mm_setzero_ps(); for (; batch >= 12 * sizeof(float); batch -= 12 * sizeof(float)) { // Load 12 (3x4) inputs at a time. - const __m128 vi0123 = _mm_loadu_ps(input); - const __m128 vi4567 = _mm_loadu_ps(input + 4); - const __m128 vi89AB = _mm_loadu_ps(input + 8); + const __m128 vi0 = _mm_loadu_ps(input); + const __m128 vi1 = _mm_loadu_ps(input + 4); + const __m128 vi2 = _mm_loadu_ps(input + 8); input += 12; // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx0123 = _mm_sub_ps(vi0123, vi_max); - const __m128 vx4567 = _mm_sub_ps(vi4567, vi_max); - const __m128 vx89AB = _mm_sub_ps(vi89AB, vi_max); + const __m128 vx0 = _mm_sub_ps(vi0, vi_max); + const __m128 vx1 = _mm_sub_ps(vi1, vi_max); + const __m128 vx2 = _mm_sub_ps(vi2, vi_max); // Compute reduced argument batch := round(x / log(2)). - __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vx0123, vlog2e), vmagic_bias); - __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vx4567, vlog2e), vmagic_bias); - __m128 vn89AB = _mm_add_ps(_mm_mul_ps(vx89AB, vlog2e), vmagic_bias); + __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); + __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); + __m128 vn2 = _mm_add_ps(_mm_mul_ps(vx2, vlog2e), vmagic_bias); // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23)); - const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23)); - const __m128 vs89AB = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn89AB), 23)); + const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); + const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); + const __m128 vs2 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn2), 23)); // Subtract the large number back to get final batch := round(x / log(2)). - vn0123 = _mm_sub_ps(vn0123, vmagic_bias); - vn4567 = _mm_sub_ps(vn4567, vmagic_bias); - vn89AB = _mm_sub_ps(vn89AB, vmagic_bias); + vn0 = _mm_sub_ps(vn0, vmagic_bias); + vn1 = _mm_sub_ps(vn1, vmagic_bias); + vn2 = _mm_sub_ps(vn2, vmagic_bias); // Compute reduced argument t := x - batch * log(2). // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vx0123); - __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vx4567); - __m128 vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_hi), vx89AB); + __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); + __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); + __m128 vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_hi), vx2); - vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123); - vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567); - vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_lo), vt89AB); + vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); + vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_lo), vt2); // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4); - __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4); - __m128 vp89AB = _mm_add_ps(_mm_mul_ps(vc5, vt89AB), vc4); + __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); + __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); + __m128 vp2 = _mm_add_ps(_mm_mul_ps(vc5, vt2), vc4); - vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3); - vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3); - vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc3); + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc3); - vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2); - vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2); - vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc2); + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc2); - vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1); - vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1); - vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc1); + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc1); // Reconstruct the final f value: // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) // = s + (t * s) * p - vt0123 = _mm_mul_ps(vt0123, vs0123); - vt4567 = _mm_mul_ps(vt4567, vs4567); - vt89AB = _mm_mul_ps(vt89AB, vs89AB); + vt0 = _mm_mul_ps(vt0, vs0); + vt1 = _mm_mul_ps(vt1, vs1); + vt2 = _mm_mul_ps(vt2, vs2); - __m128 vf0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123); - __m128 vf4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567); - __m128 vf89AB = _mm_add_ps(_mm_mul_ps(vt89AB, vp89AB), vs89AB); + __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); + __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); + __m128 vf2 = _mm_add_ps(_mm_mul_ps(vt2, vp2), vs2); // For inputs below zero cutoff, replace output with +0.0f. // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123); - vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vx4567, vdenorm_cutoff), vf4567); - vf89AB = _mm_andnot_ps(_mm_cmplt_ps(vx89AB, vdenorm_cutoff), vf89AB); + vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); + vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); + vf2 = _mm_andnot_ps(_mm_cmplt_ps(vx2, vdenorm_cutoff), vf2); // Store 12 (3x4) outputs at a time. - _mm_storeu_ps(output, vf0123); - _mm_storeu_ps(output + 4, vf4567); - _mm_storeu_ps(output + 8, vf89AB); + _mm_storeu_ps(output, vf0); + _mm_storeu_ps(output + 4, vf1); + _mm_storeu_ps(output + 8, vf2); output += 12; // Accumulate computed exponents. - vacc0 = _mm_add_ps(vacc0, vf0123); - vacc0 = _mm_add_ps(vacc0, vf4567); - vacc0 = _mm_add_ps(vacc0, vf89AB); + vacc0 = _mm_add_ps(vacc0, vf0); + vacc0 = _mm_add_ps(vacc0, vf1); + vacc0 = _mm_add_ps(vacc0, vf2); } __m128 vacc = vacc0; diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16-acc2.c index 922cd0bf075..5f75fb8ea34 100644 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16-acc2.c +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16-acc2.c @@ -58,103 +58,103 @@ void xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16_acc2( __m128 vacc1 = _mm_setzero_ps(); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { // Load 16 (4x4) inputs at a time. - const __m128 vi0123 = _mm_loadu_ps(input); - const __m128 vi4567 = _mm_loadu_ps(input + 4); - const __m128 vi89AB = _mm_loadu_ps(input + 8); - const __m128 viCDEF = _mm_loadu_ps(input + 12); + const __m128 vi0 = _mm_loadu_ps(input); + const __m128 vi1 = _mm_loadu_ps(input + 4); + const __m128 vi2 = _mm_loadu_ps(input + 8); + const __m128 vi3 = _mm_loadu_ps(input + 12); input += 16; // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx0123 = _mm_sub_ps(vi0123, vi_max); - const __m128 vx4567 = _mm_sub_ps(vi4567, vi_max); - const __m128 vx89AB = _mm_sub_ps(vi89AB, vi_max); - const __m128 vxCDEF = _mm_sub_ps(viCDEF, vi_max); + const __m128 vx0 = _mm_sub_ps(vi0, vi_max); + const __m128 vx1 = _mm_sub_ps(vi1, vi_max); + const __m128 vx2 = _mm_sub_ps(vi2, vi_max); + const __m128 vx3 = _mm_sub_ps(vi3, vi_max); // Compute reduced argument batch := round(x / log(2)). - __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vx0123, vlog2e), vmagic_bias); - __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vx4567, vlog2e), vmagic_bias); - __m128 vn89AB = _mm_add_ps(_mm_mul_ps(vx89AB, vlog2e), vmagic_bias); - __m128 vnCDEF = _mm_add_ps(_mm_mul_ps(vxCDEF, vlog2e), vmagic_bias); + __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); + __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); + __m128 vn2 = _mm_add_ps(_mm_mul_ps(vx2, vlog2e), vmagic_bias); + __m128 vn3 = _mm_add_ps(_mm_mul_ps(vx3, vlog2e), vmagic_bias); // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23)); - const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23)); - const __m128 vs89AB = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn89AB), 23)); - const __m128 vsCDEF = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vnCDEF), 23)); + const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); + const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); + const __m128 vs2 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn2), 23)); + const __m128 vs3 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn3), 23)); // Subtract the large number back to get final batch := round(x / log(2)). - vn0123 = _mm_sub_ps(vn0123, vmagic_bias); - vn4567 = _mm_sub_ps(vn4567, vmagic_bias); - vn89AB = _mm_sub_ps(vn89AB, vmagic_bias); - vnCDEF = _mm_sub_ps(vnCDEF, vmagic_bias); + vn0 = _mm_sub_ps(vn0, vmagic_bias); + vn1 = _mm_sub_ps(vn1, vmagic_bias); + vn2 = _mm_sub_ps(vn2, vmagic_bias); + vn3 = _mm_sub_ps(vn3, vmagic_bias); // Compute reduced argument t := x - batch * log(2). // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vx0123); - __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vx4567); - __m128 vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_hi), vx89AB); - __m128 vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_hi), vxCDEF); + __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); + __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); + __m128 vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_hi), vx2); + __m128 vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_hi), vx3); - vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123); - vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567); - vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_lo), vt89AB); - vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_lo), vtCDEF); + vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); + vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_lo), vt2); + vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_lo), vt3); // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4); - __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4); - __m128 vp89AB = _mm_add_ps(_mm_mul_ps(vc5, vt89AB), vc4); - __m128 vpCDEF = _mm_add_ps(_mm_mul_ps(vc5, vtCDEF), vc4); - - vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3); - vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3); - vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc3); - vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc3); - - vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2); - vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2); - vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc2); - vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc2); - - vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1); - vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1); - vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc1); - vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc1); + __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); + __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); + __m128 vp2 = _mm_add_ps(_mm_mul_ps(vc5, vt2), vc4); + __m128 vp3 = _mm_add_ps(_mm_mul_ps(vc5, vt3), vc4); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc3); + vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc3); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc2); + vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc2); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc1); + vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc1); // Reconstruct the final f value: // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) // = s + (t * s) * p - vt0123 = _mm_mul_ps(vt0123, vs0123); - vt4567 = _mm_mul_ps(vt4567, vs4567); - vt89AB = _mm_mul_ps(vt89AB, vs89AB); - vtCDEF = _mm_mul_ps(vtCDEF, vsCDEF); + vt0 = _mm_mul_ps(vt0, vs0); + vt1 = _mm_mul_ps(vt1, vs1); + vt2 = _mm_mul_ps(vt2, vs2); + vt3 = _mm_mul_ps(vt3, vs3); - __m128 vf0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123); - __m128 vf4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567); - __m128 vf89AB = _mm_add_ps(_mm_mul_ps(vt89AB, vp89AB), vs89AB); - __m128 vfCDEF = _mm_add_ps(_mm_mul_ps(vtCDEF, vpCDEF), vsCDEF); + __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); + __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); + __m128 vf2 = _mm_add_ps(_mm_mul_ps(vt2, vp2), vs2); + __m128 vf3 = _mm_add_ps(_mm_mul_ps(vt3, vp3), vs3); // For inputs below zero cutoff, replace output with +0.0f. // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123); - vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vx4567, vdenorm_cutoff), vf4567); - vf89AB = _mm_andnot_ps(_mm_cmplt_ps(vx89AB, vdenorm_cutoff), vf89AB); - vfCDEF = _mm_andnot_ps(_mm_cmplt_ps(vxCDEF, vdenorm_cutoff), vfCDEF); + vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); + vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); + vf2 = _mm_andnot_ps(_mm_cmplt_ps(vx2, vdenorm_cutoff), vf2); + vf3 = _mm_andnot_ps(_mm_cmplt_ps(vx3, vdenorm_cutoff), vf3); // Store 16 (4x4) outputs at a time. - _mm_storeu_ps(output, vf0123); - _mm_storeu_ps(output + 4, vf4567); - _mm_storeu_ps(output + 8, vf89AB); - _mm_storeu_ps(output + 12, vfCDEF); + _mm_storeu_ps(output, vf0); + _mm_storeu_ps(output + 4, vf1); + _mm_storeu_ps(output + 8, vf2); + _mm_storeu_ps(output + 12, vf3); output += 16; // Accumulate computed exponents. - vacc0 = _mm_add_ps(vacc0, vf0123); - vacc0 = _mm_add_ps(vacc0, vf4567); - vacc0 = _mm_add_ps(vacc0, vf89AB); - vacc0 = _mm_add_ps(vacc0, vfCDEF); + vacc0 = _mm_add_ps(vacc0, vf0); + vacc1 = _mm_add_ps(vacc1, vf1); + vacc0 = _mm_add_ps(vacc0, vf2); + vacc1 = _mm_add_ps(vacc1, vf3); } // Add up all accumulators to vacc0 vacc0 = _mm_add_ps(vacc0, vacc1); diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16-acc4.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16-acc4.c index a1d83e65681..21598d75908 100644 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16-acc4.c +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16-acc4.c @@ -60,103 +60,103 @@ void xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16_acc4( __m128 vacc3 = _mm_setzero_ps(); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { // Load 16 (4x4) inputs at a time. - const __m128 vi0123 = _mm_loadu_ps(input); - const __m128 vi4567 = _mm_loadu_ps(input + 4); - const __m128 vi89AB = _mm_loadu_ps(input + 8); - const __m128 viCDEF = _mm_loadu_ps(input + 12); + const __m128 vi0 = _mm_loadu_ps(input); + const __m128 vi1 = _mm_loadu_ps(input + 4); + const __m128 vi2 = _mm_loadu_ps(input + 8); + const __m128 vi3 = _mm_loadu_ps(input + 12); input += 16; // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx0123 = _mm_sub_ps(vi0123, vi_max); - const __m128 vx4567 = _mm_sub_ps(vi4567, vi_max); - const __m128 vx89AB = _mm_sub_ps(vi89AB, vi_max); - const __m128 vxCDEF = _mm_sub_ps(viCDEF, vi_max); + const __m128 vx0 = _mm_sub_ps(vi0, vi_max); + const __m128 vx1 = _mm_sub_ps(vi1, vi_max); + const __m128 vx2 = _mm_sub_ps(vi2, vi_max); + const __m128 vx3 = _mm_sub_ps(vi3, vi_max); // Compute reduced argument batch := round(x / log(2)). - __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vx0123, vlog2e), vmagic_bias); - __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vx4567, vlog2e), vmagic_bias); - __m128 vn89AB = _mm_add_ps(_mm_mul_ps(vx89AB, vlog2e), vmagic_bias); - __m128 vnCDEF = _mm_add_ps(_mm_mul_ps(vxCDEF, vlog2e), vmagic_bias); + __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); + __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); + __m128 vn2 = _mm_add_ps(_mm_mul_ps(vx2, vlog2e), vmagic_bias); + __m128 vn3 = _mm_add_ps(_mm_mul_ps(vx3, vlog2e), vmagic_bias); // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23)); - const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23)); - const __m128 vs89AB = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn89AB), 23)); - const __m128 vsCDEF = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vnCDEF), 23)); + const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); + const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); + const __m128 vs2 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn2), 23)); + const __m128 vs3 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn3), 23)); // Subtract the large number back to get final batch := round(x / log(2)). - vn0123 = _mm_sub_ps(vn0123, vmagic_bias); - vn4567 = _mm_sub_ps(vn4567, vmagic_bias); - vn89AB = _mm_sub_ps(vn89AB, vmagic_bias); - vnCDEF = _mm_sub_ps(vnCDEF, vmagic_bias); + vn0 = _mm_sub_ps(vn0, vmagic_bias); + vn1 = _mm_sub_ps(vn1, vmagic_bias); + vn2 = _mm_sub_ps(vn2, vmagic_bias); + vn3 = _mm_sub_ps(vn3, vmagic_bias); // Compute reduced argument t := x - batch * log(2). // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vx0123); - __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vx4567); - __m128 vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_hi), vx89AB); - __m128 vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_hi), vxCDEF); + __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); + __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); + __m128 vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_hi), vx2); + __m128 vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_hi), vx3); - vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123); - vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567); - vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_lo), vt89AB); - vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_lo), vtCDEF); + vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); + vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_lo), vt2); + vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_lo), vt3); // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4); - __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4); - __m128 vp89AB = _mm_add_ps(_mm_mul_ps(vc5, vt89AB), vc4); - __m128 vpCDEF = _mm_add_ps(_mm_mul_ps(vc5, vtCDEF), vc4); - - vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3); - vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3); - vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc3); - vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc3); - - vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2); - vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2); - vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc2); - vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc2); - - vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1); - vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1); - vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc1); - vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc1); + __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); + __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); + __m128 vp2 = _mm_add_ps(_mm_mul_ps(vc5, vt2), vc4); + __m128 vp3 = _mm_add_ps(_mm_mul_ps(vc5, vt3), vc4); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc3); + vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc3); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc2); + vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc2); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc1); + vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc1); // Reconstruct the final f value: // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) // = s + (t * s) * p - vt0123 = _mm_mul_ps(vt0123, vs0123); - vt4567 = _mm_mul_ps(vt4567, vs4567); - vt89AB = _mm_mul_ps(vt89AB, vs89AB); - vtCDEF = _mm_mul_ps(vtCDEF, vsCDEF); + vt0 = _mm_mul_ps(vt0, vs0); + vt1 = _mm_mul_ps(vt1, vs1); + vt2 = _mm_mul_ps(vt2, vs2); + vt3 = _mm_mul_ps(vt3, vs3); - __m128 vf0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123); - __m128 vf4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567); - __m128 vf89AB = _mm_add_ps(_mm_mul_ps(vt89AB, vp89AB), vs89AB); - __m128 vfCDEF = _mm_add_ps(_mm_mul_ps(vtCDEF, vpCDEF), vsCDEF); + __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); + __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); + __m128 vf2 = _mm_add_ps(_mm_mul_ps(vt2, vp2), vs2); + __m128 vf3 = _mm_add_ps(_mm_mul_ps(vt3, vp3), vs3); // For inputs below zero cutoff, replace output with +0.0f. // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123); - vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vx4567, vdenorm_cutoff), vf4567); - vf89AB = _mm_andnot_ps(_mm_cmplt_ps(vx89AB, vdenorm_cutoff), vf89AB); - vfCDEF = _mm_andnot_ps(_mm_cmplt_ps(vxCDEF, vdenorm_cutoff), vfCDEF); + vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); + vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); + vf2 = _mm_andnot_ps(_mm_cmplt_ps(vx2, vdenorm_cutoff), vf2); + vf3 = _mm_andnot_ps(_mm_cmplt_ps(vx3, vdenorm_cutoff), vf3); // Store 16 (4x4) outputs at a time. - _mm_storeu_ps(output, vf0123); - _mm_storeu_ps(output + 4, vf4567); - _mm_storeu_ps(output + 8, vf89AB); - _mm_storeu_ps(output + 12, vfCDEF); + _mm_storeu_ps(output, vf0); + _mm_storeu_ps(output + 4, vf1); + _mm_storeu_ps(output + 8, vf2); + _mm_storeu_ps(output + 12, vf3); output += 16; // Accumulate computed exponents. - vacc0 = _mm_add_ps(vacc0, vf0123); - vacc0 = _mm_add_ps(vacc0, vf4567); - vacc0 = _mm_add_ps(vacc0, vf89AB); - vacc0 = _mm_add_ps(vacc0, vfCDEF); + vacc0 = _mm_add_ps(vacc0, vf0); + vacc1 = _mm_add_ps(vacc1, vf1); + vacc2 = _mm_add_ps(vacc2, vf2); + vacc3 = _mm_add_ps(vacc3, vf3); } // Add up all accumulators to vacc0 vacc0 = _mm_add_ps(vacc0, vacc1); diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16.c index 7ab3f7a7043..79cabdc774d 100644 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16.c +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16.c @@ -57,103 +57,103 @@ void xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16( __m128 vacc0 = _mm_setzero_ps(); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { // Load 16 (4x4) inputs at a time. - const __m128 vi0123 = _mm_loadu_ps(input); - const __m128 vi4567 = _mm_loadu_ps(input + 4); - const __m128 vi89AB = _mm_loadu_ps(input + 8); - const __m128 viCDEF = _mm_loadu_ps(input + 12); + const __m128 vi0 = _mm_loadu_ps(input); + const __m128 vi1 = _mm_loadu_ps(input + 4); + const __m128 vi2 = _mm_loadu_ps(input + 8); + const __m128 vi3 = _mm_loadu_ps(input + 12); input += 16; // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx0123 = _mm_sub_ps(vi0123, vi_max); - const __m128 vx4567 = _mm_sub_ps(vi4567, vi_max); - const __m128 vx89AB = _mm_sub_ps(vi89AB, vi_max); - const __m128 vxCDEF = _mm_sub_ps(viCDEF, vi_max); + const __m128 vx0 = _mm_sub_ps(vi0, vi_max); + const __m128 vx1 = _mm_sub_ps(vi1, vi_max); + const __m128 vx2 = _mm_sub_ps(vi2, vi_max); + const __m128 vx3 = _mm_sub_ps(vi3, vi_max); // Compute reduced argument batch := round(x / log(2)). - __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vx0123, vlog2e), vmagic_bias); - __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vx4567, vlog2e), vmagic_bias); - __m128 vn89AB = _mm_add_ps(_mm_mul_ps(vx89AB, vlog2e), vmagic_bias); - __m128 vnCDEF = _mm_add_ps(_mm_mul_ps(vxCDEF, vlog2e), vmagic_bias); + __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); + __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); + __m128 vn2 = _mm_add_ps(_mm_mul_ps(vx2, vlog2e), vmagic_bias); + __m128 vn3 = _mm_add_ps(_mm_mul_ps(vx3, vlog2e), vmagic_bias); // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23)); - const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23)); - const __m128 vs89AB = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn89AB), 23)); - const __m128 vsCDEF = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vnCDEF), 23)); + const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); + const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); + const __m128 vs2 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn2), 23)); + const __m128 vs3 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn3), 23)); // Subtract the large number back to get final batch := round(x / log(2)). - vn0123 = _mm_sub_ps(vn0123, vmagic_bias); - vn4567 = _mm_sub_ps(vn4567, vmagic_bias); - vn89AB = _mm_sub_ps(vn89AB, vmagic_bias); - vnCDEF = _mm_sub_ps(vnCDEF, vmagic_bias); + vn0 = _mm_sub_ps(vn0, vmagic_bias); + vn1 = _mm_sub_ps(vn1, vmagic_bias); + vn2 = _mm_sub_ps(vn2, vmagic_bias); + vn3 = _mm_sub_ps(vn3, vmagic_bias); // Compute reduced argument t := x - batch * log(2). // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vx0123); - __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vx4567); - __m128 vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_hi), vx89AB); - __m128 vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_hi), vxCDEF); + __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); + __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); + __m128 vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_hi), vx2); + __m128 vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_hi), vx3); - vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123); - vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567); - vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_lo), vt89AB); - vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_lo), vtCDEF); + vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); + vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_lo), vt2); + vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_lo), vt3); // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4); - __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4); - __m128 vp89AB = _mm_add_ps(_mm_mul_ps(vc5, vt89AB), vc4); - __m128 vpCDEF = _mm_add_ps(_mm_mul_ps(vc5, vtCDEF), vc4); - - vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3); - vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3); - vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc3); - vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc3); - - vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2); - vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2); - vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc2); - vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc2); - - vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1); - vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1); - vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc1); - vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc1); + __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); + __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); + __m128 vp2 = _mm_add_ps(_mm_mul_ps(vc5, vt2), vc4); + __m128 vp3 = _mm_add_ps(_mm_mul_ps(vc5, vt3), vc4); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc3); + vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc3); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc2); + vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc2); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc1); + vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc1); // Reconstruct the final f value: // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) // = s + (t * s) * p - vt0123 = _mm_mul_ps(vt0123, vs0123); - vt4567 = _mm_mul_ps(vt4567, vs4567); - vt89AB = _mm_mul_ps(vt89AB, vs89AB); - vtCDEF = _mm_mul_ps(vtCDEF, vsCDEF); + vt0 = _mm_mul_ps(vt0, vs0); + vt1 = _mm_mul_ps(vt1, vs1); + vt2 = _mm_mul_ps(vt2, vs2); + vt3 = _mm_mul_ps(vt3, vs3); - __m128 vf0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123); - __m128 vf4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567); - __m128 vf89AB = _mm_add_ps(_mm_mul_ps(vt89AB, vp89AB), vs89AB); - __m128 vfCDEF = _mm_add_ps(_mm_mul_ps(vtCDEF, vpCDEF), vsCDEF); + __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); + __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); + __m128 vf2 = _mm_add_ps(_mm_mul_ps(vt2, vp2), vs2); + __m128 vf3 = _mm_add_ps(_mm_mul_ps(vt3, vp3), vs3); // For inputs below zero cutoff, replace output with +0.0f. // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123); - vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vx4567, vdenorm_cutoff), vf4567); - vf89AB = _mm_andnot_ps(_mm_cmplt_ps(vx89AB, vdenorm_cutoff), vf89AB); - vfCDEF = _mm_andnot_ps(_mm_cmplt_ps(vxCDEF, vdenorm_cutoff), vfCDEF); + vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); + vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); + vf2 = _mm_andnot_ps(_mm_cmplt_ps(vx2, vdenorm_cutoff), vf2); + vf3 = _mm_andnot_ps(_mm_cmplt_ps(vx3, vdenorm_cutoff), vf3); // Store 16 (4x4) outputs at a time. - _mm_storeu_ps(output, vf0123); - _mm_storeu_ps(output + 4, vf4567); - _mm_storeu_ps(output + 8, vf89AB); - _mm_storeu_ps(output + 12, vfCDEF); + _mm_storeu_ps(output, vf0); + _mm_storeu_ps(output + 4, vf1); + _mm_storeu_ps(output + 8, vf2); + _mm_storeu_ps(output + 12, vf3); output += 16; // Accumulate computed exponents. - vacc0 = _mm_add_ps(vacc0, vf0123); - vacc0 = _mm_add_ps(vacc0, vf4567); - vacc0 = _mm_add_ps(vacc0, vf89AB); - vacc0 = _mm_add_ps(vacc0, vfCDEF); + vacc0 = _mm_add_ps(vacc0, vf0); + vacc0 = _mm_add_ps(vacc0, vf1); + vacc0 = _mm_add_ps(vacc0, vf2); + vacc0 = _mm_add_ps(vacc0, vf3); } __m128 vacc = vacc0; diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20-acc2.c index d90b5cff3bf..efc2cc73c8b 100644 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20-acc2.c +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20-acc2.c @@ -58,119 +58,119 @@ void xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u20_acc2( __m128 vacc1 = _mm_setzero_ps(); for (; batch >= 20 * sizeof(float); batch -= 20 * sizeof(float)) { // Load 20 (5x4) inputs at a time. - const __m128 vi0123 = _mm_loadu_ps(input); - const __m128 vi4567 = _mm_loadu_ps(input + 4); - const __m128 vi89AB = _mm_loadu_ps(input + 8); - const __m128 viCDEF = _mm_loadu_ps(input + 12); - const __m128 viGHIJ = _mm_loadu_ps(input + 16); + const __m128 vi0 = _mm_loadu_ps(input); + const __m128 vi1 = _mm_loadu_ps(input + 4); + const __m128 vi2 = _mm_loadu_ps(input + 8); + const __m128 vi3 = _mm_loadu_ps(input + 12); + const __m128 vi4 = _mm_loadu_ps(input + 16); input += 20; // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx0123 = _mm_sub_ps(vi0123, vi_max); - const __m128 vx4567 = _mm_sub_ps(vi4567, vi_max); - const __m128 vx89AB = _mm_sub_ps(vi89AB, vi_max); - const __m128 vxCDEF = _mm_sub_ps(viCDEF, vi_max); - const __m128 vxGHIJ = _mm_sub_ps(viGHIJ, vi_max); + const __m128 vx0 = _mm_sub_ps(vi0, vi_max); + const __m128 vx1 = _mm_sub_ps(vi1, vi_max); + const __m128 vx2 = _mm_sub_ps(vi2, vi_max); + const __m128 vx3 = _mm_sub_ps(vi3, vi_max); + const __m128 vx4 = _mm_sub_ps(vi4, vi_max); // Compute reduced argument batch := round(x / log(2)). - __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vx0123, vlog2e), vmagic_bias); - __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vx4567, vlog2e), vmagic_bias); - __m128 vn89AB = _mm_add_ps(_mm_mul_ps(vx89AB, vlog2e), vmagic_bias); - __m128 vnCDEF = _mm_add_ps(_mm_mul_ps(vxCDEF, vlog2e), vmagic_bias); - __m128 vnGHIJ = _mm_add_ps(_mm_mul_ps(vxGHIJ, vlog2e), vmagic_bias); + __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); + __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); + __m128 vn2 = _mm_add_ps(_mm_mul_ps(vx2, vlog2e), vmagic_bias); + __m128 vn3 = _mm_add_ps(_mm_mul_ps(vx3, vlog2e), vmagic_bias); + __m128 vn4 = _mm_add_ps(_mm_mul_ps(vx4, vlog2e), vmagic_bias); // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23)); - const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23)); - const __m128 vs89AB = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn89AB), 23)); - const __m128 vsCDEF = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vnCDEF), 23)); - const __m128 vsGHIJ = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vnGHIJ), 23)); + const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); + const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); + const __m128 vs2 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn2), 23)); + const __m128 vs3 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn3), 23)); + const __m128 vs4 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4), 23)); // Subtract the large number back to get final batch := round(x / log(2)). - vn0123 = _mm_sub_ps(vn0123, vmagic_bias); - vn4567 = _mm_sub_ps(vn4567, vmagic_bias); - vn89AB = _mm_sub_ps(vn89AB, vmagic_bias); - vnCDEF = _mm_sub_ps(vnCDEF, vmagic_bias); - vnGHIJ = _mm_sub_ps(vnGHIJ, vmagic_bias); + vn0 = _mm_sub_ps(vn0, vmagic_bias); + vn1 = _mm_sub_ps(vn1, vmagic_bias); + vn2 = _mm_sub_ps(vn2, vmagic_bias); + vn3 = _mm_sub_ps(vn3, vmagic_bias); + vn4 = _mm_sub_ps(vn4, vmagic_bias); // Compute reduced argument t := x - batch * log(2). // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vx0123); - __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vx4567); - __m128 vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_hi), vx89AB); - __m128 vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_hi), vxCDEF); - __m128 vtGHIJ = _mm_add_ps(_mm_mul_ps(vnGHIJ, vminus_ln2_hi), vxGHIJ); - - vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123); - vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567); - vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_lo), vt89AB); - vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_lo), vtCDEF); - vtGHIJ = _mm_add_ps(_mm_mul_ps(vnGHIJ, vminus_ln2_lo), vtGHIJ); + __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); + __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); + __m128 vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_hi), vx2); + __m128 vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_hi), vx3); + __m128 vt4 = _mm_add_ps(_mm_mul_ps(vn4, vminus_ln2_hi), vx4); + + vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); + vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_lo), vt2); + vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_lo), vt3); + vt4 = _mm_add_ps(_mm_mul_ps(vn4, vminus_ln2_lo), vt4); // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4); - __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4); - __m128 vp89AB = _mm_add_ps(_mm_mul_ps(vc5, vt89AB), vc4); - __m128 vpCDEF = _mm_add_ps(_mm_mul_ps(vc5, vtCDEF), vc4); - __m128 vpGHIJ = _mm_add_ps(_mm_mul_ps(vc5, vtGHIJ), vc4); - - vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3); - vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3); - vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc3); - vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc3); - vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc3); - - vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2); - vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2); - vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc2); - vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc2); - vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc2); - - vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1); - vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1); - vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc1); - vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc1); - vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc1); + __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); + __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); + __m128 vp2 = _mm_add_ps(_mm_mul_ps(vc5, vt2), vc4); + __m128 vp3 = _mm_add_ps(_mm_mul_ps(vc5, vt3), vc4); + __m128 vp4 = _mm_add_ps(_mm_mul_ps(vc5, vt4), vc4); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc3); + vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc3); + vp4 = _mm_add_ps(_mm_mul_ps(vp4, vt4), vc3); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc2); + vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc2); + vp4 = _mm_add_ps(_mm_mul_ps(vp4, vt4), vc2); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc1); + vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc1); + vp4 = _mm_add_ps(_mm_mul_ps(vp4, vt4), vc1); // Reconstruct the final f value: // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) // = s + (t * s) * p - vt0123 = _mm_mul_ps(vt0123, vs0123); - vt4567 = _mm_mul_ps(vt4567, vs4567); - vt89AB = _mm_mul_ps(vt89AB, vs89AB); - vtCDEF = _mm_mul_ps(vtCDEF, vsCDEF); - vtGHIJ = _mm_mul_ps(vtGHIJ, vsGHIJ); - - __m128 vf0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123); - __m128 vf4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567); - __m128 vf89AB = _mm_add_ps(_mm_mul_ps(vt89AB, vp89AB), vs89AB); - __m128 vfCDEF = _mm_add_ps(_mm_mul_ps(vtCDEF, vpCDEF), vsCDEF); - __m128 vfGHIJ = _mm_add_ps(_mm_mul_ps(vtGHIJ, vpGHIJ), vsGHIJ); + vt0 = _mm_mul_ps(vt0, vs0); + vt1 = _mm_mul_ps(vt1, vs1); + vt2 = _mm_mul_ps(vt2, vs2); + vt3 = _mm_mul_ps(vt3, vs3); + vt4 = _mm_mul_ps(vt4, vs4); + + __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); + __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); + __m128 vf2 = _mm_add_ps(_mm_mul_ps(vt2, vp2), vs2); + __m128 vf3 = _mm_add_ps(_mm_mul_ps(vt3, vp3), vs3); + __m128 vf4 = _mm_add_ps(_mm_mul_ps(vt4, vp4), vs4); // For inputs below zero cutoff, replace output with +0.0f. // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123); - vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vx4567, vdenorm_cutoff), vf4567); - vf89AB = _mm_andnot_ps(_mm_cmplt_ps(vx89AB, vdenorm_cutoff), vf89AB); - vfCDEF = _mm_andnot_ps(_mm_cmplt_ps(vxCDEF, vdenorm_cutoff), vfCDEF); - vfGHIJ = _mm_andnot_ps(_mm_cmplt_ps(vxGHIJ, vdenorm_cutoff), vfGHIJ); + vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); + vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); + vf2 = _mm_andnot_ps(_mm_cmplt_ps(vx2, vdenorm_cutoff), vf2); + vf3 = _mm_andnot_ps(_mm_cmplt_ps(vx3, vdenorm_cutoff), vf3); + vf4 = _mm_andnot_ps(_mm_cmplt_ps(vx4, vdenorm_cutoff), vf4); // Store 20 (5x4) outputs at a time. - _mm_storeu_ps(output, vf0123); - _mm_storeu_ps(output + 4, vf4567); - _mm_storeu_ps(output + 8, vf89AB); - _mm_storeu_ps(output + 12, vfCDEF); - _mm_storeu_ps(output + 16, vfGHIJ); + _mm_storeu_ps(output, vf0); + _mm_storeu_ps(output + 4, vf1); + _mm_storeu_ps(output + 8, vf2); + _mm_storeu_ps(output + 12, vf3); + _mm_storeu_ps(output + 16, vf4); output += 20; // Accumulate computed exponents. - vacc0 = _mm_add_ps(vacc0, vf0123); - vacc0 = _mm_add_ps(vacc0, vf4567); - vacc0 = _mm_add_ps(vacc0, vf89AB); - vacc0 = _mm_add_ps(vacc0, vfCDEF); - vacc0 = _mm_add_ps(vacc0, vfGHIJ); + vacc0 = _mm_add_ps(vacc0, vf0); + vacc1 = _mm_add_ps(vacc1, vf1); + vacc0 = _mm_add_ps(vacc0, vf2); + vacc1 = _mm_add_ps(vacc1, vf3); + vacc0 = _mm_add_ps(vacc0, vf4); } // Add up all accumulators to vacc0 vacc0 = _mm_add_ps(vacc0, vacc1); diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20-acc5.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20-acc5.c index 0d55171291f..97aaf1ab143 100644 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20-acc5.c +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20-acc5.c @@ -61,119 +61,119 @@ void xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u20_acc5( __m128 vacc4 = _mm_setzero_ps(); for (; batch >= 20 * sizeof(float); batch -= 20 * sizeof(float)) { // Load 20 (5x4) inputs at a time. - const __m128 vi0123 = _mm_loadu_ps(input); - const __m128 vi4567 = _mm_loadu_ps(input + 4); - const __m128 vi89AB = _mm_loadu_ps(input + 8); - const __m128 viCDEF = _mm_loadu_ps(input + 12); - const __m128 viGHIJ = _mm_loadu_ps(input + 16); + const __m128 vi0 = _mm_loadu_ps(input); + const __m128 vi1 = _mm_loadu_ps(input + 4); + const __m128 vi2 = _mm_loadu_ps(input + 8); + const __m128 vi3 = _mm_loadu_ps(input + 12); + const __m128 vi4 = _mm_loadu_ps(input + 16); input += 20; // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx0123 = _mm_sub_ps(vi0123, vi_max); - const __m128 vx4567 = _mm_sub_ps(vi4567, vi_max); - const __m128 vx89AB = _mm_sub_ps(vi89AB, vi_max); - const __m128 vxCDEF = _mm_sub_ps(viCDEF, vi_max); - const __m128 vxGHIJ = _mm_sub_ps(viGHIJ, vi_max); + const __m128 vx0 = _mm_sub_ps(vi0, vi_max); + const __m128 vx1 = _mm_sub_ps(vi1, vi_max); + const __m128 vx2 = _mm_sub_ps(vi2, vi_max); + const __m128 vx3 = _mm_sub_ps(vi3, vi_max); + const __m128 vx4 = _mm_sub_ps(vi4, vi_max); // Compute reduced argument batch := round(x / log(2)). - __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vx0123, vlog2e), vmagic_bias); - __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vx4567, vlog2e), vmagic_bias); - __m128 vn89AB = _mm_add_ps(_mm_mul_ps(vx89AB, vlog2e), vmagic_bias); - __m128 vnCDEF = _mm_add_ps(_mm_mul_ps(vxCDEF, vlog2e), vmagic_bias); - __m128 vnGHIJ = _mm_add_ps(_mm_mul_ps(vxGHIJ, vlog2e), vmagic_bias); + __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); + __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); + __m128 vn2 = _mm_add_ps(_mm_mul_ps(vx2, vlog2e), vmagic_bias); + __m128 vn3 = _mm_add_ps(_mm_mul_ps(vx3, vlog2e), vmagic_bias); + __m128 vn4 = _mm_add_ps(_mm_mul_ps(vx4, vlog2e), vmagic_bias); // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23)); - const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23)); - const __m128 vs89AB = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn89AB), 23)); - const __m128 vsCDEF = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vnCDEF), 23)); - const __m128 vsGHIJ = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vnGHIJ), 23)); + const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); + const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); + const __m128 vs2 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn2), 23)); + const __m128 vs3 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn3), 23)); + const __m128 vs4 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4), 23)); // Subtract the large number back to get final batch := round(x / log(2)). - vn0123 = _mm_sub_ps(vn0123, vmagic_bias); - vn4567 = _mm_sub_ps(vn4567, vmagic_bias); - vn89AB = _mm_sub_ps(vn89AB, vmagic_bias); - vnCDEF = _mm_sub_ps(vnCDEF, vmagic_bias); - vnGHIJ = _mm_sub_ps(vnGHIJ, vmagic_bias); + vn0 = _mm_sub_ps(vn0, vmagic_bias); + vn1 = _mm_sub_ps(vn1, vmagic_bias); + vn2 = _mm_sub_ps(vn2, vmagic_bias); + vn3 = _mm_sub_ps(vn3, vmagic_bias); + vn4 = _mm_sub_ps(vn4, vmagic_bias); // Compute reduced argument t := x - batch * log(2). // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vx0123); - __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vx4567); - __m128 vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_hi), vx89AB); - __m128 vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_hi), vxCDEF); - __m128 vtGHIJ = _mm_add_ps(_mm_mul_ps(vnGHIJ, vminus_ln2_hi), vxGHIJ); - - vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123); - vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567); - vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_lo), vt89AB); - vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_lo), vtCDEF); - vtGHIJ = _mm_add_ps(_mm_mul_ps(vnGHIJ, vminus_ln2_lo), vtGHIJ); + __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); + __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); + __m128 vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_hi), vx2); + __m128 vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_hi), vx3); + __m128 vt4 = _mm_add_ps(_mm_mul_ps(vn4, vminus_ln2_hi), vx4); + + vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); + vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_lo), vt2); + vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_lo), vt3); + vt4 = _mm_add_ps(_mm_mul_ps(vn4, vminus_ln2_lo), vt4); // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4); - __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4); - __m128 vp89AB = _mm_add_ps(_mm_mul_ps(vc5, vt89AB), vc4); - __m128 vpCDEF = _mm_add_ps(_mm_mul_ps(vc5, vtCDEF), vc4); - __m128 vpGHIJ = _mm_add_ps(_mm_mul_ps(vc5, vtGHIJ), vc4); - - vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3); - vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3); - vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc3); - vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc3); - vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc3); - - vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2); - vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2); - vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc2); - vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc2); - vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc2); - - vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1); - vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1); - vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc1); - vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc1); - vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc1); + __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); + __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); + __m128 vp2 = _mm_add_ps(_mm_mul_ps(vc5, vt2), vc4); + __m128 vp3 = _mm_add_ps(_mm_mul_ps(vc5, vt3), vc4); + __m128 vp4 = _mm_add_ps(_mm_mul_ps(vc5, vt4), vc4); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc3); + vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc3); + vp4 = _mm_add_ps(_mm_mul_ps(vp4, vt4), vc3); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc2); + vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc2); + vp4 = _mm_add_ps(_mm_mul_ps(vp4, vt4), vc2); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc1); + vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc1); + vp4 = _mm_add_ps(_mm_mul_ps(vp4, vt4), vc1); // Reconstruct the final f value: // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) // = s + (t * s) * p - vt0123 = _mm_mul_ps(vt0123, vs0123); - vt4567 = _mm_mul_ps(vt4567, vs4567); - vt89AB = _mm_mul_ps(vt89AB, vs89AB); - vtCDEF = _mm_mul_ps(vtCDEF, vsCDEF); - vtGHIJ = _mm_mul_ps(vtGHIJ, vsGHIJ); - - __m128 vf0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123); - __m128 vf4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567); - __m128 vf89AB = _mm_add_ps(_mm_mul_ps(vt89AB, vp89AB), vs89AB); - __m128 vfCDEF = _mm_add_ps(_mm_mul_ps(vtCDEF, vpCDEF), vsCDEF); - __m128 vfGHIJ = _mm_add_ps(_mm_mul_ps(vtGHIJ, vpGHIJ), vsGHIJ); + vt0 = _mm_mul_ps(vt0, vs0); + vt1 = _mm_mul_ps(vt1, vs1); + vt2 = _mm_mul_ps(vt2, vs2); + vt3 = _mm_mul_ps(vt3, vs3); + vt4 = _mm_mul_ps(vt4, vs4); + + __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); + __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); + __m128 vf2 = _mm_add_ps(_mm_mul_ps(vt2, vp2), vs2); + __m128 vf3 = _mm_add_ps(_mm_mul_ps(vt3, vp3), vs3); + __m128 vf4 = _mm_add_ps(_mm_mul_ps(vt4, vp4), vs4); // For inputs below zero cutoff, replace output with +0.0f. // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123); - vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vx4567, vdenorm_cutoff), vf4567); - vf89AB = _mm_andnot_ps(_mm_cmplt_ps(vx89AB, vdenorm_cutoff), vf89AB); - vfCDEF = _mm_andnot_ps(_mm_cmplt_ps(vxCDEF, vdenorm_cutoff), vfCDEF); - vfGHIJ = _mm_andnot_ps(_mm_cmplt_ps(vxGHIJ, vdenorm_cutoff), vfGHIJ); + vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); + vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); + vf2 = _mm_andnot_ps(_mm_cmplt_ps(vx2, vdenorm_cutoff), vf2); + vf3 = _mm_andnot_ps(_mm_cmplt_ps(vx3, vdenorm_cutoff), vf3); + vf4 = _mm_andnot_ps(_mm_cmplt_ps(vx4, vdenorm_cutoff), vf4); // Store 20 (5x4) outputs at a time. - _mm_storeu_ps(output, vf0123); - _mm_storeu_ps(output + 4, vf4567); - _mm_storeu_ps(output + 8, vf89AB); - _mm_storeu_ps(output + 12, vfCDEF); - _mm_storeu_ps(output + 16, vfGHIJ); + _mm_storeu_ps(output, vf0); + _mm_storeu_ps(output + 4, vf1); + _mm_storeu_ps(output + 8, vf2); + _mm_storeu_ps(output + 12, vf3); + _mm_storeu_ps(output + 16, vf4); output += 20; // Accumulate computed exponents. - vacc0 = _mm_add_ps(vacc0, vf0123); - vacc4 = _mm_add_ps(vacc4, vf4567); - vacc3 = _mm_add_ps(vacc3, vf89AB); - vacc2 = _mm_add_ps(vacc2, vfCDEF); - vacc1 = _mm_add_ps(vacc1, vfGHIJ); + vacc0 = _mm_add_ps(vacc0, vf0); + vacc1 = _mm_add_ps(vacc1, vf1); + vacc2 = _mm_add_ps(vacc2, vf2); + vacc3 = _mm_add_ps(vacc3, vf3); + vacc4 = _mm_add_ps(vacc4, vf4); } // Add up all accumulators to vacc0 vacc0 = _mm_add_ps(vacc0, vacc1); diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20.c index 0b032dea122..8d337ced608 100644 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20.c +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20.c @@ -57,119 +57,119 @@ void xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u20( __m128 vacc0 = _mm_setzero_ps(); for (; batch >= 20 * sizeof(float); batch -= 20 * sizeof(float)) { // Load 20 (5x4) inputs at a time. - const __m128 vi0123 = _mm_loadu_ps(input); - const __m128 vi4567 = _mm_loadu_ps(input + 4); - const __m128 vi89AB = _mm_loadu_ps(input + 8); - const __m128 viCDEF = _mm_loadu_ps(input + 12); - const __m128 viGHIJ = _mm_loadu_ps(input + 16); + const __m128 vi0 = _mm_loadu_ps(input); + const __m128 vi1 = _mm_loadu_ps(input + 4); + const __m128 vi2 = _mm_loadu_ps(input + 8); + const __m128 vi3 = _mm_loadu_ps(input + 12); + const __m128 vi4 = _mm_loadu_ps(input + 16); input += 20; // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx0123 = _mm_sub_ps(vi0123, vi_max); - const __m128 vx4567 = _mm_sub_ps(vi4567, vi_max); - const __m128 vx89AB = _mm_sub_ps(vi89AB, vi_max); - const __m128 vxCDEF = _mm_sub_ps(viCDEF, vi_max); - const __m128 vxGHIJ = _mm_sub_ps(viGHIJ, vi_max); + const __m128 vx0 = _mm_sub_ps(vi0, vi_max); + const __m128 vx1 = _mm_sub_ps(vi1, vi_max); + const __m128 vx2 = _mm_sub_ps(vi2, vi_max); + const __m128 vx3 = _mm_sub_ps(vi3, vi_max); + const __m128 vx4 = _mm_sub_ps(vi4, vi_max); // Compute reduced argument batch := round(x / log(2)). - __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vx0123, vlog2e), vmagic_bias); - __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vx4567, vlog2e), vmagic_bias); - __m128 vn89AB = _mm_add_ps(_mm_mul_ps(vx89AB, vlog2e), vmagic_bias); - __m128 vnCDEF = _mm_add_ps(_mm_mul_ps(vxCDEF, vlog2e), vmagic_bias); - __m128 vnGHIJ = _mm_add_ps(_mm_mul_ps(vxGHIJ, vlog2e), vmagic_bias); + __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); + __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); + __m128 vn2 = _mm_add_ps(_mm_mul_ps(vx2, vlog2e), vmagic_bias); + __m128 vn3 = _mm_add_ps(_mm_mul_ps(vx3, vlog2e), vmagic_bias); + __m128 vn4 = _mm_add_ps(_mm_mul_ps(vx4, vlog2e), vmagic_bias); // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23)); - const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23)); - const __m128 vs89AB = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn89AB), 23)); - const __m128 vsCDEF = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vnCDEF), 23)); - const __m128 vsGHIJ = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vnGHIJ), 23)); + const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); + const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); + const __m128 vs2 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn2), 23)); + const __m128 vs3 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn3), 23)); + const __m128 vs4 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4), 23)); // Subtract the large number back to get final batch := round(x / log(2)). - vn0123 = _mm_sub_ps(vn0123, vmagic_bias); - vn4567 = _mm_sub_ps(vn4567, vmagic_bias); - vn89AB = _mm_sub_ps(vn89AB, vmagic_bias); - vnCDEF = _mm_sub_ps(vnCDEF, vmagic_bias); - vnGHIJ = _mm_sub_ps(vnGHIJ, vmagic_bias); + vn0 = _mm_sub_ps(vn0, vmagic_bias); + vn1 = _mm_sub_ps(vn1, vmagic_bias); + vn2 = _mm_sub_ps(vn2, vmagic_bias); + vn3 = _mm_sub_ps(vn3, vmagic_bias); + vn4 = _mm_sub_ps(vn4, vmagic_bias); // Compute reduced argument t := x - batch * log(2). // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vx0123); - __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vx4567); - __m128 vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_hi), vx89AB); - __m128 vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_hi), vxCDEF); - __m128 vtGHIJ = _mm_add_ps(_mm_mul_ps(vnGHIJ, vminus_ln2_hi), vxGHIJ); - - vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123); - vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567); - vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_lo), vt89AB); - vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_lo), vtCDEF); - vtGHIJ = _mm_add_ps(_mm_mul_ps(vnGHIJ, vminus_ln2_lo), vtGHIJ); + __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); + __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); + __m128 vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_hi), vx2); + __m128 vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_hi), vx3); + __m128 vt4 = _mm_add_ps(_mm_mul_ps(vn4, vminus_ln2_hi), vx4); + + vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); + vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_lo), vt2); + vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_lo), vt3); + vt4 = _mm_add_ps(_mm_mul_ps(vn4, vminus_ln2_lo), vt4); // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4); - __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4); - __m128 vp89AB = _mm_add_ps(_mm_mul_ps(vc5, vt89AB), vc4); - __m128 vpCDEF = _mm_add_ps(_mm_mul_ps(vc5, vtCDEF), vc4); - __m128 vpGHIJ = _mm_add_ps(_mm_mul_ps(vc5, vtGHIJ), vc4); - - vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3); - vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3); - vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc3); - vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc3); - vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc3); - - vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2); - vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2); - vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc2); - vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc2); - vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc2); - - vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1); - vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1); - vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc1); - vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc1); - vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc1); + __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); + __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); + __m128 vp2 = _mm_add_ps(_mm_mul_ps(vc5, vt2), vc4); + __m128 vp3 = _mm_add_ps(_mm_mul_ps(vc5, vt3), vc4); + __m128 vp4 = _mm_add_ps(_mm_mul_ps(vc5, vt4), vc4); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc3); + vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc3); + vp4 = _mm_add_ps(_mm_mul_ps(vp4, vt4), vc3); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc2); + vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc2); + vp4 = _mm_add_ps(_mm_mul_ps(vp4, vt4), vc2); + + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); + vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc1); + vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc1); + vp4 = _mm_add_ps(_mm_mul_ps(vp4, vt4), vc1); // Reconstruct the final f value: // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) // = s + (t * s) * p - vt0123 = _mm_mul_ps(vt0123, vs0123); - vt4567 = _mm_mul_ps(vt4567, vs4567); - vt89AB = _mm_mul_ps(vt89AB, vs89AB); - vtCDEF = _mm_mul_ps(vtCDEF, vsCDEF); - vtGHIJ = _mm_mul_ps(vtGHIJ, vsGHIJ); - - __m128 vf0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123); - __m128 vf4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567); - __m128 vf89AB = _mm_add_ps(_mm_mul_ps(vt89AB, vp89AB), vs89AB); - __m128 vfCDEF = _mm_add_ps(_mm_mul_ps(vtCDEF, vpCDEF), vsCDEF); - __m128 vfGHIJ = _mm_add_ps(_mm_mul_ps(vtGHIJ, vpGHIJ), vsGHIJ); + vt0 = _mm_mul_ps(vt0, vs0); + vt1 = _mm_mul_ps(vt1, vs1); + vt2 = _mm_mul_ps(vt2, vs2); + vt3 = _mm_mul_ps(vt3, vs3); + vt4 = _mm_mul_ps(vt4, vs4); + + __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); + __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); + __m128 vf2 = _mm_add_ps(_mm_mul_ps(vt2, vp2), vs2); + __m128 vf3 = _mm_add_ps(_mm_mul_ps(vt3, vp3), vs3); + __m128 vf4 = _mm_add_ps(_mm_mul_ps(vt4, vp4), vs4); // For inputs below zero cutoff, replace output with +0.0f. // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123); - vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vx4567, vdenorm_cutoff), vf4567); - vf89AB = _mm_andnot_ps(_mm_cmplt_ps(vx89AB, vdenorm_cutoff), vf89AB); - vfCDEF = _mm_andnot_ps(_mm_cmplt_ps(vxCDEF, vdenorm_cutoff), vfCDEF); - vfGHIJ = _mm_andnot_ps(_mm_cmplt_ps(vxGHIJ, vdenorm_cutoff), vfGHIJ); + vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); + vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); + vf2 = _mm_andnot_ps(_mm_cmplt_ps(vx2, vdenorm_cutoff), vf2); + vf3 = _mm_andnot_ps(_mm_cmplt_ps(vx3, vdenorm_cutoff), vf3); + vf4 = _mm_andnot_ps(_mm_cmplt_ps(vx4, vdenorm_cutoff), vf4); // Store 20 (5x4) outputs at a time. - _mm_storeu_ps(output, vf0123); - _mm_storeu_ps(output + 4, vf4567); - _mm_storeu_ps(output + 8, vf89AB); - _mm_storeu_ps(output + 12, vfCDEF); - _mm_storeu_ps(output + 16, vfGHIJ); + _mm_storeu_ps(output, vf0); + _mm_storeu_ps(output + 4, vf1); + _mm_storeu_ps(output + 8, vf2); + _mm_storeu_ps(output + 12, vf3); + _mm_storeu_ps(output + 16, vf4); output += 20; // Accumulate computed exponents. - vacc0 = _mm_add_ps(vacc0, vf0123); - vacc0 = _mm_add_ps(vacc0, vf4567); - vacc0 = _mm_add_ps(vacc0, vf89AB); - vacc0 = _mm_add_ps(vacc0, vfCDEF); - vacc0 = _mm_add_ps(vacc0, vfGHIJ); + vacc0 = _mm_add_ps(vacc0, vf0); + vacc0 = _mm_add_ps(vacc0, vf1); + vacc0 = _mm_add_ps(vacc0, vf2); + vacc0 = _mm_add_ps(vacc0, vf3); + vacc0 = _mm_add_ps(vacc0, vf4); } __m128 vacc = vacc0; diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u4.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u4.c index 41135d41d1b..4d68b94b95c 100644 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u4.c +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u4.c @@ -57,55 +57,55 @@ void xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u4( __m128 vacc0 = _mm_setzero_ps(); for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { // Load 4 (1x4) inputs at a time. - const __m128 vi0123 = _mm_loadu_ps(input); + const __m128 vi0 = _mm_loadu_ps(input); input += 4; // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx0123 = _mm_sub_ps(vi0123, vi_max); + const __m128 vx0 = _mm_sub_ps(vi0, vi_max); // Compute reduced argument batch := round(x / log(2)). - __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vx0123, vlog2e), vmagic_bias); + __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23)); + const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); // Subtract the large number back to get final batch := round(x / log(2)). - vn0123 = _mm_sub_ps(vn0123, vmagic_bias); + vn0 = _mm_sub_ps(vn0, vmagic_bias); // Compute reduced argument t := x - batch * log(2). // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vx0123); + __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); - vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123); + vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4); + __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); - vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3); + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); - vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2); + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); - vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1); + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); // Reconstruct the final f value: // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) // = s + (t * s) * p - vt0123 = _mm_mul_ps(vt0123, vs0123); + vt0 = _mm_mul_ps(vt0, vs0); - __m128 vf0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123); + __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); // For inputs below zero cutoff, replace output with +0.0f. // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123); + vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); // Store 4 (1x4) outputs at a time. - _mm_storeu_ps(output, vf0123); + _mm_storeu_ps(output, vf0); output += 4; // Accumulate computed exponents. - vacc0 = _mm_add_ps(vacc0, vf0123); + vacc0 = _mm_add_ps(vacc0, vf0); } __m128 vacc = vacc0; diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u8-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u8-acc2.c index aa819940ff4..25314dfb936 100644 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u8-acc2.c +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u8-acc2.c @@ -58,71 +58,71 @@ void xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u8_acc2( __m128 vacc1 = _mm_setzero_ps(); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { // Load 8 (2x4) inputs at a time. - const __m128 vi0123 = _mm_loadu_ps(input); - const __m128 vi4567 = _mm_loadu_ps(input + 4); + const __m128 vi0 = _mm_loadu_ps(input); + const __m128 vi1 = _mm_loadu_ps(input + 4); input += 8; // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx0123 = _mm_sub_ps(vi0123, vi_max); - const __m128 vx4567 = _mm_sub_ps(vi4567, vi_max); + const __m128 vx0 = _mm_sub_ps(vi0, vi_max); + const __m128 vx1 = _mm_sub_ps(vi1, vi_max); // Compute reduced argument batch := round(x / log(2)). - __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vx0123, vlog2e), vmagic_bias); - __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vx4567, vlog2e), vmagic_bias); + __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); + __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23)); - const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23)); + const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); + const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); // Subtract the large number back to get final batch := round(x / log(2)). - vn0123 = _mm_sub_ps(vn0123, vmagic_bias); - vn4567 = _mm_sub_ps(vn4567, vmagic_bias); + vn0 = _mm_sub_ps(vn0, vmagic_bias); + vn1 = _mm_sub_ps(vn1, vmagic_bias); // Compute reduced argument t := x - batch * log(2). // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vx0123); - __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vx4567); + __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); + __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); - vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123); - vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567); + vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4); - __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4); + __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); + __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); - vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3); - vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3); + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); - vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2); - vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2); + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); - vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1); - vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1); + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); // Reconstruct the final f value: // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) // = s + (t * s) * p - vt0123 = _mm_mul_ps(vt0123, vs0123); - vt4567 = _mm_mul_ps(vt4567, vs4567); + vt0 = _mm_mul_ps(vt0, vs0); + vt1 = _mm_mul_ps(vt1, vs1); - __m128 vf0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123); - __m128 vf4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567); + __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); + __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); // For inputs below zero cutoff, replace output with +0.0f. // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123); - vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vx4567, vdenorm_cutoff), vf4567); + vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); + vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); // Store 8 (2x4) outputs at a time. - _mm_storeu_ps(output, vf0123); - _mm_storeu_ps(output + 4, vf4567); + _mm_storeu_ps(output, vf0); + _mm_storeu_ps(output + 4, vf1); output += 8; // Accumulate computed exponents. - vacc0 = _mm_add_ps(vacc0, vf0123); - vacc0 = _mm_add_ps(vacc0, vf4567); + vacc0 = _mm_add_ps(vacc0, vf0); + vacc1 = _mm_add_ps(vacc1, vf1); } // Add up all accumulators to vacc0 vacc0 = _mm_add_ps(vacc0, vacc1); diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u8.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u8.c index 1aa1bed08ab..f645a973b79 100644 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u8.c +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u8.c @@ -57,71 +57,71 @@ void xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u8( __m128 vacc0 = _mm_setzero_ps(); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { // Load 8 (2x4) inputs at a time. - const __m128 vi0123 = _mm_loadu_ps(input); - const __m128 vi4567 = _mm_loadu_ps(input + 4); + const __m128 vi0 = _mm_loadu_ps(input); + const __m128 vi1 = _mm_loadu_ps(input + 4); input += 8; // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx0123 = _mm_sub_ps(vi0123, vi_max); - const __m128 vx4567 = _mm_sub_ps(vi4567, vi_max); + const __m128 vx0 = _mm_sub_ps(vi0, vi_max); + const __m128 vx1 = _mm_sub_ps(vi1, vi_max); // Compute reduced argument batch := round(x / log(2)). - __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vx0123, vlog2e), vmagic_bias); - __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vx4567, vlog2e), vmagic_bias); + __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); + __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23)); - const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23)); + const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); + const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); // Subtract the large number back to get final batch := round(x / log(2)). - vn0123 = _mm_sub_ps(vn0123, vmagic_bias); - vn4567 = _mm_sub_ps(vn4567, vmagic_bias); + vn0 = _mm_sub_ps(vn0, vmagic_bias); + vn1 = _mm_sub_ps(vn1, vmagic_bias); // Compute reduced argument t := x - batch * log(2). // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vx0123); - __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vx4567); + __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); + __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); - vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123); - vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567); + vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4); - __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4); + __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); + __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); - vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3); - vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3); + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); - vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2); - vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2); + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); - vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1); - vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1); + vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); + vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); // Reconstruct the final f value: // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) // = s + (t * s) * p - vt0123 = _mm_mul_ps(vt0123, vs0123); - vt4567 = _mm_mul_ps(vt4567, vs4567); + vt0 = _mm_mul_ps(vt0, vs0); + vt1 = _mm_mul_ps(vt1, vs1); - __m128 vf0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123); - __m128 vf4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567); + __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); + __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); // For inputs below zero cutoff, replace output with +0.0f. // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123); - vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vx4567, vdenorm_cutoff), vf4567); + vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); + vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); // Store 8 (2x4) outputs at a time. - _mm_storeu_ps(output, vf0123); - _mm_storeu_ps(output + 4, vf4567); + _mm_storeu_ps(output, vf0); + _mm_storeu_ps(output + 4, vf1); output += 8; // Accumulate computed exponents. - vacc0 = _mm_add_ps(vacc0, vf0123); - vacc0 = _mm_add_ps(vacc0, vf4567); + vacc0 = _mm_add_ps(vacc0, vf0); + vacc0 = _mm_add_ps(vacc0, vf1); } __m128 vacc = vacc0; diff --git a/src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in b/src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in index ca71175d829..b5a6f9de3da 100644 --- a/src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in +++ b/src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in @@ -6,7 +6,6 @@ $assert BATCH_TILE % 4 == 0 $assert BATCH_TILE >= 4 $SIMD_TILE = BATCH_TILE // 4 -$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" #include #include @@ -15,7 +14,8 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" #include "xnnpack/raddstoreexpminusmax.h" -void xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u${BATCH_TILE}${"" if ACCUMULATORS == 1 else "_acc%d" % ACCUMULATORS}( +$ISA = {0: "sse2", 1: "avx"}[AVX] +void xnn_f32_raddstoreexpminusmax_ukernel__${ISA}_rr2_p5_u${BATCH_TILE}${"" if ACCUMULATORS == 1 else "_acc%d" % ACCUMULATORS}( size_t batch, const float* input, const float* max, @@ -58,73 +58,73 @@ void xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u${BATCH_TILE}${"" if ACC __m128 vacc${K} = _mm_setzero_ps(); for (; batch >= ${BATCH_TILE} * sizeof(float); batch -= ${BATCH_TILE} * sizeof(float)) { // Load ${BATCH_TILE} (${SIMD_TILE}x4) inputs at a time. - const __m128 vi${ABC[0:4]} = _mm_loadu_ps(input); - $for N in range(4, BATCH_TILE, 4): - const __m128 vi${ABC[N:N+4]} = _mm_loadu_ps(input + ${N}); + const __m128 vi0 = _mm_loadu_ps(input); + $for N in range(1, SIMD_TILE): + const __m128 vi${N} = _mm_loadu_ps(input + ${N*4}); input += ${BATCH_TILE}; // Subtract maximum input x := i - i_max. This implies x <= 0. - $for N in range(0, BATCH_TILE, 4): - const __m128 vx${ABC[N:N+4]} = _mm_sub_ps(vi${ABC[N:N+4]}, vi_max); + $for N in range(SIMD_TILE): + const __m128 vx${N} = _mm_sub_ps(vi${N}, vi_max); // Compute reduced argument batch := round(x / log(2)). - $for N in range(0, BATCH_TILE, 4): - __m128 vn${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vx${ABC[N:N+4]}, vlog2e), vmagic_bias); + $for N in range(SIMD_TILE): + __m128 vn${N} = _mm_add_ps(_mm_mul_ps(vx${N}, vlog2e), vmagic_bias); // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - $for N in range(0, BATCH_TILE, 4): - const __m128 vs${ABC[N:N+4]} = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn${ABC[N:N+4]}), 23)); + $for N in range(SIMD_TILE): + const __m128 vs${N} = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn${N}), 23)); // Subtract the large number back to get final batch := round(x / log(2)). - $for N in range(0, BATCH_TILE, 4): - vn${ABC[N:N+4]} = _mm_sub_ps(vn${ABC[N:N+4]}, vmagic_bias); + $for N in range(SIMD_TILE): + vn${N} = _mm_sub_ps(vn${N}, vmagic_bias); // Compute reduced argument t := x - batch * log(2). // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - $for N in range(0, BATCH_TILE, 4): - __m128 vt${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vn${ABC[N:N+4]}, vminus_ln2_hi), vx${ABC[N:N+4]}); + $for N in range(SIMD_TILE): + __m128 vt${N} = _mm_add_ps(_mm_mul_ps(vn${N}, vminus_ln2_hi), vx${N}); - $for N in range(0, BATCH_TILE, 4): - vt${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vn${ABC[N:N+4]}, vminus_ln2_lo), vt${ABC[N:N+4]}); + $for N in range(SIMD_TILE): + vt${N} = _mm_add_ps(_mm_mul_ps(vn${N}, vminus_ln2_lo), vt${N}); // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - $for N in range(0, BATCH_TILE, 4): - __m128 vp${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vc5, vt${ABC[N:N+4]}), vc4); + $for N in range(SIMD_TILE): + __m128 vp${N} = _mm_add_ps(_mm_mul_ps(vc5, vt${N}), vc4); - $for N in range(0, BATCH_TILE, 4): - vp${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vp${ABC[N:N+4]}, vt${ABC[N:N+4]}), vc3); + $for N in range(SIMD_TILE): + vp${N} = _mm_add_ps(_mm_mul_ps(vp${N}, vt${N}), vc3); - $for N in range(0, BATCH_TILE, 4): - vp${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vp${ABC[N:N+4]}, vt${ABC[N:N+4]}), vc2); + $for N in range(SIMD_TILE): + vp${N} = _mm_add_ps(_mm_mul_ps(vp${N}, vt${N}), vc2); - $for N in range(0, BATCH_TILE, 4): - vp${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vp${ABC[N:N+4]}, vt${ABC[N:N+4]}), vc1); + $for N in range(SIMD_TILE): + vp${N} = _mm_add_ps(_mm_mul_ps(vp${N}, vt${N}), vc1); // Reconstruct the final f value: // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) // = s + (t * s) * p - $for N in range(0, BATCH_TILE, 4): - vt${ABC[N:N+4]} = _mm_mul_ps(vt${ABC[N:N+4]}, vs${ABC[N:N+4]}); + $for N in range(SIMD_TILE): + vt${N} = _mm_mul_ps(vt${N}, vs${N}); - $for N in range(0, BATCH_TILE, 4): - __m128 vf${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vt${ABC[N:N+4]}, vp${ABC[N:N+4]}), vs${ABC[N:N+4]}); + $for N in range(SIMD_TILE): + __m128 vf${N} = _mm_add_ps(_mm_mul_ps(vt${N}, vp${N}), vs${N}); // For inputs below zero cutoff, replace output with +0.0f. // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - $for N in range(0, BATCH_TILE, 4): - vf${ABC[N:N+4]} = _mm_andnot_ps(_mm_cmplt_ps(vx${ABC[N:N+4]}, vdenorm_cutoff), vf${ABC[N:N+4]}); + $for N in range(SIMD_TILE): + vf${N} = _mm_andnot_ps(_mm_cmplt_ps(vx${N}, vdenorm_cutoff), vf${N}); // Store ${BATCH_TILE} (${SIMD_TILE}x4) outputs at a time. - _mm_storeu_ps(output, vf${ABC[0:4]}); - $for N in range(4, BATCH_TILE, 4): - _mm_storeu_ps(output + ${N}, vf${ABC[N:N+4]}); + _mm_storeu_ps(output, vf0); + $for N in range(1, SIMD_TILE): + _mm_storeu_ps(output + ${N*4}, vf${N}); output += ${BATCH_TILE}; // Accumulate computed exponents. - $for N in range(0, BATCH_TILE, 4): - vacc${N % ACCUMULATORS} = _mm_add_ps(vacc${N % ACCUMULATORS}, vf${ABC[N:N+4]}); + $for N in range(SIMD_TILE): + vacc${N % ACCUMULATORS} = _mm_add_ps(vacc${N % ACCUMULATORS}, vf${N}); } $if ACCUMULATORS > 1: // Add up all accumulators to vacc0 diff --git a/src/xnnpack/raddstoreexpminusmax.h b/src/xnnpack/raddstoreexpminusmax.h index 414484a7a0a..4a645c57232 100644 --- a/src/xnnpack/raddstoreexpminusmax.h +++ b/src/xnnpack/raddstoreexpminusmax.h @@ -148,6 +148,19 @@ DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_u DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u20_acc2) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u20_acc5) +DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u4) +DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u8) +DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u8_acc2) +DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12) +DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12_acc2) +DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12_acc3) +DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16) +DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16_acc2) +DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16_acc4) +DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20) +DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20_acc2) +DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20_acc5) + DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32_acc2) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32_acc4) diff --git a/test/f32-raddstoreexpminusmax.cc b/test/f32-raddstoreexpminusmax.cc index cd487ba722f..ebf89e24b97 100644 --- a/test/f32-raddstoreexpminusmax.cc +++ b/test/f32-raddstoreexpminusmax.cc @@ -2322,6 +2322,450 @@ #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U4, elements_eq_4) { + TEST_REQUIRES_X86_AVX; + RAddStoreExpMinusMaxMicrokernelTester() + .elements(4) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u4, nullptr); + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U4, elements_div_4) { + TEST_REQUIRES_X86_AVX; + for (size_t elements = 8; elements < 40; elements += 4) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u4, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U4, elements_lt_4) { + TEST_REQUIRES_X86_AVX; + for (size_t elements = 1; elements < 4; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u4, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U4, elements_gt_4) { + TEST_REQUIRES_X86_AVX; + for (size_t elements = 5; elements < 8; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u4, nullptr); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U8, elements_eq_8) { + TEST_REQUIRES_X86_AVX; + RAddStoreExpMinusMaxMicrokernelTester() + .elements(8) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u8, nullptr); + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U8, elements_div_8) { + TEST_REQUIRES_X86_AVX; + for (size_t elements = 16; elements < 80; elements += 8) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u8, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U8, elements_lt_8) { + TEST_REQUIRES_X86_AVX; + for (size_t elements = 1; elements < 8; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u8, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U8, elements_gt_8) { + TEST_REQUIRES_X86_AVX; + for (size_t elements = 9; elements < 16; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u8, nullptr); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U8_ACC2, elements_eq_8) { + TEST_REQUIRES_X86_AVX; + RAddStoreExpMinusMaxMicrokernelTester() + .elements(8) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u8_acc2, nullptr); + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U8_ACC2, elements_div_8) { + TEST_REQUIRES_X86_AVX; + for (size_t elements = 16; elements < 80; elements += 8) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u8_acc2, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U8_ACC2, elements_lt_8) { + TEST_REQUIRES_X86_AVX; + for (size_t elements = 1; elements < 8; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u8_acc2, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U8_ACC2, elements_gt_8) { + TEST_REQUIRES_X86_AVX; + for (size_t elements = 9; elements < 16; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u8_acc2, nullptr); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U12, elements_eq_12) { + TEST_REQUIRES_X86_AVX; + RAddStoreExpMinusMaxMicrokernelTester() + .elements(12) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12, nullptr); + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U12, elements_div_12) { + TEST_REQUIRES_X86_AVX; + for (size_t elements = 24; elements < 120; elements += 12) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U12, elements_lt_12) { + TEST_REQUIRES_X86_AVX; + for (size_t elements = 1; elements < 12; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U12, elements_gt_12) { + TEST_REQUIRES_X86_AVX; + for (size_t elements = 13; elements < 24; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12, nullptr); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U12_ACC2, elements_eq_12) { + TEST_REQUIRES_X86_AVX; + RAddStoreExpMinusMaxMicrokernelTester() + .elements(12) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12_acc2, nullptr); + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U12_ACC2, elements_div_12) { + TEST_REQUIRES_X86_AVX; + for (size_t elements = 24; elements < 120; elements += 12) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12_acc2, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U12_ACC2, elements_lt_12) { + TEST_REQUIRES_X86_AVX; + for (size_t elements = 1; elements < 12; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12_acc2, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U12_ACC2, elements_gt_12) { + TEST_REQUIRES_X86_AVX; + for (size_t elements = 13; elements < 24; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12_acc2, nullptr); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U12_ACC3, elements_eq_12) { + TEST_REQUIRES_X86_AVX; + RAddStoreExpMinusMaxMicrokernelTester() + .elements(12) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12_acc3, nullptr); + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U12_ACC3, elements_div_12) { + TEST_REQUIRES_X86_AVX; + for (size_t elements = 24; elements < 120; elements += 12) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12_acc3, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U12_ACC3, elements_lt_12) { + TEST_REQUIRES_X86_AVX; + for (size_t elements = 1; elements < 12; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12_acc3, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U12_ACC3, elements_gt_12) { + TEST_REQUIRES_X86_AVX; + for (size_t elements = 13; elements < 24; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12_acc3, nullptr); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U16, elements_eq_16) { + TEST_REQUIRES_X86_AVX; + RAddStoreExpMinusMaxMicrokernelTester() + .elements(16) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16, nullptr); + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U16, elements_div_16) { + TEST_REQUIRES_X86_AVX; + for (size_t elements = 32; elements < 160; elements += 16) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U16, elements_lt_16) { + TEST_REQUIRES_X86_AVX; + for (size_t elements = 1; elements < 16; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U16, elements_gt_16) { + TEST_REQUIRES_X86_AVX; + for (size_t elements = 17; elements < 32; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16, nullptr); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U16_ACC2, elements_eq_16) { + TEST_REQUIRES_X86_AVX; + RAddStoreExpMinusMaxMicrokernelTester() + .elements(16) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16_acc2, nullptr); + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U16_ACC2, elements_div_16) { + TEST_REQUIRES_X86_AVX; + for (size_t elements = 32; elements < 160; elements += 16) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16_acc2, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U16_ACC2, elements_lt_16) { + TEST_REQUIRES_X86_AVX; + for (size_t elements = 1; elements < 16; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16_acc2, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U16_ACC2, elements_gt_16) { + TEST_REQUIRES_X86_AVX; + for (size_t elements = 17; elements < 32; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16_acc2, nullptr); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U16_ACC4, elements_eq_16) { + TEST_REQUIRES_X86_AVX; + RAddStoreExpMinusMaxMicrokernelTester() + .elements(16) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16_acc4, nullptr); + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U16_ACC4, elements_div_16) { + TEST_REQUIRES_X86_AVX; + for (size_t elements = 32; elements < 160; elements += 16) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16_acc4, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U16_ACC4, elements_lt_16) { + TEST_REQUIRES_X86_AVX; + for (size_t elements = 1; elements < 16; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16_acc4, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U16_ACC4, elements_gt_16) { + TEST_REQUIRES_X86_AVX; + for (size_t elements = 17; elements < 32; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16_acc4, nullptr); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U20, elements_eq_20) { + TEST_REQUIRES_X86_AVX; + RAddStoreExpMinusMaxMicrokernelTester() + .elements(20) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20, nullptr); + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U20, elements_div_20) { + TEST_REQUIRES_X86_AVX; + for (size_t elements = 40; elements < 200; elements += 20) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U20, elements_lt_20) { + TEST_REQUIRES_X86_AVX; + for (size_t elements = 1; elements < 20; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U20, elements_gt_20) { + TEST_REQUIRES_X86_AVX; + for (size_t elements = 21; elements < 40; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20, nullptr); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U20_ACC2, elements_eq_20) { + TEST_REQUIRES_X86_AVX; + RAddStoreExpMinusMaxMicrokernelTester() + .elements(20) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20_acc2, nullptr); + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U20_ACC2, elements_div_20) { + TEST_REQUIRES_X86_AVX; + for (size_t elements = 40; elements < 200; elements += 20) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20_acc2, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U20_ACC2, elements_lt_20) { + TEST_REQUIRES_X86_AVX; + for (size_t elements = 1; elements < 20; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20_acc2, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U20_ACC2, elements_gt_20) { + TEST_REQUIRES_X86_AVX; + for (size_t elements = 21; elements < 40; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20_acc2, nullptr); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U20_ACC5, elements_eq_20) { + TEST_REQUIRES_X86_AVX; + RAddStoreExpMinusMaxMicrokernelTester() + .elements(20) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20_acc5, nullptr); + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U20_ACC5, elements_div_20) { + TEST_REQUIRES_X86_AVX; + for (size_t elements = 40; elements < 200; elements += 20) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20_acc5, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U20_ACC5, elements_lt_20) { + TEST_REQUIRES_X86_AVX; + for (size_t elements = 1; elements < 20; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20_acc5, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U20_ACC5, elements_gt_20) { + TEST_REQUIRES_X86_AVX; + for (size_t elements = 21; elements < 40; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20_acc5, nullptr); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + #if XNN_ARCH_X86 || XNN_ARCH_X86_64 TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U32, elements_eq_32) { TEST_REQUIRES_X86_AVX2; diff --git a/test/f32-raddstoreexpminusmax.yaml b/test/f32-raddstoreexpminusmax.yaml index f4e3394c447..26cb25c310d 100644 --- a/test/f32-raddstoreexpminusmax.yaml +++ b/test/f32-raddstoreexpminusmax.yaml @@ -57,7 +57,7 @@ - name: xnn_f32_raddstoreexpminusmax_ukernel__rvv_rr2_p6_u2v - name: xnn_f32_raddstoreexpminusmax_ukernel__rvv_rr2_p6_u4v -# x86 SSE +# x86 SSE2 - name: xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u4 - name: xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u8 - name: xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u8_acc2 @@ -72,6 +72,20 @@ - name: xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u20_acc5 # x86 AVX +- name: xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u4 +- name: xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u8 +- name: xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u8_acc2 +- name: xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12 +- name: xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12_acc2 +- name: xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12_acc3 +- name: xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16 +- name: xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16_acc2 +- name: xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16_acc4 +- name: xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20 +- name: xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20_acc2 +- name: xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20_acc5 + +# x86 AVX2 - name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32 - name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32_acc2 - name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32_acc4 From dc3070f381ec4fa4d37f060aae683534333aadcc Mon Sep 17 00:00:00 2001 From: Alan Kelly Date: Sat, 21 Sep 2024 00:20:59 -0700 Subject: [PATCH 23/50] Add variant of FC which can take fp16 weights and biases and f32 inputs and outputs. Some models have float16 weights to reduce model size. Currently we have to convert float16 weights to float32 within the XNNPack delegate and this memory is not freed until program termination. Also, if we are doing float 16 inference, we convert the f16 weights to f32 inside the delegate, and then back again in XNNPack. So this will reduce memory usage for f32 inference since only one copy of f32 weights is active at any time. And for f16 inference, no conversion to f32 is ever done. PiperOrigin-RevId: 677119973 --- include/xnnpack.h | 24 +++ src/operators/fully-connected-nc.c | 49 ++++++ test/fully-connected-nc.cc | 22 --- test/fully-connected-operator-tester.h | 203 ++++++++++++++++++------- 4 files changed, 220 insertions(+), 78 deletions(-) diff --git a/include/xnnpack.h b/include/xnnpack.h index 242dd477dd5..c4df36b1be8 100644 --- a/include/xnnpack.h +++ b/include/xnnpack.h @@ -4300,6 +4300,20 @@ enum xnn_status xnn_setup_fully_connected_nc_f16( const void* input, void* output); +enum xnn_status xnn_create_fully_connected_nc_f32_f16( + size_t input_channels, + size_t output_channels, + size_t input_stride, + size_t output_stride, + const void* kernel, + const void* bias, + float output_min, + float output_max, + uint32_t flags, + xnn_code_cache_t code_cache, + xnn_weights_cache_t weights_cache, + xnn_operator_t* fully_connected_op_out); + enum xnn_status xnn_create_fully_connected_nc_f32( size_t input_channels, size_t output_channels, @@ -4314,11 +4328,21 @@ enum xnn_status xnn_create_fully_connected_nc_f32( xnn_weights_cache_t weights_cache, xnn_operator_t* fully_connected_op_out); +enum xnn_status xnn_reshape_fully_connected_nc_f32_f16( + xnn_operator_t fully_connected_op, + size_t batch_size, + pthreadpool_t threadpool); + enum xnn_status xnn_reshape_fully_connected_nc_f32( xnn_operator_t fully_connected_op, size_t batch_size, pthreadpool_t threadpool); +enum xnn_status xnn_setup_fully_connected_nc_f32_f16( + xnn_operator_t fully_connected_op, + const float* input, + float* output); + enum xnn_status xnn_setup_fully_connected_nc_f32( xnn_operator_t fully_connected_op, const float* input, diff --git a/src/operators/fully-connected-nc.c b/src/operators/fully-connected-nc.c index 47c40ebfae9..fe5a4f4d26f 100644 --- a/src/operators/fully-connected-nc.c +++ b/src/operators/fully-connected-nc.c @@ -1112,6 +1112,39 @@ enum xnn_status xnn_create_fully_connected_nc_qd8_f16_qc8w( fully_connected_op_out); } +enum xnn_status xnn_create_fully_connected_nc_f32_f16( + size_t input_channels, + size_t output_channels, + size_t input_stride, + size_t output_stride, + const void* kernel, + const void* bias, + float output_min, + float output_max, + uint32_t flags, + xnn_code_cache_t code_cache, + xnn_weights_cache_t weights_cache, + xnn_operator_t* fully_connected_op_out) +{ + float *fp32_kernel_buffer = (float*) malloc(input_channels * output_channels * sizeof(float)); + float *fp32_bias_buffer = NULL; + const xnn_float16 *f16_kernel = (const xnn_float16*) kernel; + const xnn_float16 *f16_bias = (const xnn_float16*) bias; + for (size_t i = 0; i < input_channels * output_channels; ++i) { + fp32_kernel_buffer[i] = xnn_float16_to_float(f16_kernel[i]); + } + if (bias) { + fp32_bias_buffer = (float*) malloc(output_channels * sizeof(float)); + for (size_t i = 0; i < output_channels; ++i) { + fp32_bias_buffer[i] = xnn_float16_to_float(f16_bias[i]); + } + } + enum xnn_status status = xnn_create_fully_connected_nc_f32(input_channels, output_channels, input_stride, output_stride, fp32_kernel_buffer, fp32_bias_buffer, output_min, output_max, flags, code_cache, weights_cache, fully_connected_op_out); + free(fp32_kernel_buffer); + free(fp32_bias_buffer); + return status; +} + enum xnn_status xnn_create_fully_connected_nc_f32( size_t input_channels, size_t output_channels, @@ -1836,6 +1869,14 @@ enum xnn_status xnn_reshape_fully_connected_nc_f16( threadpool); } +enum xnn_status xnn_reshape_fully_connected_nc_f32_f16( + xnn_operator_t fully_connected_op, + size_t batch_size, + pthreadpool_t threadpool) +{ + return xnn_reshape_fully_connected_nc_f32(fully_connected_op, batch_size, threadpool); +} + enum xnn_status xnn_reshape_fully_connected_nc_f32( xnn_operator_t fully_connected_op, size_t batch_size, @@ -2129,6 +2170,14 @@ enum xnn_status xnn_setup_fully_connected_nc_f16( input, output, /*quantization_params=*/NULL); } +enum xnn_status xnn_setup_fully_connected_nc_f32_f16( + xnn_operator_t fully_connected_op, + const float* input, + float* output) +{ + return xnn_setup_fully_connected_nc_f32(fully_connected_op, input, output); +} + enum xnn_status xnn_setup_fully_connected_nc_f32( xnn_operator_t fully_connected_op, const float* input, diff --git a/test/fully-connected-nc.cc b/test/fully-connected-nc.cc index d5cb04ec7e6..88aa9b9ddac 100644 --- a/test/fully-connected-nc.cc +++ b/test/fully-connected-nc.cc @@ -708,28 +708,6 @@ TEST(FULLY_CONNECTED_NC_F32, weights_cache_unit_batch_transpose_weights) { .TestF32(); } -#if !XNN_ARCH_WASM && XNN_ENABLE_JIT // TODO(b/290880274) -TEST(FULLY_CONNECTED_NC_F32, unit_batch_with_jit) { - FullyConnectedOperatorTester() - .batch_size(1) - .input_channels(22) - .output_channels(19) - .use_jit(true) - .iterations(3) - .TestF32(); -} - -TEST(FULLY_CONNECTED_NC_F32, small_batch_with_jit) { - FullyConnectedOperatorTester() - .batch_size(12) - .input_channels(22) - .output_channels(19) - .use_jit(true) - .iterations(3) - .TestF32(); -} -#endif // !XNN_ARCH_WASM && XNN_ENABLE_JIT - TEST(FULLY_CONNECTED_NC_F32_QC4W, unit_batch) { FullyConnectedOperatorTester() .batch_size(1) diff --git a/test/fully-connected-operator-tester.h b/test/fully-connected-operator-tester.h index da7778bf3df..5fa9c7e570c 100644 --- a/test/fully-connected-operator-tester.h +++ b/test/fully-connected-operator-tester.h @@ -44,6 +44,7 @@ class FullyConnectedOperatorTester { public: enum class WeightsType { Default, + FP16, FP32, }; @@ -2165,24 +2166,42 @@ class FullyConnectedOperatorTester { } } - void TestF32() const { - ASSERT_EQ(weights_type(), WeightsType::Default); + void TestF32() { + weights_type_ = WeightsType::FP32; + TestF32WeightsType(); + weights_type_ = WeightsType::FP16; + TestF32WeightsType(); + } + + void TestF32WeightsType() const { + switch (weights_type()) { + case WeightsType::FP16: + break; + case WeightsType::FP32: + break; + default: + GTEST_FAIL() << "unexpected weights type"; + } xnnpack::ReplicableRandomDevice rng; std::uniform_real_distribution f32dist(0.1f, 1.0f); std::vector input(XNN_EXTRA_BYTES / sizeof(float) + (batch_size() - 1) * input_stride() + input_channels()); - std::vector kernel(output_channels() * input_channels()); - std::vector bias(output_channels()); + std::vector kernel(output_channels() * input_channels()); + std::vector kernel_as_float(kernel.size()); + std::vector bias(output_channels()); + std::vector bias_as_float(bias.size()); std::vector output((batch_size() - 1) * output_stride() + output_channels()); std::vector output_ref(batch_size() * output_channels()); for (size_t iteration = 0; iteration < iterations(); iteration++) { std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); std::generate(kernel.begin(), kernel.end(), [&]() { return f32dist(rng); }); + std::copy(kernel.cbegin(), kernel.cend(), kernel_as_float.begin()); std::generate(bias.begin(), bias.end(), [&]() { return f32dist(rng); }); - std::fill(output.begin(), output.end(), nanf("")); + std::copy(bias.cbegin(), bias.cend(), bias_as_float.begin()); + std::fill(output.begin(), output.end(), std::nanf("")); // Compute reference results, without renormalization. if (has_bias()) { @@ -2245,14 +2264,33 @@ class FullyConnectedOperatorTester { } } - const xnn_status status = xnn_create_fully_connected_nc_f32( - input_channels(), output_channels(), - input_stride(), output_stride(), - kernel.data(), has_bias() ? bias.data() : nullptr, - output_min, output_max, - transpose_weights() ? XNN_FLAG_TRANSPOSE_WEIGHTS : 0, - auto_code_cache, auto_weights_cache.get(), - &fully_connected_op); + xnn_status status; + + switch (weights_type()) { + case WeightsType::FP32: + status = xnn_create_fully_connected_nc_f32( + input_channels(), output_channels(), + input_stride(), output_stride(), + kernel_as_float.data(), has_bias() ? bias_as_float.data() : nullptr, + output_min, output_max, + transpose_weights() ? XNN_FLAG_TRANSPOSE_WEIGHTS : 0, + auto_code_cache, auto_weights_cache.get(), + &fully_connected_op); + break; + case WeightsType::FP16: + status = xnn_create_fully_connected_nc_f32_f16( + input_channels(), output_channels(), + input_stride(), output_stride(), + kernel.data(), has_bias() ? bias.data() : nullptr, + output_min, output_max, + transpose_weights() ? XNN_FLAG_TRANSPOSE_WEIGHTS : 0, + auto_code_cache, auto_weights_cache.get(), + &fully_connected_op); + break; + default: + GTEST_FAIL() <<"unexpected weights type"; + } + if (status == xnn_status_unsupported_hardware) { GTEST_SKIP(); } @@ -2266,24 +2304,42 @@ class FullyConnectedOperatorTester { // Smart pointer to automatically delete fully_connected_op. std::unique_ptr auto_fully_connected_op(fully_connected_op, xnn_delete_operator); - #if XNN_PLATFORM_JIT - if (use_jit()) { - // Check that we actually generated code. - ASSERT_GT(code_cache.cache.code.size, 0); - xnn_finalize_code_memory(&code_cache.cache.code); - } - #endif - - ASSERT_EQ(xnn_status_success, - xnn_reshape_fully_connected_nc_f32( - fully_connected_op, - batch_size(), - /*threadpool=*/nullptr)); - - ASSERT_EQ(xnn_status_success, - xnn_setup_fully_connected_nc_f32( - fully_connected_op, - input.data(), output.data())); + switch (weights_type()) { + case WeightsType::FP32: + ASSERT_EQ(xnn_status_success, + xnn_reshape_fully_connected_nc_f32( + fully_connected_op, + batch_size(), + /*threadpool=*/nullptr)); + break; + case WeightsType::FP16: + ASSERT_EQ(xnn_status_success, + xnn_reshape_fully_connected_nc_f32_f16( + fully_connected_op, + batch_size(), + /*threadpool=*/nullptr)); + break; + default: + GTEST_FAIL() <<"unexpected weights type"; + } + + + switch (weights_type()) { + case WeightsType::FP32: + ASSERT_EQ(xnn_status_success, + xnn_setup_fully_connected_nc_f32( + fully_connected_op, + input.data(), output.data())); + break; + case WeightsType::FP16: + ASSERT_EQ(xnn_status_success, + xnn_setup_fully_connected_nc_f32_f16( + fully_connected_op, + input.data(), output.data())); + break; + default: + GTEST_FAIL() <<"unexpected weights type"; + } ASSERT_EQ(xnn_status_success, xnn_run_operator(fully_connected_op, /*threadpool=*/nullptr)); @@ -2293,40 +2349,75 @@ class FullyConnectedOperatorTester { if (use_weights_cache()) { // We already finalized the code cache, so create a new code cache if we are testing JIT. xnn_code_cache_t auto_inner_code_cache = nullptr; - #if XNN_PLATFORM_JIT - xnn_code_cache inner_code_cache; - if (use_jit()) { - xnn_init_code_cache(&inner_code_cache); - auto_inner_code_cache.reset(&inner_code_cache); - } - #endif // Create another operator with the same weights cache. xnn_operator_t fully_connected_op2 = nullptr; size_t old_weights_cache_size = internal_weights_cache->cache.weights.size; - ASSERT_EQ(xnn_status_success, - xnn_create_fully_connected_nc_f32( - input_channels(), output_channels(), input_stride(), - output_stride(), kernel.data(), - has_bias() ? bias.data() : nullptr, output_min, - output_max, - transpose_weights() ? XNN_FLAG_TRANSPOSE_WEIGHTS : 0, - auto_inner_code_cache, auto_weights_cache.get(), - &fully_connected_op2)); + switch (weights_type()) { + case WeightsType::FP32: + ASSERT_EQ(xnn_status_success, + xnn_create_fully_connected_nc_f32( + input_channels(), output_channels(), input_stride(), + output_stride(), kernel_as_float.data(), + has_bias() ? bias_as_float.data() : nullptr, output_min, + output_max, + transpose_weights() ? XNN_FLAG_TRANSPOSE_WEIGHTS : 0, + auto_inner_code_cache, auto_weights_cache.get(), + &fully_connected_op2)); + break; + case WeightsType::FP16: + ASSERT_EQ(xnn_status_success, + xnn_create_fully_connected_nc_f32_f16( + input_channels(), output_channels(), + input_stride(), output_stride(), + kernel.data(), has_bias() ? bias.data() : nullptr, + output_min, output_max, + transpose_weights() ? XNN_FLAG_TRANSPOSE_WEIGHTS : 0, + auto_code_cache, auto_weights_cache.get(), + &fully_connected_op2)); + break; + default: + GTEST_FAIL() <<"unexpected weights type"; + } ASSERT_NE(nullptr, fully_connected_op2); std::unique_ptr auto_fully_connected_op(fully_connected_op2, xnn_delete_operator); - ASSERT_EQ(xnn_status_success, - xnn_reshape_fully_connected_nc_f32( - fully_connected_op2, - batch_size(), - /*threadpool=*/nullptr)); + switch (weights_type()) { + case WeightsType::FP32: + ASSERT_EQ(xnn_status_success, + xnn_reshape_fully_connected_nc_f32( + fully_connected_op2, + batch_size(), + /*threadpool=*/nullptr)); + break; + case WeightsType::FP16: + ASSERT_EQ(xnn_status_success, + xnn_reshape_fully_connected_nc_f32_f16( + fully_connected_op2, + batch_size(), + /*threadpool=*/nullptr)); + break; + default: + GTEST_FAIL() <<"unexpected weights type"; + } std::vector output2(output.size(), nanf("")); - ASSERT_EQ(xnn_status_success, - xnn_setup_fully_connected_nc_f32( - fully_connected_op2, - input.data(), output2.data())); + switch (weights_type()) { + case WeightsType::FP32: + ASSERT_EQ(xnn_status_success, + xnn_setup_fully_connected_nc_f32( + fully_connected_op2, + input.data(), output2.data())); + break; + case WeightsType::FP16: + ASSERT_EQ(xnn_status_success, + xnn_setup_fully_connected_nc_f32_f16( + fully_connected_op2, + input.data(), output2.data())); + break; + default: + GTEST_FAIL() <<"unexpected weights type"; + } ASSERT_EQ(xnn_status_success, xnn_run_operator(fully_connected_op2, /*threadpool=*/nullptr)); From f09f91adeedf88dfe03f97a2e8d848072d2ae187 Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Sat, 21 Sep 2024 00:51:24 -0700 Subject: [PATCH 24/50] Replace (some) end2end benchmarks with subgraph benchmarks The existing "end2end" benchmarks are not actually end-to-end, because they skip the subgraph API, which contains nontrivial optimizations and work. This change removes these existing end2end benchmarks, and adds subgraph benchmarks instead. Currently, only qs8, fp16, and fp32 mobilenet benchmarks are included. More datatypes and kinds of models are still TODO. This also allows closing a const correctness loophole in the configs that existed specifically to enable these benchmarks to hack the configs. There is also coverage of the individual ops at the microkernel level. PiperOrigin-RevId: 677129403 --- BUILD.bazel | 16 - CMakeLists.txt | 70 +- bench/BUILD.bazel | 177 - bench/end2end.cc | 201 - bench/end2end.h | 33 - bench/f16-dwconv-e2e.cc | 736 --- bench/f16-gemm-e2e.cc | 452 -- bench/f32-dwconv-e2e.cc | 2148 -------- bench/f32-gemm-e2e.cc | 2338 -------- bench/models/BUILD | 42 + bench/models/benchmark.cc | 179 + bench/models/fp32-mobilenet-v1.cc | 1747 ++++++ bench/models/fp32-mobilenet-v2.cc | 3537 +++++++++++++ bench/models/fp32-mobilenet-v3-large.cc | 5407 +++++++++++++++++++ bench/models/fp32-mobilenet-v3-small.cc | 4757 +++++++++++++++++ bench/models/models.h | 19 + bench/models/qs8-mobilenet-v2.cc | 3542 +++++++++++++ bench/qs8-dwconv-e2e.cc | 1937 ------- bench/qs8-gemm-e2e.cc | 2541 --------- bench/qu8-dwconv-e2e.cc | 1453 ----- bench/qu8-gemm-e2e.cc | 1171 ---- models/BUILD | 314 -- models/fp16-mobilenet-v1.cc | 1543 ------ models/fp16-mobilenet-v2.cc | 3183 ----------- models/fp16-mobilenet-v3-large.cc | 5023 ------------------ models/fp16-mobilenet-v3-small.cc | 4373 --------------- models/fp16-sparse-mobilenet-v1.cc | 1448 ----- models/fp16-sparse-mobilenet-v2.cc | 3032 ----------- models/fp16-sparse-mobilenet-v3-large.cc | 4814 ----------------- models/fp16-sparse-mobilenet-v3-small.cc | 4189 --------------- models/fp32-mobilenet-v1.cc | 1536 ------ models/fp32-mobilenet-v2.cc | 3240 ------------ models/fp32-mobilenet-v3-large.cc | 5080 ------------------ models/fp32-mobilenet-v3-small.cc | 4430 ---------------- models/fp32-sparse-mobilenet-v1.cc | 1448 ----- models/fp32-sparse-mobilenet-v2.cc | 3031 ----------- models/fp32-sparse-mobilenet-v3-large.cc | 4814 ----------------- models/fp32-sparse-mobilenet-v3-small.cc | 4189 --------------- models/qs8-mobilenet-v1.cc | 1564 ------ models/qs8-mobilenet-v2.cc | 3248 ------------ models/qs8-qc8w-mobilenet-v1.cc | 1621 ------ models/qs8-qc8w-mobilenet-v2.cc | 3355 ------------ models/qu8-mobilenet-v1.cc | 1774 ------- models/qu8-mobilenet-v2.cc | 3559 ------------- models/qu8-mobilenet-v3-large.cc | 6166 ---------------------- models/qu8-mobilenet-v3-small.cc | 5527 ------------------- src/configs/dwconv-config.c | 10 +- src/configs/gemm-config.c | 28 +- src/xnnpack/config.h | 38 +- src/xnnpack/models.h | 77 - 50 files changed, 19285 insertions(+), 95872 deletions(-) delete mode 100644 bench/end2end.cc delete mode 100644 bench/end2end.h delete mode 100644 bench/f16-dwconv-e2e.cc delete mode 100644 bench/f16-gemm-e2e.cc delete mode 100644 bench/f32-dwconv-e2e.cc delete mode 100644 bench/f32-gemm-e2e.cc create mode 100644 bench/models/BUILD create mode 100644 bench/models/benchmark.cc create mode 100644 bench/models/fp32-mobilenet-v1.cc create mode 100644 bench/models/fp32-mobilenet-v2.cc create mode 100644 bench/models/fp32-mobilenet-v3-large.cc create mode 100644 bench/models/fp32-mobilenet-v3-small.cc create mode 100644 bench/models/models.h create mode 100644 bench/models/qs8-mobilenet-v2.cc delete mode 100644 bench/qs8-dwconv-e2e.cc delete mode 100644 bench/qs8-gemm-e2e.cc delete mode 100644 bench/qu8-dwconv-e2e.cc delete mode 100644 bench/qu8-gemm-e2e.cc delete mode 100644 models/BUILD delete mode 100644 models/fp16-mobilenet-v1.cc delete mode 100644 models/fp16-mobilenet-v2.cc delete mode 100644 models/fp16-mobilenet-v3-large.cc delete mode 100644 models/fp16-mobilenet-v3-small.cc delete mode 100644 models/fp16-sparse-mobilenet-v1.cc delete mode 100644 models/fp16-sparse-mobilenet-v2.cc delete mode 100644 models/fp16-sparse-mobilenet-v3-large.cc delete mode 100644 models/fp16-sparse-mobilenet-v3-small.cc delete mode 100644 models/fp32-mobilenet-v1.cc delete mode 100644 models/fp32-mobilenet-v2.cc delete mode 100644 models/fp32-mobilenet-v3-large.cc delete mode 100644 models/fp32-mobilenet-v3-small.cc delete mode 100644 models/fp32-sparse-mobilenet-v1.cc delete mode 100644 models/fp32-sparse-mobilenet-v2.cc delete mode 100644 models/fp32-sparse-mobilenet-v3-large.cc delete mode 100644 models/fp32-sparse-mobilenet-v3-small.cc delete mode 100644 models/qs8-mobilenet-v1.cc delete mode 100644 models/qs8-mobilenet-v2.cc delete mode 100644 models/qs8-qc8w-mobilenet-v1.cc delete mode 100644 models/qs8-qc8w-mobilenet-v2.cc delete mode 100644 models/qu8-mobilenet-v1.cc delete mode 100644 models/qu8-mobilenet-v2.cc delete mode 100644 models/qu8-mobilenet-v3-large.cc delete mode 100644 models/qu8-mobilenet-v3-small.cc delete mode 100644 src/xnnpack/models.h diff --git a/BUILD.bazel b/BUILD.bazel index 2da97e46eff..0ee742076cf 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -17,7 +17,6 @@ load( "xnnpack_min_size_copts", "xnnpack_slinky_deps", "xnnpack_slinky_srcs", - "xnnpack_std_cxxopts", "xnnpack_transitive_source_list", "xnnpack_visibility", ) @@ -1243,21 +1242,6 @@ xnnpack_cc_library( ], ) -############################# End-to-end benchmarks ############################ - -# Helper library for benchmarks to depend on. -xnnpack_cc_library( - name = "models_h", - hdrs = ["src/xnnpack/models.h"], - copts = xnnpack_std_cxxopts(), - deps = [ - ":XNNPACK", - ":aligned_allocator", - ":common", - "@FP16", - ], -) - ############################# Build configurations ############################# # Enables usage of ARM FP16 (FP16 arithmetics) scalar kernels. diff --git a/CMakeLists.txt b/CMakeLists.txt index b2b413f1749..7c56cbe3019 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1923,60 +1923,24 @@ IF(XNNPACK_BUILD_BENCHMARKS) IF(XNNPACK_BUILD_LIBRARY) # ---[ Build end-to-end microbenchmarks - ADD_LIBRARY(bench-models STATIC - models/fp16-mobilenet-v1.cc - models/fp16-mobilenet-v2.cc - models/fp16-mobilenet-v3-large.cc - models/fp16-mobilenet-v3-small.cc - models/fp16-sparse-mobilenet-v1.cc - models/fp16-sparse-mobilenet-v2.cc - models/fp16-sparse-mobilenet-v3-large.cc - models/fp16-sparse-mobilenet-v3-small.cc - models/fp32-mobilenet-v1.cc - models/fp32-mobilenet-v2.cc - models/fp32-mobilenet-v3-large.cc - models/fp32-mobilenet-v3-small.cc - models/fp32-sparse-mobilenet-v1.cc - models/fp32-sparse-mobilenet-v2.cc - models/fp32-sparse-mobilenet-v3-large.cc - models/fp32-sparse-mobilenet-v3-small.cc - models/qs8-qc8w-mobilenet-v1.cc - models/qs8-qc8w-mobilenet-v2.cc - models/qs8-mobilenet-v1.cc - models/qs8-mobilenet-v2.cc - models/qu8-mobilenet-v1.cc - models/qu8-mobilenet-v2.cc - models/qu8-mobilenet-v3-large.cc - models/qu8-mobilenet-v3-small.cc) - SET_TARGET_PROPERTIES(bench-models PROPERTIES CXX_EXTENSIONS YES) + ADD_LIBRARY(models STATIC + bench/models/fp32-mobilenet-v1.cc + bench/models/fp32-mobilenet-v2.cc + bench/models/fp32-mobilenet-v3-large.cc + bench/models/fp32-mobilenet-v3-small.cc + bench/models/qs8-mobilenet-v2.cc) + SET_TARGET_PROPERTIES(models PROPERTIES CXX_EXTENSIONS YES) + TARGET_INCLUDE_DIRECTORIES(models PRIVATE .) + TARGET_LINK_LIBRARIES(models PRIVATE XNNPACK) + + ADD_EXECUTABLE(bench-models bench/models/benchmark.cc) TARGET_INCLUDE_DIRECTORIES(bench-models PRIVATE .) - TARGET_LINK_LIBRARIES(bench-models PRIVATE fp16 benchmark::benchmark) - TARGET_LINK_LIBRARIES(bench-models PRIVATE XNNPACK bench-utils) - - SET(LIBRARY_END2END_BENCHMARKS - end2end - f16-gemm-e2e - f16-dwconv-e2e - f32-dwconv-e2e - f32-gemm-e2e - qs8-dwconv-e2e - qs8-gemm-e2e - qu8-gemm-e2e - qu8-dwconv-e2e) - FOREACH(BENCH ${LIBRARY_END2END_BENCHMARKS}) - ADD_EXECUTABLE(${BENCH}-bench bench/${BENCH}.cc) - TARGET_INCLUDE_DIRECTORIES(${BENCH}-bench PRIVATE .) - TARGET_LINK_LIBRARIES(${BENCH}-bench PRIVATE - bench-models - bench-utils - benchmark::benchmark - fp16 - hardware-config - logging - microkernels-all - microparams-init - XNNPACK) - ENDFOREACH() + TARGET_LINK_LIBRARIES(bench-models PRIVATE + bench-utils + benchmark::benchmark + fp16 + models + XNNPACK) # ---[ Build operator-level microbenchmarks SET(LIBRARY_OPERATOR_BENCHMARKS diff --git a/bench/BUILD.bazel b/bench/BUILD.bazel index 5e9006dd815..a8758ea9539 100644 --- a/bench/BUILD.bazel +++ b/bench/BUILD.bazel @@ -654,180 +654,3 @@ xnnpack_benchmark( ], deps = OPERATOR_BENCHMARK_DEPS + xnnpack_optional_tflite_deps(), ) - -############################### E2E benchmarks ############################### - -xnnpack_benchmark( - name = "f16_dwconv_e2e_bench", - srcs = [ - "end2end.h", - "f16-dwconv-e2e.cc", - ], - tags = xnnpack_slow_benchmark_tags(), - deps = MICROKERNEL_BENCHMARK_DEPS + [ - "//:XNNPACK", - "//:microkernel_configs", - "//:models_h", - "//models:fp16_mobilenet_v1", - "//models:fp16_mobilenet_v2", - "//models:fp16_mobilenet_v3_large", - "//models:fp16_mobilenet_v3_small", - ], -) - -xnnpack_benchmark( - name = "f16_gemm_e2e_bench", - srcs = [ - "end2end.h", - "f16-gemm-e2e.cc", - ], - tags = xnnpack_slow_benchmark_tags(), - deps = MICROKERNEL_BENCHMARK_DEPS + [ - "//:XNNPACK", - "//:microkernel_configs", - "//:models_h", - "//models:fp16_mobilenet_v1", - "//models:fp16_mobilenet_v2", - "//models:fp16_mobilenet_v3_large", - "//models:fp16_mobilenet_v3_small", - ], -) - -xnnpack_benchmark( - name = "f32_dwconv_e2e_bench", - srcs = [ - "end2end.h", - "f32-dwconv-e2e.cc", - ], - tags = xnnpack_slow_benchmark_tags(), - deps = MICROKERNEL_BENCHMARK_DEPS + [ - "//:XNNPACK", - "//:microkernel_configs", - "//:models_h", - "//models:fp32_mobilenet_v1", - "//models:fp32_mobilenet_v2", - "//models:fp32_mobilenet_v3_large", - "//models:fp32_mobilenet_v3_small", - ], -) - -xnnpack_benchmark( - name = "f32_gemm_e2e_bench", - srcs = [ - "end2end.h", - "f32-gemm-e2e.cc", - ], - tags = xnnpack_slow_benchmark_tags(), - deps = MICROKERNEL_BENCHMARK_DEPS + [ - "//:XNNPACK", - "//:microkernel_configs", - "//:models_h", - "//models:fp32_mobilenet_v1", - "//models:fp32_mobilenet_v2", - "//models:fp32_mobilenet_v3_large", - "//models:fp32_mobilenet_v3_small", - ], -) - -xnnpack_benchmark( - name = "qs8_dwconv_e2e_bench", - srcs = [ - "end2end.h", - "qs8-dwconv-e2e.cc", - ], - tags = xnnpack_slow_benchmark_tags(), - deps = MICROKERNEL_BENCHMARK_DEPS + [ - "//:XNNPACK", - "//:microkernel_configs", - "//:models_h", - "//models:qs8_mobilenet_v1", - "//models:qs8_mobilenet_v2", - ], -) - -xnnpack_benchmark( - name = "qs8_gemm_e2e_bench", - srcs = [ - "end2end.h", - "qs8-gemm-e2e.cc", - ], - tags = xnnpack_slow_benchmark_tags(), - deps = MICROKERNEL_BENCHMARK_DEPS + [ - "//:XNNPACK", - "//:microkernel_configs", - "//:models_h", - "//models:qs8_mobilenet_v1", - "//models:qs8_mobilenet_v2", - ], -) - -xnnpack_benchmark( - name = "qu8_gemm_e2e_bench", - srcs = [ - "end2end.h", - "qu8-gemm-e2e.cc", - ], - tags = xnnpack_slow_benchmark_tags(), - deps = MICROKERNEL_BENCHMARK_DEPS + [ - "//:XNNPACK", - "//:microkernel_configs", - "//:models_h", - "//models:qu8_mobilenet_v1", - "//models:qu8_mobilenet_v2", - "//models:qu8_mobilenet_v3_large", - "//models:qu8_mobilenet_v3_small", - ], -) - -xnnpack_benchmark( - name = "qu8_dwconv_e2e_bench", - srcs = [ - "end2end.h", - "qu8-dwconv-e2e.cc", - ], - tags = xnnpack_slow_benchmark_tags(), - deps = MICROKERNEL_BENCHMARK_DEPS + [ - "//:XNNPACK", - "//:microkernel_configs", - "//:models_h", - "//models:qu8_mobilenet_v1", - "//models:qu8_mobilenet_v2", - "//models:qu8_mobilenet_v3_large", - "//models:qu8_mobilenet_v3_small", - ], -) - -xnnpack_benchmark( - name = "end2end_bench", - srcs = ["end2end.cc"], - tags = xnnpack_slow_benchmark_tags(), - deps = [ - ":bench_utils", - "//:XNNPACK", - "//:models_h", - "//models:fp16_mobilenet_v1", - "//models:fp16_mobilenet_v2", - "//models:fp16_mobilenet_v3_large", - "//models:fp16_mobilenet_v3_small", - "//models:fp16_sparse_mobilenet_v1", - "//models:fp16_sparse_mobilenet_v2", - "//models:fp16_sparse_mobilenet_v3_large", - "//models:fp16_sparse_mobilenet_v3_small", - "//models:fp32_mobilenet_v1", - "//models:fp32_mobilenet_v2", - "//models:fp32_mobilenet_v3_large", - "//models:fp32_mobilenet_v3_small", - "//models:fp32_sparse_mobilenet_v1", - "//models:fp32_sparse_mobilenet_v2", - "//models:fp32_sparse_mobilenet_v3_large", - "//models:fp32_sparse_mobilenet_v3_small", - "//models:qs8_mobilenet_v1", - "//models:qs8_mobilenet_v2", - "//models:qs8_qc8w_mobilenet_v1", - "//models:qs8_qc8w_mobilenet_v2", - "//models:qu8_mobilenet_v1", - "//models:qu8_mobilenet_v2", - "//models:qu8_mobilenet_v3_large", - "//models:qu8_mobilenet_v3_small", - ], -) diff --git a/bench/end2end.cc b/bench/end2end.cc deleted file mode 100644 index ed70d67b955..00000000000 --- a/bench/end2end.cc +++ /dev/null @@ -1,201 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include - -#include - -#include "bench/utils.h" - -#include "xnnpack.h" -#include "xnnpack/models.h" - - -static void End2EndBenchmark( - benchmark::State& state, - models::ExecutionPlanFactory model_factory) -{ - if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) { - state.SkipWithError("failed to initialize XNNPACK"); - return; - } - - const size_t num_threads = state.range(0); - std::unique_ptr threadpool( - pthreadpool_create(num_threads), pthreadpool_destroy); - - auto execution_plan = model_factory(threadpool.get()); - if (execution_plan.empty()) { - state.SkipWithError("failed to create a model"); - return; - } - - for (auto _ : state) { - for (const std::unique_ptr& op : execution_plan) { - xnn_status status = xnn_run_operator(op.get(), threadpool.get()); - if (status != xnn_status_success) { - state.SkipWithError("failed to run a model"); - return; - } - } - } - - const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); - if (cpu_frequency != 0) { - state.counters["cpufreq"] = cpu_frequency; - } -} - -static void FP32MobileNetV1(benchmark::State& state) { - End2EndBenchmark(state, models::FP32MobileNetV1); -} - -static void FP32MobileNetV2(benchmark::State& state) { - End2EndBenchmark(state, models::FP32MobileNetV2); -} - -static void FP32MobileNetV3Large(benchmark::State& state) { - End2EndBenchmark(state, models::FP32MobileNetV3Large); -} - -static void FP32MobileNetV3Small(benchmark::State& state) { - End2EndBenchmark(state, models::FP32MobileNetV3Small); -} - -static void FP32Sparse80MobileNetV1(benchmark::State& state) { - End2EndBenchmark(state, [](pthreadpool_t threadpool) { - return models::FP32SparseMobileNetV1(0.8f, threadpool); - }); -} - -static void FP32Sparse80MobileNetV2(benchmark::State& state) { - End2EndBenchmark(state, [](pthreadpool_t threadpool) { - return models::FP32SparseMobileNetV2(0.8f, threadpool); - }); -} - -static void FP32Sparse80MobileNetV3Large(benchmark::State& state) { - End2EndBenchmark(state, [](pthreadpool_t threadpool) { - return models::FP32SparseMobileNetV3Large(0.8f, threadpool); - }); -} - -static void FP32Sparse80MobileNetV3Small(benchmark::State& state) { - End2EndBenchmark(state, [](pthreadpool_t threadpool) { - return models::FP32SparseMobileNetV3Small(0.8f, threadpool); - }); -} - -static void FP16MobileNetV1(benchmark::State& state) { - End2EndBenchmark(state, models::FP16MobileNetV1); -} - -static void FP16MobileNetV2(benchmark::State& state) { - End2EndBenchmark(state, models::FP16MobileNetV2); -} - -static void FP16MobileNetV3Large(benchmark::State& state) { - End2EndBenchmark(state, models::FP16MobileNetV3Large); -} - -static void FP16MobileNetV3Small(benchmark::State& state) { - End2EndBenchmark(state, models::FP16MobileNetV3Small); -} - -static void FP16Sparse80MobileNetV1(benchmark::State& state) { - End2EndBenchmark(state, [](pthreadpool_t threadpool) { - return models::FP16SparseMobileNetV1(0.8f, threadpool); - }); -} - -static void FP16Sparse80MobileNetV2(benchmark::State& state) { - End2EndBenchmark(state, [](pthreadpool_t threadpool) { - return models::FP16SparseMobileNetV2(0.8f, threadpool); - }); -} - -static void FP16Sparse80MobileNetV3Large(benchmark::State& state) { - End2EndBenchmark(state, [](pthreadpool_t threadpool) { - return models::FP16SparseMobileNetV3Large(0.8f, threadpool); - }); -} - -static void FP16Sparse80MobileNetV3Small(benchmark::State& state) { - End2EndBenchmark(state, [](pthreadpool_t threadpool) { - return models::FP16SparseMobileNetV3Small(0.8f, threadpool); - }); -} - -static void QC8MobileNetV1(benchmark::State& state) { - End2EndBenchmark(state, models::QC8MobileNetV1); -} - -static void QC8MobileNetV2(benchmark::State& state) { - End2EndBenchmark(state, models::QC8MobileNetV2); -} - -static void QS8MobileNetV1(benchmark::State& state) { - End2EndBenchmark(state, models::QS8MobileNetV1); -} - -static void QS8MobileNetV2(benchmark::State& state) { - End2EndBenchmark(state, models::QS8MobileNetV2); -} - -static void QU8MobileNetV1(benchmark::State& state) { - End2EndBenchmark(state, models::QU8MobileNetV1); -} - -static void QU8MobileNetV2(benchmark::State& state) { - End2EndBenchmark(state, models::QU8MobileNetV2); -} - -static void QU8MobileNetV3Large(benchmark::State& state) { - End2EndBenchmark(state, models::QU8MobileNetV3Large); -} - -static void QU8MobileNetV3Small(benchmark::State& state) { - End2EndBenchmark(state, models::QU8MobileNetV3Small); -} - -BENCHMARK(FP32MobileNetV1)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); -BENCHMARK(FP32MobileNetV2)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); -BENCHMARK(FP32MobileNetV3Large)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); -BENCHMARK(FP32MobileNetV3Small)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); - -BENCHMARK(FP32Sparse80MobileNetV1)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); -BENCHMARK(FP32Sparse80MobileNetV2)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); -BENCHMARK(FP32Sparse80MobileNetV3Large)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); -BENCHMARK(FP32Sparse80MobileNetV3Small)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); - -BENCHMARK(FP16MobileNetV1)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); -BENCHMARK(FP16MobileNetV2)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); -BENCHMARK(FP16MobileNetV3Large)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); -BENCHMARK(FP16MobileNetV3Small)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); - -BENCHMARK(FP16Sparse80MobileNetV1)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); -BENCHMARK(FP16Sparse80MobileNetV2)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); -BENCHMARK(FP16Sparse80MobileNetV3Large)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); -BENCHMARK(FP16Sparse80MobileNetV3Small)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); - -BENCHMARK(QC8MobileNetV1)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); -BENCHMARK(QC8MobileNetV2)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); - -BENCHMARK(QS8MobileNetV1)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); -BENCHMARK(QS8MobileNetV2)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); - -BENCHMARK(QU8MobileNetV1)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); -BENCHMARK(QU8MobileNetV2)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); -BENCHMARK(QU8MobileNetV3Large)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); -BENCHMARK(QU8MobileNetV3Small)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/end2end.h b/bench/end2end.h deleted file mode 100644 index 5a05071e15e..00000000000 --- a/bench/end2end.h +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#pragma once - -#include - -#include "xnnpack/models.h" - - -#define BENCHMARK_FP16_END2END(benchmark_fn) \ - BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v1, models::FP16MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime(); \ - BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v2, models::FP16MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime(); \ - BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v3_large, models::FP16MobileNetV3Large)->Unit(benchmark::kMicrosecond)->UseRealTime(); \ - BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v3_small, models::FP16MobileNetV3Small)->Unit(benchmark::kMicrosecond)->UseRealTime(); - -#define BENCHMARK_FP32_END2END(benchmark_fn) \ - BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v1, models::FP32MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime(); \ - BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v2, models::FP32MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime(); \ - BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v3_large, models::FP32MobileNetV3Large)->Unit(benchmark::kMicrosecond)->UseRealTime(); \ - BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v3_small, models::FP32MobileNetV3Small)->Unit(benchmark::kMicrosecond)->UseRealTime(); - -#define BENCHMARK_QS8_END2END(benchmark_fn) \ - BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v1, models::QS8MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime(); \ - BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v2, models::QS8MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime(); - -#define BENCHMARK_QU8_END2END(benchmark_fn) \ - BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v1, models::QU8MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime(); \ - BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v2, models::QU8MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime(); \ - BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v3, models::QU8MobileNetV3Large)->Unit(benchmark::kMicrosecond)->UseRealTime(); \ - BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v3, models::QU8MobileNetV3Small)->Unit(benchmark::kMicrosecond)->UseRealTime(); diff --git a/bench/f16-dwconv-e2e.cc b/bench/f16-dwconv-e2e.cc deleted file mode 100644 index f2985af7eee..00000000000 --- a/bench/f16-dwconv-e2e.cc +++ /dev/null @@ -1,736 +0,0 @@ -// Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include -#include - -#include "bench/end2end.h" -#include "bench/utils.h" -#include - -#include "xnnpack.h" -#include "xnnpack/config.h" -#include "xnnpack/dwconv.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams-init.h" -#include "xnnpack/models.h" - - -static void DWConvEnd2EndBenchmark( - benchmark::State& state, - models::ExecutionPlanFactory model_factory, - xnn_f16_dwconv_minmax_unipass_ukernel_fn dwconv_minmax, - xnn_init_f16_minmax_params_fn init_params, - uint8_t channel_tile, uint8_t primary_tile, - benchmark::utils::IsaCheckFunction isa_check = nullptr) -{ - if (isa_check != nullptr && !isa_check(state)) { - return; - } - if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) { - state.SkipWithError("failed to initialize XNNPACK"); - return; - } - - struct xnn_dwconv_config* dwconv_config = xnn_init_f16_dwconv_config(); - if (dwconv_config == nullptr) { - state.SkipWithError("hardware does not support F16 DWCONV"); - return; - } - - // Save dwconv_config so that we can modify it for the benchmark and later restore it. - struct xnn_dwconv_config saved_dwconv_params[XNN_MAX_F16_DWCONV_UKERNELS]; - memcpy(saved_dwconv_params, dwconv_config, sizeof(saved_dwconv_params)); - - // Override microkernels chosen in xnn_initialize - for (size_t i = 0; i < XNN_MAX_F16_DWCONV_UKERNELS; i++) { - // Replace only the microkernel with the matching kernel size. - if (dwconv_config[i].primary_tile == primary_tile) { - std::memset(&dwconv_config[i], 0, sizeof(dwconv_config[i])); - - // Note: do not directly assign to dwconv_config[i] because it breaks older gcc. - dwconv_config[i].minmax.unipass = xnn_dwconv_unipass_ukernel_fn(dwconv_minmax); - dwconv_config[i].channel_tile = channel_tile; - dwconv_config[i].channel_subtile = channel_tile; - dwconv_config[i].channel_round = 1; - dwconv_config[i].primary_tile = primary_tile; - dwconv_config[i].init.f16 = init_params; - break; - } - } - - auto execution_plan = model_factory(nullptr); - if (execution_plan.empty()) { - state.SkipWithError("failed to create a model"); - return; - } - - for (auto _ : state) { - for (const std::unique_ptr& op : execution_plan) { - xnn_status status = xnn_run_operator(op.get(), nullptr); - if (status != xnn_status_success) { - state.SkipWithError("failed to run a model"); - return; - } - } - } - - const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); - if (cpu_frequency != 0) { - state.counters["cpufreq"] = cpu_frequency; - } - - // Restore dwconv_config to original state as defined in init.c. - memcpy(dwconv_config, saved_dwconv_params, sizeof(saved_dwconv_params)); -} - -static void DWConvEnd2EndBenchmark( - benchmark::State& state, - models::ExecutionPlanFactory model_factory, - xnn_f16_dwconv_minmax_multipass_ukernel_fn dwconv_minmax, - xnn_init_f16_minmax_params_fn init_params, - uint8_t channel_tile, uint8_t channel_subtile, uint8_t channel_round, - uint8_t primary_tile, uint8_t middle_tile, uint8_t last_tile, - uint8_t primary_tile_to_replace, - benchmark::utils::IsaCheckFunction isa_check = nullptr) -{ - if (isa_check != nullptr && !isa_check(state)) { - return; - } - - struct xnn_dwconv_config* dwconv_config = xnn_init_f16_dwconv_config(); - if (dwconv_config == nullptr) { - state.SkipWithError("failed to initialize f16 DWCONV config"); - return; - } - - // Save dwconv_convig so that we can modify it for the benchmark and later restore it. - struct xnn_dwconv_config saved_dwconv_params[XNN_MAX_F16_DWCONV_UKERNELS]; - memcpy(saved_dwconv_params, dwconv_config, sizeof(saved_dwconv_params)); - - bool found = false; - for (size_t i = 0; i < XNN_MAX_F16_DWCONV_UKERNELS; i++) { - if (dwconv_config[i].primary_tile == primary_tile_to_replace) { - found = true; - } else if (dwconv_config[i].last_tile != 0) { - // Found a multipass microkernel, replace it. - found = true; - } - } - - if (!found) { - state.SkipWithError("can't replace with multipass"); - return; - } - - // Override microkernels chosen in xnn_initialize - for (size_t i = 0; i < XNN_MAX_F16_DWCONV_UKERNELS; i++) { - // Replace only the microkernel with the matching kernel size. - if (dwconv_config[i].primary_tile == primary_tile_to_replace || - dwconv_config[i].last_tile != 0) { - // Replace either when the primary_tile_to_replace matches, or replace the - // first multipass dwconv microkernel we find. - // TODO(zhin): support specifying target multipass dwconv to replace. - std::memset(&dwconv_config[i], 0, sizeof(dwconv_config[i])); - - // Note: do not directly assign to dwconv_config[i] because it breaks older gcc. - dwconv_config[i].minmax.multipass = xnn_dwconv_multipass_ukernel_fn(dwconv_minmax); - dwconv_config[i].channel_tile = channel_tile; - dwconv_config[i].channel_subtile = channel_subtile; - dwconv_config[i].channel_round = channel_round; - dwconv_config[i].primary_tile = primary_tile; - dwconv_config[i].middle_tile = middle_tile; - dwconv_config[i].last_tile = last_tile; - dwconv_config[i].init.f16 = init_params; - break; - } - } - - auto execution_plan = model_factory(nullptr); - if (execution_plan.empty()) { - state.SkipWithError("failed to create a model"); - return; - } - - for (auto _ : state) { - for (const std::unique_ptr& op : execution_plan) { - xnn_status status = xnn_run_operator(op.get(), nullptr); - if (status != xnn_status_success) { - state.SkipWithError("failed to run a model"); - return; - } - } - } - - const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); - if (cpu_frequency != 0) { - state.counters["cpufreq"] = cpu_frequency; - } - - memcpy(dwconv_config, saved_dwconv_params, sizeof(saved_dwconv_params)); -} - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - static void f16_dwconv_4p8c__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_4p8c__neonfp16arith, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/8, /*primary_tile=*/4, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH); - } - static void f16_dwconv_4p8c__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_4p8c__neonfp16arith_acc2, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/8, /*primary_tile=*/4, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH); - } - static void f16_dwconv_4p16c__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_4p16c__neonfp16arith, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/16, /*primary_tile=*/4, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH); - } - static void f16_dwconv_4p16c__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_4p16c__neonfp16arith_acc2, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/16, /*primary_tile=*/4, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH); - } - static void f16_dwconv_4p32c__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_4p32c__neonfp16arith, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/32, /*primary_tile=*/4, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH); - } - static void f16_dwconv_4p32c__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_4p32c__neonfp16arith_acc2, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/32, /*primary_tile=*/4, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH); - } - - static void f16_dwconv_9p8c__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_9p8c__neonfp16arith, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/8, /*primary_tile=*/9, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH); - } - static void f16_dwconv_9p8c__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_9p8c__neonfp16arith_acc2, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/8, /*primary_tile=*/9, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH); - } - static void f16_dwconv_9p16c__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_9p16c__neonfp16arith, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/16, /*primary_tile=*/9, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH); - } - static void f16_dwconv_9p16c__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_9p16c__neonfp16arith_acc2, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/16, /*primary_tile=*/9, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH); - } - static void f16_dwconv_9p32c__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_9p32c__neonfp16arith, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/32, /*primary_tile=*/9, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH); - } - static void f16_dwconv_9p32c__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_9p32c__neonfp16arith_acc2, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/32, /*primary_tile=*/9, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH); - } - - static void f16_dwconv_25p8c__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_25p8c__neonfp16arith, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/8, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH); - } - static void f16_dwconv_25p8c__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_25p8c__neonfp16arith_acc2, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/8, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH); - } - static void f16_dwconv_25p16c__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_25p16c__neonfp16arith, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/16, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH); - } - static void f16_dwconv_25p16c__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_25p16c__neonfp16arith_acc2, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/16, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH); - } - static void f16_dwconv_25p32c__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_25p32c__neonfp16arith, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/32, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH); - } - static void f16_dwconv_25p32c__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_25p32c__neonfp16arith_acc2, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/32, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH); - } - - static void f16_dwconv_5f5m5l8c8s4r__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__neonfp16arith, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, - /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH); - } - static void f16_dwconv_5f5m5l8c8s4r__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, - /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH); - } - static void f16_dwconv_5f5m5l16c8s4r__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__neonfp16arith, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, - /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH); - } - static void f16_dwconv_5f5m5l16c8s4r__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, - /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH); - } - static void f16_dwconv_5f5m5l32c8s4r__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__neonfp16arith, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, - /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH); - } - static void f16_dwconv_5f5m5l32c8s4r__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, - /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH); - } - - static void f16_dwconv_6f6m7l8c8s4r__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__neonfp16arith, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, - /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH); - } - static void f16_dwconv_6f6m7l8c8s4r__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, - /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH); - } - static void f16_dwconv_6f6m7l16c8s4r__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__neonfp16arith, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, - /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH); - } - static void f16_dwconv_6f6m7l16c8s4r__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, - /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH); - } - static void f16_dwconv_6f6m7l32c8s4r__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__neonfp16arith, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, - /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH); - } - static void f16_dwconv_6f6m7l32c8s4r__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, - /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH); - } - - static void f16_dwconv_8f8m9l8c8s4r__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__neonfp16arith, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25, - /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH); - } - static void f16_dwconv_8f8m9l8c8s4r__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25, - /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH); - } - static void f16_dwconv_8f8m9l16c8s4r__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__neonfp16arith, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25, - /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH); - } - static void f16_dwconv_8f8m9l16c8s4r__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25, - /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH); - } - static void f16_dwconv_8f8m9l32c8s4r__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__neonfp16arith, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25, - /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH); - } - static void f16_dwconv_8f8m9l32c8s4r__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25, - /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH); - } - - BENCHMARK_FP16_END2END(f16_dwconv_4p8c__neonfp16arith); - BENCHMARK_FP16_END2END(f16_dwconv_4p8c__neonfp16arith_acc2); - BENCHMARK_FP16_END2END(f16_dwconv_4p16c__neonfp16arith); - BENCHMARK_FP16_END2END(f16_dwconv_4p16c__neonfp16arith_acc2); - BENCHMARK_FP16_END2END(f16_dwconv_4p32c__neonfp16arith); - BENCHMARK_FP16_END2END(f16_dwconv_4p32c__neonfp16arith_acc2); - - BENCHMARK_FP16_END2END(f16_dwconv_9p8c__neonfp16arith); - BENCHMARK_FP16_END2END(f16_dwconv_9p8c__neonfp16arith_acc2); - BENCHMARK_FP16_END2END(f16_dwconv_9p16c__neonfp16arith); - BENCHMARK_FP16_END2END(f16_dwconv_9p16c__neonfp16arith_acc2); - BENCHMARK_FP16_END2END(f16_dwconv_9p32c__neonfp16arith); - BENCHMARK_FP16_END2END(f16_dwconv_9p32c__neonfp16arith_acc2); - - BENCHMARK_FP16_END2END(f16_dwconv_25p8c__neonfp16arith); - BENCHMARK_FP16_END2END(f16_dwconv_25p8c__neonfp16arith_acc2); - BENCHMARK_FP16_END2END(f16_dwconv_25p16c__neonfp16arith); - BENCHMARK_FP16_END2END(f16_dwconv_25p16c__neonfp16arith_acc2); - BENCHMARK_FP16_END2END(f16_dwconv_25p32c__neonfp16arith); - BENCHMARK_FP16_END2END(f16_dwconv_25p32c__neonfp16arith_acc2); - - BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l8c8s4r__neonfp16arith) - BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l8c8s4r__neonfp16arith_acc2) - BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l16c8s4r__neonfp16arith) - BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l16c8s4r__neonfp16arith_acc2) - BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l32c8s4r__neonfp16arith) - BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l32c8s4r__neonfp16arith_acc2) - - BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l8c8s4r__neonfp16arith) - BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l8c8s4r__neonfp16arith_acc2) - BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l16c8s4r__neonfp16arith) - BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l16c8s4r__neonfp16arith_acc2) - BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l32c8s4r__neonfp16arith) - BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l32c8s4r__neonfp16arith_acc2) - - BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l8c8s4r__neonfp16arith) - BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l8c8s4r__neonfp16arith_acc2) - BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l16c8s4r__neonfp16arith) - BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l16c8s4r__neonfp16arith_acc2) - BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l32c8s4r__neonfp16arith) - BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l32c8s4r__neonfp16arith_acc2) - -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - static void f16_dwconv_25p8c__fma3(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_25p8c__fma3, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/8, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3); - } - static void f16_dwconv_25p8c__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_25p8c__fma3_acc2, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/8, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3); - } - static void f16_dwconv_25p16c__fma3(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_25p16c__fma3, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/16, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3); - } - static void f16_dwconv_25p16c__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_25p16c__fma3_acc2, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/16, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3); - } - static void f16_dwconv_25p32c__fma3(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_25p32c__fma3, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/32, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3); - } - static void f16_dwconv_25p32c__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_25p32c__fma3_acc2, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/32, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3); - } - - static void f16_dwconv_5f5m5l8c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__fma3, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, - /*isa_check=*/benchmark::utils::CheckFMA3); - } - static void f16_dwconv_5f5m5l8c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__fma3_acc2, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, - /*isa_check=*/benchmark::utils::CheckFMA3); - } - static void f16_dwconv_5f5m5l16c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__fma3, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, - /*isa_check=*/benchmark::utils::CheckFMA3); - } - static void f16_dwconv_5f5m5l16c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__fma3_acc2, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, - /*isa_check=*/benchmark::utils::CheckFMA3); - } - static void f16_dwconv_5f5m5l32c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__fma3, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, - /*isa_check=*/benchmark::utils::CheckFMA3); - } - static void f16_dwconv_5f5m5l32c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__fma3_acc2, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, - /*isa_check=*/benchmark::utils::CheckFMA3); - } - - static void f16_dwconv_6f6m7l8c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__fma3, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, - /*isa_check=*/benchmark::utils::CheckFMA3); - } - static void f16_dwconv_6f6m7l8c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__fma3_acc2, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, - /*isa_check=*/benchmark::utils::CheckFMA3); - } - static void f16_dwconv_6f6m7l16c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__fma3, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, - /*isa_check=*/benchmark::utils::CheckFMA3); - } - static void f16_dwconv_6f6m7l16c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__fma3_acc2, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, - /*isa_check=*/benchmark::utils::CheckFMA3); - } - static void f16_dwconv_6f6m7l32c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__fma3, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, - /*isa_check=*/benchmark::utils::CheckFMA3); - } - static void f16_dwconv_6f6m7l32c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__fma3_acc2, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, - /*isa_check=*/benchmark::utils::CheckFMA3); - } - - static void f16_dwconv_8f8m9l8c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__fma3, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25, - /*isa_check=*/benchmark::utils::CheckFMA3); - } - static void f16_dwconv_8f8m9l8c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__fma3_acc2, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25, - /*isa_check=*/benchmark::utils::CheckFMA3); - } - static void f16_dwconv_8f8m9l16c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__fma3, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25, - /*isa_check=*/benchmark::utils::CheckFMA3); - } - static void f16_dwconv_8f8m9l16c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__fma3_acc2, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25, - /*isa_check=*/benchmark::utils::CheckFMA3); - } - static void f16_dwconv_8f8m9l32c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__fma3, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25, - /*isa_check=*/benchmark::utils::CheckFMA3); - } - static void f16_dwconv_8f8m9l32c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark( - state, model, - xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__fma3_acc2, xnn_init_f16_minmax_scalar_params, - /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25, - /*isa_check=*/benchmark::utils::CheckFMA3); - } - - BENCHMARK_FP16_END2END(f16_dwconv_25p8c__fma3) - BENCHMARK_FP16_END2END(f16_dwconv_25p8c__fma3_acc2) - BENCHMARK_FP16_END2END(f16_dwconv_25p16c__fma3) - BENCHMARK_FP16_END2END(f16_dwconv_25p16c__fma3_acc2) - BENCHMARK_FP16_END2END(f16_dwconv_25p32c__fma3) - BENCHMARK_FP16_END2END(f16_dwconv_25p32c__fma3_acc2) - - BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l8c8s4r__fma3) - BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l8c8s4r__fma3_acc2) - BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l16c8s4r__fma3) - BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l16c8s4r__fma3_acc2) - BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l32c8s4r__fma3) - BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l32c8s4r__fma3_acc2) - - BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l8c8s4r__fma3) - BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l8c8s4r__fma3_acc2) - BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l16c8s4r__fma3) - BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l16c8s4r__fma3_acc2) - BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l32c8s4r__fma3) - BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l32c8s4r__fma3_acc2) - - BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l8c8s4r__fma3) - BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l8c8s4r__fma3_acc2) - BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l16c8s4r__fma3) - BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l16c8s4r__fma3_acc2) - BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l32c8s4r__fma3) - BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l32c8s4r__fma3_acc2) - -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/f16-gemm-e2e.cc b/bench/f16-gemm-e2e.cc deleted file mode 100644 index e6b10550550..00000000000 --- a/bench/f16-gemm-e2e.cc +++ /dev/null @@ -1,452 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include -#include - -#include "bench/end2end.h" -#include "bench/utils.h" -#include - -#include "xnnpack.h" -#include "xnnpack/config.h" -#include "xnnpack/gemm.h" -#include "xnnpack/igemm.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams-init.h" -#include "xnnpack/models.h" -#include "xnnpack/pack.h" - - -static void GEMMEnd2EndBenchmark( - benchmark::State& state, - models::ExecutionPlanFactory model_factory, - xnn_f16_gemm_minmax_ukernel_fn gemm_minmax, - xnn_f16_igemm_minmax_ukernel_fn igemm_minmax, - xnn_f16_gemm_minmax_ukernel_fn gemm1_minmax, - xnn_f16_igemm_minmax_ukernel_fn igemm1_minmax, - xnn_init_f16_minmax_params_fn init_params, - uint8_t mr, uint8_t nr, uint8_t log2_kr = 0, uint8_t log2_sr = 0, - benchmark::utils::IsaCheckFunction isa_check = nullptr) -{ - if (isa_check != nullptr && !isa_check(state)) { - return; - } - if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) { - state.SkipWithError("failed to initialize XNNPACK"); - return; - } - - struct xnn_gemm_config* gemm_config = xnn_init_f16_gemm_config(); - if (gemm_config == nullptr) { - state.SkipWithError("hardware does not support F16 gemm"); - return; - } - - // Override microkernels chosen in xnn_initialize - std::memset(gemm_config, 0, sizeof(struct xnn_gemm_config)); - gemm_config->minmax.gemm[mr-1] = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_fn(gemm_minmax)); - gemm_config->minmax.igemm[mr-1] = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_fn(igemm_minmax)); - gemm_config->minmax.gemm[0] = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_fn(gemm1_minmax)); - gemm_config->minmax.igemm[0] = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_fn(igemm1_minmax)); - gemm_config->init.f16 = init_params; - gemm_config->mr = mr; - gemm_config->nr = nr; - gemm_config->log2_kr = log2_kr; - gemm_config->log2_sr = log2_sr; - gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_f16_gemm_goi_w; - - auto execution_plan = model_factory(nullptr); - if (execution_plan.empty()) { - state.SkipWithError("failed to create a model"); - return; - } - - for (auto _ : state) { - for (const std::unique_ptr& op : execution_plan) { - xnn_status status = xnn_run_operator(op.get(), nullptr); - if (status != xnn_status_success) { - state.SkipWithError("failed to run a model"); - return; - } - } - } - - const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); - if (cpu_frequency != 0) { - state.counters["cpufreq"] = cpu_frequency; - } -} - -#if XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM64 & XNN_ENABLE_ASSEMBLY - static void f16_gemm_4x8__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f16_gemm_minmax_ukernel_4x8__asm_aarch64_neonfp16arith_ld64, - xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64, - xnn_f16_gemm_minmax_ukernel_1x8__asm_aarch64_neonfp16arith_ld64, - xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64, - xnn_init_f16_minmax_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEONFP16ARITH); - } - - static void f16_gemm_6x8__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f16_gemm_minmax_ukernel_6x8__asm_aarch64_neonfp16arith_ld64, - xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64, - xnn_f16_gemm_minmax_ukernel_1x8__asm_aarch64_neonfp16arith_ld64, - xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64, - xnn_init_f16_minmax_scalar_params, - /*mr=*/6, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEONFP16ARITH); - } - - static void f16_gemm_8x8__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f16_gemm_minmax_ukernel_8x8__asm_aarch64_neonfp16arith_ld64, - xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64, - xnn_f16_gemm_minmax_ukernel_1x8__asm_aarch64_neonfp16arith_ld64, - xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64, - xnn_init_f16_minmax_scalar_params, - /*mr=*/8, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEONFP16ARITH); - } - - static void f16_gemm_4x16__asm_aarch64_neonfp16arith_ld32(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f16_gemm_minmax_ukernel_4x16__asm_aarch64_neonfp16arith_ld32, - xnn_f16_igemm_minmax_ukernel_4x16__asm_aarch64_neonfp16arith_ld32, - xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld32, - xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld32, - xnn_init_f16_minmax_scalar_params, - /*mr=*/4, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEONFP16ARITH); - } - - static void f16_gemm_4x16__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f16_gemm_minmax_ukernel_4x16__asm_aarch64_neonfp16arith_ld64, - xnn_f16_igemm_minmax_ukernel_4x16__asm_aarch64_neonfp16arith_ld64, - xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64, - xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64, - xnn_init_f16_minmax_scalar_params, - /*mr=*/4, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEONFP16ARITH); - } - - static void f16_gemm_6x16__asm_aarch64_neonfp16arith_ld32(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_ld32, - xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_ld32, - xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld32, - xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld32, - xnn_init_f16_minmax_scalar_params, - /*mr=*/6, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEONFP16ARITH); - } - - static void f16_gemm_6x16__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_ld64, - xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_ld64, - xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64, - xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64, - xnn_init_f16_minmax_scalar_params, - /*mr=*/6, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEONFP16ARITH); - } - - static void f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55, - xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55, - xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64, - xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64, - xnn_init_f16_minmax_scalar_params, - /*mr=*/6, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEONFP16ARITH); - } - - static void f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a55r0(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55r0, - xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55r0, - xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64, - xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64, - xnn_init_f16_minmax_scalar_params, - /*mr=*/6, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEONFP16ARITH); - } - - static void f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a75, - xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a75, - xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64, - xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64, - xnn_init_f16_minmax_scalar_params, - /*mr=*/6, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEONFP16ARITH); - } - - BENCHMARK_FP16_END2END(f16_gemm_4x8__asm_aarch64_neonfp16arith_ld64); - BENCHMARK_FP16_END2END(f16_gemm_6x8__asm_aarch64_neonfp16arith_ld64); - BENCHMARK_FP16_END2END(f16_gemm_8x8__asm_aarch64_neonfp16arith_ld64); - BENCHMARK_FP16_END2END(f16_gemm_4x16__asm_aarch64_neonfp16arith_ld32); - BENCHMARK_FP16_END2END(f16_gemm_4x16__asm_aarch64_neonfp16arith_ld64); - BENCHMARK_FP16_END2END(f16_gemm_6x16__asm_aarch64_neonfp16arith_ld32); - BENCHMARK_FP16_END2END(f16_gemm_6x16__asm_aarch64_neonfp16arith_ld64); - BENCHMARK_FP16_END2END(f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a55); - BENCHMARK_FP16_END2END(f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a55r0); - BENCHMARK_FP16_END2END(f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a75); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM64 & XNN_ENABLE_ASSEMBLY - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - static void f16_gemm_4x8__neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64, - xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64, - xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64, - xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64, - xnn_init_f16_minmax_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEONFP16ARITH); - } - - static void f16_gemm_6x8__neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64, - xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64, - xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64, - xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64, - xnn_init_f16_minmax_scalar_params, - /*mr=*/6, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEONFP16ARITH); - } - - static void f16_gemm_8x8__neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64, - xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64, - xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64, - xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64, - xnn_init_f16_minmax_scalar_params, - /*mr=*/8, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEONFP16ARITH); - } - - static void f16_gemm_4x16__neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64, - xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64, - xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64, - xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64, - xnn_init_f16_minmax_scalar_params, - /*mr=*/4, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEONFP16ARITH); - } - - static void f16_gemm_6x16__neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64, - xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64, - xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64, - xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64, - xnn_init_f16_minmax_scalar_params, - /*mr=*/6, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEONFP16ARITH); - } - - static void f16_gemm_8x16__neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64, - xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64, - xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64, - xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64, - xnn_init_f16_minmax_scalar_params, - /*mr=*/8, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEONFP16ARITH); - } - - BENCHMARK_FP16_END2END(f16_gemm_4x8__neonfp16arith_ld64); - BENCHMARK_FP16_END2END(f16_gemm_6x8__neonfp16arith_ld64); - BENCHMARK_FP16_END2END(f16_gemm_8x8__neonfp16arith_ld64); - BENCHMARK_FP16_END2END(f16_gemm_4x16__neonfp16arith_ld64); - BENCHMARK_FP16_END2END(f16_gemm_6x16__neonfp16arith_ld64); - BENCHMARK_FP16_END2END(f16_gemm_8x16__neonfp16arith_ld64); -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - static void f16_gemm_4x8__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f16_gemm_minmax_ukernel_4x8__avx2_broadcast, - xnn_f16_igemm_minmax_ukernel_4x8__avx2_broadcast, - xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast, - xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast, - xnn_init_f16_minmax_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckAVX2); - } - static void f16_gemm_5x8__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f16_gemm_minmax_ukernel_5x8__avx2_broadcast, - xnn_f16_igemm_minmax_ukernel_5x8__avx2_broadcast, - xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast, - xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast, - xnn_init_f16_minmax_scalar_params, - /*mr=*/5, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckAVX2); - } - static void f16_gemm_6x8__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f16_gemm_minmax_ukernel_6x8__avx2_broadcast, - xnn_f16_igemm_minmax_ukernel_6x8__avx2_broadcast, - xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast, - xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast, - xnn_init_f16_minmax_scalar_params, - /*mr=*/6, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckAVX2); - } - static void f16_gemm_7x8__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f16_gemm_minmax_ukernel_7x8__avx2_broadcast, - xnn_f16_igemm_minmax_ukernel_7x8__avx2_broadcast, - xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast, - xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast, - xnn_init_f16_minmax_scalar_params, - /*mr=*/7, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckAVX2); - } - - static void f16_gemm_3x16__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f16_gemm_minmax_ukernel_3x16__avx2_broadcast, - xnn_f16_igemm_minmax_ukernel_3x16__avx2_broadcast, - xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast, - xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast, - xnn_init_f16_minmax_scalar_params, - /*mr=*/3, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckAVX2); - } - static void f16_gemm_4x16__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast, - xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast, - xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast, - xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast, - xnn_init_f16_minmax_scalar_params, - /*mr=*/4, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckAVX2); - } - static void f16_gemm_5x16__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f16_gemm_minmax_ukernel_5x16__avx2_broadcast, - xnn_f16_igemm_minmax_ukernel_5x16__avx2_broadcast, - xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast, - xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast, - xnn_init_f16_minmax_scalar_params, - /*mr=*/5, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckAVX2); - } - - static void f16_f32acc_gemm_4x8__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f16_f32acc_gemm_minmax_ukernel_4x8__avx2_broadcast, - xnn_f16_f32acc_igemm_minmax_ukernel_4x8__avx2_broadcast, - xnn_f16_f32acc_gemm_minmax_ukernel_1x8__avx2_broadcast, - xnn_f16_f32acc_igemm_minmax_ukernel_1x8__avx2_broadcast, - xnn_init_f16_minmax_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckAVX2); - } - static void f16_f32acc_gemm_5x8__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f16_f32acc_gemm_minmax_ukernel_5x8__avx2_broadcast, - xnn_f16_f32acc_igemm_minmax_ukernel_5x8__avx2_broadcast, - xnn_f16_f32acc_gemm_minmax_ukernel_1x8__avx2_broadcast, - xnn_f16_f32acc_igemm_minmax_ukernel_1x8__avx2_broadcast, - xnn_init_f16_minmax_scalar_params, - /*mr=*/5, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckAVX2); - } - static void f16_f32acc_gemm_6x8__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f16_f32acc_gemm_minmax_ukernel_6x8__avx2_broadcast, - xnn_f16_f32acc_igemm_minmax_ukernel_6x8__avx2_broadcast, - xnn_f16_f32acc_gemm_minmax_ukernel_1x8__avx2_broadcast, - xnn_f16_f32acc_igemm_minmax_ukernel_1x8__avx2_broadcast, - xnn_init_f16_minmax_scalar_params, - /*mr=*/6, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckAVX2); - } - static void f16_f32acc_gemm_7x8__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f16_f32acc_gemm_minmax_ukernel_7x8__avx2_broadcast, - xnn_f16_f32acc_igemm_minmax_ukernel_7x8__avx2_broadcast, - xnn_f16_f32acc_gemm_minmax_ukernel_1x8__avx2_broadcast, - xnn_f16_f32acc_igemm_minmax_ukernel_1x8__avx2_broadcast, - xnn_init_f16_minmax_scalar_params, - /*mr=*/7, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckAVX2); - } - - static void f16_f32acc_gemm_3x16__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f16_f32acc_gemm_minmax_ukernel_3x16__avx2_broadcast, - xnn_f16_f32acc_igemm_minmax_ukernel_3x16__avx2_broadcast, - xnn_f16_f32acc_gemm_minmax_ukernel_1x16__avx2_broadcast, - xnn_f16_f32acc_igemm_minmax_ukernel_1x16__avx2_broadcast, - xnn_init_f16_minmax_scalar_params, - /*mr=*/3, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckAVX2); - } - static void f16_f32acc_gemm_4x16__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f16_f32acc_gemm_minmax_ukernel_4x16__avx2_broadcast, - xnn_f16_f32acc_igemm_minmax_ukernel_4x16__avx2_broadcast, - xnn_f16_f32acc_gemm_minmax_ukernel_1x16__avx2_broadcast, - xnn_f16_f32acc_igemm_minmax_ukernel_1x16__avx2_broadcast, - xnn_init_f16_minmax_scalar_params, - /*mr=*/4, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckAVX2); - } - static void f16_f32acc_gemm_5x16__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f16_f32acc_gemm_minmax_ukernel_5x16__avx2_broadcast, - xnn_f16_f32acc_igemm_minmax_ukernel_5x16__avx2_broadcast, - xnn_f16_f32acc_gemm_minmax_ukernel_1x16__avx2_broadcast, - xnn_f16_f32acc_igemm_minmax_ukernel_1x16__avx2_broadcast, - xnn_init_f16_minmax_scalar_params, - /*mr=*/5, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckAVX2); - } - - BENCHMARK_FP16_END2END(f16_gemm_4x8__avx2_broadcast); - BENCHMARK_FP16_END2END(f16_gemm_5x8__avx2_broadcast); - BENCHMARK_FP16_END2END(f16_gemm_6x8__avx2_broadcast); - BENCHMARK_FP16_END2END(f16_gemm_7x8__avx2_broadcast); - - BENCHMARK_FP16_END2END(f16_gemm_3x16__avx2_broadcast); - BENCHMARK_FP16_END2END(f16_gemm_4x16__avx2_broadcast); - BENCHMARK_FP16_END2END(f16_gemm_5x16__avx2_broadcast); - - BENCHMARK_FP16_END2END(f16_f32acc_gemm_4x8__avx2_broadcast); - BENCHMARK_FP16_END2END(f16_f32acc_gemm_5x8__avx2_broadcast); - BENCHMARK_FP16_END2END(f16_f32acc_gemm_6x8__avx2_broadcast); - BENCHMARK_FP16_END2END(f16_f32acc_gemm_7x8__avx2_broadcast); - - BENCHMARK_FP16_END2END(f16_f32acc_gemm_3x16__avx2_broadcast); - BENCHMARK_FP16_END2END(f16_f32acc_gemm_4x16__avx2_broadcast); - BENCHMARK_FP16_END2END(f16_f32acc_gemm_5x16__avx2_broadcast); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/f32-dwconv-e2e.cc b/bench/f32-dwconv-e2e.cc deleted file mode 100644 index 670e5cff3ea..00000000000 --- a/bench/f32-dwconv-e2e.cc +++ /dev/null @@ -1,2148 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include -#include - -#include -#include "bench/end2end.h" -#include "bench/utils.h" - -#include "xnnpack.h" -#include "xnnpack/config.h" -#include "xnnpack/dwconv.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams-init.h" -#include "xnnpack/models.h" - - -static void DWConvEnd2EndBenchmark( - benchmark::State& state, - models::ExecutionPlanFactory model_factory, - xnn_f32_dwconv_minmax_unipass_ukernel_fn dwconv_minmax, - xnn_f32_dwconv_unipass_ukernel_fn dwconv, - xnn_init_f32_minmax_params_fn init_params, - uint8_t channel_tile, uint8_t primary_tile, - benchmark::utils::IsaCheckFunction isa_check = nullptr) -{ - if (isa_check != nullptr && !isa_check(state)) { - return; - } - if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) { - state.SkipWithError("failed to initialize XNNPACK"); - return; - } - - struct xnn_dwconv_config* dwconv_config = xnn_init_f32_dwconv_config(); - if (dwconv_config == nullptr) { - state.SkipWithError("hardware does not support F32 DWCONV"); - return; - } - - // Save dwconv_config so that we can modify it for the benchmark and later restore it. - struct xnn_dwconv_config saved_dwconv_params[XNN_MAX_F32_DWCONV_UKERNELS]; - memcpy(saved_dwconv_params, dwconv_config, sizeof(saved_dwconv_params)); - - // Override microkernels chosen in xnn_initialize - for (size_t i = 0; i < XNN_MAX_F32_DWCONV_UKERNELS; i++) { - // Replace only the microkernel with the matching kernel size. - if (dwconv_config[i].primary_tile == primary_tile) { - std::memset(&dwconv_config[i], 0, sizeof(dwconv_config[i])); - - // Note: do not directly assign to dwconv_config[i] because it breaks older gcc. - dwconv_config[i].minmax.unipass = xnn_dwconv_unipass_ukernel_fn(dwconv_minmax); - dwconv_config[i].linear.unipass = xnn_dwconv_unipass_ukernel_fn(dwconv); - dwconv_config[i].channel_tile = channel_tile; - dwconv_config[i].channel_subtile = channel_tile; - dwconv_config[i].channel_round = 1; - dwconv_config[i].primary_tile = primary_tile; - dwconv_config[i].init.f32 = init_params; - break; - } - } - - auto execution_plan = model_factory(nullptr); - if (execution_plan.empty()) { - state.SkipWithError("failed to create a model"); - return; - } - - for (auto _ : state) { - for (const std::unique_ptr& op : execution_plan) { - xnn_status status = xnn_run_operator(op.get(), nullptr); - if (status != xnn_status_success) { - state.SkipWithError("failed to run a model"); - return; - } - } - } - - const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); - if (cpu_frequency != 0) { - state.counters["cpufreq"] = cpu_frequency; - } - - // Restore dwconv_config to original state as defined in init.c. - memcpy(dwconv_config, saved_dwconv_params, sizeof(saved_dwconv_params)); -} - -static void DWConvEnd2EndBenchmark( - benchmark::State& state, - models::ExecutionPlanFactory model_factory, - xnn_f32_dwconv_minmax_multipass_ukernel_fn dwconv_minmax, - xnn_f32_dwconv_multipass_ukernel_fn dwconv, - xnn_init_f32_minmax_params_fn init_params, - uint8_t channel_tile, uint8_t channel_subtile, uint8_t channel_round, - uint8_t primary_tile, uint8_t middle_tile, uint8_t last_tile, - uint8_t primary_tile_to_replace, - benchmark::utils::IsaCheckFunction isa_check = nullptr) -{ - if (isa_check != nullptr && !isa_check(state)) { - return; - } - if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) { - state.SkipWithError("failed to initialize XNNPACK"); - return; - } - - struct xnn_dwconv_config* dwconv_config = xnn_init_f32_dwconv_config(); - if (dwconv_config == nullptr) { - state.SkipWithError("failed to initialize f32 DWCONV config"); - return; - } - - // Save dwconv_config so that we can modify it for the benchmark and later restore it. - struct xnn_dwconv_config saved_dwconv_params[XNN_MAX_F32_DWCONV_UKERNELS]; - memcpy(saved_dwconv_params, dwconv_config, sizeof(saved_dwconv_params)); - - bool found = false; - for (size_t i = 0; i < XNN_MAX_F32_DWCONV_UKERNELS; i++) { - if (dwconv_config[i].primary_tile == primary_tile_to_replace) { - found = true; - } else if (dwconv_config[i].last_tile != 0) { - // Found a multipass microkernel, replace it. - found = true; - } - } - - if (!found) { - state.SkipWithError("can't replace with multipass"); - return; - } - - // Override microkernels chosen in xnn_initialize - for (size_t i = 0; i < XNN_MAX_F32_DWCONV_UKERNELS; i++) { - // Replace only the microkernel with the matching kernel size. - if (dwconv_config[i].primary_tile == primary_tile_to_replace || - dwconv_config[i].last_tile != 0) { - // Replace either when the primary_tile_to_replace matches, or replace the - // first multipass dwconv microkernel we find. - // TODO(zhin): support specifying target multipass dwconv to replace. - std::memset(&dwconv_config[i], 0, sizeof(dwconv_config[i])); - - // Note: do not directly assign to dwconv_config[i] because it breaks older gcc. - dwconv_config[i].minmax.multipass = xnn_dwconv_multipass_ukernel_fn(dwconv_minmax); - dwconv_config[i].linear.multipass = xnn_dwconv_multipass_ukernel_fn(dwconv); - dwconv_config[i].channel_tile = channel_tile; - dwconv_config[i].channel_subtile = channel_subtile; - dwconv_config[i].channel_round = channel_round; - dwconv_config[i].primary_tile = primary_tile; - dwconv_config[i].middle_tile = middle_tile; - dwconv_config[i].last_tile = last_tile; - dwconv_config[i].init.f32 = init_params; - break; - } - } - - auto execution_plan = model_factory(nullptr); - if (execution_plan.empty()) { - state.SkipWithError("failed to create a model"); - return; - } - - for (auto _ : state) { - for (const std::unique_ptr& op : execution_plan) { - xnn_status status = xnn_run_operator(op.get(), nullptr); - if (status != xnn_status_success) { - state.SkipWithError("failed to run a model"); - return; - } - } - } - - const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); - if (cpu_frequency != 0) { - state.counters["cpufreq"] = cpu_frequency; - } - - // Restore dwconv_config to original state as defined in init.c. - memcpy(dwconv_config, saved_dwconv_params, sizeof(saved_dwconv_params)); -} - -#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY - static void f32_dwconv_9p4c__asm_aarch64_neonfma(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p4c__asm_aarch64_neonfma, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 4 /* channel tile */, 9 /* primary tile */); - } - - static void f32_dwconv_9p4c__asm_aarch64_neonfma_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p4c__asm_aarch64_neonfma_cortex_a55, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 4 /* channel tile */, 9 /* primary tile */); - } - - BENCHMARK_FP32_END2END(f32_dwconv_9p4c__asm_aarch64_neonfma); - BENCHMARK_FP32_END2END(f32_dwconv_9p4c__asm_aarch64_neonfma_cortex_a55); -#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - static void f32_dwconv_9p4c__neon(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p4c__neon, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 4 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON); - } - - static void f32_dwconv_9p4c__neon_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p4c__neon_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 4 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON); - } - - static void f32_dwconv_9p8c__neon(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p8c__neon, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON); - } - - static void f32_dwconv_9p8c__neon_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p8c__neon_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON); - } - - static void f32_dwconv_9p16c__neon(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p16c__neon, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON); - } - - static void f32_dwconv_9p16c__neon_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p16c__neon_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON); - } - - static void f32_dwconv_9p4c__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p4c__neonfma, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 4 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEONFMA); - } - - static void f32_dwconv_9p4c__neonfma_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p4c__neonfma_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 4 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEONFMA); - } - - static void f32_dwconv_9p8c__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p8c__neonfma, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEONFMA); - } - - static void f32_dwconv_9p8c__neonfma_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p8c__neonfma_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEONFMA); - } - - static void f32_dwconv_9p16c__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p16c__neonfma, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEONFMA); - } - - static void f32_dwconv_9p16c__neonfma_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p16c__neonfma_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEONFMA); - } - - static void f32_dwconv_25p8c__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p8c__neonfma, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 8 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEONFMA); - } - - static void f32_dwconv_25p8c__neonfma_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p8c__neonfma_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 8 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEONFMA); - } - - static void f32_dwconv_5f5m5l4c4s4r__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__neonfma, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckNEONFMA); - } - static void f32_dwconv_5f5m5l4c4s4r__neonfma_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__neonfma_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckNEONFMA); - } - static void f32_dwconv_5f5m5l8c4s4r__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l8c4s4r__neonfma, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckNEONFMA); - } - static void f32_dwconv_5f5m5l8c4s4r__neonfma_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l8c4s4r__neonfma_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckNEONFMA); - } - - static void f32_dwconv_6f6m7l4c4s4r__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_6f6m7l4c4s4r__neonfma, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckNEONFMA); - } - static void f32_dwconv_6f6m7l4c4s4r__neonfma_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_6f6m7l4c4s4r__neonfma_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckNEONFMA); - } - static void f32_dwconv_6f6m7l8c4s4r__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_6f6m7l8c4s4r__neonfma, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckNEONFMA); - } - static void f32_dwconv_6f6m7l8c4s4r__neonfma_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_6f6m7l8c4s4r__neonfma_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckNEONFMA); - } - - static void f32_dwconv_8f8m9l4c4s4r__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_8f8m9l4c4s4r__neonfma, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckNEONFMA); - } - static void f32_dwconv_8f8m9l4c4s4r__neonfma_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_8f8m9l4c4s4r__neonfma_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckNEONFMA); - } - static void f32_dwconv_8f8m9l8c4s4r__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_8f8m9l8c4s4r__neonfma, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckNEONFMA); - } - static void f32_dwconv_8f8m9l8c4s4r__neonfma_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_8f8m9l8c4s4r__neonfma_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckNEONFMA); - } - - static void f32_dwconv_25p8c__neon(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p8c__neon, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 8 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON); - } - - static void f32_dwconv_25p8c__neon_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p8c__neon_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 8 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON); - } - - static void f32_dwconv_5f5m5l4c4s4r__neon(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__neon, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckNEON); - } - static void f32_dwconv_5f5m5l4c4s4r__neon_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__neon_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckNEON); - } - static void f32_dwconv_5f5m5l8c4s4r__neon(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l8c4s4r__neon, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckNEON); - } - static void f32_dwconv_5f5m5l8c4s4r__neon_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l8c4s4r__neon_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckNEON); - } - - static void f32_dwconv_6f6m7l4c4s4r__neon(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_6f6m7l4c4s4r__neon, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckNEON); - } - static void f32_dwconv_6f6m7l4c4s4r__neon_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_6f6m7l4c4s4r__neon_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckNEON); - } - static void f32_dwconv_6f6m7l8c4s4r__neon(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_6f6m7l8c4s4r__neon, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckNEON); - } - static void f32_dwconv_6f6m7l8c4s4r__neon_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_6f6m7l8c4s4r__neon_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckNEON); - } - - static void f32_dwconv_8f8m9l4c4s4r__neon(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_8f8m9l4c4s4r__neon, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/8, /*middle_tile=*/9, /*last_tile=*/8, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckNEON); - } - static void f32_dwconv_8f8m9l4c4s4r__neon_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_8f8m9l4c4s4r__neon_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/8, /*middle_tile=*/9, /*last_tile=*/8, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckNEON); - } - static void f32_dwconv_8f8m9l8c4s4r__neon(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_8f8m9l8c4s4r__neon, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/8, /*middle_tile=*/9, /*last_tile=*/8, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckNEON); - } - static void f32_dwconv_8f8m9l8c4s4r__neon_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_8f8m9l8c4s4r__neon_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/8, /*middle_tile=*/9, /*last_tile=*/8, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckNEON); - } - - BENCHMARK_FP32_END2END(f32_dwconv_9p4c__neonfma); - BENCHMARK_FP32_END2END(f32_dwconv_9p4c__neonfma_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_9p8c__neonfma); - BENCHMARK_FP32_END2END(f32_dwconv_9p8c__neonfma_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_9p16c__neonfma); - BENCHMARK_FP32_END2END(f32_dwconv_9p16c__neonfma_acc2); - - BENCHMARK_FP32_END2END(f32_dwconv_25p8c__neonfma); - BENCHMARK_FP32_END2END(f32_dwconv_25p8c__neonfma_acc2); - - BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l4c4s4r__neonfma); - BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l4c4s4r__neonfma_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l8c4s4r__neonfma); - BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l8c4s4r__neonfma_acc2); - - BENCHMARK_FP32_END2END(f32_dwconv_6f6m7l4c4s4r__neonfma); - BENCHMARK_FP32_END2END(f32_dwconv_6f6m7l4c4s4r__neonfma_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_6f6m7l8c4s4r__neonfma); - BENCHMARK_FP32_END2END(f32_dwconv_6f6m7l8c4s4r__neonfma_acc2); - - BENCHMARK_FP32_END2END(f32_dwconv_8f8m9l4c4s4r__neonfma); - BENCHMARK_FP32_END2END(f32_dwconv_8f8m9l4c4s4r__neonfma_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_8f8m9l8c4s4r__neonfma); - BENCHMARK_FP32_END2END(f32_dwconv_8f8m9l8c4s4r__neonfma_acc2); - - BENCHMARK_FP32_END2END(f32_dwconv_9p4c__neon); - BENCHMARK_FP32_END2END(f32_dwconv_9p4c__neon_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_9p8c__neon); - BENCHMARK_FP32_END2END(f32_dwconv_9p8c__neon_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_9p16c__neon); - BENCHMARK_FP32_END2END(f32_dwconv_9p16c__neon_acc2); - - BENCHMARK_FP32_END2END(f32_dwconv_25p8c__neon); - BENCHMARK_FP32_END2END(f32_dwconv_25p8c__neon_acc2); - - BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l4c4s4r__neon); - BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l4c4s4r__neon_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l8c4s4r__neon); - BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l8c4s4r__neon_acc2); - - BENCHMARK_FP32_END2END(f32_dwconv_6f6m7l4c4s4r__neon); - BENCHMARK_FP32_END2END(f32_dwconv_6f6m7l4c4s4r__neon_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_6f6m7l8c4s4r__neon); - BENCHMARK_FP32_END2END(f32_dwconv_6f6m7l8c4s4r__neon_acc2); - - BENCHMARK_FP32_END2END(f32_dwconv_8f8m9l4c4s4r__neon); - BENCHMARK_FP32_END2END(f32_dwconv_8f8m9l4c4s4r__neon_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_8f8m9l8c4s4r__neon); - BENCHMARK_FP32_END2END(f32_dwconv_8f8m9l8c4s4r__neon_acc2); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - static void f32_dwconv_9p4c__sse(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p4c__sse, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 4 /* channel tile */, 9 /* primary tile */); - } - static void f32_dwconv_9p4c__sse_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p4c__sse_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 4 /* channel tile */, 9 /* primary tile */); - } - static void f32_dwconv_9p8c__sse(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p8c__sse, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 8 /* channel tile */, 9 /* primary tile */); - } - static void f32_dwconv_9p8c__sse_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p8c__sse_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 8 /* channel tile */, 9 /* primary tile */); - } - - static void f32_dwconv_25p4c__sse(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p4c__sse, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 4 /* channel tile */, 25 /* primary tile */); - } - static void f32_dwconv_25p4c__sse_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p4c__sse_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 4 /* channel tile */, 25 /* primary tile */); - } - static void f32_dwconv_25p8c__sse(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p8c__sse, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 8 /* channel tile */, 25 /* primary tile */); - } - static void f32_dwconv_25p8c__sse_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p8c__sse_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 8 /* channel tile */, 25 /* primary tile */); - } - - static void f32_dwconv_5f5m5l4c4s4r__sse(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__sse, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - static void f32_dwconv_5f5m5l4c4s4r__sse_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__sse_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - static void f32_dwconv_5f5m5l8c4s4r__sse(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l8c4s4r__sse, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - static void f32_dwconv_5f5m5l8c4s4r__sse_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l8c4s4r__sse_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - static void f32_dwconv_5f5m5l16c4s4r__sse(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l16c4s4r__sse, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - static void f32_dwconv_5f5m5l16c4s4r__sse_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l16c4s4r__sse_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - - static void f32_dwconv_6f6m7l4c4s4r__sse(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_6f6m7l4c4s4r__sse, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - static void f32_dwconv_6f6m7l4c4s4r__sse_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_6f6m7l4c4s4r__sse_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - static void f32_dwconv_6f6m7l8c4s4r__sse(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_6f6m7l8c4s4r__sse, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - static void f32_dwconv_6f6m7l8c4s4r__sse_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_6f6m7l8c4s4r__sse_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - static void f32_dwconv_6f6m7l16c4s4r__sse(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_6f6m7l16c4s4r__sse, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - static void f32_dwconv_6f6m7l16c4s4r__sse_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_6f6m7l16c4s4r__sse_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - - static void f32_dwconv_8f8m9l4c4s4r__sse(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_8f8m9l4c4s4r__sse, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - static void f32_dwconv_8f8m9l4c4s4r__sse_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_8f8m9l4c4s4r__sse_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - static void f32_dwconv_8f8m9l8c4s4r__sse(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_8f8m9l8c4s4r__sse, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - static void f32_dwconv_8f8m9l8c4s4r__sse_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_8f8m9l8c4s4r__sse_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - static void f32_dwconv_8f8m9l16c4s4r__sse(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_8f8m9l16c4s4r__sse, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - static void f32_dwconv_8f8m9l16c4s4r__sse_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_8f8m9l16c4s4r__sse_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - - static void f32_dwconv_9p8c__avx(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p8c__avx, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX); - } - static void f32_dwconv_9p8c__avx_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p8c__avx_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX); - } - static void f32_dwconv_9p16c__avx(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p16c__avx, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX); - } - static void f32_dwconv_9p16c__avx_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p16c__avx_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX); - } - - static void f32_dwconv_25p8c__avx(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p8c__avx, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 8 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckAVX); - } - static void f32_dwconv_25p8c__avx_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p8c__avx_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 8 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckAVX); - } - static void f32_dwconv_25p16c__avx(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p16c__avx, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckAVX); - } - static void f32_dwconv_25p16c__avx_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p16c__avx_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckAVX); - } - - static void f32_dwconv_5f5m5l8c8s4r__avx(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l8c8s4r__avx, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckAVX); - } - static void f32_dwconv_5f5m5l8c8s4r__avx_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l8c8s4r__avx_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckAVX); - } - static void f32_dwconv_5f5m5l16c8s4r__avx(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l16c8s4r__avx, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckAVX); - } - static void f32_dwconv_5f5m5l16c8s4r__avx_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l16c8s4r__avx_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckAVX); - } - - static void f32_dwconv_6f6m7l8c8s4r__avx(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_6f6m7l8c8s4r__avx, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckAVX); - } - static void f32_dwconv_6f6m7l8c8s4r__avx_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_6f6m7l8c8s4r__avx_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckAVX); - } - static void f32_dwconv_6f6m7l16c8s4r__avx(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_6f6m7l16c8s4r__avx, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckAVX); - } - static void f32_dwconv_6f6m7l16c8s4r__avx_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_6f6m7l16c8s4r__avx_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckAVX); - } - - static void f32_dwconv_8f8m9l8c8s4r__avx(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_8f8m9l8c8s4r__avx, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckAVX); - } - static void f32_dwconv_8f8m9l8c8s4r__avx_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_8f8m9l8c8s4r__avx_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckAVX); - } - static void f32_dwconv_8f8m9l16c8s4r__avx(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_8f8m9l16c8s4r__avx, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckAVX); - } - static void f32_dwconv_8f8m9l16c8s4r__avx_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_8f8m9l16c8s4r__avx_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckAVX); - } - - static void f32_dwconv_3p8c__fma3(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_3p8c__fma3, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 8 /* channel tile */, 3 /* primary tile */, benchmark::utils::CheckFMA3); - } - static void f32_dwconv_3p8c__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_3p8c__fma3_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 8 /* channel tile */, 3 /* primary tile */, benchmark::utils::CheckFMA3); - } - static void f32_dwconv_3p16c__fma3(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_3p16c__fma3, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 16 /* channel tile */, 3 /* primary tile */, benchmark::utils::CheckFMA3); - } - static void f32_dwconv_3p16c__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_3p16c__fma3_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 16 /* channel tile */, 3 /* primary tile */, benchmark::utils::CheckFMA3); - } - static void f32_dwconv_4p8c__fma3(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_4p8c__fma3, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 8 /* channel tile */, 4 /* primary tile */, benchmark::utils::CheckFMA3); - } - static void f32_dwconv_4p8c__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_4p8c__fma3_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 8 /* channel tile */, 4 /* primary tile */, benchmark::utils::CheckFMA3); - } - static void f32_dwconv_4p16c__fma3(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_4p16c__fma3, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 16 /* channel tile */, 4 /* primary tile */, benchmark::utils::CheckFMA3); - } - static void f32_dwconv_4p16c__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_4p16c__fma3_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 16 /* channel tile */, 4 /* primary tile */, benchmark::utils::CheckFMA3); - } - static void f32_dwconv_9p8c__fma3(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p8c__fma3, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckFMA3); - } - static void f32_dwconv_9p8c__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p8c__fma3_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckFMA3); - } - static void f32_dwconv_9p16c__fma3(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p16c__fma3, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckFMA3); - } - static void f32_dwconv_9p16c__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p16c__fma3_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckFMA3); - } - static void f32_dwconv_25p8c__fma3(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p8c__fma3, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 8 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckFMA3); - } - static void f32_dwconv_25p8c__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p8c__fma3_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 8 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckFMA3); - } - static void f32_dwconv_25p16c__fma3(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p16c__fma3, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckFMA3); - } - static void f32_dwconv_25p16c__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p16c__fma3_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckFMA3); - } - - static void f32_dwconv_5f5m5l8c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l8c8s4r__fma3, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckFMA3); - } - static void f32_dwconv_5f5m5l8c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l8c8s4r__fma3_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckFMA3); - } - static void f32_dwconv_5f5m5l16c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l16c8s4r__fma3, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckFMA3); - } - static void f32_dwconv_5f5m5l16c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l16c8s4r__fma3_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckFMA3); - } - static void f32_dwconv_5f5m5l32c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l32c8s4r__fma3, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckFMA3); - } - static void f32_dwconv_5f5m5l32c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l32c8s4r__fma3_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckFMA3); - } - static void f32_dwconv_7f6m6l8c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_7f6m6l8c8s4r__fma3, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/7, /*middle_tile=*/6, /*last_tile=*/6, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckFMA3); - } - static void f32_dwconv_7f6m6l8c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_7f6m6l8c8s4r__fma3_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/7, /*middle_tile=*/6, /*last_tile=*/6, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckFMA3); - } - static void f32_dwconv_7f6m6l16c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_7f6m6l16c8s4r__fma3, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/7, /*middle_tile=*/6, /*last_tile=*/6, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckFMA3); - } - static void f32_dwconv_7f6m6l16c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_7f6m6l16c8s4r__fma3_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/7, /*middle_tile=*/6, /*last_tile=*/6, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckFMA3); - } - static void f32_dwconv_7f6m6l32c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_7f6m6l32c8s4r__fma3, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/7, /*middle_tile=*/6, /*last_tile=*/6, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckFMA3); - } - static void f32_dwconv_7f6m6l32c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_7f6m6l32c8s4r__fma3_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4, - /*primary_tile=*/7, /*middle_tile=*/6, /*last_tile=*/6, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckFMA3); - } - - static void f32_dwconv_9p16c__avx512f(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p16c__avx512f, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX512F); - } - static void f32_dwconv_9p16c__avx512f_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p16c__avx512f_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX512F); - } - static void f32_dwconv_9p32c__avx512f(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p32c__avx512f, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX512F); - } - static void f32_dwconv_9p32c__avx512f_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p32c__avx512f_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX512F); - } - - static void f32_dwconv_25p16c__avx512f(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p16c__avx512f, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckAVX512F); - } - static void f32_dwconv_25p16c__avx512f_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p16c__avx512f_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckAVX512F); - } - static void f32_dwconv_25p32c__avx512f(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p32c__avx512f, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 32 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckAVX512F); - } - static void f32_dwconv_25p32c__avx512f_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p32c__avx512f_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - 32 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckAVX512F); - } - static void f32_dwconv_5f5m5l16c16s1r__avx512f(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l16c16s1r__avx512f, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/16, /*channel_round=*/1, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckAVX512F); - } - static void f32_dwconv_5f5m5l16c16s1r__avx512f_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l16c16s1r__avx512f_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/16, /*channel_round=*/1, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckAVX512F); - } - static void f32_dwconv_5f5m5l32c16s1r__avx512f(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l32c16s1r__avx512f, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/32, /*channel_subtile=*/16, /*channel_round=*/1, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckAVX512F); - } - static void f32_dwconv_5f5m5l32c16s1r__avx512f_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l32c16s1r__avx512f_acc2, - nullptr /* dwconv */, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/32, /*channel_subtile=*/16, /*channel_round=*/1, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, - benchmark::utils::CheckAVX512F); - } - - BENCHMARK_FP32_END2END(f32_dwconv_9p16c__avx512f); - BENCHMARK_FP32_END2END(f32_dwconv_9p16c__avx512f_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_9p32c__avx512f); - BENCHMARK_FP32_END2END(f32_dwconv_9p32c__avx512f_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_25p16c__avx512f); - BENCHMARK_FP32_END2END(f32_dwconv_25p16c__avx512f_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_25p32c__avx512f); - BENCHMARK_FP32_END2END(f32_dwconv_25p32c__avx512f_acc2); - - BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l16c16s1r__avx512f); - BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l16c16s1r__avx512f_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l32c16s1r__avx512f); - BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l32c16s1r__avx512f_acc2); - - BENCHMARK_FP32_END2END(f32_dwconv_3p8c__fma3); - BENCHMARK_FP32_END2END(f32_dwconv_3p8c__fma3_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_3p16c__fma3); - BENCHMARK_FP32_END2END(f32_dwconv_3p16c__fma3_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_4p8c__fma3); - BENCHMARK_FP32_END2END(f32_dwconv_4p8c__fma3_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_4p16c__fma3); - BENCHMARK_FP32_END2END(f32_dwconv_4p16c__fma3_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_9p8c__fma3); - BENCHMARK_FP32_END2END(f32_dwconv_9p8c__fma3_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_9p16c__fma3); - BENCHMARK_FP32_END2END(f32_dwconv_9p16c__fma3_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_25p8c__fma3); - BENCHMARK_FP32_END2END(f32_dwconv_25p8c__fma3_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_25p16c__fma3); - BENCHMARK_FP32_END2END(f32_dwconv_25p16c__fma3_acc2); - - BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l8c8s4r__fma3); - BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l8c8s4r__fma3_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l16c8s4r__fma3); - BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l16c8s4r__fma3_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l32c8s4r__fma3); - BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l32c8s4r__fma3_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_7f6m6l8c8s4r__fma3); - BENCHMARK_FP32_END2END(f32_dwconv_7f6m6l8c8s4r__fma3_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_7f6m6l16c8s4r__fma3); - BENCHMARK_FP32_END2END(f32_dwconv_7f6m6l16c8s4r__fma3_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_7f6m6l32c8s4r__fma3); - BENCHMARK_FP32_END2END(f32_dwconv_7f6m6l32c8s4r__fma3_acc2); - - BENCHMARK_FP32_END2END(f32_dwconv_9p8c__avx); - BENCHMARK_FP32_END2END(f32_dwconv_9p8c__avx_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_9p16c__avx); - BENCHMARK_FP32_END2END(f32_dwconv_9p16c__avx_acc2); - - BENCHMARK_FP32_END2END(f32_dwconv_25p8c__avx); - BENCHMARK_FP32_END2END(f32_dwconv_25p8c__avx_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_25p16c__avx); - BENCHMARK_FP32_END2END(f32_dwconv_25p16c__avx_acc2); - - BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l8c8s4r__avx); - BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l8c8s4r__avx_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l16c8s4r__avx); - BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l16c8s4r__avx_acc2); - - BENCHMARK_FP32_END2END(f32_dwconv_6f6m7l8c8s4r__avx); - BENCHMARK_FP32_END2END(f32_dwconv_6f6m7l8c8s4r__avx_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_6f6m7l16c8s4r__avx); - BENCHMARK_FP32_END2END(f32_dwconv_6f6m7l16c8s4r__avx_acc2); - - BENCHMARK_FP32_END2END(f32_dwconv_8f8m9l8c8s4r__avx); - BENCHMARK_FP32_END2END(f32_dwconv_8f8m9l8c8s4r__avx_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_8f8m9l16c8s4r__avx); - BENCHMARK_FP32_END2END(f32_dwconv_8f8m9l16c8s4r__avx_acc2); - - BENCHMARK_FP32_END2END(f32_dwconv_9p4c__sse); - BENCHMARK_FP32_END2END(f32_dwconv_9p4c__sse_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_9p8c__sse); - BENCHMARK_FP32_END2END(f32_dwconv_9p8c__sse_acc2); - - BENCHMARK_FP32_END2END(f32_dwconv_25p4c__sse); - BENCHMARK_FP32_END2END(f32_dwconv_25p4c__sse_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_25p8c__sse); - BENCHMARK_FP32_END2END(f32_dwconv_25p8c__sse_acc2); - - BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l4c4s4r__sse); - BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l4c4s4r__sse_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l8c4s4r__sse); - BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l8c4s4r__sse_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l16c4s4r__sse); - BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l16c4s4r__sse_acc2); - - BENCHMARK_FP32_END2END(f32_dwconv_6f6m7l4c4s4r__sse); - BENCHMARK_FP32_END2END(f32_dwconv_6f6m7l4c4s4r__sse_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_6f6m7l8c4s4r__sse); - BENCHMARK_FP32_END2END(f32_dwconv_6f6m7l8c4s4r__sse_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_6f6m7l16c4s4r__sse); - BENCHMARK_FP32_END2END(f32_dwconv_6f6m7l16c4s4r__sse_acc2); - - BENCHMARK_FP32_END2END(f32_dwconv_8f8m9l4c4s4r__sse); - BENCHMARK_FP32_END2END(f32_dwconv_8f8m9l4c4s4r__sse_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_8f8m9l8c4s4r__sse); - BENCHMARK_FP32_END2END(f32_dwconv_8f8m9l8c4s4r__sse_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_8f8m9l16c4s4r__sse); - BENCHMARK_FP32_END2END(f32_dwconv_8f8m9l16c4s4r__sse_acc2); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - -#if XNN_ARCH_WASM - static void f32_dwconv_9p1c__wasm(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p1c__wasm, - xnn_f32_dwconv_ukernel_9p1c__scalar, - xnn_init_f32_minmax_scalar_params, - 1 /* channel tile */, 9 /* primary tile */); - } - static void f32_dwconv_9p1c__wasm_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p1c__wasm_acc2, - xnn_f32_dwconv_ukernel_9p1c__scalar_acc2, - xnn_init_f32_minmax_scalar_params, - 1 /* channel tile */, 9 /* primary tile */); - } - static void f32_dwconv_25p1c__wasm(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p1c__wasm, - xnn_f32_dwconv_ukernel_25p1c__scalar, - xnn_init_f32_minmax_scalar_params, - 1 /* channel tile */, 25 /* primary tile */); - } - static void f32_dwconv_25p1c__wasm_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p1c__wasm_acc2, - xnn_f32_dwconv_ukernel_25p1c__scalar_acc2, - xnn_init_f32_minmax_scalar_params, - 1 /* channel tile */, 25 /* primary tile */); - } - - static void f32_dwconv_3f3m3l1c1s1r__wasm(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_3f3m3l1c1s1r__wasm, - xnn_f32_dwconv_ukernel_3f3m3l1c1s1r__scalar, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/3, /*middle_tile=*/3, /*last_tile=*/3, - /*primary_tile_to_replace=*/9); - } -static void f32_dwconv_3f3m3l1c1s1r__wasm_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_3f3m3l1c1s1r__wasm_acc2, - xnn_f32_dwconv_ukernel_3f3m3l1c1s1r__scalar_acc2, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/3, /*middle_tile=*/3, /*last_tile=*/3, - /*primary_tile_to_replace=*/9); - } - - static void f32_dwconv_5f5m5l1c1s1r__wasm(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l1c1s1r__wasm, - xnn_f32_dwconv_ukernel_5f5m5l1c1s1r__scalar, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - static void f32_dwconv_5f5m5l1c1s1r__wasm_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l1c1s1r__wasm_acc2, - xnn_f32_dwconv_ukernel_5f5m5l1c1s1r__scalar_acc2, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - static void f32_dwconv_6f6m7l1c1s1r__wasm(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_6f6m7l1c1s1r__wasm, - xnn_f32_dwconv_ukernel_6f6m7l1c1s1r__scalar, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - static void f32_dwconv_6f6m7l1c1s1r__wasm_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_6f6m7l1c1s1r__wasm_acc2, - xnn_f32_dwconv_ukernel_6f6m7l1c1s1r__scalar_acc2, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - static void f32_dwconv_8f8m9l1c1s1r__wasm(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_8f8m9l1c1s1r__wasm, - xnn_f32_dwconv_ukernel_8f8m9l1c1s1r__scalar, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - static void f32_dwconv_8f8m9l1c1s1r__wasm_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_8f8m9l1c1s1r__wasm_acc2, - xnn_f32_dwconv_ukernel_8f8m9l1c1s1r__scalar_acc2, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - - BENCHMARK_FP32_END2END(f32_dwconv_9p1c__wasm); - BENCHMARK_FP32_END2END(f32_dwconv_9p1c__wasm_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_25p1c__wasm); - BENCHMARK_FP32_END2END(f32_dwconv_25p1c__wasm_acc2); - - BENCHMARK_FP32_END2END(f32_dwconv_3f3m3l1c1s1r__wasm); - BENCHMARK_FP32_END2END(f32_dwconv_3f3m3l1c1s1r__wasm_acc2); - - BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l1c1s1r__wasm); - BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l1c1s1r__wasm_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_6f6m7l1c1s1r__wasm); - BENCHMARK_FP32_END2END(f32_dwconv_6f6m7l1c1s1r__wasm_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_8f8m9l1c1s1r__wasm); - BENCHMARK_FP32_END2END(f32_dwconv_8f8m9l1c1s1r__wasm_acc2); -#endif // XNN_ARCH_WASM - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - static void f32_dwconv_9p4c__wasmsimd_arm(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p4c__wasmsimd_arm, - xnn_f32_dwconv_ukernel_9p4c__wasmsimd, - xnn_init_f32_minmax_scalar_params, - 4 /* channel tile */, 9 /* primary tile */); - } - - static void f32_dwconv_9p4c__wasmsimd_arm_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p4c__wasmsimd_arm_acc2, - xnn_f32_dwconv_ukernel_9p4c__wasmsimd_acc2, - xnn_init_f32_minmax_scalar_params, - 4 /* channel tile */, 9 /* primary tile */); - } - - static void f32_dwconv_9p8c__wasmsimd_arm(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p8c__wasmsimd_arm, - xnn_f32_dwconv_ukernel_9p8c__wasmsimd, - xnn_init_f32_minmax_scalar_params, - 8 /* channel tile */, 9 /* primary tile */); - } - - static void f32_dwconv_9p8c__wasmsimd_arm_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p8c__wasmsimd_arm_acc2, - xnn_f32_dwconv_ukernel_9p8c__wasmsimd_acc2, - xnn_init_f32_minmax_scalar_params, - 8 /* channel tile */, 9 /* primary tile */); - } - - static void f32_dwconv_9p4c__wasmsimd_x86(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p4c__wasmsimd_x86, - xnn_f32_dwconv_ukernel_9p4c__wasmsimd, - xnn_init_f32_minmax_scalar_params, - 4 /* channel tile */, 9 /* primary tile */); - } - - static void f32_dwconv_9p4c__wasmsimd_x86_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p4c__wasmsimd_x86_acc2, - xnn_f32_dwconv_ukernel_9p4c__wasmsimd_acc2, - xnn_init_f32_minmax_scalar_params, - 4 /* channel tile */, 9 /* primary tile */); - } - - static void f32_dwconv_9p8c__wasmsimd_x86(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p8c__wasmsimd_x86, - xnn_f32_dwconv_ukernel_9p8c__wasmsimd, - xnn_init_f32_minmax_scalar_params, - 8 /* channel tile */, 9 /* primary tile */); - } - - static void f32_dwconv_9p8c__wasmsimd_x86_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p8c__wasmsimd_x86_acc2, - xnn_f32_dwconv_ukernel_9p8c__wasmsimd_acc2, - xnn_init_f32_minmax_scalar_params, - 8 /* channel tile */, 9 /* primary tile */); - } - - static void f32_dwconv_25p4c__wasmsimd_arm(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p4c__wasmsimd_arm, - xnn_f32_dwconv_ukernel_25p4c__wasmsimd, - xnn_init_f32_minmax_scalar_params, - 4 /* channel tile */, 25 /* primary tile */); - } - - static void f32_dwconv_25p4c__wasmsimd_arm_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p4c__wasmsimd_arm_acc2, - xnn_f32_dwconv_ukernel_25p4c__wasmsimd, - xnn_init_f32_minmax_scalar_params, - 4 /* channel tile */, 25 /* primary tile */); - } - - static void f32_dwconv_25p8c__wasmsimd_arm(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p8c__wasmsimd_arm, - xnn_f32_dwconv_ukernel_25p8c__wasmsimd, - xnn_init_f32_minmax_scalar_params, - 8 /* channel tile */, 25 /* primary tile */); - } - - static void f32_dwconv_25p8c__wasmsimd_arm_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p8c__wasmsimd_arm_acc2, - xnn_f32_dwconv_ukernel_25p8c__wasmsimd, - xnn_init_f32_minmax_scalar_params, - 8 /* channel tile */, 25 /* primary tile */); - } - - static void f32_dwconv_25p4c__wasmsimd_x86(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p4c__wasmsimd_x86, - xnn_f32_dwconv_ukernel_25p4c__wasmsimd, - xnn_init_f32_minmax_scalar_params, - 4 /* channel tile */, 25 /* primary tile */); - } - - static void f32_dwconv_25p4c__wasmsimd_x86_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p4c__wasmsimd_x86_acc2, - xnn_f32_dwconv_ukernel_25p4c__wasmsimd, - xnn_init_f32_minmax_scalar_params, - 4 /* channel tile */, 25 /* primary tile */); - } - - static void f32_dwconv_25p8c__wasmsimd_x86(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p8c__wasmsimd_x86, - xnn_f32_dwconv_ukernel_25p8c__wasmsimd, - xnn_init_f32_minmax_scalar_params, - 8 /* channel tile */, 25 /* primary tile */); - } - - static void f32_dwconv_25p8c__wasmsimd_x86_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p8c__wasmsimd_x86_acc2, - xnn_f32_dwconv_ukernel_25p8c__wasmsimd, - xnn_init_f32_minmax_scalar_params, - 8 /* channel tile */, 25 /* primary tile */); - } - - static void f32_dwconv_3f3m3l4c4s4r__wasmsimd_arm(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_3f3m3l4c4s4r__wasmsimd_arm, - xnn_f32_dwconv_ukernel_3f3m3l4c4s4r__wasmsimd, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/3, /*middle_tile=*/3, /*last_tile=*/3, - /*primary_tile_to_replace=*/9); - } - static void f32_dwconv_3f3m3l4c4s4r__wasmsimd_arm_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_3f3m3l4c4s4r__wasmsimd_arm_acc2, - xnn_f32_dwconv_ukernel_3f3m3l4c4s4r__wasmsimd_acc2, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/3, /*middle_tile=*/3, /*last_tile=*/3, - /*primary_tile_to_replace=*/9); - } - static void f32_dwconv_3f3m3l8c4s4r__wasmsimd_arm(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_3f3m3l8c4s4r__wasmsimd_arm, - xnn_f32_dwconv_ukernel_3f3m3l8c4s4r__wasmsimd, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/3, /*middle_tile=*/3, /*last_tile=*/3, - /*primary_tile_to_replace=*/9); - } - static void f32_dwconv_3f3m3l8c4s4r__wasmsimd_arm_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_3f3m3l8c4s4r__wasmsimd_arm_acc2, - xnn_f32_dwconv_ukernel_3f3m3l8c4s4r__wasmsimd_acc2, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/3, /*middle_tile=*/3, /*last_tile=*/3, - /*primary_tile_to_replace=*/9); - } - - static void f32_dwconv_3f3m3l4c4s4r__wasmsimd_x86(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_3f3m3l4c4s4r__wasmsimd_x86, - xnn_f32_dwconv_ukernel_3f3m3l4c4s4r__wasmsimd, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/3, /*middle_tile=*/3, /*last_tile=*/3, - /*primary_tile_to_replace=*/9); - } - static void f32_dwconv_3f3m3l4c4s4r__wasmsimd_x86_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_3f3m3l4c4s4r__wasmsimd_x86_acc2, - xnn_f32_dwconv_ukernel_3f3m3l4c4s4r__wasmsimd_acc2, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/3, /*middle_tile=*/3, /*last_tile=*/3, - /*primary_tile_to_replace=*/9); - } - static void f32_dwconv_3f3m3l8c4s4r__wasmsimd_x86(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_3f3m3l8c4s4r__wasmsimd_x86, - xnn_f32_dwconv_ukernel_3f3m3l8c4s4r__wasmsimd, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/3, /*middle_tile=*/3, /*last_tile=*/3, - /*primary_tile_to_replace=*/9); - } - static void f32_dwconv_3f3m3l8c4s4r__wasmsimd_x86_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_3f3m3l8c4s4r__wasmsimd_x86_acc2, - xnn_f32_dwconv_ukernel_3f3m3l8c4s4r__wasmsimd_acc2, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/3, /*middle_tile=*/3, /*last_tile=*/3, - /*primary_tile_to_replace=*/9); - } - - static void f32_dwconv_5f5m5l4c4s4r__wasmsimd_arm(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmsimd_arm, - xnn_f32_dwconv_ukernel_5f5m5l4c4s4r__wasmsimd, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - - static void f32_dwconv_5f5m5l4c4s4r__wasmsimd_arm_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmsimd_arm_acc2, - xnn_f32_dwconv_ukernel_5f5m5l4c4s4r__wasmsimd_acc2, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - - static void f32_dwconv_5f5m5l4c4s4r__wasmsimd_x86(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmsimd_x86, - xnn_f32_dwconv_ukernel_5f5m5l4c4s4r__wasmsimd, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - - static void f32_dwconv_5f5m5l4c4s4r__wasmsimd_x86_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmsimd_x86_acc2, - xnn_f32_dwconv_ukernel_5f5m5l4c4s4r__wasmsimd_acc2, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - - BENCHMARK_FP32_END2END(f32_dwconv_9p4c__wasmsimd_arm); - BENCHMARK_FP32_END2END(f32_dwconv_9p4c__wasmsimd_arm_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_9p8c__wasmsimd_arm); - BENCHMARK_FP32_END2END(f32_dwconv_9p8c__wasmsimd_arm_acc2); - - BENCHMARK_FP32_END2END(f32_dwconv_9p4c__wasmsimd_x86); - BENCHMARK_FP32_END2END(f32_dwconv_9p4c__wasmsimd_x86_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_9p8c__wasmsimd_x86); - BENCHMARK_FP32_END2END(f32_dwconv_9p8c__wasmsimd_x86_acc2); - - BENCHMARK_FP32_END2END(f32_dwconv_25p4c__wasmsimd_arm); - BENCHMARK_FP32_END2END(f32_dwconv_25p4c__wasmsimd_arm_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_25p8c__wasmsimd_arm); - BENCHMARK_FP32_END2END(f32_dwconv_25p8c__wasmsimd_arm_acc2); - - BENCHMARK_FP32_END2END(f32_dwconv_25p4c__wasmsimd_x86); - BENCHMARK_FP32_END2END(f32_dwconv_25p4c__wasmsimd_x86_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_25p8c__wasmsimd_x86); - BENCHMARK_FP32_END2END(f32_dwconv_25p8c__wasmsimd_x86_acc2); - - BENCHMARK_FP32_END2END(f32_dwconv_3f3m3l4c4s4r__wasmsimd_arm); - BENCHMARK_FP32_END2END(f32_dwconv_3f3m3l4c4s4r__wasmsimd_arm); - BENCHMARK_FP32_END2END(f32_dwconv_3f3m3l8c4s4r__wasmsimd_arm_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_3f3m3l8c4s4r__wasmsimd_arm_acc2); - - BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l4c4s4r__wasmsimd_arm); - BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l4c4s4r__wasmsimd_arm_acc2); - - BENCHMARK_FP32_END2END(f32_dwconv_3f3m3l4c4s4r__wasmsimd_x86); - BENCHMARK_FP32_END2END(f32_dwconv_3f3m3l4c4s4r__wasmsimd_x86_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_3f3m3l8c4s4r__wasmsimd_x86); - BENCHMARK_FP32_END2END(f32_dwconv_3f3m3l8c4s4r__wasmsimd_x86_acc2); - - BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l4c4s4r__wasmsimd_x86); - BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l4c4s4r__wasmsimd_x86_acc2); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - -#if XNN_ARCH_WASMRELAXEDSIMD - static void f32_dwconv_25p4c__wasmrelaxedsimd(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p4c__wasmrelaxedsimd, - xnn_f32_dwconv_ukernel_25p4c__wasmrelaxedsimd_fma, - xnn_init_f32_minmax_scalar_params, - 4 /* channel tile */, 25 /* primary tile */); - } - static void f32_dwconv_25p4c__wasmrelaxedsimd_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p4c__wasmrelaxedsimd_acc2, - xnn_f32_dwconv_ukernel_25p4c__wasmrelaxedsimd_fma, - xnn_init_f32_minmax_scalar_params, - 4 /* channel tile */, 25 /* primary tile */); - } - static void f32_dwconv_25p4c__wasmrelaxedsimd_fma(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p4c__wasmrelaxedsimd_fma, - xnn_f32_dwconv_ukernel_25p4c__wasmrelaxedsimd_fma, - xnn_init_f32_minmax_scalar_params, - 4 /* channel tile */, 25 /* primary tile */); - } - static void f32_dwconv_25p4c__wasmrelaxedsimd_fma_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p4c__wasmrelaxedsimd_fma_acc2, - xnn_f32_dwconv_ukernel_25p4c__wasmrelaxedsimd_fma, - xnn_init_f32_minmax_scalar_params, - 4 /* channel tile */, 25 /* primary tile */); - } - - static void f32_dwconv_25p8c__wasmrelaxedsimd(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p8c__wasmrelaxedsimd, - xnn_f32_dwconv_ukernel_25p8c__wasmrelaxedsimd_fma, - xnn_init_f32_minmax_scalar_params, - 8 /* channel tile */, 25 /* primary tile */); - } - static void f32_dwconv_25p8c__wasmrelaxedsimd_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p8c__wasmrelaxedsimd_acc2, - xnn_f32_dwconv_ukernel_25p8c__wasmrelaxedsimd_fma, - xnn_init_f32_minmax_scalar_params, - 8 /* channel tile */, 25 /* primary tile */); - } - static void f32_dwconv_25p8c__wasmrelaxedsimd_fma(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p8c__wasmrelaxedsimd_fma, - xnn_f32_dwconv_ukernel_25p8c__wasmrelaxedsimd_fma, - xnn_init_f32_minmax_scalar_params, - 8 /* channel tile */, 25 /* primary tile */); - } - static void f32_dwconv_25p8c__wasmrelaxedsimd_fma_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p8c__wasmrelaxedsimd_fma_acc2, - xnn_f32_dwconv_ukernel_25p8c__wasmrelaxedsimd_fma, - xnn_init_f32_minmax_scalar_params, - 8 /* channel tile */, 25 /* primary tile */); - } - - static void f32_dwconv_5f5m5l4c4s4r__wasmrelaxedsimd(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmrelaxedsimd, - /*dwconv=*/xnn_f32_dwconv_ukernel_5f5m5l4c4s4r__wasmrelaxedsimd_fma, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - static void f32_dwconv_5f5m5l4c4s4r__wasmrelaxedsimd_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmrelaxedsimd_acc2, - /*dwconv=*/xnn_f32_dwconv_ukernel_5f5m5l4c4s4r__wasmrelaxedsimd_fma_acc2, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - static void f32_dwconv_5f5m5l4c4s4r__wasmrelaxedsimd_fma(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmrelaxedsimd_fma, - /*dwconv=*/xnn_f32_dwconv_ukernel_5f5m5l4c4s4r__wasmrelaxedsimd_fma, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - static void f32_dwconv_5f5m5l4c4s4r__wasmrelaxedsimd_fma_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmrelaxedsimd_fma_acc2, - /*dwconv=*/xnn_f32_dwconv_ukernel_5f5m5l4c4s4r__wasmrelaxedsimd_fma_acc2, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - - BENCHMARK_FP32_END2END(f32_dwconv_25p4c__wasmrelaxedsimd); - BENCHMARK_FP32_END2END(f32_dwconv_25p4c__wasmrelaxedsimd_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_25p8c__wasmrelaxedsimd); - BENCHMARK_FP32_END2END(f32_dwconv_25p8c__wasmrelaxedsimd_acc2); - - BENCHMARK_FP32_END2END(f32_dwconv_25p4c__wasmrelaxedsimd_fma); - BENCHMARK_FP32_END2END(f32_dwconv_25p4c__wasmrelaxedsimd_fma_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_25p8c__wasmrelaxedsimd_fma); - BENCHMARK_FP32_END2END(f32_dwconv_25p8c__wasmrelaxedsimd_fma_acc2); - - BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l4c4s4r__wasmrelaxedsimd); - BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l4c4s4r__wasmrelaxedsimd_acc2); - BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l4c4s4r__wasmrelaxedsimd_fma); - BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l4c4s4r__wasmrelaxedsimd_fma_acc2); -#endif // XNN_ARCH_WASMRELAXEDSIMD - -static void f32_dwconv_9p1c__scalar(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p1c__scalar, - xnn_f32_dwconv_ukernel_9p1c__scalar, - xnn_init_f32_minmax_scalar_params, - 1 /* channel tile */, 9 /* primary tile */); -} - -static void f32_dwconv_9p1c__scalar_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p1c__scalar_acc2, - xnn_f32_dwconv_ukernel_9p1c__scalar_acc2, - xnn_init_f32_minmax_scalar_params, - 1 /* channel tile */, 9 /* primary tile */); -} - -static void f32_dwconv_9p2c__scalar(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p2c__scalar, - xnn_f32_dwconv_ukernel_9p2c__scalar, - xnn_init_f32_minmax_scalar_params, - 2 /* channel tile */, 9 /* primary tile */); -} - -static void f32_dwconv_9p2c__scalar_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_9p2c__scalar_acc2, - xnn_f32_dwconv_ukernel_9p2c__scalar_acc2, - xnn_init_f32_minmax_scalar_params, - 2 /* channel tile */, 9 /* primary tile */); -} - -static void f32_dwconv_25p1c__scalar(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p1c__scalar, - xnn_f32_dwconv_ukernel_25p1c__scalar, - xnn_init_f32_minmax_scalar_params, - 1 /* channel tile */, 25 /* primary tile */); -} -static void f32_dwconv_25p1c__scalar_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_25p1c__scalar_acc2, - xnn_f32_dwconv_ukernel_25p1c__scalar, - xnn_init_f32_minmax_scalar_params, - 1 /* channel tile */, 25 /* primary tile */); -} - -static void f32_dwconv_2f2m2l1c1s1r__scalar(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_2f2m2l1c1s1r__scalar, - /*dwconv=*/xnn_f32_dwconv_ukernel_2f2m2l1c1s1r__scalar, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/2, /*middle_tile=*/2, /*last_tile=*/2, - /*primary_tile_to_replace=*/25); -} -static void f32_dwconv_2f2m2l1c1s1r__scalar_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_2f2m2l1c1s1r__scalar_acc2, - /*dwconv=*/xnn_f32_dwconv_ukernel_2f2m2l1c1s1r__scalar_acc2, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/2, /*middle_tile=*/2, /*last_tile=*/2, - /*primary_tile_to_replace=*/25); -} -static void f32_dwconv_2f2m2l4c1s1r__scalar(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_2f2m2l4c1s1r__scalar, - /*dwconv=*/xnn_f32_dwconv_ukernel_2f2m2l4c1s1r__scalar, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/2, /*middle_tile=*/2, /*last_tile=*/2, - /*primary_tile_to_replace=*/25); -} -static void f32_dwconv_2f2m2l4c1s1r__scalar_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_2f2m2l4c1s1r__scalar_acc2, - /*dwconv=*/xnn_f32_dwconv_ukernel_2f2m2l4c1s1r__scalar_acc2, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/2, /*middle_tile=*/2, /*last_tile=*/2, - /*primary_tile_to_replace=*/25); -} -static void f32_dwconv_5f5m5l1c1s1r__scalar(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l1c1s1r__scalar, - /*dwconv=*/xnn_f32_dwconv_ukernel_5f5m5l1c1s1r__scalar, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); -} -static void f32_dwconv_5f5m5l1c1s1r__scalar_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_5f5m5l1c1s1r__scalar_acc2, - /*dwconv=*/xnn_f32_dwconv_ukernel_5f5m5l1c1s1r__scalar_acc2, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); -} - -static void f32_dwconv_6f6m7l1c1s1r__scalar(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_6f6m7l1c1s1r__scalar, - /*dwconv=*/xnn_f32_dwconv_ukernel_6f6m7l1c1s1r__scalar, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); -} -static void f32_dwconv_6f6m7l1c1s1r__scalar_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_6f6m7l1c1s1r__scalar_acc2, - /*dwconv=*/xnn_f32_dwconv_ukernel_6f6m7l1c1s1r__scalar_acc2, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); -} - -static void f32_dwconv_8f8m9l1c1s1r__scalar(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_8f8m9l1c1s1r__scalar, - /*dwconv=*/xnn_f32_dwconv_ukernel_8f8m9l1c1s1r__scalar, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); -} -static void f32_dwconv_8f8m9l1c1s1r__scalar_acc2(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_f32_dwconv_minmax_ukernel_8f8m9l1c1s1r__scalar_acc2, - /*dwconv=*/xnn_f32_dwconv_ukernel_8f8m9l1c1s1r__scalar_acc2, - xnn_init_f32_minmax_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); -} - -BENCHMARK_FP32_END2END(f32_dwconv_9p1c__scalar); -BENCHMARK_FP32_END2END(f32_dwconv_9p1c__scalar_acc2); -BENCHMARK_FP32_END2END(f32_dwconv_9p2c__scalar); -BENCHMARK_FP32_END2END(f32_dwconv_9p2c__scalar_acc2); -BENCHMARK_FP32_END2END(f32_dwconv_25p1c__scalar); -BENCHMARK_FP32_END2END(f32_dwconv_25p1c__scalar_acc2); - -BENCHMARK_FP32_END2END(f32_dwconv_2f2m2l1c1s1r__scalar); -BENCHMARK_FP32_END2END(f32_dwconv_2f2m2l1c1s1r__scalar_acc2); -BENCHMARK_FP32_END2END(f32_dwconv_2f2m2l4c1s1r__scalar); -BENCHMARK_FP32_END2END(f32_dwconv_2f2m2l4c1s1r__scalar_acc2); -BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l1c1s1r__scalar); -BENCHMARK_FP32_END2END(f32_dwconv_5f5m5l1c1s1r__scalar_acc2); -BENCHMARK_FP32_END2END(f32_dwconv_6f6m7l1c1s1r__scalar); -BENCHMARK_FP32_END2END(f32_dwconv_6f6m7l1c1s1r__scalar_acc2); -BENCHMARK_FP32_END2END(f32_dwconv_8f8m9l1c1s1r__scalar); -BENCHMARK_FP32_END2END(f32_dwconv_8f8m9l1c1s1r__scalar_acc2); - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/f32-gemm-e2e.cc b/bench/f32-gemm-e2e.cc deleted file mode 100644 index 80c19c8606f..00000000000 --- a/bench/f32-gemm-e2e.cc +++ /dev/null @@ -1,2338 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include -#include - -#include "bench/end2end.h" -#include "bench/utils.h" -#include - -#include "xnnpack.h" -#include "xnnpack/config.h" -#include "xnnpack/gemm.h" -#include "xnnpack/igemm.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams-init.h" -#include "xnnpack/models.h" -#include "xnnpack/pack.h" - - -static void GEMMEnd2EndBenchmark( - benchmark::State& state, - models::ExecutionPlanFactory model_factory, - xnn_f32_gemm_minmax_ukernel_fn gemm_minmax, - xnn_f32_igemm_minmax_ukernel_fn igemm_minmax, - xnn_f32_gemm_minmax_ukernel_fn gemm1_minmax, - xnn_f32_igemm_minmax_ukernel_fn igemm1_minmax, - xnn_f32_gemm_relu_ukernel_fn gemm_relu, - xnn_f32_igemm_relu_ukernel_fn igemm_relu, - xnn_f32_gemm_relu_ukernel_fn gemm1_relu, - xnn_f32_igemm_relu_ukernel_fn igemm1_relu, - xnn_f32_gemm_ukernel_fn gemm, - xnn_f32_igemm_ukernel_fn igemm, - xnn_f32_gemm_ukernel_fn gemm1, - xnn_f32_igemm_ukernel_fn igemm1, - xnn_init_f32_minmax_params_fn init_params, - uint8_t mr, uint8_t nr, uint8_t log2_kr = 0, uint8_t log2_sr = 0, - benchmark::utils::IsaCheckFunction isa_check = nullptr) -{ - if (isa_check != nullptr && !isa_check(state)) { - return; - } - if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) { - state.SkipWithError("failed to initialize XNNPACK"); - return; - } - - struct xnn_gemm_config* gemm_config = xnn_init_f32_gemm_config(); - if (gemm_config == nullptr) { - state.SkipWithError("hardware does not support F32 gemm"); - return; - } - - struct xnn_gemm_config* gemm_nr2_config = xnn_init_f32_gemm_nr2_config(); - if (gemm_nr2_config == nullptr) { - state.SkipWithError("hardware does not support F32 gemm"); - return; - } - - // Override microkernels chosen in xnn_initialize - std::memset(gemm_config, 0, sizeof(struct xnn_gemm_config)); - std::memset(gemm_nr2_config, 0, sizeof(struct xnn_gemm_config)); - gemm_config->minmax.gemm[mr-1] = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_fn(gemm_minmax)); - gemm_config->minmax.igemm[mr-1] = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_fn(igemm_minmax)); - gemm_config->minmax.gemm[0] = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_fn(gemm1_minmax)); - gemm_config->minmax.igemm[0] = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_fn(igemm1_minmax)); - gemm_config->relu.gemm[mr-1] = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_fn(gemm_relu)); - gemm_config->relu.igemm[mr-1] = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_fn(igemm_relu)); - gemm_config->relu.gemm[0] = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_fn(gemm1_relu)); - gemm_config->relu.igemm[0] = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_fn(igemm1_relu)); - gemm_config->linear.gemm[mr-1] = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_fn(gemm)); - gemm_config->linear.igemm[mr-1] = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_fn(igemm)); - gemm_config->linear.gemm[0] = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_fn(gemm1)); - gemm_config->linear.igemm[0] = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_fn(igemm1)); - gemm_config->init.f32 = init_params; - gemm_config->mr = mr; - gemm_config->nr = nr; - gemm_config->log2_kr = log2_kr; - gemm_config->log2_sr = log2_sr; - gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_f32_gemm_goi_w; - - auto execution_plan = model_factory(nullptr); - if (execution_plan.empty()) { - state.SkipWithError("failed to create a model"); - return; - } - - for (auto _ : state) { - for (const std::unique_ptr& op : execution_plan) { - xnn_status status = xnn_run_operator(op.get(), nullptr); - if (status != xnn_status_success) { - state.SkipWithError("failed to run a model"); - return; - } - } - } - - const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); - if (cpu_frequency != 0) { - state.counters["cpufreq"] = cpu_frequency; - } -} - -#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY - static void f32_gemm_4x2__asm_aarch64_neonfma_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x2__asm_aarch64_neonfma_cortex_a75, - xnn_f32_igemm_minmax_ukernel_4x2__asm_aarch64_neonfma_cortex_a75, - xnn_f32_gemm_minmax_ukernel_4x2__asm_aarch64_neonfma_cortex_a75, - xnn_f32_igemm_minmax_ukernel_4x2__asm_aarch64_neonfma_cortex_a75, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/2); - } - static void f32_gemm_4x2__asm_aarch64_neonfma_cortex_a75_prfm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x2__asm_aarch64_neonfma_cortex_a75_prfm, - xnn_f32_igemm_minmax_ukernel_4x2__asm_aarch64_neonfma_cortex_a75_prfm, - xnn_f32_gemm_minmax_ukernel_4x2__asm_aarch64_neonfma_cortex_a75_prfm, - xnn_f32_igemm_minmax_ukernel_4x2__asm_aarch64_neonfma_cortex_a75_prfm, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/2); - } - static void f32_gemm_4x2__asm_aarch64_neonfma_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x2__asm_aarch64_neonfma_ld64, - xnn_f32_igemm_minmax_ukernel_4x2__asm_aarch64_neonfma_ld64, - xnn_f32_gemm_minmax_ukernel_4x2__asm_aarch64_neonfma_ld64, - xnn_f32_igemm_minmax_ukernel_4x2__asm_aarch64_neonfma_ld64, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/2); - } - static void f32_gemm_4x12__asm_aarch64_neonfma_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x12__asm_aarch64_neonfma_cortex_a53, - xnn_f32_igemm_minmax_ukernel_4x12__asm_aarch64_neonfma_cortex_a53, - xnn_f32_gemm_minmax_ukernel_1x12__asm_aarch64_neonfma_cortex_a53, - xnn_f32_igemm_minmax_ukernel_1x12__asm_aarch64_neonfma_cortex_a53, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/12); - } - static void f32_gemm_4x8__asm_aarch64_neonfma_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a53, - xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a53, - xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53, - xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8); - } - static void f32_gemm_4x8__asm_aarch64_neonfma_cortex_a53_prfm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a53_prfm, - xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a53_prfm, - xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53_prfm, - xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53_prfm, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8); - } - static void f32_gemm_4x8__asm_aarch64_neonfma_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a55, - xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a55, - xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53, - xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8); - } - static void f32_gemm_4x8__asm_aarch64_neonfma_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a75, - xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a75, - xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a75, - xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a75, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8); - } - static void f32_gemm_4x8__asm_aarch64_neonfma_cortex_a75_prfm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a75_prfm, - xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a75_prfm, - xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a75_prfm, - xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a75_prfm, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8); - } - static void f32_gemm_4x8__asm_aarch64_neonfma_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_ld64, - xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch64_neonfma_ld64, - xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld64, - xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld64, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8); - } - static void f32_gemm_4x8__asm_aarch64_neonfma_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_ld128, - xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch64_neonfma_ld128, - xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld64, - xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld64, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8); - } - static void f32_gemm_5x8__asm_aarch64_neonfma_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_5x8__asm_aarch64_neonfma_cortex_a75, - xnn_f32_igemm_minmax_ukernel_5x8__asm_aarch64_neonfma_cortex_a75, - xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a75, - xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a75, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/5, /*nr=*/8); - } - static void f32_gemm_5x8__asm_aarch64_neonfma_cortex_a75_prfm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_5x8__asm_aarch64_neonfma_cortex_a75_prfm, - xnn_f32_igemm_minmax_ukernel_5x8__asm_aarch64_neonfma_cortex_a75_prfm, - xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a75_prfm, - xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a75_prfm, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/5, /*nr=*/8); - } - static void f32_gemm_6x8__asm_aarch64_neonfma_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a53, - xnn_f32_igemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a53, - xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53, - xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/6, /*nr=*/8); - } - static void f32_gemm_6x8__asm_aarch64_neonfma_cortex_a53_prfm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a53_prfm, - xnn_f32_igemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a53_prfm, - xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53_prfm, - xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53_prfm, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/6, /*nr=*/8); - } - static void f32_gemm_6x8__asm_aarch64_neonfma_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a55, - xnn_f32_igemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a55, - xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53, - xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/6, /*nr=*/8); - } - static void f32_gemm_6x8__asm_aarch64_neonfma_cortex_a73(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a73, - xnn_f32_igemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a73, - xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a75_prfm, - xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a75_prfm, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/6, /*nr=*/8); - } - static void f32_gemm_6x8__asm_aarch64_neonfma_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a75, - xnn_f32_igemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a75, - xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a75, - xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a75, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/6, /*nr=*/8); - } - static void f32_gemm_6x8__asm_aarch64_neonfma_cortex_a75_prfm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a75_prfm, - xnn_f32_igemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a75_prfm, - xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a75_prfm, - xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a75_prfm, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/6, /*nr=*/8); - } - static void f32_gemm_6x8__asm_aarch64_neonfma_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_ld64, - xnn_f32_igemm_minmax_ukernel_6x8__asm_aarch64_neonfma_ld64, - xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_lane_ld64, - xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_lane_ld64, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/6, /*nr=*/8); - } - static void f32_gemm_6x8__asm_aarch64_neonfma_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_ld128, - xnn_f32_igemm_minmax_ukernel_6x8__asm_aarch64_neonfma_ld128, - xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_lane_ld128, - xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_lane_ld128, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/6, /*nr=*/8); - } - - BENCHMARK_FP32_END2END(f32_gemm_4x2__asm_aarch64_neonfma_cortex_a75) - BENCHMARK_FP32_END2END(f32_gemm_4x2__asm_aarch64_neonfma_cortex_a75_prfm) - BENCHMARK_FP32_END2END(f32_gemm_4x2__asm_aarch64_neonfma_ld64) - BENCHMARK_FP32_END2END(f32_gemm_4x8__asm_aarch64_neonfma_ld64) - BENCHMARK_FP32_END2END(f32_gemm_4x8__asm_aarch64_neonfma_ld128); - BENCHMARK_FP32_END2END(f32_gemm_6x8__asm_aarch64_neonfma_ld64); - BENCHMARK_FP32_END2END(f32_gemm_6x8__asm_aarch64_neonfma_ld128); - BENCHMARK_FP32_END2END(f32_gemm_4x8__asm_aarch64_neonfma_cortex_a53) - BENCHMARK_FP32_END2END(f32_gemm_4x8__asm_aarch64_neonfma_cortex_a53_prfm) - BENCHMARK_FP32_END2END(f32_gemm_4x8__asm_aarch64_neonfma_cortex_a55) - BENCHMARK_FP32_END2END(f32_gemm_4x8__asm_aarch64_neonfma_cortex_a75) - BENCHMARK_FP32_END2END(f32_gemm_4x8__asm_aarch64_neonfma_cortex_a75_prfm) - BENCHMARK_FP32_END2END(f32_gemm_5x8__asm_aarch64_neonfma_cortex_a75); - BENCHMARK_FP32_END2END(f32_gemm_5x8__asm_aarch64_neonfma_cortex_a75_prfm); - BENCHMARK_FP32_END2END(f32_gemm_6x8__asm_aarch64_neonfma_cortex_a53); - BENCHMARK_FP32_END2END(f32_gemm_6x8__asm_aarch64_neonfma_cortex_a53_prfm); - BENCHMARK_FP32_END2END(f32_gemm_6x8__asm_aarch64_neonfma_cortex_a55); - BENCHMARK_FP32_END2END(f32_gemm_6x8__asm_aarch64_neonfma_cortex_a73); - BENCHMARK_FP32_END2END(f32_gemm_6x8__asm_aarch64_neonfma_cortex_a75); - BENCHMARK_FP32_END2END(f32_gemm_6x8__asm_aarch64_neonfma_cortex_a75_prfm); - BENCHMARK_FP32_END2END(f32_gemm_4x12__asm_aarch64_neonfma_cortex_a53); -#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY - -#if XNN_ARCH_ARM64 - static void f32_gemm_2x16__aarch64_neonfma_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_2x16__aarch64_neonfma_lane_ld128, - xnn_f32_igemm_minmax_ukernel_2x16__aarch64_neonfma_lane_ld128, - xnn_f32_gemm_minmax_ukernel_1x16__aarch64_neonfma_lane_ld128, - xnn_f32_igemm_minmax_ukernel_1x16__aarch64_neonfma_lane_ld128, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/2, /*nr=*/16); - } - static void f32_gemm_3x16__aarch64_neonfma_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_3x16__aarch64_neonfma_lane_ld128, - xnn_f32_igemm_minmax_ukernel_3x16__aarch64_neonfma_lane_ld128, - xnn_f32_gemm_minmax_ukernel_1x16__aarch64_neonfma_lane_ld128, - xnn_f32_igemm_minmax_ukernel_1x16__aarch64_neonfma_lane_ld128, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/3, /*nr=*/16); - } - static void f32_gemm_4x16__aarch64_neonfma_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x16__aarch64_neonfma_lane_ld128, - xnn_f32_igemm_minmax_ukernel_4x16__aarch64_neonfma_lane_ld128, - xnn_f32_gemm_minmax_ukernel_1x16__aarch64_neonfma_lane_ld128, - xnn_f32_igemm_minmax_ukernel_1x16__aarch64_neonfma_lane_ld128, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/16); - } - static void f32_gemm_5x16__aarch64_neonfma_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_5x16__aarch64_neonfma_lane_ld128, - xnn_f32_igemm_minmax_ukernel_5x16__aarch64_neonfma_lane_ld128, - xnn_f32_gemm_minmax_ukernel_1x16__aarch64_neonfma_lane_ld128, - xnn_f32_igemm_minmax_ukernel_1x16__aarch64_neonfma_lane_ld128, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/5, /*nr=*/16); - } - static void f32_gemm_6x16__aarch64_neonfma_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_6x16__aarch64_neonfma_lane_ld128, - xnn_f32_igemm_minmax_ukernel_6x16__aarch64_neonfma_lane_ld128, - xnn_f32_gemm_minmax_ukernel_1x16__aarch64_neonfma_lane_ld128, - xnn_f32_igemm_minmax_ukernel_1x16__aarch64_neonfma_lane_ld128, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/6, /*nr=*/16); - } - static void f32_gemm_4x2__aarch64_neonfma_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_lane_ld64, - xnn_f32_igemm_minmax_ukernel_4x2__aarch64_neonfma_lane_ld64, - xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_lane_ld64, - xnn_f32_igemm_minmax_ukernel_4x2__aarch64_neonfma_lane_ld64, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/2); - } - static void f32_gemm_6x2__aarch64_neonfma_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_6x2__aarch64_neonfma_lane_ld64, - xnn_f32_igemm_minmax_ukernel_6x2__aarch64_neonfma_lane_ld64, - xnn_f32_gemm_minmax_ukernel_6x2__aarch64_neonfma_lane_ld64, - xnn_f32_igemm_minmax_ukernel_6x2__aarch64_neonfma_lane_ld64, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/6, /*nr=*/2); - } - static void f32_gemm_4x8__aarch64_neonfma_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_lane_ld64, - xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_lane_ld64, - xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_lane_ld64, - xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_lane_ld64, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8); - } - static void f32_gemm_4x8__aarch64_neonfma_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_lane_ld128, - xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_lane_ld128, - xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_lane_ld128, - xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_lane_ld128, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8); - } - static void f32_gemm_6x8__aarch64_neonfma_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_lane_ld64, - xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_lane_ld64, - xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_lane_ld64, - xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_lane_ld64, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/6, /*nr=*/8); - } - static void f32_gemm_6x8__aarch64_neonfma_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_lane_ld128, - xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_lane_ld128, - xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_lane_ld128, - xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_lane_ld128, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/6, /*nr=*/8); - } - - BENCHMARK_FP32_END2END(f32_gemm_2x16__aarch64_neonfma_lane_ld128); - BENCHMARK_FP32_END2END(f32_gemm_3x16__aarch64_neonfma_lane_ld128); - BENCHMARK_FP32_END2END(f32_gemm_4x16__aarch64_neonfma_lane_ld128); - BENCHMARK_FP32_END2END(f32_gemm_5x16__aarch64_neonfma_lane_ld128); - BENCHMARK_FP32_END2END(f32_gemm_6x16__aarch64_neonfma_lane_ld128); - - BENCHMARK_FP32_END2END(f32_gemm_4x2__aarch64_neonfma_lane_ld64); - BENCHMARK_FP32_END2END(f32_gemm_6x2__aarch64_neonfma_lane_ld64); - - BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch64_neonfma_lane_ld64); - BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch64_neonfma_lane_ld128); - - BENCHMARK_FP32_END2END(f32_gemm_6x8__aarch64_neonfma_lane_ld64); - BENCHMARK_FP32_END2END(f32_gemm_6x8__aarch64_neonfma_lane_ld128); -#endif // XNN_ARCH_ARM64 - -#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY - static void f32_gemm_4x8__asm_aarch32_neon_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch32_neon_ld64, - xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch32_neon_ld64, - xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch32_neon_cortex_a53, - xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch32_neon_cortex_a53, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void f32_gemm_4x8__asm_aarch32_neon_cortex_a7(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a7, - xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a7, - xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch32_neon_cortex_a53, - xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch32_neon_cortex_a53, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void f32_gemm_4x8__asm_aarch32_neon_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a53, - xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a53, - xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch32_neon_cortex_a53, - xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch32_neon_cortex_a53, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void f32_gemm_4x8__asm_aarch32_neon_cortex_a53_prfm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a53_prfm, - xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a53_prfm, - xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch32_neon_cortex_a53_prfm, - xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch32_neon_cortex_a53_prfm, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void f32_gemm_4x8__asm_aarch32_neon_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a55, - xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a55, - xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch32_neon_cortex_a53, - xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch32_neon_cortex_a53, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void f32_gemm_4x8__asm_aarch32_neon_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a75, - xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a75, - xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch32_neon_cortex_a53, - xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch32_neon_cortex_a53, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void f32_gemm_4x8__asm_aarch32_neon_cortex_a75_prfm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a75_prfm, - xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a75_prfm, - xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch32_neon_cortex_a53_prfm, - xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch32_neon_cortex_a53_prfm, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - - BENCHMARK_FP32_END2END(f32_gemm_4x8__asm_aarch32_neon_ld64); - BENCHMARK_FP32_END2END(f32_gemm_4x8__asm_aarch32_neon_cortex_a7); - BENCHMARK_FP32_END2END(f32_gemm_4x8__asm_aarch32_neon_cortex_a53); - BENCHMARK_FP32_END2END(f32_gemm_4x8__asm_aarch32_neon_cortex_a53_prfm); - BENCHMARK_FP32_END2END(f32_gemm_4x8__asm_aarch32_neon_cortex_a55); - BENCHMARK_FP32_END2END(f32_gemm_4x8__asm_aarch32_neon_cortex_a75); - BENCHMARK_FP32_END2END(f32_gemm_4x8__asm_aarch32_neon_cortex_a75_prfm); -#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - static void f32_gemm_2x16__neon_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_2x16__neon_lane_ld128, - xnn_f32_igemm_minmax_ukernel_2x16__neon_lane_ld128, - xnn_f32_gemm_minmax_ukernel_1x16__neon_lane_ld128, - xnn_f32_igemm_minmax_ukernel_1x16__neon_lane_ld128, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/2, /*nr=*/16); - } - static void f32_gemm_3x16__neon_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_3x16__neon_lane_ld128, - xnn_f32_igemm_minmax_ukernel_3x16__neon_lane_ld128, - xnn_f32_gemm_minmax_ukernel_1x16__neon_lane_ld128, - xnn_f32_igemm_minmax_ukernel_1x16__neon_lane_ld128, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/3, /*nr=*/16); - } - static void f32_gemm_4x16__neon_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x16__neon_lane_ld128, - xnn_f32_igemm_minmax_ukernel_4x16__neon_lane_ld128, - xnn_f32_gemm_minmax_ukernel_1x16__neon_lane_ld128, - xnn_f32_igemm_minmax_ukernel_1x16__neon_lane_ld128, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/16); - } - static void f32_gemm_5x16__neon_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_5x16__neon_lane_ld128, - xnn_f32_igemm_minmax_ukernel_5x16__neon_lane_ld128, - xnn_f32_gemm_minmax_ukernel_1x16__neon_lane_ld128, - xnn_f32_igemm_minmax_ukernel_1x16__neon_lane_ld128, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/5, /*nr=*/16); - } - static void f32_gemm_6x16__neon_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_6x16__neon_lane_ld128, - xnn_f32_igemm_minmax_ukernel_6x16__neon_lane_ld128, - xnn_f32_gemm_minmax_ukernel_1x16__neon_lane_ld128, - xnn_f32_igemm_minmax_ukernel_1x16__neon_lane_ld128, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/6, /*nr=*/16); - } - static void f32_gemm_4x2__neon_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x2__neon_lane_ld64, - xnn_f32_igemm_minmax_ukernel_4x2__neon_lane_ld64, - xnn_f32_gemm_minmax_ukernel_4x2__neon_lane_ld64, - xnn_f32_igemm_minmax_ukernel_4x2__neon_lane_ld64, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/2, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - - static void f32_gemm_6x2__neon_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_6x2__neon_lane_ld64, - xnn_f32_igemm_minmax_ukernel_6x2__neon_lane_ld64, - xnn_f32_gemm_minmax_ukernel_6x2__neon_lane_ld64, - xnn_f32_igemm_minmax_ukernel_6x2__neon_lane_ld64, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/6, /*nr=*/2, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - - static void f32_gemm_4x8__neon_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64, - xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld64, - xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64, - xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - - static void f32_gemm_4x8__neon_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128, - xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128, - xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld128, - xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld128, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - - static void f32_gemm_6x8__neon_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld64, - xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld64, - xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64, - xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/6, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - - static void f32_gemm_6x8__neon_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128, - xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld128, - xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64, - xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/6, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - - static void f32_gemm_4x8__neon_dup_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld64, - xnn_f32_igemm_minmax_ukernel_4x8__neon_dup_ld64, - xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64, - xnn_f32_igemm_minmax_ukernel_1x8__neon_dup_ld64, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - - static void f32_gemm_4x8__neon_dup_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128, - xnn_f32_igemm_minmax_ukernel_4x8__neon_dup_ld128, - xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64, - xnn_f32_igemm_minmax_ukernel_1x8__neon_dup_ld64, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - - static void f32_gemm_6x8__neon_dup_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64, - xnn_f32_igemm_minmax_ukernel_6x8__neon_dup_ld64, - xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64, - xnn_f32_igemm_minmax_ukernel_1x8__neon_dup_ld64, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/6, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - - static void f32_gemm_6x8__neon_dup_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld128, - xnn_f32_igemm_minmax_ukernel_6x8__neon_dup_ld128, - xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64, - xnn_f32_igemm_minmax_ukernel_1x8__neon_dup_ld64, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/6, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - - static void f32_gemm_4x8__neonfma_dup_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64, - xnn_f32_igemm_minmax_ukernel_4x8__neonfma_dup_ld64, - xnn_f32_gemm_minmax_ukernel_1x8__neonfma_dup_ld64, - xnn_f32_igemm_minmax_ukernel_1x8__neonfma_dup_ld64, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEONFMA); - } - - static void f32_gemm_4x8__neonfma_dup_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld128, - xnn_f32_igemm_minmax_ukernel_4x8__neonfma_dup_ld128, - xnn_f32_gemm_minmax_ukernel_1x8__neonfma_dup_ld64, - xnn_f32_igemm_minmax_ukernel_1x8__neonfma_dup_ld64, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEONFMA); - } - - static void f32_gemm_6x8__neonfma_dup_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_6x8__neonfma_dup_ld64, - xnn_f32_igemm_minmax_ukernel_6x8__neonfma_dup_ld64, - xnn_f32_gemm_minmax_ukernel_1x8__neonfma_dup_ld64, - xnn_f32_igemm_minmax_ukernel_1x8__neonfma_dup_ld64, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/6, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEONFMA); - } - - static void f32_gemm_6x8__neonfma_dup_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_6x8__neonfma_dup_ld128, - xnn_f32_igemm_minmax_ukernel_6x8__neonfma_dup_ld128, - xnn_f32_gemm_minmax_ukernel_1x8__neonfma_dup_ld64, - xnn_f32_igemm_minmax_ukernel_1x8__neonfma_dup_ld64, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/6, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEONFMA); - } - - static void f32_gemm_4x8s4__neon(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8s4__neon, - xnn_f32_igemm_minmax_ukernel_4x8s4__neon, - xnn_f32_gemm_minmax_ukernel_1x8s4__neon, - xnn_f32_igemm_minmax_ukernel_1x8s4__neon, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/2, - benchmark::utils::CheckNEON); - } - - static void f32_gemm_4x8s4__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8s4__neonfma, - xnn_f32_igemm_minmax_ukernel_4x8s4__neonfma, - xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma, - xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/2, - benchmark::utils::CheckNEONFMA); - } - - static void f32_gemm_6x8s4__neon(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_6x8s4__neon, - xnn_f32_igemm_minmax_ukernel_6x8s4__neon, - xnn_f32_gemm_minmax_ukernel_1x8s4__neon, - xnn_f32_igemm_minmax_ukernel_1x8s4__neon, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/6, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/2, - benchmark::utils::CheckNEON); - } - - static void f32_gemm_6x8s4__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma, - xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma, - xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma, - xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/6, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/2, - benchmark::utils::CheckNEONFMA); - } - - static void f32_gemm_8x8s4__neon(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_8x8s4__neon, - xnn_f32_igemm_minmax_ukernel_8x8s4__neon, - xnn_f32_gemm_minmax_ukernel_1x8s4__neon, - xnn_f32_igemm_minmax_ukernel_1x8s4__neon, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/8, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/2, - benchmark::utils::CheckNEON); - } - - static void f32_gemm_8x8s4__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma, - xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma, - xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma, - xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/8, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/2, - benchmark::utils::CheckNEONFMA); - } - BENCHMARK_FP32_END2END(f32_gemm_2x16__neon_lane_ld128); - BENCHMARK_FP32_END2END(f32_gemm_3x16__neon_lane_ld128); - BENCHMARK_FP32_END2END(f32_gemm_4x16__neon_lane_ld128); - BENCHMARK_FP32_END2END(f32_gemm_5x16__neon_lane_ld128); - BENCHMARK_FP32_END2END(f32_gemm_6x16__neon_lane_ld128); - - BENCHMARK_FP32_END2END(f32_gemm_4x2__neon_lane_ld64); - BENCHMARK_FP32_END2END(f32_gemm_6x2__neon_lane_ld64); - - BENCHMARK_FP32_END2END(f32_gemm_4x8__neon_lane_ld64); - BENCHMARK_FP32_END2END(f32_gemm_4x8__neon_lane_ld128); - BENCHMARK_FP32_END2END(f32_gemm_6x8__neon_lane_ld64); - BENCHMARK_FP32_END2END(f32_gemm_6x8__neon_lane_ld128); - - BENCHMARK_FP32_END2END(f32_gemm_4x8__neon_dup_ld64); - BENCHMARK_FP32_END2END(f32_gemm_4x8__neon_dup_ld128); - BENCHMARK_FP32_END2END(f32_gemm_6x8__neon_dup_ld64); - BENCHMARK_FP32_END2END(f32_gemm_6x8__neon_dup_ld128); - - BENCHMARK_FP32_END2END(f32_gemm_4x8__neonfma_dup_ld64); - BENCHMARK_FP32_END2END(f32_gemm_4x8__neonfma_dup_ld128); - BENCHMARK_FP32_END2END(f32_gemm_6x8__neonfma_dup_ld64); - BENCHMARK_FP32_END2END(f32_gemm_6x8__neonfma_dup_ld128); - - BENCHMARK_FP32_END2END(f32_gemm_4x8s4__neon); - BENCHMARK_FP32_END2END(f32_gemm_6x8s4__neon); - BENCHMARK_FP32_END2END(f32_gemm_8x8s4__neon); - - BENCHMARK_FP32_END2END(f32_gemm_4x8s4__neonfma); - BENCHMARK_FP32_END2END(f32_gemm_6x8s4__neonfma); - BENCHMARK_FP32_END2END(f32_gemm_8x8s4__neonfma); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - static void f32_gemm_4x16__avx512f_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x16__avx512f_broadcast, - xnn_f32_igemm_minmax_ukernel_4x16__avx512f_broadcast, - xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast, - xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckAVX512F); - } - static void f32_gemm_5x16__avx512f_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_5x16__avx512f_broadcast, - xnn_f32_igemm_minmax_ukernel_5x16__avx512f_broadcast, - xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast, - xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/5, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckAVX512F); - } - static void f32_gemm_6x16__avx512f_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_6x16__avx512f_broadcast, - xnn_f32_igemm_minmax_ukernel_6x16__avx512f_broadcast, - xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast, - xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/6, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckAVX512F); - } - static void f32_gemm_7x16__avx512f_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast, - xnn_f32_igemm_minmax_ukernel_7x16__avx512f_broadcast, - xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast, - xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/7, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckAVX512F); - } - static void f32_gemm_8x16__avx512f_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_8x16__avx512f_broadcast, - xnn_f32_igemm_minmax_ukernel_8x16__avx512f_broadcast, - xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast, - xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/8, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckAVX512F); - } - - static void f32_gemm_4x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast, - xnn_f32_igemm_minmax_ukernel_4x8__fma3_broadcast, - xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast, - xnn_f32_igemm_minmax_ukernel_1x8__fma3_broadcast, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckFMA3); - } - static void f32_gemm_5x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast, - xnn_f32_igemm_minmax_ukernel_5x8__fma3_broadcast, - xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast, - xnn_f32_igemm_minmax_ukernel_1x8__fma3_broadcast, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/5, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckFMA3); - } - static void f32_gemm_6x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_6x8__fma3_broadcast, - xnn_f32_igemm_minmax_ukernel_6x8__fma3_broadcast, - xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast, - xnn_f32_igemm_minmax_ukernel_1x8__fma3_broadcast, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/6, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckFMA3); - } - static void f32_gemm_7x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_7x8__fma3_broadcast, - xnn_f32_igemm_minmax_ukernel_7x8__fma3_broadcast, - xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast, - xnn_f32_igemm_minmax_ukernel_1x8__fma3_broadcast, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/7, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckFMA3); - } - static void f32_gemm_8x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_8x8__fma3_broadcast, - xnn_f32_igemm_minmax_ukernel_8x8__fma3_broadcast, - xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast, - xnn_f32_igemm_minmax_ukernel_1x8__fma3_broadcast, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/8, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckFMA3); - } - static void f32_gemm_3x16__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast, - xnn_f32_igemm_minmax_ukernel_3x16__fma3_broadcast, - xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast, - xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/3, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckFMA3); - } - static void f32_gemm_4x16__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x16__fma3_broadcast, - xnn_f32_igemm_minmax_ukernel_4x16__fma3_broadcast, - xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast, - xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckFMA3); - } - static void f32_gemm_5x16__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast, - xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast, - xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast, - xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/5, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckFMA3); - } - static void f32_gemm_3x16s4__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast, - xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast, - xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast, - xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/3, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/2, - benchmark::utils::CheckFMA3); - } - static void f32_gemm_4x16s4__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast, - xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast, - xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast, - xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/2, - benchmark::utils::CheckFMA3); - } - static void f32_gemm_5x16s4__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast, - xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast, - xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast, - xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/5, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/2, - benchmark::utils::CheckFMA3); - } - - static void f32_gemm_4x8__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8__avx_broadcast, - xnn_f32_igemm_minmax_ukernel_4x8__avx_broadcast, - xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast, - xnn_f32_igemm_minmax_ukernel_1x8__avx_broadcast, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckAVX); - } - static void f32_gemm_5x8__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast, - xnn_f32_igemm_minmax_ukernel_5x8__avx_broadcast, - xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast, - xnn_f32_igemm_minmax_ukernel_1x8__avx_broadcast, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/5, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckAVX); - } - static void f32_gemm_6x8__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast, - xnn_f32_igemm_minmax_ukernel_6x8__avx_broadcast, - xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast, - xnn_f32_igemm_minmax_ukernel_1x8__avx_broadcast, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/6, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckAVX); - } - static void f32_gemm_7x8__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_7x8__avx_broadcast, - xnn_f32_igemm_minmax_ukernel_7x8__avx_broadcast, - xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast, - xnn_f32_igemm_minmax_ukernel_1x8__avx_broadcast, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/7, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckAVX); - } - static void f32_gemm_3x16__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast, - xnn_f32_igemm_minmax_ukernel_3x16__avx_broadcast, - xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast, - xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/3, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckAVX); - } - static void f32_gemm_4x16__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x16__avx_broadcast, - xnn_f32_igemm_minmax_ukernel_4x16__avx_broadcast, - xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast, - xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckAVX); - } - static void f32_gemm_5x16__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast, - xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast, - xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast, - xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/5, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckAVX); - } - - static void f32_gemm_3x8__sse_load1(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_3x8__sse_load1, - xnn_f32_igemm_minmax_ukernel_3x8__sse_load1, - xnn_f32_gemm_minmax_ukernel_1x8__sse_load1, - xnn_f32_igemm_minmax_ukernel_1x8__sse_load1, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/3, /*nr=*/8); - } - static void f32_gemm_4x8__sse_load1(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8__sse_load1, - xnn_f32_igemm_minmax_ukernel_4x8__sse_load1, - xnn_f32_gemm_minmax_ukernel_1x8__sse_load1, - xnn_f32_igemm_minmax_ukernel_1x8__sse_load1, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8); - } - static void f32_gemm_5x8__sse_load1(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_5x8__sse_load1, - xnn_f32_igemm_minmax_ukernel_5x8__sse_load1, - xnn_f32_gemm_minmax_ukernel_1x8__sse_load1, - xnn_f32_igemm_minmax_ukernel_1x8__sse_load1, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/5, /*nr=*/8); - } - static void f32_gemm_3x8__sse_dup(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_3x8__sse_dup, - xnn_f32_igemm_minmax_ukernel_3x8__sse_dup, - xnn_f32_gemm_minmax_ukernel_1x8__sse_dup, - xnn_f32_igemm_minmax_ukernel_1x8__sse_dup, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/3, /*nr=*/8); - } - static void f32_gemm_4x8__sse_dup(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8__sse_dup, - xnn_f32_igemm_minmax_ukernel_4x8__sse_dup, - xnn_f32_gemm_minmax_ukernel_1x8__sse_dup, - xnn_f32_igemm_minmax_ukernel_1x8__sse_dup, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8); - } - static void f32_gemm_5x8__sse_dup(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_5x8__sse_dup, - xnn_f32_igemm_minmax_ukernel_5x8__sse_dup, - xnn_f32_gemm_minmax_ukernel_1x8__sse_dup, - xnn_f32_igemm_minmax_ukernel_1x8__sse_dup, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/5, /*nr=*/8); - } - static void f32_gemm_3x8s4__sse(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_3x8s4__sse, - xnn_f32_igemm_minmax_ukernel_3x8s4__sse, - xnn_f32_gemm_minmax_ukernel_1x8s4__sse, - xnn_f32_igemm_minmax_ukernel_1x8s4__sse, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/3, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/2); - } - static void f32_gemm_4x8s4__sse(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8s4__sse, - xnn_f32_igemm_minmax_ukernel_4x8s4__sse, - xnn_f32_gemm_minmax_ukernel_1x8s4__sse, - xnn_f32_igemm_minmax_ukernel_1x8s4__sse, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/2); - } - static void f32_gemm_5x8s4__sse(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_5x8s4__sse, - xnn_f32_igemm_minmax_ukernel_5x8s4__sse, - xnn_f32_gemm_minmax_ukernel_1x8s4__sse, - xnn_f32_igemm_minmax_ukernel_1x8s4__sse, - nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */, - nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */, - xnn_init_f32_minmax_scalar_params, - /*mr=*/5, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/2); - } - - BENCHMARK_FP32_END2END(f32_gemm_4x16__avx512f_broadcast); - BENCHMARK_FP32_END2END(f32_gemm_5x16__avx512f_broadcast); - BENCHMARK_FP32_END2END(f32_gemm_6x16__avx512f_broadcast); - BENCHMARK_FP32_END2END(f32_gemm_7x16__avx512f_broadcast); - BENCHMARK_FP32_END2END(f32_gemm_8x16__avx512f_broadcast); - - BENCHMARK_FP32_END2END(f32_gemm_4x8__fma3_broadcast); - BENCHMARK_FP32_END2END(f32_gemm_5x8__fma3_broadcast); - BENCHMARK_FP32_END2END(f32_gemm_6x8__fma3_broadcast); - BENCHMARK_FP32_END2END(f32_gemm_7x8__fma3_broadcast); - BENCHMARK_FP32_END2END(f32_gemm_8x8__fma3_broadcast); - BENCHMARK_FP32_END2END(f32_gemm_3x16__fma3_broadcast); - BENCHMARK_FP32_END2END(f32_gemm_4x16__fma3_broadcast); - BENCHMARK_FP32_END2END(f32_gemm_5x16__fma3_broadcast); - - BENCHMARK_FP32_END2END(f32_gemm_3x16s4__fma3_broadcast); - BENCHMARK_FP32_END2END(f32_gemm_4x16s4__fma3_broadcast); - BENCHMARK_FP32_END2END(f32_gemm_5x16s4__fma3_broadcast); - - BENCHMARK_FP32_END2END(f32_gemm_4x8__avx_broadcast); - BENCHMARK_FP32_END2END(f32_gemm_5x8__avx_broadcast); - BENCHMARK_FP32_END2END(f32_gemm_6x8__avx_broadcast); - BENCHMARK_FP32_END2END(f32_gemm_7x8__avx_broadcast); - BENCHMARK_FP32_END2END(f32_gemm_3x16__avx_broadcast); - BENCHMARK_FP32_END2END(f32_gemm_4x16__avx_broadcast); - BENCHMARK_FP32_END2END(f32_gemm_5x16__avx_broadcast); - - BENCHMARK_FP32_END2END(f32_gemm_3x8__sse_load1); - BENCHMARK_FP32_END2END(f32_gemm_4x8__sse_load1); - BENCHMARK_FP32_END2END(f32_gemm_5x8__sse_load1); - - BENCHMARK_FP32_END2END(f32_gemm_3x8__sse_dup); - BENCHMARK_FP32_END2END(f32_gemm_4x8__sse_dup); - BENCHMARK_FP32_END2END(f32_gemm_5x8__sse_dup); - - BENCHMARK_FP32_END2END(f32_gemm_3x8s4__sse); - BENCHMARK_FP32_END2END(f32_gemm_4x8s4__sse); - BENCHMARK_FP32_END2END(f32_gemm_5x8s4__sse); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_WASMRELAXEDSIMD - static void f32_gemm_3x8__wasmrelaxedsimd_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_loadsplat, - xnn_f32_igemm_minmax_ukernel_3x8__wasmrelaxedsimd_loadsplat, - xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat, - xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat, - xnn_f32_gemm_relu_ukernel_3x8__wasmsimd_loadsplat, - xnn_f32_igemm_relu_ukernel_3x8__wasmsimd_loadsplat, - xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_loadsplat, - xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_loadsplat, - xnn_f32_gemm_ukernel_3x8__wasmsimd_loadsplat, - xnn_f32_igemm_ukernel_3x8__wasmsimd_loadsplat, - xnn_f32_gemm_ukernel_1x8__wasmsimd_loadsplat, - xnn_f32_igemm_ukernel_1x8__wasmsimd_loadsplat, - xnn_init_f32_minmax_scalar_params, - /*mr=*/3, /*nr=*/8); - } - static void f32_gemm_4x8__wasmrelaxedsimd_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_loadsplat, - xnn_f32_igemm_minmax_ukernel_4x8__wasmrelaxedsimd_loadsplat, - xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat, - xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat, - xnn_f32_gemm_relu_ukernel_4x8__wasmsimd_loadsplat, - xnn_f32_igemm_relu_ukernel_4x8__wasmsimd_loadsplat, - xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_loadsplat, - xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_loadsplat, - xnn_f32_gemm_ukernel_4x8__wasmsimd_loadsplat, - xnn_f32_igemm_ukernel_4x8__wasmsimd_loadsplat, - xnn_f32_gemm_ukernel_1x8__wasmsimd_loadsplat, - xnn_f32_igemm_ukernel_1x8__wasmsimd_loadsplat, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8); - } - static void f32_gemm_5x8__wasmrelaxedsimd_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_loadsplat, - xnn_f32_igemm_minmax_ukernel_5x8__wasmrelaxedsimd_loadsplat, - xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat, - xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat, - xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_loadsplat, - xnn_f32_igemm_relu_ukernel_5x8__wasmsimd_loadsplat, - xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_loadsplat, - xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_loadsplat, - xnn_f32_gemm_ukernel_5x8__wasmsimd_loadsplat, - xnn_f32_igemm_ukernel_5x8__wasmsimd_loadsplat, - xnn_f32_gemm_ukernel_1x8__wasmsimd_loadsplat, - xnn_f32_igemm_ukernel_1x8__wasmsimd_loadsplat, - xnn_init_f32_minmax_scalar_params, - /*mr=*/5, /*nr=*/8); - } - static void f32_gemm_6x8__wasmrelaxedsimd_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_loadsplat, - xnn_f32_igemm_minmax_ukernel_6x8__wasmrelaxedsimd_loadsplat, - xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat, - xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat, - xnn_f32_gemm_relu_ukernel_6x8__wasmsimd_loadsplat, - xnn_f32_igemm_relu_ukernel_6x8__wasmsimd_loadsplat, - xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_loadsplat, - xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_loadsplat, - xnn_f32_gemm_ukernel_6x8__wasmsimd_loadsplat, - xnn_f32_igemm_ukernel_6x8__wasmsimd_loadsplat, - xnn_f32_gemm_ukernel_1x8__wasmsimd_loadsplat, - xnn_f32_igemm_ukernel_1x8__wasmsimd_loadsplat, - xnn_init_f32_minmax_scalar_params, - /*mr=*/6, /*nr=*/8); - } - static void f32_gemm_3x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_igemm_minmax_ukernel_3x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_gemm_relu_ukernel_3x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_igemm_relu_ukernel_3x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_igemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_gemm_ukernel_3x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_igemm_ukernel_3x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_gemm_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_igemm_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat, - xnn_init_f32_minmax_scalar_params, - /*mr=*/3, /*nr=*/8); - } - static void f32_gemm_4x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_igemm_minmax_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_gemm_relu_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_igemm_relu_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_igemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_gemm_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_igemm_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_gemm_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_igemm_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8); - } - static void f32_gemm_5x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_igemm_minmax_ukernel_5x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_gemm_relu_ukernel_5x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_igemm_relu_ukernel_5x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_igemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_gemm_ukernel_5x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_igemm_ukernel_5x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_gemm_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_igemm_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat, - xnn_init_f32_minmax_scalar_params, - /*mr=*/5, /*nr=*/8); - } - static void f32_gemm_6x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_igemm_minmax_ukernel_6x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_gemm_relu_ukernel_6x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_igemm_relu_ukernel_6x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_igemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_gemm_ukernel_6x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_igemm_ukernel_6x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_gemm_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat, - xnn_f32_igemm_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat, - xnn_init_f32_minmax_scalar_params, - /*mr=*/6, /*nr=*/8); - } - static void f32_gemm_3x8__wasmrelaxedsimd_splat(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_splat, - xnn_f32_igemm_minmax_ukernel_3x8__wasmrelaxedsimd_splat, - xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_splat, - xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_splat, - xnn_f32_gemm_relu_ukernel_3x8__wasmsimd_splat, - xnn_f32_igemm_relu_ukernel_3x8__wasmsimd_splat, - xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat, - xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat, - xnn_f32_gemm_ukernel_3x8__wasmsimd_splat, - xnn_f32_igemm_ukernel_3x8__wasmsimd_splat, - xnn_f32_gemm_ukernel_1x8__wasmsimd_splat, - xnn_f32_igemm_ukernel_1x8__wasmsimd_splat, - xnn_init_f32_minmax_scalar_params, - /*mr=*/3, /*nr=*/8); - } - static void f32_gemm_4x8__wasmrelaxedsimd_splat(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_splat, - xnn_f32_igemm_minmax_ukernel_4x8__wasmrelaxedsimd_splat, - xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_splat, - xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_splat, - xnn_f32_gemm_relu_ukernel_4x8__wasmsimd_splat, - xnn_f32_igemm_relu_ukernel_4x8__wasmsimd_splat, - xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat, - xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat, - xnn_f32_gemm_ukernel_4x8__wasmsimd_splat, - xnn_f32_igemm_ukernel_4x8__wasmsimd_splat, - xnn_f32_gemm_ukernel_1x8__wasmsimd_splat, - xnn_f32_igemm_ukernel_1x8__wasmsimd_splat, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8); - } - static void f32_gemm_5x8__wasmrelaxedsimd_splat(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_splat, - xnn_f32_igemm_minmax_ukernel_5x8__wasmrelaxedsimd_splat, - xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_splat, - xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_splat, - xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_splat, - xnn_f32_igemm_relu_ukernel_5x8__wasmsimd_splat, - xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat, - xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat, - xnn_f32_gemm_ukernel_5x8__wasmsimd_splat, - xnn_f32_igemm_ukernel_5x8__wasmsimd_splat, - xnn_f32_gemm_ukernel_1x8__wasmsimd_splat, - xnn_f32_igemm_ukernel_1x8__wasmsimd_splat, - xnn_init_f32_minmax_scalar_params, - /*mr=*/5, /*nr=*/8); - } - static void f32_gemm_6x8__wasmrelaxedsimd_splat(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_splat, - xnn_f32_igemm_minmax_ukernel_6x8__wasmrelaxedsimd_splat, - xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_splat, - xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_splat, - xnn_f32_gemm_relu_ukernel_6x8__wasmsimd_splat, - xnn_f32_igemm_relu_ukernel_6x8__wasmsimd_splat, - xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat, - xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat, - xnn_f32_gemm_ukernel_6x8__wasmsimd_splat, - xnn_f32_igemm_ukernel_6x8__wasmsimd_splat, - xnn_f32_gemm_ukernel_1x8__wasmsimd_splat, - xnn_f32_igemm_ukernel_1x8__wasmsimd_splat, - xnn_init_f32_minmax_scalar_params, - /*mr=*/6, /*nr=*/8); - } - static void f32_gemm_3x8__wasmrelaxedsimd_fma_splat(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_fma_splat, - xnn_f32_igemm_minmax_ukernel_3x8__wasmrelaxedsimd_fma_splat, - xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_splat, - xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_splat, - xnn_f32_gemm_relu_ukernel_3x8__wasmrelaxedsimd_fma_splat, - xnn_f32_igemm_relu_ukernel_3x8__wasmrelaxedsimd_fma_splat, - xnn_f32_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_splat, - xnn_f32_igemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_splat, - xnn_f32_gemm_ukernel_3x8__wasmrelaxedsimd_fma_splat, - xnn_f32_igemm_ukernel_3x8__wasmrelaxedsimd_fma_splat, - xnn_f32_gemm_ukernel_1x8__wasmrelaxedsimd_fma_splat, - xnn_f32_igemm_ukernel_1x8__wasmrelaxedsimd_fma_splat, - xnn_init_f32_minmax_scalar_params, - /*mr=*/3, /*nr=*/8); - } - static void f32_gemm_4x8__wasmrelaxedsimd_fma_splat(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_fma_splat, - xnn_f32_igemm_minmax_ukernel_4x8__wasmrelaxedsimd_fma_splat, - xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_splat, - xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_splat, - xnn_f32_gemm_relu_ukernel_4x8__wasmrelaxedsimd_fma_splat, - xnn_f32_igemm_relu_ukernel_4x8__wasmrelaxedsimd_fma_splat, - xnn_f32_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_splat, - xnn_f32_igemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_splat, - xnn_f32_gemm_ukernel_4x8__wasmrelaxedsimd_fma_splat, - xnn_f32_igemm_ukernel_4x8__wasmrelaxedsimd_fma_splat, - xnn_f32_gemm_ukernel_1x8__wasmrelaxedsimd_fma_splat, - xnn_f32_igemm_ukernel_1x8__wasmrelaxedsimd_fma_splat, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8); - } - static void f32_gemm_5x8__wasmrelaxedsimd_fma_splat(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_fma_splat, - xnn_f32_igemm_minmax_ukernel_5x8__wasmrelaxedsimd_fma_splat, - xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_splat, - xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_splat, - xnn_f32_gemm_relu_ukernel_5x8__wasmrelaxedsimd_fma_splat, - xnn_f32_igemm_relu_ukernel_5x8__wasmrelaxedsimd_fma_splat, - xnn_f32_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_splat, - xnn_f32_igemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_splat, - xnn_f32_gemm_ukernel_5x8__wasmrelaxedsimd_fma_splat, - xnn_f32_igemm_ukernel_5x8__wasmrelaxedsimd_fma_splat, - xnn_f32_gemm_ukernel_1x8__wasmrelaxedsimd_fma_splat, - xnn_f32_igemm_ukernel_1x8__wasmrelaxedsimd_fma_splat, - xnn_init_f32_minmax_scalar_params, - /*mr=*/5, /*nr=*/8); - } - static void f32_gemm_6x8__wasmrelaxedsimd_fma_splat(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_fma_splat, - xnn_f32_igemm_minmax_ukernel_6x8__wasmrelaxedsimd_fma_splat, - xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_splat, - xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_splat, - xnn_f32_gemm_relu_ukernel_6x8__wasmrelaxedsimd_fma_splat, - xnn_f32_igemm_relu_ukernel_6x8__wasmrelaxedsimd_fma_splat, - xnn_f32_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_splat, - xnn_f32_igemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_splat, - xnn_f32_gemm_ukernel_6x8__wasmrelaxedsimd_fma_splat, - xnn_f32_igemm_ukernel_6x8__wasmrelaxedsimd_fma_splat, - xnn_f32_gemm_ukernel_1x8__wasmrelaxedsimd_fma_splat, - xnn_f32_igemm_ukernel_1x8__wasmrelaxedsimd_fma_splat, - xnn_init_f32_minmax_scalar_params, - /*mr=*/6, /*nr=*/8); - } - static void f32_gemm_3x8s4__wasmrelaxedsimd(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd, - xnn_f32_igemm_minmax_ukernel_3x8s4__wasmrelaxedsimd, - xnn_f32_gemm_minmax_ukernel_1x8s4__wasmrelaxedsimd, - xnn_f32_igemm_minmax_ukernel_1x8s4__wasmrelaxedsimd, - xnn_f32_gemm_relu_ukernel_3x8s4__wasmsimd, - xnn_f32_igemm_relu_ukernel_3x8s4__wasmsimd, - xnn_f32_gemm_relu_ukernel_1x8s4__wasmsimd, - xnn_f32_igemm_relu_ukernel_1x8s4__wasmsimd, - xnn_f32_gemm_ukernel_3x8s4__wasmsimd, - xnn_f32_igemm_ukernel_3x8s4__wasmsimd, - xnn_f32_gemm_ukernel_1x8s4__wasmsimd, - xnn_f32_igemm_ukernel_1x8s4__wasmsimd, - xnn_init_f32_minmax_scalar_params, - /*mr=*/3, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/2); - } - static void f32_gemm_4x8s4__wasmrelaxedsimd(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd, - xnn_f32_igemm_minmax_ukernel_4x8s4__wasmrelaxedsimd, - xnn_f32_gemm_minmax_ukernel_1x8s4__wasmrelaxedsimd, - xnn_f32_igemm_minmax_ukernel_1x8s4__wasmrelaxedsimd, - xnn_f32_gemm_relu_ukernel_4x8s4__wasmsimd, - xnn_f32_igemm_relu_ukernel_4x8s4__wasmsimd, - xnn_f32_gemm_relu_ukernel_1x8s4__wasmsimd, - xnn_f32_igemm_relu_ukernel_1x8s4__wasmsimd, - xnn_f32_gemm_ukernel_4x8s4__wasmsimd, - xnn_f32_igemm_ukernel_4x8s4__wasmsimd, - xnn_f32_gemm_ukernel_1x8s4__wasmsimd, - xnn_f32_igemm_ukernel_1x8s4__wasmsimd, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/2); - } - static void f32_gemm_5x8s4__wasmrelaxedsimd(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_5x8s4__wasmrelaxedsimd, - xnn_f32_igemm_minmax_ukernel_5x8s4__wasmrelaxedsimd, - xnn_f32_gemm_minmax_ukernel_1x8s4__wasmrelaxedsimd, - xnn_f32_igemm_minmax_ukernel_1x8s4__wasmrelaxedsimd, - xnn_f32_gemm_relu_ukernel_5x8s4__wasmsimd, - xnn_f32_igemm_relu_ukernel_5x8s4__wasmsimd, - xnn_f32_gemm_relu_ukernel_1x8s4__wasmsimd, - xnn_f32_igemm_relu_ukernel_1x8s4__wasmsimd, - xnn_f32_gemm_ukernel_5x8s4__wasmsimd, - xnn_f32_igemm_ukernel_5x8s4__wasmsimd, - xnn_f32_gemm_ukernel_1x8s4__wasmsimd, - xnn_f32_igemm_ukernel_1x8s4__wasmsimd, - xnn_init_f32_minmax_scalar_params, - /*mr=*/5, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/2); - } - static void f32_gemm_6x8s4__wasmrelaxedsimd(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_6x8s4__wasmrelaxedsimd, - xnn_f32_igemm_minmax_ukernel_6x8s4__wasmrelaxedsimd, - xnn_f32_gemm_minmax_ukernel_1x8s4__wasmrelaxedsimd, - xnn_f32_igemm_minmax_ukernel_1x8s4__wasmrelaxedsimd, - xnn_f32_gemm_relu_ukernel_6x8s4__wasmsimd, - xnn_f32_igemm_relu_ukernel_6x8s4__wasmsimd, - xnn_f32_gemm_relu_ukernel_1x8s4__wasmsimd, - xnn_f32_igemm_relu_ukernel_1x8s4__wasmsimd, - xnn_f32_gemm_ukernel_6x8s4__wasmsimd, - xnn_f32_igemm_ukernel_6x8s4__wasmsimd, - xnn_f32_gemm_ukernel_1x8s4__wasmsimd, - xnn_f32_igemm_ukernel_1x8s4__wasmsimd, - xnn_init_f32_minmax_scalar_params, - /*mr=*/6, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/2); - } - static void f32_gemm_3x8s4__wasmrelaxedsimd_fma(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd_fma, - xnn_f32_igemm_minmax_ukernel_3x8s4__wasmrelaxedsimd_fma, - xnn_f32_gemm_minmax_ukernel_1x8s4__wasmrelaxedsimd_fma, - xnn_f32_igemm_minmax_ukernel_1x8s4__wasmrelaxedsimd_fma, - xnn_f32_gemm_relu_ukernel_3x8s4__wasmrelaxedsimd_fma, - xnn_f32_igemm_relu_ukernel_3x8s4__wasmrelaxedsimd_fma, - xnn_f32_gemm_relu_ukernel_1x8s4__wasmrelaxedsimd_fma, - xnn_f32_igemm_relu_ukernel_1x8s4__wasmrelaxedsimd_fma, - xnn_f32_gemm_ukernel_3x8s4__wasmrelaxedsimd_fma, - xnn_f32_igemm_ukernel_3x8s4__wasmrelaxedsimd_fma, - xnn_f32_gemm_ukernel_1x8s4__wasmrelaxedsimd_fma, - xnn_f32_igemm_ukernel_1x8s4__wasmrelaxedsimd_fma, - xnn_init_f32_minmax_scalar_params, - /*mr=*/3, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/2); - } - static void f32_gemm_4x8s4__wasmrelaxedsimd_fma(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd_fma, - xnn_f32_igemm_minmax_ukernel_4x8s4__wasmrelaxedsimd_fma, - xnn_f32_gemm_minmax_ukernel_1x8s4__wasmrelaxedsimd_fma, - xnn_f32_igemm_minmax_ukernel_1x8s4__wasmrelaxedsimd_fma, - xnn_f32_gemm_relu_ukernel_4x8s4__wasmrelaxedsimd_fma, - xnn_f32_igemm_relu_ukernel_4x8s4__wasmrelaxedsimd_fma, - xnn_f32_gemm_relu_ukernel_1x8s4__wasmrelaxedsimd_fma, - xnn_f32_igemm_relu_ukernel_1x8s4__wasmrelaxedsimd_fma, - xnn_f32_gemm_ukernel_4x8s4__wasmrelaxedsimd_fma, - xnn_f32_igemm_ukernel_4x8s4__wasmrelaxedsimd_fma, - xnn_f32_gemm_ukernel_1x8s4__wasmrelaxedsimd_fma, - xnn_f32_igemm_ukernel_1x8s4__wasmrelaxedsimd_fma, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/2); - } - static void f32_gemm_5x8s4__wasmrelaxedsimd_fma(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma, - xnn_f32_igemm_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma, - xnn_f32_gemm_minmax_ukernel_1x8s4__wasmrelaxedsimd_fma, - xnn_f32_igemm_minmax_ukernel_1x8s4__wasmrelaxedsimd_fma, - xnn_f32_gemm_relu_ukernel_5x8s4__wasmrelaxedsimd_fma, - xnn_f32_igemm_relu_ukernel_5x8s4__wasmrelaxedsimd_fma, - xnn_f32_gemm_relu_ukernel_1x8s4__wasmrelaxedsimd_fma, - xnn_f32_igemm_relu_ukernel_1x8s4__wasmrelaxedsimd_fma, - xnn_f32_gemm_ukernel_5x8s4__wasmrelaxedsimd_fma, - xnn_f32_igemm_ukernel_5x8s4__wasmrelaxedsimd_fma, - xnn_f32_gemm_ukernel_1x8s4__wasmrelaxedsimd_fma, - xnn_f32_igemm_ukernel_1x8s4__wasmrelaxedsimd_fma, - xnn_init_f32_minmax_scalar_params, - /*mr=*/5, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/2); - } - static void f32_gemm_6x8s4__wasmrelaxedsimd_fma(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_6x8s4__wasmrelaxedsimd_fma, - xnn_f32_igemm_minmax_ukernel_6x8s4__wasmrelaxedsimd_fma, - xnn_f32_gemm_minmax_ukernel_1x8s4__wasmrelaxedsimd_fma, - xnn_f32_igemm_minmax_ukernel_1x8s4__wasmrelaxedsimd_fma, - xnn_f32_gemm_relu_ukernel_6x8s4__wasmrelaxedsimd_fma, - xnn_f32_igemm_relu_ukernel_6x8s4__wasmrelaxedsimd_fma, - xnn_f32_gemm_relu_ukernel_1x8s4__wasmrelaxedsimd_fma, - xnn_f32_igemm_relu_ukernel_1x8s4__wasmrelaxedsimd_fma, - xnn_f32_gemm_ukernel_6x8s4__wasmrelaxedsimd_fma, - xnn_f32_igemm_ukernel_6x8s4__wasmrelaxedsimd_fma, - xnn_f32_gemm_ukernel_1x8s4__wasmrelaxedsimd_fma, - xnn_f32_igemm_ukernel_1x8s4__wasmrelaxedsimd_fma, - xnn_init_f32_minmax_scalar_params, - /*mr=*/6, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/2); - } - - BENCHMARK_FP32_END2END(f32_gemm_3x8__wasmrelaxedsimd_loadsplat); - BENCHMARK_FP32_END2END(f32_gemm_4x8__wasmrelaxedsimd_loadsplat); - BENCHMARK_FP32_END2END(f32_gemm_5x8__wasmrelaxedsimd_loadsplat); - BENCHMARK_FP32_END2END(f32_gemm_6x8__wasmrelaxedsimd_loadsplat); - - BENCHMARK_FP32_END2END(f32_gemm_3x8__wasmrelaxedsimd_fma_loadsplat); - BENCHMARK_FP32_END2END(f32_gemm_4x8__wasmrelaxedsimd_fma_loadsplat); - BENCHMARK_FP32_END2END(f32_gemm_5x8__wasmrelaxedsimd_fma_loadsplat); - BENCHMARK_FP32_END2END(f32_gemm_6x8__wasmrelaxedsimd_fma_loadsplat); - - BENCHMARK_FP32_END2END(f32_gemm_3x8__wasmrelaxedsimd_splat); - BENCHMARK_FP32_END2END(f32_gemm_4x8__wasmrelaxedsimd_splat); - BENCHMARK_FP32_END2END(f32_gemm_5x8__wasmrelaxedsimd_splat); - BENCHMARK_FP32_END2END(f32_gemm_6x8__wasmrelaxedsimd_splat); - - BENCHMARK_FP32_END2END(f32_gemm_3x8__wasmrelaxedsimd_fma_splat); - BENCHMARK_FP32_END2END(f32_gemm_4x8__wasmrelaxedsimd_fma_splat); - BENCHMARK_FP32_END2END(f32_gemm_5x8__wasmrelaxedsimd_fma_splat); - BENCHMARK_FP32_END2END(f32_gemm_6x8__wasmrelaxedsimd_fma_splat); - - BENCHMARK_FP32_END2END(f32_gemm_3x8s4__wasmrelaxedsimd); - BENCHMARK_FP32_END2END(f32_gemm_4x8s4__wasmrelaxedsimd); - BENCHMARK_FP32_END2END(f32_gemm_5x8s4__wasmrelaxedsimd); - BENCHMARK_FP32_END2END(f32_gemm_6x8s4__wasmrelaxedsimd); - - BENCHMARK_FP32_END2END(f32_gemm_3x8s4__wasmrelaxedsimd_fma); - BENCHMARK_FP32_END2END(f32_gemm_4x8s4__wasmrelaxedsimd_fma); - BENCHMARK_FP32_END2END(f32_gemm_5x8s4__wasmrelaxedsimd_fma); - BENCHMARK_FP32_END2END(f32_gemm_6x8s4__wasmrelaxedsimd_fma); -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - static void f32_gemm_3x8__wasmsimd_arm_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_loadsplat, - xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_loadsplat, - xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat, - xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat, - xnn_f32_gemm_relu_ukernel_3x8__wasmsimd_loadsplat, - xnn_f32_igemm_relu_ukernel_3x8__wasmsimd_loadsplat, - xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_loadsplat, - xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_loadsplat, - xnn_f32_gemm_ukernel_3x8__wasmsimd_loadsplat, - xnn_f32_igemm_ukernel_3x8__wasmsimd_loadsplat, - xnn_f32_gemm_ukernel_1x8__wasmsimd_loadsplat, - xnn_f32_igemm_ukernel_1x8__wasmsimd_loadsplat, - xnn_init_f32_minmax_scalar_params, - /*mr=*/3, /*nr=*/8); - } - static void f32_gemm_4x8__wasmsimd_arm_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat, - xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat, - xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat, - xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat, - xnn_f32_gemm_relu_ukernel_4x8__wasmsimd_loadsplat, - xnn_f32_igemm_relu_ukernel_4x8__wasmsimd_loadsplat, - xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_loadsplat, - xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_loadsplat, - xnn_f32_gemm_ukernel_4x8__wasmsimd_loadsplat, - xnn_f32_igemm_ukernel_4x8__wasmsimd_loadsplat, - xnn_f32_gemm_ukernel_1x8__wasmsimd_loadsplat, - xnn_f32_igemm_ukernel_1x8__wasmsimd_loadsplat, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8); - } - static void f32_gemm_5x8__wasmsimd_arm_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat, - xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat, - xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat, - xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat, - xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_loadsplat, - xnn_f32_igemm_relu_ukernel_5x8__wasmsimd_loadsplat, - xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_loadsplat, - xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_loadsplat, - xnn_f32_gemm_ukernel_5x8__wasmsimd_loadsplat, - xnn_f32_igemm_ukernel_5x8__wasmsimd_loadsplat, - xnn_f32_gemm_ukernel_1x8__wasmsimd_loadsplat, - xnn_f32_igemm_ukernel_1x8__wasmsimd_loadsplat, - xnn_init_f32_minmax_scalar_params, - /*mr=*/5, /*nr=*/8); - } - static void f32_gemm_6x8__wasmsimd_arm_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat, - xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat, - xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat, - xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat, - xnn_f32_gemm_relu_ukernel_6x8__wasmsimd_loadsplat, - xnn_f32_igemm_relu_ukernel_6x8__wasmsimd_loadsplat, - xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_loadsplat, - xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_loadsplat, - xnn_f32_gemm_ukernel_6x8__wasmsimd_loadsplat, - xnn_f32_igemm_ukernel_6x8__wasmsimd_loadsplat, - xnn_f32_gemm_ukernel_1x8__wasmsimd_loadsplat, - xnn_f32_igemm_ukernel_1x8__wasmsimd_loadsplat, - xnn_init_f32_minmax_scalar_params, - /*mr=*/6, /*nr=*/8); - } - static void f32_gemm_3x8__wasmsimd_x86_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat, - xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat, - xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat, - xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat, - xnn_f32_gemm_relu_ukernel_3x8__wasmsimd_loadsplat, - xnn_f32_igemm_relu_ukernel_3x8__wasmsimd_loadsplat, - xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_loadsplat, - xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_loadsplat, - xnn_f32_gemm_ukernel_3x8__wasmsimd_loadsplat, - xnn_f32_igemm_ukernel_3x8__wasmsimd_loadsplat, - xnn_f32_gemm_ukernel_1x8__wasmsimd_loadsplat, - xnn_f32_igemm_ukernel_1x8__wasmsimd_loadsplat, - xnn_init_f32_minmax_scalar_params, - /*mr=*/3, /*nr=*/8); - } - static void f32_gemm_4x8__wasmsimd_x86_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat, - xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat, - xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat, - xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat, - xnn_f32_gemm_relu_ukernel_4x8__wasmsimd_loadsplat, - xnn_f32_igemm_relu_ukernel_4x8__wasmsimd_loadsplat, - xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_loadsplat, - xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_loadsplat, - xnn_f32_gemm_ukernel_4x8__wasmsimd_loadsplat, - xnn_f32_igemm_ukernel_4x8__wasmsimd_loadsplat, - xnn_f32_gemm_ukernel_1x8__wasmsimd_loadsplat, - xnn_f32_igemm_ukernel_1x8__wasmsimd_loadsplat, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8); - } - static void f32_gemm_5x8__wasmsimd_x86_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_x86_loadsplat, - xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_x86_loadsplat, - xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat, - xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat, - xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_loadsplat, - xnn_f32_igemm_relu_ukernel_5x8__wasmsimd_loadsplat, - xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_loadsplat, - xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_loadsplat, - xnn_f32_gemm_ukernel_5x8__wasmsimd_loadsplat, - xnn_f32_igemm_ukernel_5x8__wasmsimd_loadsplat, - xnn_f32_gemm_ukernel_1x8__wasmsimd_loadsplat, - xnn_f32_igemm_ukernel_1x8__wasmsimd_loadsplat, - xnn_init_f32_minmax_scalar_params, - /*mr=*/5, /*nr=*/8); - } - static void f32_gemm_6x8__wasmsimd_x86_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat, - xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat, - xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat, - xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat, - xnn_f32_gemm_relu_ukernel_6x8__wasmsimd_loadsplat, - xnn_f32_igemm_relu_ukernel_6x8__wasmsimd_loadsplat, - xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_loadsplat, - xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_loadsplat, - xnn_f32_gemm_ukernel_6x8__wasmsimd_loadsplat, - xnn_f32_igemm_ukernel_6x8__wasmsimd_loadsplat, - xnn_f32_gemm_ukernel_1x8__wasmsimd_loadsplat, - xnn_f32_igemm_ukernel_1x8__wasmsimd_loadsplat, - xnn_init_f32_minmax_scalar_params, - /*mr=*/6, /*nr=*/8); - } - static void f32_gemm_3x8__wasmsimd_arm_splat(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat, - xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_splat, - xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat, - xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_splat, - xnn_f32_gemm_relu_ukernel_3x8__wasmsimd_splat, - xnn_f32_igemm_relu_ukernel_3x8__wasmsimd_splat, - xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat, - xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat, - xnn_f32_gemm_ukernel_3x8__wasmsimd_splat, - xnn_f32_igemm_ukernel_3x8__wasmsimd_splat, - xnn_f32_gemm_ukernel_1x8__wasmsimd_splat, - xnn_f32_igemm_ukernel_1x8__wasmsimd_splat, - xnn_init_f32_minmax_scalar_params, - /*mr=*/3, /*nr=*/8); - } - static void f32_gemm_4x8__wasmsimd_arm_splat(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_splat, - xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_arm_splat, - xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat, - xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_splat, - xnn_f32_gemm_relu_ukernel_4x8__wasmsimd_splat, - xnn_f32_igemm_relu_ukernel_4x8__wasmsimd_splat, - xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat, - xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat, - xnn_f32_gemm_ukernel_4x8__wasmsimd_splat, - xnn_f32_igemm_ukernel_4x8__wasmsimd_splat, - xnn_f32_gemm_ukernel_1x8__wasmsimd_splat, - xnn_f32_igemm_ukernel_1x8__wasmsimd_splat, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8); - } - static void f32_gemm_5x8__wasmsimd_arm_splat(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_splat, - xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_splat, - xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat, - xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_splat, - xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_splat, - xnn_f32_igemm_relu_ukernel_5x8__wasmsimd_splat, - xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat, - xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat, - xnn_f32_gemm_ukernel_5x8__wasmsimd_splat, - xnn_f32_igemm_ukernel_5x8__wasmsimd_splat, - xnn_f32_gemm_ukernel_1x8__wasmsimd_splat, - xnn_f32_igemm_ukernel_1x8__wasmsimd_splat, - xnn_init_f32_minmax_scalar_params, - /*mr=*/5, /*nr=*/8); - } - static void f32_gemm_6x8__wasmsimd_arm_splat(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat, - xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_splat, - xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat, - xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_splat, - xnn_f32_gemm_relu_ukernel_6x8__wasmsimd_splat, - xnn_f32_igemm_relu_ukernel_6x8__wasmsimd_splat, - xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat, - xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat, - xnn_f32_gemm_ukernel_6x8__wasmsimd_splat, - xnn_f32_igemm_ukernel_6x8__wasmsimd_splat, - xnn_f32_gemm_ukernel_1x8__wasmsimd_splat, - xnn_f32_igemm_ukernel_1x8__wasmsimd_splat, - xnn_init_f32_minmax_scalar_params, - /*mr=*/6, /*nr=*/8); - } - static void f32_gemm_3x8__wasmsimd_x86_splat(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat, - xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat, - xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat, - xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat, - xnn_f32_gemm_relu_ukernel_3x8__wasmsimd_splat, - xnn_f32_igemm_relu_ukernel_3x8__wasmsimd_splat, - xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat, - xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat, - xnn_f32_gemm_ukernel_3x8__wasmsimd_splat, - xnn_f32_igemm_ukernel_3x8__wasmsimd_splat, - xnn_f32_gemm_ukernel_1x8__wasmsimd_splat, - xnn_f32_igemm_ukernel_1x8__wasmsimd_splat, - xnn_init_f32_minmax_scalar_params, - /*mr=*/3, /*nr=*/8); - } - static void f32_gemm_4x8__wasmsimd_x86_splat(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat, - xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat, - xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat, - xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat, - xnn_f32_gemm_relu_ukernel_4x8__wasmsimd_splat, - xnn_f32_igemm_relu_ukernel_4x8__wasmsimd_splat, - xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat, - xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat, - xnn_f32_gemm_ukernel_4x8__wasmsimd_splat, - xnn_f32_igemm_ukernel_4x8__wasmsimd_splat, - xnn_f32_gemm_ukernel_1x8__wasmsimd_splat, - xnn_f32_igemm_ukernel_1x8__wasmsimd_splat, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8); - } - static void f32_gemm_5x8__wasmsimd_x86_splat(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_x86_splat, - xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_x86_splat, - xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat, - xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat, - xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_splat, - xnn_f32_igemm_relu_ukernel_5x8__wasmsimd_splat, - xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat, - xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat, - xnn_f32_gemm_ukernel_5x8__wasmsimd_splat, - xnn_f32_igemm_ukernel_5x8__wasmsimd_splat, - xnn_f32_gemm_ukernel_1x8__wasmsimd_splat, - xnn_f32_igemm_ukernel_1x8__wasmsimd_splat, - xnn_init_f32_minmax_scalar_params, - /*mr=*/5, /*nr=*/8); - } - static void f32_gemm_6x8__wasmsimd_x86_splat(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat, - xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_splat, - xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat, - xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat, - xnn_f32_gemm_relu_ukernel_6x8__wasmsimd_splat, - xnn_f32_igemm_relu_ukernel_6x8__wasmsimd_splat, - xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat, - xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat, - xnn_f32_gemm_ukernel_6x8__wasmsimd_splat, - xnn_f32_igemm_ukernel_6x8__wasmsimd_splat, - xnn_f32_gemm_ukernel_1x8__wasmsimd_splat, - xnn_f32_igemm_ukernel_1x8__wasmsimd_splat, - xnn_init_f32_minmax_scalar_params, - /*mr=*/6, /*nr=*/8); - } - static void f32_gemm_3x8s4__wasmsimd_arm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm, - xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm, - xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm, - xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm, - xnn_f32_gemm_relu_ukernel_3x8s4__wasmsimd, - xnn_f32_igemm_relu_ukernel_3x8s4__wasmsimd, - xnn_f32_gemm_relu_ukernel_1x8s4__wasmsimd, - xnn_f32_igemm_relu_ukernel_1x8s4__wasmsimd, - xnn_f32_gemm_ukernel_3x8s4__wasmsimd, - xnn_f32_igemm_ukernel_3x8s4__wasmsimd, - xnn_f32_gemm_ukernel_1x8s4__wasmsimd, - xnn_f32_igemm_ukernel_1x8s4__wasmsimd, - xnn_init_f32_minmax_scalar_params, - /*mr=*/3, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/2); - } - static void f32_gemm_4x8s4__wasmsimd_arm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm, - xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_arm, - xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm, - xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm, - xnn_f32_gemm_relu_ukernel_4x8s4__wasmsimd, - xnn_f32_igemm_relu_ukernel_4x8s4__wasmsimd, - xnn_f32_gemm_relu_ukernel_1x8s4__wasmsimd, - xnn_f32_igemm_relu_ukernel_1x8s4__wasmsimd, - xnn_f32_gemm_ukernel_4x8s4__wasmsimd, - xnn_f32_igemm_ukernel_4x8s4__wasmsimd, - xnn_f32_gemm_ukernel_1x8s4__wasmsimd, - xnn_f32_igemm_ukernel_1x8s4__wasmsimd, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/2); - } - static void f32_gemm_5x8s4__wasmsimd_arm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_arm, - xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_arm, - xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm, - xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm, - xnn_f32_gemm_relu_ukernel_5x8s4__wasmsimd, - xnn_f32_igemm_relu_ukernel_5x8s4__wasmsimd, - xnn_f32_gemm_relu_ukernel_1x8s4__wasmsimd, - xnn_f32_igemm_relu_ukernel_1x8s4__wasmsimd, - xnn_f32_gemm_ukernel_5x8s4__wasmsimd, - xnn_f32_igemm_ukernel_5x8s4__wasmsimd, - xnn_f32_gemm_ukernel_1x8s4__wasmsimd, - xnn_f32_igemm_ukernel_1x8s4__wasmsimd, - xnn_init_f32_minmax_scalar_params, - /*mr=*/5, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/2); - } - static void f32_gemm_6x8s4__wasmsimd_arm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm, - xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm, - xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm, - xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm, - xnn_f32_gemm_relu_ukernel_6x8s4__wasmsimd, - xnn_f32_igemm_relu_ukernel_6x8s4__wasmsimd, - xnn_f32_gemm_relu_ukernel_1x8s4__wasmsimd, - xnn_f32_igemm_relu_ukernel_1x8s4__wasmsimd, - xnn_f32_gemm_ukernel_6x8s4__wasmsimd, - xnn_f32_igemm_ukernel_6x8s4__wasmsimd, - xnn_f32_gemm_ukernel_1x8s4__wasmsimd, - xnn_f32_igemm_ukernel_1x8s4__wasmsimd, - xnn_init_f32_minmax_scalar_params, - /*mr=*/6, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/2); - } - static void f32_gemm_3x8s4__wasmsimd_x86(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86, - xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86, - xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86, - xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86, - xnn_f32_gemm_relu_ukernel_3x8s4__wasmsimd, - xnn_f32_igemm_relu_ukernel_3x8s4__wasmsimd, - xnn_f32_gemm_relu_ukernel_1x8s4__wasmsimd, - xnn_f32_igemm_relu_ukernel_1x8s4__wasmsimd, - xnn_f32_gemm_ukernel_3x8s4__wasmsimd, - xnn_f32_igemm_ukernel_3x8s4__wasmsimd, - xnn_f32_gemm_ukernel_1x8s4__wasmsimd, - xnn_f32_igemm_ukernel_1x8s4__wasmsimd, - xnn_init_f32_minmax_scalar_params, - /*mr=*/3, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/2); - } - static void f32_gemm_4x8s4__wasmsimd_x86(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86, - xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86, - xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86, - xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86, - xnn_f32_gemm_relu_ukernel_4x8s4__wasmsimd, - xnn_f32_igemm_relu_ukernel_4x8s4__wasmsimd, - xnn_f32_gemm_relu_ukernel_1x8s4__wasmsimd, - xnn_f32_igemm_relu_ukernel_1x8s4__wasmsimd, - xnn_f32_gemm_ukernel_4x8s4__wasmsimd, - xnn_f32_igemm_ukernel_4x8s4__wasmsimd, - xnn_f32_gemm_ukernel_1x8s4__wasmsimd, - xnn_f32_igemm_ukernel_1x8s4__wasmsimd, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/2); - } - static void f32_gemm_5x8s4__wasmsimd_x86(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_x86, - xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_x86, - xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86, - xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86, - xnn_f32_gemm_relu_ukernel_5x8s4__wasmsimd, - xnn_f32_igemm_relu_ukernel_5x8s4__wasmsimd, - xnn_f32_gemm_relu_ukernel_1x8s4__wasmsimd, - xnn_f32_igemm_relu_ukernel_1x8s4__wasmsimd, - xnn_f32_gemm_ukernel_5x8s4__wasmsimd, - xnn_f32_igemm_ukernel_5x8s4__wasmsimd, - xnn_f32_gemm_ukernel_1x8s4__wasmsimd, - xnn_f32_igemm_ukernel_1x8s4__wasmsimd, - xnn_init_f32_minmax_scalar_params, - /*mr=*/5, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/2); - } - static void f32_gemm_6x8s4__wasmsimd_x86(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86, - xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86, - xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86, - xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86, - xnn_f32_gemm_relu_ukernel_6x8s4__wasmsimd, - xnn_f32_igemm_relu_ukernel_6x8s4__wasmsimd, - xnn_f32_gemm_relu_ukernel_1x8s4__wasmsimd, - xnn_f32_igemm_relu_ukernel_1x8s4__wasmsimd, - xnn_f32_gemm_ukernel_6x8s4__wasmsimd, - xnn_f32_igemm_ukernel_6x8s4__wasmsimd, - xnn_f32_gemm_ukernel_1x8s4__wasmsimd, - xnn_f32_igemm_ukernel_1x8s4__wasmsimd, - xnn_init_f32_minmax_scalar_params, - /*mr=*/6, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/2); - } - - BENCHMARK_FP32_END2END(f32_gemm_3x8__wasmsimd_arm_loadsplat); - BENCHMARK_FP32_END2END(f32_gemm_4x8__wasmsimd_arm_loadsplat); - BENCHMARK_FP32_END2END(f32_gemm_5x8__wasmsimd_arm_loadsplat); - BENCHMARK_FP32_END2END(f32_gemm_6x8__wasmsimd_arm_loadsplat); - - BENCHMARK_FP32_END2END(f32_gemm_3x8__wasmsimd_x86_loadsplat); - BENCHMARK_FP32_END2END(f32_gemm_4x8__wasmsimd_x86_loadsplat); - BENCHMARK_FP32_END2END(f32_gemm_5x8__wasmsimd_x86_loadsplat); - BENCHMARK_FP32_END2END(f32_gemm_6x8__wasmsimd_x86_loadsplat); - - BENCHMARK_FP32_END2END(f32_gemm_3x8__wasmsimd_arm_splat); - BENCHMARK_FP32_END2END(f32_gemm_4x8__wasmsimd_arm_splat); - BENCHMARK_FP32_END2END(f32_gemm_5x8__wasmsimd_arm_splat); - BENCHMARK_FP32_END2END(f32_gemm_6x8__wasmsimd_arm_splat); - - BENCHMARK_FP32_END2END(f32_gemm_3x8__wasmsimd_x86_splat); - BENCHMARK_FP32_END2END(f32_gemm_4x8__wasmsimd_x86_splat); - BENCHMARK_FP32_END2END(f32_gemm_5x8__wasmsimd_x86_splat); - BENCHMARK_FP32_END2END(f32_gemm_6x8__wasmsimd_x86_splat); - - BENCHMARK_FP32_END2END(f32_gemm_3x8s4__wasmsimd_arm); - BENCHMARK_FP32_END2END(f32_gemm_4x8s4__wasmsimd_arm); - BENCHMARK_FP32_END2END(f32_gemm_5x8s4__wasmsimd_arm); - BENCHMARK_FP32_END2END(f32_gemm_6x8s4__wasmsimd_arm); - - BENCHMARK_FP32_END2END(f32_gemm_3x8s4__wasmsimd_x86); - BENCHMARK_FP32_END2END(f32_gemm_4x8s4__wasmsimd_x86); - BENCHMARK_FP32_END2END(f32_gemm_5x8s4__wasmsimd_x86); - BENCHMARK_FP32_END2END(f32_gemm_6x8s4__wasmsimd_x86); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - -#if XNN_ARCH_WASM - static void f32_gemm_2x4__wasm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_2x4__wasm, - xnn_f32_igemm_minmax_ukernel_2x4__wasm, - xnn_f32_gemm_minmax_ukernel_1x4__wasm, - xnn_f32_igemm_minmax_ukernel_1x4__wasm, - xnn_f32_gemm_relu_ukernel_2x4__wasm, - xnn_f32_igemm_relu_ukernel_2x4__wasm, - xnn_f32_gemm_relu_ukernel_1x4__wasm, - xnn_f32_igemm_relu_ukernel_1x4__wasm, - xnn_f32_gemm_ukernel_2x4__scalar, - xnn_f32_igemm_ukernel_2x4__scalar, - xnn_f32_gemm_ukernel_1x4__scalar, - xnn_f32_igemm_ukernel_1x4__scalar, - xnn_init_f32_minmax_scalar_params, - /*mr=*/2, /*nr=*/4); - } - - static void f32_gemm_4x4__wasm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x4__wasm, - xnn_f32_igemm_minmax_ukernel_4x4__wasm, - xnn_f32_gemm_minmax_ukernel_1x4__wasm, - xnn_f32_igemm_minmax_ukernel_1x4__wasm, - xnn_f32_gemm_relu_ukernel_4x4__wasm, - xnn_f32_igemm_relu_ukernel_4x4__wasm, - xnn_f32_gemm_relu_ukernel_1x4__wasm, - xnn_f32_igemm_relu_ukernel_1x4__wasm, - xnn_f32_gemm_ukernel_4x4__scalar, - xnn_f32_igemm_ukernel_4x4__scalar, - xnn_f32_gemm_ukernel_1x4__scalar, - xnn_f32_igemm_ukernel_1x4__scalar, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/4); - } - - BENCHMARK_FP32_END2END(f32_gemm_2x4__wasm); - BENCHMARK_FP32_END2END(f32_gemm_4x4__wasm); -#endif // XNN_ARCH_WASM - - -static void f32_gemm_2x4__scalar(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_2x4__scalar, - xnn_f32_igemm_minmax_ukernel_2x4__scalar, - xnn_f32_gemm_minmax_ukernel_1x4__scalar, - xnn_f32_igemm_minmax_ukernel_1x4__scalar, - xnn_f32_gemm_relu_ukernel_2x4__scalar, - xnn_f32_igemm_relu_ukernel_2x4__scalar, - xnn_f32_gemm_relu_ukernel_1x4__scalar, - xnn_f32_igemm_relu_ukernel_1x4__scalar, - xnn_f32_gemm_ukernel_2x4__scalar, - xnn_f32_igemm_ukernel_2x4__scalar, - xnn_f32_gemm_ukernel_1x4__scalar, - xnn_f32_igemm_ukernel_1x4__scalar, - xnn_init_f32_minmax_scalar_params, - /*mr=*/2, /*nr=*/4); -} - -static void f32_gemm_4x4__scalar(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_f32_gemm_minmax_ukernel_4x4__scalar, - xnn_f32_igemm_minmax_ukernel_4x4__scalar, - xnn_f32_gemm_minmax_ukernel_1x4__scalar, - xnn_f32_igemm_minmax_ukernel_1x4__scalar, - xnn_f32_gemm_relu_ukernel_4x4__scalar, - xnn_f32_igemm_relu_ukernel_4x4__scalar, - xnn_f32_gemm_relu_ukernel_1x4__scalar, - xnn_f32_igemm_relu_ukernel_1x4__scalar, - xnn_f32_gemm_ukernel_4x4__scalar, - xnn_f32_igemm_ukernel_4x4__scalar, - xnn_f32_gemm_ukernel_1x4__scalar, - xnn_f32_igemm_ukernel_1x4__scalar, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/4); -} - -BENCHMARK_FP32_END2END(f32_gemm_2x4__scalar); -BENCHMARK_FP32_END2END(f32_gemm_4x4__scalar); - - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/models/BUILD b/bench/models/BUILD new file mode 100644 index 00000000000..3dd3599a9b3 --- /dev/null +++ b/bench/models/BUILD @@ -0,0 +1,42 @@ +# Copyright 2023 Google LLC +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +load( + "//:build_defs.bzl", + "xnnpack_benchmark", + "xnnpack_cxx_library", + "xnnpack_slow_benchmark_tags", +) + +xnnpack_cxx_library( + name = "models", + srcs = [ + "fp32-mobilenet-v1.cc", + "fp32-mobilenet-v2.cc", + "fp32-mobilenet-v3-large.cc", + "fp32-mobilenet-v3-small.cc", + "qs8-mobilenet-v2.cc", + ], + hdrs = [ + "models.h", + ], + deps = [ + "//:XNNPACK", + ], +) + +xnnpack_benchmark( + name = "benchmark", + srcs = ["benchmark.cc"], + tags = xnnpack_slow_benchmark_tags(), + deps = [ + ":models", + "//:allocator", + "//:subgraph", + "//:xnnpack_h", + "//bench:bench_utils", + "@pthreadpool", + ], +) diff --git a/bench/models/benchmark.cc b/bench/models/benchmark.cc new file mode 100644 index 00000000000..de46af8b8f0 --- /dev/null +++ b/bench/models/benchmark.cc @@ -0,0 +1,179 @@ +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include +#include +#include +#include +#include +#include + +#include "models.h" +#include "bench/utils.h" +#include "xnnpack.h" +#include "xnnpack/allocator.h" +#include "xnnpack/subgraph.h" +#include "pthreadpool.h" + +struct ModelRuntime { + std::unique_ptr model; + pthreadpool_t threadpool = nullptr; + xnn_runtime_t runtime = nullptr; + std::vector external_values; + + explicit ModelRuntime(int num_threads) : model(nullptr, xnn_delete_subgraph) { + xnn_delete_runtime(runtime); + threadpool = pthreadpool_create(num_threads); + } + + ~ModelRuntime() { + if (runtime) { + xnn_delete_runtime(runtime); + } + if (threadpool) { + pthreadpool_destroy(threadpool); + } + for (xnn_external_value& i : external_values) { + xnn_release_simd_memory(i.data); + } + } + + bool CreateModel(std::function model_factory) { + model.reset(model_factory()); + if (!model) { + return false; + } + for (uint32_t i = 0; i < model->num_values; ++i) { + if ((model->values[i].flags & (XNN_VALUE_FLAG_EXTERNAL_INPUT | + XNN_VALUE_FLAG_EXTERNAL_OUTPUT)) == 0) { + continue; + } + // Make a buffer for this external value. + size_t size = xnn_tensor_get_size(&model->values[i]) + XNN_EXTRA_BYTES; + external_values.push_back( + xnn_external_value{i, xnn_allocate_zero_simd_memory(size)}); + } + return model != nullptr; + } + + bool CreateRuntime(uint32_t flags) { + assert(!runtime); + return xnn_status_success == xnn_create_runtime_v4(model.get(), nullptr, + nullptr, threadpool, + flags, &runtime); + } + bool ReshapeRuntime() { + return xnn_status_success == xnn_reshape_runtime(runtime); + } + + bool SetupRuntime() { + return xnn_status_success == xnn_setup_runtime_v2(runtime, + external_values.size(), + external_values.data()); + } + + bool Invoke() { return xnn_status_success == xnn_invoke_runtime(runtime); } +}; + +static void BenchmarkInvoke(benchmark::State& state, + std::function model_factory, + uint32_t flags = 0) { + if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) { + state.SkipWithError("failed to initialize XNNPACK"); + return; + } + + ModelRuntime model_runtime(state.range(0)); + if (!model_runtime.CreateModel(model_factory)) { + state.SkipWithError("failed to create model"); + return; + } + + // TODO(dsharlet): We should have benchmarks of these steps too. + if (!model_runtime.CreateRuntime(flags)) { + state.SkipWithError("failed to create runtime"); + return; + } + + if (!model_runtime.ReshapeRuntime()) { + state.SkipWithError("failed to reshape runtime"); + return; + } + + if (!model_runtime.SetupRuntime()) { + state.SkipWithError("failed to setup runtime"); + return; + } + + for (auto _ : state) { + if (!model_runtime.Invoke()) { + state.SkipWithError("failed to invoke runtime"); + return; + } + } + + const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); + if (cpu_frequency != 0) { + state.counters["cpufreq"] = cpu_frequency; + } +} + +static void FP32MobileNetV1(benchmark::State& state) { + BenchmarkInvoke(state, models::FP32MobileNetV1); +} + +static void FP32MobileNetV2(benchmark::State& state) { + BenchmarkInvoke(state, models::FP32MobileNetV2); +} + +static void FP32MobileNetV3Large(benchmark::State& state) { + BenchmarkInvoke(state, models::FP32MobileNetV3Large); +} + +static void FP32MobileNetV3Small(benchmark::State& state) { + BenchmarkInvoke(state, models::FP32MobileNetV3Small); +} + +static void FP16MobileNetV1(benchmark::State& state) { + BenchmarkInvoke(state, models::FP32MobileNetV1, + XNN_FLAG_FORCE_FP16_INFERENCE); +} + +static void FP16MobileNetV2(benchmark::State& state) { + BenchmarkInvoke(state, models::FP32MobileNetV2, + XNN_FLAG_FORCE_FP16_INFERENCE); +} + +static void FP16MobileNetV3Large(benchmark::State& state) { + BenchmarkInvoke(state, models::FP32MobileNetV3Large, + XNN_FLAG_FORCE_FP16_INFERENCE); +} + +static void FP16MobileNetV3Small(benchmark::State& state) { + BenchmarkInvoke(state, models::FP32MobileNetV3Small, + XNN_FLAG_FORCE_FP16_INFERENCE); +} + +static void QS8MobileNetV2(benchmark::State& state) { + BenchmarkInvoke(state, models::QS8MobileNetV2); +} + +BENCHMARK(FP32MobileNetV1)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); +BENCHMARK(FP32MobileNetV2)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); +BENCHMARK(FP32MobileNetV3Large)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); +BENCHMARK(FP32MobileNetV3Small)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); + +BENCHMARK(FP16MobileNetV1)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); +BENCHMARK(FP16MobileNetV2)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); +BENCHMARK(FP16MobileNetV3Large)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); +BENCHMARK(FP16MobileNetV3Small)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); + +BENCHMARK(QS8MobileNetV2)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); + +#ifndef XNNPACK_BENCHMARK_NO_MAIN +BENCHMARK_MAIN(); +#endif diff --git a/bench/models/fp32-mobilenet-v1.cc b/bench/models/fp32-mobilenet-v1.cc new file mode 100644 index 00000000000..937500f4cac --- /dev/null +++ b/bench/models/fp32-mobilenet-v1.cc @@ -0,0 +1,1747 @@ +// Copyright 2020 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. +// +// Auto-generated file. Do not edit! + +#include "xnnpack.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xnnpack.h" + +// align a size up to XNN_EXTRA_BYTES +#define XNN_PAD_EXTRA_BYTES(s, t) (((s) + XNN_EXTRA_BYTES / sizeof(t) - 1) & ~(XNN_EXTRA_BYTES / sizeof(t) - 1)) + +namespace models { + +xnn_subgraph_t FP32MobileNetV1() { + xnn_status status; + xnn_subgraph_t subgraph = nullptr; + status = xnn_create_subgraph(/*num_external_values=*/2, 0, &subgraph); + if (status != xnn_status_success) { + std::cerr << "failed to create subgrpah" << std::endl; + return nullptr; + } + + uint32_t v0 = XNN_INVALID_VALUE_ID; + std::array v0_dims = {{1, 224, 224, 3}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v0_dims.size(), v0_dims.data(), + /*data=*/nullptr, + 1, XNN_VALUE_FLAG_EXTERNAL_INPUT, &v0); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v0" << std::endl; + return nullptr; + } + + uint32_t v1 = XNN_INVALID_VALUE_ID; + std::array v1_dims = {{1, 112, 112, 32}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v1_dims.size(), v1_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v1); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v1" << std::endl; + return nullptr; + } + + uint32_t v2 = XNN_INVALID_VALUE_ID; + std::array v2_dims = {{1, 112, 112, 32}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v2_dims.size(), v2_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v2); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v2" << std::endl; + return nullptr; + } + + uint32_t v3 = XNN_INVALID_VALUE_ID; + std::array v3_dims = {{1, 112, 112, 64}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v3_dims.size(), v3_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v3); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v3" << std::endl; + return nullptr; + } + + uint32_t v4 = XNN_INVALID_VALUE_ID; + std::array v4_dims = {{1, 56, 56, 64}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v4_dims.size(), v4_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v4); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v4" << std::endl; + return nullptr; + } + + uint32_t v5 = XNN_INVALID_VALUE_ID; + std::array v5_dims = {{1, 56, 56, 128}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v5_dims.size(), v5_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v5); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v5" << std::endl; + return nullptr; + } + + uint32_t v6 = XNN_INVALID_VALUE_ID; + std::array v6_dims = {{1, 56, 56, 128}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v6_dims.size(), v6_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v6); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v6" << std::endl; + return nullptr; + } + + uint32_t v7 = XNN_INVALID_VALUE_ID; + std::array v7_dims = {{1, 56, 56, 128}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v7_dims.size(), v7_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v7); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v7" << std::endl; + return nullptr; + } + + uint32_t v8 = XNN_INVALID_VALUE_ID; + std::array v8_dims = {{1, 28, 28, 128}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v8_dims.size(), v8_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v8); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v8" << std::endl; + return nullptr; + } + + uint32_t v9 = XNN_INVALID_VALUE_ID; + std::array v9_dims = {{1, 28, 28, 256}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v9_dims.size(), v9_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v9); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v9" << std::endl; + return nullptr; + } + + uint32_t v10 = XNN_INVALID_VALUE_ID; + std::array v10_dims = {{1, 28, 28, 256}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v10_dims.size(), v10_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v10); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v10" << std::endl; + return nullptr; + } + + uint32_t v11 = XNN_INVALID_VALUE_ID; + std::array v11_dims = {{1, 28, 28, 256}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v11_dims.size(), v11_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v11); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v11" << std::endl; + return nullptr; + } + + uint32_t v12 = XNN_INVALID_VALUE_ID; + std::array v12_dims = {{1, 14, 14, 256}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v12_dims.size(), v12_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v12); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v12" << std::endl; + return nullptr; + } + + uint32_t v13 = XNN_INVALID_VALUE_ID; + std::array v13_dims = {{1, 14, 14, 512}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v13_dims.size(), v13_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v13); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v13" << std::endl; + return nullptr; + } + + uint32_t v14 = XNN_INVALID_VALUE_ID; + std::array v14_dims = {{1, 14, 14, 512}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v14_dims.size(), v14_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v14); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v14" << std::endl; + return nullptr; + } + + uint32_t v15 = XNN_INVALID_VALUE_ID; + std::array v15_dims = {{1, 14, 14, 512}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v15_dims.size(), v15_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v15); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v15" << std::endl; + return nullptr; + } + + uint32_t v16 = XNN_INVALID_VALUE_ID; + std::array v16_dims = {{1, 14, 14, 512}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v16_dims.size(), v16_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v16); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v16" << std::endl; + return nullptr; + } + + uint32_t v17 = XNN_INVALID_VALUE_ID; + std::array v17_dims = {{1, 14, 14, 512}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v17_dims.size(), v17_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v17); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v17" << std::endl; + return nullptr; + } + + uint32_t v18 = XNN_INVALID_VALUE_ID; + std::array v18_dims = {{1, 14, 14, 512}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v18_dims.size(), v18_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v18); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v18" << std::endl; + return nullptr; + } + + uint32_t v19 = XNN_INVALID_VALUE_ID; + std::array v19_dims = {{1, 14, 14, 512}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v19_dims.size(), v19_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v19); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v19" << std::endl; + return nullptr; + } + + uint32_t v20 = XNN_INVALID_VALUE_ID; + std::array v20_dims = {{1, 14, 14, 512}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v20_dims.size(), v20_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v20); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v20" << std::endl; + return nullptr; + } + + uint32_t v21 = XNN_INVALID_VALUE_ID; + std::array v21_dims = {{1, 14, 14, 512}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v21_dims.size(), v21_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v21); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v21" << std::endl; + return nullptr; + } + + uint32_t v22 = XNN_INVALID_VALUE_ID; + std::array v22_dims = {{1, 14, 14, 512}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v22_dims.size(), v22_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v22); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v22" << std::endl; + return nullptr; + } + + uint32_t v23 = XNN_INVALID_VALUE_ID; + std::array v23_dims = {{1, 14, 14, 512}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v23_dims.size(), v23_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v23); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v23" << std::endl; + return nullptr; + } + + uint32_t v24 = XNN_INVALID_VALUE_ID; + std::array v24_dims = {{1, 7, 7, 512}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v24_dims.size(), v24_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v24); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v24" << std::endl; + return nullptr; + } + + uint32_t v25 = XNN_INVALID_VALUE_ID; + std::array v25_dims = {{1, 7, 7, 1024}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v25_dims.size(), v25_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v25); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v25" << std::endl; + return nullptr; + } + + uint32_t v26 = XNN_INVALID_VALUE_ID; + std::array v26_dims = {{1, 7, 7, 1024}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v26_dims.size(), v26_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v26); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v26" << std::endl; + return nullptr; + } + + uint32_t v27 = XNN_INVALID_VALUE_ID; + std::array v27_dims = {{1, 7, 7, 1024}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v27_dims.size(), v27_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v27); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v27" << std::endl; + return nullptr; + } + + uint32_t v28 = XNN_INVALID_VALUE_ID; + std::array v28_dims = {{1, 1, 1, 1024}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v28_dims.size(), v28_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v28); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v28" << std::endl; + return nullptr; + } + + uint32_t v29 = XNN_INVALID_VALUE_ID; + std::array v29_dims = {{1, 1, 1, 1001}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v29_dims.size(), v29_dims.data(), + /*data=*/nullptr, + 0, XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &v29); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v29" << std::endl; + return nullptr; + } + + alignas(16) static std::array w30_data; + uint32_t w30 = XNN_INVALID_VALUE_ID; + std::array w30_dims = {{32, 3, 3, 3}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w30_dims.size(), w30_dims.data(), + /*data=*/w30_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w30); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w30" << std::endl; + return nullptr; + } + + alignas(16) static std::array w31_data; + uint32_t w31 = XNN_INVALID_VALUE_ID; + std::array w31_dims = {{32}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w31_dims.size(), w31_dims.data(), + /*data=*/w31_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w31); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w31" << std::endl; + return nullptr; + } + + alignas(16) static std::array w32_data; + uint32_t w32 = XNN_INVALID_VALUE_ID; + std::array w32_dims = {{1, 3, 3, 32}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w32_dims.size(), w32_dims.data(), + /*data=*/w32_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w32); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w32" << std::endl; + return nullptr; + } + + alignas(16) static std::array w33_data; + uint32_t w33 = XNN_INVALID_VALUE_ID; + std::array w33_dims = {{32}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w33_dims.size(), w33_dims.data(), + /*data=*/w33_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w33); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w33" << std::endl; + return nullptr; + } + + alignas(16) static std::array w34_data; + uint32_t w34 = XNN_INVALID_VALUE_ID; + std::array w34_dims = {{64, 1, 1, 32}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w34_dims.size(), w34_dims.data(), + /*data=*/w34_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w34); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w34" << std::endl; + return nullptr; + } + + alignas(16) static std::array w35_data; + uint32_t w35 = XNN_INVALID_VALUE_ID; + std::array w35_dims = {{64}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w35_dims.size(), w35_dims.data(), + /*data=*/w35_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w35); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w35" << std::endl; + return nullptr; + } + + alignas(16) static std::array w36_data; + uint32_t w36 = XNN_INVALID_VALUE_ID; + std::array w36_dims = {{1, 3, 3, 64}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w36_dims.size(), w36_dims.data(), + /*data=*/w36_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w36); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w36" << std::endl; + return nullptr; + } + + alignas(16) static std::array w37_data; + uint32_t w37 = XNN_INVALID_VALUE_ID; + std::array w37_dims = {{64}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w37_dims.size(), w37_dims.data(), + /*data=*/w37_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w37); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w37" << std::endl; + return nullptr; + } + + alignas(16) static std::array w38_data; + uint32_t w38 = XNN_INVALID_VALUE_ID; + std::array w38_dims = {{128, 1, 1, 64}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w38_dims.size(), w38_dims.data(), + /*data=*/w38_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w38); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w38" << std::endl; + return nullptr; + } + + alignas(16) static std::array w39_data; + uint32_t w39 = XNN_INVALID_VALUE_ID; + std::array w39_dims = {{128}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w39_dims.size(), w39_dims.data(), + /*data=*/w39_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w39); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w39" << std::endl; + return nullptr; + } + + alignas(16) static std::array w40_data; + uint32_t w40 = XNN_INVALID_VALUE_ID; + std::array w40_dims = {{1, 3, 3, 128}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w40_dims.size(), w40_dims.data(), + /*data=*/w40_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w40); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w40" << std::endl; + return nullptr; + } + + alignas(16) static std::array w41_data; + uint32_t w41 = XNN_INVALID_VALUE_ID; + std::array w41_dims = {{128}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w41_dims.size(), w41_dims.data(), + /*data=*/w41_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w41); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w41" << std::endl; + return nullptr; + } + + alignas(16) static std::array w42_data; + uint32_t w42 = XNN_INVALID_VALUE_ID; + std::array w42_dims = {{128, 1, 1, 128}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w42_dims.size(), w42_dims.data(), + /*data=*/w42_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w42); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w42" << std::endl; + return nullptr; + } + + alignas(16) static std::array w43_data; + uint32_t w43 = XNN_INVALID_VALUE_ID; + std::array w43_dims = {{128}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w43_dims.size(), w43_dims.data(), + /*data=*/w43_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w43); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w43" << std::endl; + return nullptr; + } + + alignas(16) static std::array w44_data; + uint32_t w44 = XNN_INVALID_VALUE_ID; + std::array w44_dims = {{1, 3, 3, 128}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w44_dims.size(), w44_dims.data(), + /*data=*/w44_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w44); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w44" << std::endl; + return nullptr; + } + + alignas(16) static std::array w45_data; + uint32_t w45 = XNN_INVALID_VALUE_ID; + std::array w45_dims = {{128}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w45_dims.size(), w45_dims.data(), + /*data=*/w45_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w45); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w45" << std::endl; + return nullptr; + } + + alignas(16) static std::array w46_data; + uint32_t w46 = XNN_INVALID_VALUE_ID; + std::array w46_dims = {{256, 1, 1, 128}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w46_dims.size(), w46_dims.data(), + /*data=*/w46_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w46); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w46" << std::endl; + return nullptr; + } + + alignas(16) static std::array w47_data; + uint32_t w47 = XNN_INVALID_VALUE_ID; + std::array w47_dims = {{256}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w47_dims.size(), w47_dims.data(), + /*data=*/w47_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w47); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w47" << std::endl; + return nullptr; + } + + alignas(16) static std::array w48_data; + uint32_t w48 = XNN_INVALID_VALUE_ID; + std::array w48_dims = {{1, 3, 3, 256}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w48_dims.size(), w48_dims.data(), + /*data=*/w48_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w48); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w48" << std::endl; + return nullptr; + } + + alignas(16) static std::array w49_data; + uint32_t w49 = XNN_INVALID_VALUE_ID; + std::array w49_dims = {{256}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w49_dims.size(), w49_dims.data(), + /*data=*/w49_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w49); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w49" << std::endl; + return nullptr; + } + + alignas(16) static std::array w50_data; + uint32_t w50 = XNN_INVALID_VALUE_ID; + std::array w50_dims = {{256, 1, 1, 256}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w50_dims.size(), w50_dims.data(), + /*data=*/w50_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w50); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w50" << std::endl; + return nullptr; + } + + alignas(16) static std::array w51_data; + uint32_t w51 = XNN_INVALID_VALUE_ID; + std::array w51_dims = {{256}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w51_dims.size(), w51_dims.data(), + /*data=*/w51_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w51); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w51" << std::endl; + return nullptr; + } + + alignas(16) static std::array w52_data; + uint32_t w52 = XNN_INVALID_VALUE_ID; + std::array w52_dims = {{1, 3, 3, 256}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w52_dims.size(), w52_dims.data(), + /*data=*/w52_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w52); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w52" << std::endl; + return nullptr; + } + + alignas(16) static std::array w53_data; + uint32_t w53 = XNN_INVALID_VALUE_ID; + std::array w53_dims = {{256}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w53_dims.size(), w53_dims.data(), + /*data=*/w53_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w53); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w53" << std::endl; + return nullptr; + } + + alignas(16) static std::array w54_data; + uint32_t w54 = XNN_INVALID_VALUE_ID; + std::array w54_dims = {{512, 1, 1, 256}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w54_dims.size(), w54_dims.data(), + /*data=*/w54_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w54); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w54" << std::endl; + return nullptr; + } + + alignas(16) static std::array w55_data; + uint32_t w55 = XNN_INVALID_VALUE_ID; + std::array w55_dims = {{512}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w55_dims.size(), w55_dims.data(), + /*data=*/w55_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w55); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w55" << std::endl; + return nullptr; + } + + alignas(16) static std::array w56_data; + uint32_t w56 = XNN_INVALID_VALUE_ID; + std::array w56_dims = {{1, 3, 3, 512}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w56_dims.size(), w56_dims.data(), + /*data=*/w56_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w56); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w56" << std::endl; + return nullptr; + } + + alignas(16) static std::array w57_data; + uint32_t w57 = XNN_INVALID_VALUE_ID; + std::array w57_dims = {{512}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w57_dims.size(), w57_dims.data(), + /*data=*/w57_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w57); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w57" << std::endl; + return nullptr; + } + + alignas(16) static std::array w58_data; + uint32_t w58 = XNN_INVALID_VALUE_ID; + std::array w58_dims = {{512, 1, 1, 512}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w58_dims.size(), w58_dims.data(), + /*data=*/w58_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w58); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w58" << std::endl; + return nullptr; + } + + alignas(16) static std::array w59_data; + uint32_t w59 = XNN_INVALID_VALUE_ID; + std::array w59_dims = {{512}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w59_dims.size(), w59_dims.data(), + /*data=*/w59_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w59); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w59" << std::endl; + return nullptr; + } + + alignas(16) static std::array w60_data; + uint32_t w60 = XNN_INVALID_VALUE_ID; + std::array w60_dims = {{1, 3, 3, 512}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w60_dims.size(), w60_dims.data(), + /*data=*/w60_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w60); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w60" << std::endl; + return nullptr; + } + + alignas(16) static std::array w61_data; + uint32_t w61 = XNN_INVALID_VALUE_ID; + std::array w61_dims = {{512}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w61_dims.size(), w61_dims.data(), + /*data=*/w61_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w61); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w61" << std::endl; + return nullptr; + } + + alignas(16) static std::array w62_data; + uint32_t w62 = XNN_INVALID_VALUE_ID; + std::array w62_dims = {{512, 1, 1, 512}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w62_dims.size(), w62_dims.data(), + /*data=*/w62_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w62); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w62" << std::endl; + return nullptr; + } + + alignas(16) static std::array w63_data; + uint32_t w63 = XNN_INVALID_VALUE_ID; + std::array w63_dims = {{512}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w63_dims.size(), w63_dims.data(), + /*data=*/w63_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w63); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w63" << std::endl; + return nullptr; + } + + alignas(16) static std::array w64_data; + uint32_t w64 = XNN_INVALID_VALUE_ID; + std::array w64_dims = {{1, 3, 3, 512}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w64_dims.size(), w64_dims.data(), + /*data=*/w64_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w64); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w64" << std::endl; + return nullptr; + } + + alignas(16) static std::array w65_data; + uint32_t w65 = XNN_INVALID_VALUE_ID; + std::array w65_dims = {{512}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w65_dims.size(), w65_dims.data(), + /*data=*/w65_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w65); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w65" << std::endl; + return nullptr; + } + + alignas(16) static std::array w66_data; + uint32_t w66 = XNN_INVALID_VALUE_ID; + std::array w66_dims = {{512, 1, 1, 512}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w66_dims.size(), w66_dims.data(), + /*data=*/w66_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w66); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w66" << std::endl; + return nullptr; + } + + alignas(16) static std::array w67_data; + uint32_t w67 = XNN_INVALID_VALUE_ID; + std::array w67_dims = {{512}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w67_dims.size(), w67_dims.data(), + /*data=*/w67_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w67); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w67" << std::endl; + return nullptr; + } + + alignas(16) static std::array w68_data; + uint32_t w68 = XNN_INVALID_VALUE_ID; + std::array w68_dims = {{1, 3, 3, 512}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w68_dims.size(), w68_dims.data(), + /*data=*/w68_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w68); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w68" << std::endl; + return nullptr; + } + + alignas(16) static std::array w69_data; + uint32_t w69 = XNN_INVALID_VALUE_ID; + std::array w69_dims = {{512}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w69_dims.size(), w69_dims.data(), + /*data=*/w69_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w69); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w69" << std::endl; + return nullptr; + } + + alignas(16) static std::array w70_data; + uint32_t w70 = XNN_INVALID_VALUE_ID; + std::array w70_dims = {{512, 1, 1, 512}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w70_dims.size(), w70_dims.data(), + /*data=*/w70_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w70); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w70" << std::endl; + return nullptr; + } + + alignas(16) static std::array w71_data; + uint32_t w71 = XNN_INVALID_VALUE_ID; + std::array w71_dims = {{512}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w71_dims.size(), w71_dims.data(), + /*data=*/w71_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w71); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w71" << std::endl; + return nullptr; + } + + alignas(16) static std::array w72_data; + uint32_t w72 = XNN_INVALID_VALUE_ID; + std::array w72_dims = {{1, 3, 3, 512}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w72_dims.size(), w72_dims.data(), + /*data=*/w72_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w72); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w72" << std::endl; + return nullptr; + } + + alignas(16) static std::array w73_data; + uint32_t w73 = XNN_INVALID_VALUE_ID; + std::array w73_dims = {{512}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w73_dims.size(), w73_dims.data(), + /*data=*/w73_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w73); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w73" << std::endl; + return nullptr; + } + + alignas(16) static std::array w74_data; + uint32_t w74 = XNN_INVALID_VALUE_ID; + std::array w74_dims = {{512, 1, 1, 512}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w74_dims.size(), w74_dims.data(), + /*data=*/w74_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w74); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w74" << std::endl; + return nullptr; + } + + alignas(16) static std::array w75_data; + uint32_t w75 = XNN_INVALID_VALUE_ID; + std::array w75_dims = {{512}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w75_dims.size(), w75_dims.data(), + /*data=*/w75_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w75); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w75" << std::endl; + return nullptr; + } + + alignas(16) static std::array w76_data; + uint32_t w76 = XNN_INVALID_VALUE_ID; + std::array w76_dims = {{1, 3, 3, 512}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w76_dims.size(), w76_dims.data(), + /*data=*/w76_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w76); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w76" << std::endl; + return nullptr; + } + + alignas(16) static std::array w77_data; + uint32_t w77 = XNN_INVALID_VALUE_ID; + std::array w77_dims = {{512}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w77_dims.size(), w77_dims.data(), + /*data=*/w77_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w77); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w77" << std::endl; + return nullptr; + } + + alignas(16) static std::array w78_data; + uint32_t w78 = XNN_INVALID_VALUE_ID; + std::array w78_dims = {{1024, 1, 1, 512}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w78_dims.size(), w78_dims.data(), + /*data=*/w78_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w78); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w78" << std::endl; + return nullptr; + } + + alignas(16) static std::array w79_data; + uint32_t w79 = XNN_INVALID_VALUE_ID; + std::array w79_dims = {{1024}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w79_dims.size(), w79_dims.data(), + /*data=*/w79_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w79); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w79" << std::endl; + return nullptr; + } + + alignas(16) static std::array w80_data; + uint32_t w80 = XNN_INVALID_VALUE_ID; + std::array w80_dims = {{1, 3, 3, 1024}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w80_dims.size(), w80_dims.data(), + /*data=*/w80_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w80); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w80" << std::endl; + return nullptr; + } + + alignas(16) static std::array w81_data; + uint32_t w81 = XNN_INVALID_VALUE_ID; + std::array w81_dims = {{1024}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w81_dims.size(), w81_dims.data(), + /*data=*/w81_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w81); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w81" << std::endl; + return nullptr; + } + + alignas(16) static std::array w82_data; + uint32_t w82 = XNN_INVALID_VALUE_ID; + std::array w82_dims = {{1024, 1, 1, 1024}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w82_dims.size(), w82_dims.data(), + /*data=*/w82_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w82); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w82" << std::endl; + return nullptr; + } + + alignas(16) static std::array w83_data; + uint32_t w83 = XNN_INVALID_VALUE_ID; + std::array w83_dims = {{1024}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w83_dims.size(), w83_dims.data(), + /*data=*/w83_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w83); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w83" << std::endl; + return nullptr; + } + + alignas(16) static std::array w84_data; + uint32_t w84 = XNN_INVALID_VALUE_ID; + std::array w84_dims = {{1001, 1, 1, 1024}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w84_dims.size(), w84_dims.data(), + /*data=*/w84_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w84); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w84" << std::endl; + return nullptr; + } + + alignas(16) static std::array w85_data; + uint32_t w85 = XNN_INVALID_VALUE_ID; + std::array w85_dims = {{1001}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w85_dims.size(), w85_dims.data(), + /*data=*/w85_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w85); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w85" << std::endl; + return nullptr; + } + + std::random_device random_device; + auto rng = std::mt19937(random_device()); + auto f32rng = std::bind(std::uniform_real_distribution(-1.0f, +1.0f), std::ref(rng)); + std::generate(w30_data.begin(), w30_data.end(), std::ref(f32rng)); + std::generate(w31_data.begin(), w31_data.end(), std::ref(f32rng)); + std::generate(w32_data.begin(), w32_data.end(), std::ref(f32rng)); + std::generate(w33_data.begin(), w33_data.end(), std::ref(f32rng)); + std::generate(w34_data.begin(), w34_data.end(), std::ref(f32rng)); + std::generate(w35_data.begin(), w35_data.end(), std::ref(f32rng)); + std::generate(w36_data.begin(), w36_data.end(), std::ref(f32rng)); + std::generate(w37_data.begin(), w37_data.end(), std::ref(f32rng)); + std::generate(w38_data.begin(), w38_data.end(), std::ref(f32rng)); + std::generate(w39_data.begin(), w39_data.end(), std::ref(f32rng)); + std::generate(w40_data.begin(), w40_data.end(), std::ref(f32rng)); + std::generate(w41_data.begin(), w41_data.end(), std::ref(f32rng)); + std::generate(w42_data.begin(), w42_data.end(), std::ref(f32rng)); + std::generate(w43_data.begin(), w43_data.end(), std::ref(f32rng)); + std::generate(w44_data.begin(), w44_data.end(), std::ref(f32rng)); + std::generate(w45_data.begin(), w45_data.end(), std::ref(f32rng)); + std::generate(w46_data.begin(), w46_data.end(), std::ref(f32rng)); + std::generate(w47_data.begin(), w47_data.end(), std::ref(f32rng)); + std::generate(w48_data.begin(), w48_data.end(), std::ref(f32rng)); + std::generate(w49_data.begin(), w49_data.end(), std::ref(f32rng)); + std::generate(w50_data.begin(), w50_data.end(), std::ref(f32rng)); + std::generate(w51_data.begin(), w51_data.end(), std::ref(f32rng)); + std::generate(w52_data.begin(), w52_data.end(), std::ref(f32rng)); + std::generate(w53_data.begin(), w53_data.end(), std::ref(f32rng)); + std::generate(w54_data.begin(), w54_data.end(), std::ref(f32rng)); + std::generate(w55_data.begin(), w55_data.end(), std::ref(f32rng)); + std::generate(w56_data.begin(), w56_data.end(), std::ref(f32rng)); + std::generate(w57_data.begin(), w57_data.end(), std::ref(f32rng)); + std::generate(w58_data.begin(), w58_data.end(), std::ref(f32rng)); + std::generate(w59_data.begin(), w59_data.end(), std::ref(f32rng)); + std::generate(w60_data.begin(), w60_data.end(), std::ref(f32rng)); + std::generate(w61_data.begin(), w61_data.end(), std::ref(f32rng)); + std::generate(w62_data.begin(), w62_data.end(), std::ref(f32rng)); + std::generate(w63_data.begin(), w63_data.end(), std::ref(f32rng)); + std::generate(w64_data.begin(), w64_data.end(), std::ref(f32rng)); + std::generate(w65_data.begin(), w65_data.end(), std::ref(f32rng)); + std::generate(w66_data.begin(), w66_data.end(), std::ref(f32rng)); + std::generate(w67_data.begin(), w67_data.end(), std::ref(f32rng)); + std::generate(w68_data.begin(), w68_data.end(), std::ref(f32rng)); + std::generate(w69_data.begin(), w69_data.end(), std::ref(f32rng)); + std::generate(w70_data.begin(), w70_data.end(), std::ref(f32rng)); + std::generate(w71_data.begin(), w71_data.end(), std::ref(f32rng)); + std::generate(w72_data.begin(), w72_data.end(), std::ref(f32rng)); + std::generate(w73_data.begin(), w73_data.end(), std::ref(f32rng)); + std::generate(w74_data.begin(), w74_data.end(), std::ref(f32rng)); + std::generate(w75_data.begin(), w75_data.end(), std::ref(f32rng)); + std::generate(w76_data.begin(), w76_data.end(), std::ref(f32rng)); + std::generate(w77_data.begin(), w77_data.end(), std::ref(f32rng)); + std::generate(w78_data.begin(), w78_data.end(), std::ref(f32rng)); + std::generate(w79_data.begin(), w79_data.end(), std::ref(f32rng)); + std::generate(w80_data.begin(), w80_data.end(), std::ref(f32rng)); + std::generate(w81_data.begin(), w81_data.end(), std::ref(f32rng)); + std::generate(w82_data.begin(), w82_data.end(), std::ref(f32rng)); + std::generate(w83_data.begin(), w83_data.end(), std::ref(f32rng)); + std::generate(w84_data.begin(), w84_data.end(), std::ref(f32rng)); + std::generate(w85_data.begin(), w85_data.end(), std::ref(f32rng)); + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/0, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/2, /*subsampling_width=*/2, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/3, + /*group_output_channels=*/32, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v0, + w30, + w31, + v1, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #0" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/32, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v1, + w32, + w33, + v2, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #1" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/32, + /*group_output_channels=*/64, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v2, + w34, + w35, + v3, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #2" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/0, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/2, /*subsampling_width=*/2, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/64, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v3, + w36, + w37, + v4, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #3" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/64, + /*group_output_channels=*/128, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v4, + w38, + w39, + v5, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #4" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/128, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v5, + w40, + w41, + v6, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #5" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/128, + /*group_output_channels=*/128, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v6, + w42, + w43, + v7, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #6" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/0, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/2, /*subsampling_width=*/2, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/128, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v7, + w44, + w45, + v8, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #7" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/128, + /*group_output_channels=*/256, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v8, + w46, + w47, + v9, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #8" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/256, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v9, + w48, + w49, + v10, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #9" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/256, + /*group_output_channels=*/256, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v10, + w50, + w51, + v11, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #10" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/0, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/2, /*subsampling_width=*/2, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/256, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v11, + w52, + w53, + v12, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #11" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/256, + /*group_output_channels=*/512, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v12, + w54, + w55, + v13, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #12" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/512, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v13, + w56, + w57, + v14, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #13" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/512, + /*group_output_channels=*/512, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v14, + w58, + w59, + v15, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #14" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/512, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v15, + w60, + w61, + v16, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #15" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/512, + /*group_output_channels=*/512, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v16, + w62, + w63, + v17, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #16" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/512, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v17, + w64, + w65, + v18, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #17" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/512, + /*group_output_channels=*/512, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v18, + w66, + w67, + v19, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #18" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/512, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v19, + w68, + w69, + v20, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #19" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/512, + /*group_output_channels=*/512, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v20, + w70, + w71, + v21, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #20" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/512, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v21, + w72, + w73, + v22, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #21" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/512, + /*group_output_channels=*/512, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v22, + w74, + w75, + v23, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #22" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/0, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/2, /*subsampling_width=*/2, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/512, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v23, + w76, + w77, + v24, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #23" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/512, + /*group_output_channels=*/1024, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v24, + w78, + w79, + v25, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #24" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/1024, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v25, + w80, + w81, + v26, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #25" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/1024, + /*group_output_channels=*/1024, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v26, + w82, + w83, + v27, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #26" << std::endl; + return nullptr; + } + + status = xnn_define_average_pooling_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*pooling_height=*/7, /*pooling_width=*/7, + /*stride_height=*/2, /*stride_width=*/2, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v27, + v28, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #27" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/1024, + /*group_output_channels=*/1001, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v28, + w84, + w85, + v29, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #28" << std::endl; + return nullptr; + } + + return subgraph; +} + +} // namespace models diff --git a/bench/models/fp32-mobilenet-v2.cc b/bench/models/fp32-mobilenet-v2.cc new file mode 100644 index 00000000000..3bb746d94b0 --- /dev/null +++ b/bench/models/fp32-mobilenet-v2.cc @@ -0,0 +1,3537 @@ +// Copyright 2020 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. +// +// Auto-generated file. Do not edit! + +#include "xnnpack.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xnnpack.h" + +// align a size up to XNN_EXTRA_BYTES +#define XNN_PAD_EXTRA_BYTES(s, t) (((s) + XNN_EXTRA_BYTES / sizeof(t) - 1) & ~(XNN_EXTRA_BYTES / sizeof(t) - 1)) + +namespace models { + +xnn_subgraph_t FP32MobileNetV2() { + xnn_status status; + xnn_subgraph_t subgraph = nullptr; + status = xnn_create_subgraph(/*num_external_values=*/2, 0, &subgraph); + if (status != xnn_status_success) { + std::cerr << "failed to create subgrpah" << std::endl; + return nullptr; + } + + uint32_t v0 = XNN_INVALID_VALUE_ID; + std::array v0_dims = {{1, 224, 224, 3}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v0_dims.size(), v0_dims.data(), + /*data=*/nullptr, + 1, XNN_VALUE_FLAG_EXTERNAL_INPUT, &v0); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v0" << std::endl; + return nullptr; + } + + uint32_t v1 = XNN_INVALID_VALUE_ID; + std::array v1_dims = {{1, 112, 112, 32}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v1_dims.size(), v1_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v1); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v1" << std::endl; + return nullptr; + } + + uint32_t v2 = XNN_INVALID_VALUE_ID; + std::array v2_dims = {{1, 112, 112, 32}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v2_dims.size(), v2_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v2); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v2" << std::endl; + return nullptr; + } + + uint32_t v3 = XNN_INVALID_VALUE_ID; + std::array v3_dims = {{1, 112, 112, 16}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v3_dims.size(), v3_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v3); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v3" << std::endl; + return nullptr; + } + + uint32_t v4 = XNN_INVALID_VALUE_ID; + std::array v4_dims = {{1, 112, 112, 96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v4_dims.size(), v4_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v4); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v4" << std::endl; + return nullptr; + } + + uint32_t v5 = XNN_INVALID_VALUE_ID; + std::array v5_dims = {{1, 56, 56, 96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v5_dims.size(), v5_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v5); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v5" << std::endl; + return nullptr; + } + + uint32_t v6 = XNN_INVALID_VALUE_ID; + std::array v6_dims = {{1, 56, 56, 24}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v6_dims.size(), v6_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v6); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v6" << std::endl; + return nullptr; + } + + uint32_t v7 = XNN_INVALID_VALUE_ID; + std::array v7_dims = {{1, 56, 56, 144}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v7_dims.size(), v7_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v7); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v7" << std::endl; + return nullptr; + } + + uint32_t v8 = XNN_INVALID_VALUE_ID; + std::array v8_dims = {{1, 56, 56, 144}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v8_dims.size(), v8_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v8); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v8" << std::endl; + return nullptr; + } + + uint32_t v9 = XNN_INVALID_VALUE_ID; + std::array v9_dims = {{1, 56, 56, 24}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v9_dims.size(), v9_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v9); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v9" << std::endl; + return nullptr; + } + + uint32_t v10 = XNN_INVALID_VALUE_ID; + std::array v10_dims = {{1, 56, 56, 24}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v10_dims.size(), v10_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v10); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v10" << std::endl; + return nullptr; + } + + uint32_t v11 = XNN_INVALID_VALUE_ID; + std::array v11_dims = {{1, 56, 56, 144}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v11_dims.size(), v11_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v11); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v11" << std::endl; + return nullptr; + } + + uint32_t v12 = XNN_INVALID_VALUE_ID; + std::array v12_dims = {{1, 28, 28, 144}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v12_dims.size(), v12_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v12); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v12" << std::endl; + return nullptr; + } + + uint32_t v13 = XNN_INVALID_VALUE_ID; + std::array v13_dims = {{1, 28, 28, 32}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v13_dims.size(), v13_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v13); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v13" << std::endl; + return nullptr; + } + + uint32_t v14 = XNN_INVALID_VALUE_ID; + std::array v14_dims = {{1, 28, 28, 192}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v14_dims.size(), v14_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v14); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v14" << std::endl; + return nullptr; + } + + uint32_t v15 = XNN_INVALID_VALUE_ID; + std::array v15_dims = {{1, 28, 28, 192}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v15_dims.size(), v15_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v15); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v15" << std::endl; + return nullptr; + } + + uint32_t v16 = XNN_INVALID_VALUE_ID; + std::array v16_dims = {{1, 28, 28, 32}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v16_dims.size(), v16_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v16); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v16" << std::endl; + return nullptr; + } + + uint32_t v17 = XNN_INVALID_VALUE_ID; + std::array v17_dims = {{1, 28, 28, 32}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v17_dims.size(), v17_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v17); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v17" << std::endl; + return nullptr; + } + + uint32_t v18 = XNN_INVALID_VALUE_ID; + std::array v18_dims = {{1, 28, 28, 192}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v18_dims.size(), v18_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v18); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v18" << std::endl; + return nullptr; + } + + uint32_t v19 = XNN_INVALID_VALUE_ID; + std::array v19_dims = {{1, 28, 28, 192}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v19_dims.size(), v19_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v19); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v19" << std::endl; + return nullptr; + } + + uint32_t v20 = XNN_INVALID_VALUE_ID; + std::array v20_dims = {{1, 28, 28, 32}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v20_dims.size(), v20_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v20); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v20" << std::endl; + return nullptr; + } + + uint32_t v21 = XNN_INVALID_VALUE_ID; + std::array v21_dims = {{1, 28, 28, 32}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v21_dims.size(), v21_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v21); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v21" << std::endl; + return nullptr; + } + + uint32_t v22 = XNN_INVALID_VALUE_ID; + std::array v22_dims = {{1, 28, 28, 192}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v22_dims.size(), v22_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v22); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v22" << std::endl; + return nullptr; + } + + uint32_t v23 = XNN_INVALID_VALUE_ID; + std::array v23_dims = {{1, 14, 14, 192}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v23_dims.size(), v23_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v23); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v23" << std::endl; + return nullptr; + } + + uint32_t v24 = XNN_INVALID_VALUE_ID; + std::array v24_dims = {{1, 14, 14, 64}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v24_dims.size(), v24_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v24); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v24" << std::endl; + return nullptr; + } + + uint32_t v25 = XNN_INVALID_VALUE_ID; + std::array v25_dims = {{1, 14, 14, 384}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v25_dims.size(), v25_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v25); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v25" << std::endl; + return nullptr; + } + + uint32_t v26 = XNN_INVALID_VALUE_ID; + std::array v26_dims = {{1, 14, 14, 384}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v26_dims.size(), v26_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v26); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v26" << std::endl; + return nullptr; + } + + uint32_t v27 = XNN_INVALID_VALUE_ID; + std::array v27_dims = {{1, 14, 14, 64}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v27_dims.size(), v27_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v27); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v27" << std::endl; + return nullptr; + } + + uint32_t v28 = XNN_INVALID_VALUE_ID; + std::array v28_dims = {{1, 14, 14, 64}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v28_dims.size(), v28_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v28); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v28" << std::endl; + return nullptr; + } + + uint32_t v29 = XNN_INVALID_VALUE_ID; + std::array v29_dims = {{1, 14, 14, 384}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v29_dims.size(), v29_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v29); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v29" << std::endl; + return nullptr; + } + + uint32_t v30 = XNN_INVALID_VALUE_ID; + std::array v30_dims = {{1, 14, 14, 384}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v30_dims.size(), v30_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v30); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v30" << std::endl; + return nullptr; + } + + uint32_t v31 = XNN_INVALID_VALUE_ID; + std::array v31_dims = {{1, 14, 14, 64}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v31_dims.size(), v31_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v31); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v31" << std::endl; + return nullptr; + } + + uint32_t v32 = XNN_INVALID_VALUE_ID; + std::array v32_dims = {{1, 14, 14, 64}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v32_dims.size(), v32_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v32); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v32" << std::endl; + return nullptr; + } + + uint32_t v33 = XNN_INVALID_VALUE_ID; + std::array v33_dims = {{1, 14, 14, 384}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v33_dims.size(), v33_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v33); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v33" << std::endl; + return nullptr; + } + + uint32_t v34 = XNN_INVALID_VALUE_ID; + std::array v34_dims = {{1, 14, 14, 384}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v34_dims.size(), v34_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v34); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v34" << std::endl; + return nullptr; + } + + uint32_t v35 = XNN_INVALID_VALUE_ID; + std::array v35_dims = {{1, 14, 14, 64}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v35_dims.size(), v35_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v35); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v35" << std::endl; + return nullptr; + } + + uint32_t v36 = XNN_INVALID_VALUE_ID; + std::array v36_dims = {{1, 14, 14, 64}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v36_dims.size(), v36_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v36); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v36" << std::endl; + return nullptr; + } + + uint32_t v37 = XNN_INVALID_VALUE_ID; + std::array v37_dims = {{1, 14, 14, 384}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v37_dims.size(), v37_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v37); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v37" << std::endl; + return nullptr; + } + + uint32_t v38 = XNN_INVALID_VALUE_ID; + std::array v38_dims = {{1, 14, 14, 384}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v38_dims.size(), v38_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v38); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v38" << std::endl; + return nullptr; + } + + uint32_t v39 = XNN_INVALID_VALUE_ID; + std::array v39_dims = {{1, 14, 14, 96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v39_dims.size(), v39_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v39); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v39" << std::endl; + return nullptr; + } + + uint32_t v40 = XNN_INVALID_VALUE_ID; + std::array v40_dims = {{1, 14, 14, 576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v40_dims.size(), v40_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v40); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v40" << std::endl; + return nullptr; + } + + uint32_t v41 = XNN_INVALID_VALUE_ID; + std::array v41_dims = {{1, 14, 14, 576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v41_dims.size(), v41_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v41); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v41" << std::endl; + return nullptr; + } + + uint32_t v42 = XNN_INVALID_VALUE_ID; + std::array v42_dims = {{1, 14, 14, 96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v42_dims.size(), v42_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v42); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v42" << std::endl; + return nullptr; + } + + uint32_t v43 = XNN_INVALID_VALUE_ID; + std::array v43_dims = {{1, 14, 14, 96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v43_dims.size(), v43_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v43); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v43" << std::endl; + return nullptr; + } + + uint32_t v44 = XNN_INVALID_VALUE_ID; + std::array v44_dims = {{1, 14, 14, 576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v44_dims.size(), v44_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v44); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v44" << std::endl; + return nullptr; + } + + uint32_t v45 = XNN_INVALID_VALUE_ID; + std::array v45_dims = {{1, 14, 14, 576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v45_dims.size(), v45_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v45); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v45" << std::endl; + return nullptr; + } + + uint32_t v46 = XNN_INVALID_VALUE_ID; + std::array v46_dims = {{1, 14, 14, 96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v46_dims.size(), v46_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v46); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v46" << std::endl; + return nullptr; + } + + uint32_t v47 = XNN_INVALID_VALUE_ID; + std::array v47_dims = {{1, 14, 14, 96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v47_dims.size(), v47_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v47); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v47" << std::endl; + return nullptr; + } + + uint32_t v48 = XNN_INVALID_VALUE_ID; + std::array v48_dims = {{1, 14, 14, 576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v48_dims.size(), v48_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v48); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v48" << std::endl; + return nullptr; + } + + uint32_t v49 = XNN_INVALID_VALUE_ID; + std::array v49_dims = {{1, 7, 7, 576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v49_dims.size(), v49_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v49); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v49" << std::endl; + return nullptr; + } + + uint32_t v50 = XNN_INVALID_VALUE_ID; + std::array v50_dims = {{1, 7, 7, 160}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v50_dims.size(), v50_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v50); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v50" << std::endl; + return nullptr; + } + + uint32_t v51 = XNN_INVALID_VALUE_ID; + std::array v51_dims = {{1, 7, 7, 960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v51_dims.size(), v51_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v51); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v51" << std::endl; + return nullptr; + } + + uint32_t v52 = XNN_INVALID_VALUE_ID; + std::array v52_dims = {{1, 7, 7, 960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v52_dims.size(), v52_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v52); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v52" << std::endl; + return nullptr; + } + + uint32_t v53 = XNN_INVALID_VALUE_ID; + std::array v53_dims = {{1, 7, 7, 160}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v53_dims.size(), v53_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v53); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v53" << std::endl; + return nullptr; + } + + uint32_t v54 = XNN_INVALID_VALUE_ID; + std::array v54_dims = {{1, 7, 7, 160}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v54_dims.size(), v54_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v54); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v54" << std::endl; + return nullptr; + } + + uint32_t v55 = XNN_INVALID_VALUE_ID; + std::array v55_dims = {{1, 7, 7, 960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v55_dims.size(), v55_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v55); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v55" << std::endl; + return nullptr; + } + + uint32_t v56 = XNN_INVALID_VALUE_ID; + std::array v56_dims = {{1, 7, 7, 960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v56_dims.size(), v56_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v56); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v56" << std::endl; + return nullptr; + } + + uint32_t v57 = XNN_INVALID_VALUE_ID; + std::array v57_dims = {{1, 7, 7, 160}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v57_dims.size(), v57_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v57); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v57" << std::endl; + return nullptr; + } + + uint32_t v58 = XNN_INVALID_VALUE_ID; + std::array v58_dims = {{1, 7, 7, 160}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v58_dims.size(), v58_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v58); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v58" << std::endl; + return nullptr; + } + + uint32_t v59 = XNN_INVALID_VALUE_ID; + std::array v59_dims = {{1, 7, 7, 960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v59_dims.size(), v59_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v59); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v59" << std::endl; + return nullptr; + } + + uint32_t v60 = XNN_INVALID_VALUE_ID; + std::array v60_dims = {{1, 7, 7, 960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v60_dims.size(), v60_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v60); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v60" << std::endl; + return nullptr; + } + + uint32_t v61 = XNN_INVALID_VALUE_ID; + std::array v61_dims = {{1, 7, 7, 320}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v61_dims.size(), v61_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v61); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v61" << std::endl; + return nullptr; + } + + uint32_t v62 = XNN_INVALID_VALUE_ID; + std::array v62_dims = {{1, 7, 7, 1280}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v62_dims.size(), v62_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v62); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v62" << std::endl; + return nullptr; + } + + uint32_t v63 = XNN_INVALID_VALUE_ID; + std::array v63_dims = {{1, 1, 1, 1280}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v63_dims.size(), v63_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v63); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v63" << std::endl; + return nullptr; + } + + uint32_t v64 = XNN_INVALID_VALUE_ID; + std::array v64_dims = {{1, 1, 1, 1001}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v64_dims.size(), v64_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v64); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v64" << std::endl; + return nullptr; + } + + uint32_t v65 = XNN_INVALID_VALUE_ID; + std::array v65_dims = {{1, 1001}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v65_dims.size(), v65_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v65); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v65" << std::endl; + return nullptr; + } + + uint32_t v66 = XNN_INVALID_VALUE_ID; + std::array v66_dims = {{1, 1001}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v66_dims.size(), v66_dims.data(), + /*data=*/nullptr, + 0, XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &v66); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v66" << std::endl; + return nullptr; + } + + alignas(16) static std::array w67_data; + uint32_t w67 = XNN_INVALID_VALUE_ID; + std::array w67_dims = {{32, 3, 3, 3}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w67_dims.size(), w67_dims.data(), + /*data=*/w67_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w67); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w67" << std::endl; + return nullptr; + } + + alignas(16) static std::array w68_data; + uint32_t w68 = XNN_INVALID_VALUE_ID; + std::array w68_dims = {{32}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w68_dims.size(), w68_dims.data(), + /*data=*/w68_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w68); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w68" << std::endl; + return nullptr; + } + + alignas(16) static std::array w69_data; + uint32_t w69 = XNN_INVALID_VALUE_ID; + std::array w69_dims = {{1, 3, 3, 32}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w69_dims.size(), w69_dims.data(), + /*data=*/w69_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w69); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w69" << std::endl; + return nullptr; + } + + alignas(16) static std::array w70_data; + uint32_t w70 = XNN_INVALID_VALUE_ID; + std::array w70_dims = {{32}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w70_dims.size(), w70_dims.data(), + /*data=*/w70_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w70); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w70" << std::endl; + return nullptr; + } + + alignas(16) static std::array w71_data; + uint32_t w71 = XNN_INVALID_VALUE_ID; + std::array w71_dims = {{16, 1, 1, 32}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w71_dims.size(), w71_dims.data(), + /*data=*/w71_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w71); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w71" << std::endl; + return nullptr; + } + + alignas(16) static std::array w72_data; + uint32_t w72 = XNN_INVALID_VALUE_ID; + std::array w72_dims = {{16}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w72_dims.size(), w72_dims.data(), + /*data=*/w72_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w72); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w72" << std::endl; + return nullptr; + } + + alignas(16) static std::array w73_data; + uint32_t w73 = XNN_INVALID_VALUE_ID; + std::array w73_dims = {{96, 1, 1, 16}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w73_dims.size(), w73_dims.data(), + /*data=*/w73_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w73); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w73" << std::endl; + return nullptr; + } + + alignas(16) static std::array w74_data; + uint32_t w74 = XNN_INVALID_VALUE_ID; + std::array w74_dims = {{96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w74_dims.size(), w74_dims.data(), + /*data=*/w74_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w74); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w74" << std::endl; + return nullptr; + } + + alignas(16) static std::array w75_data; + uint32_t w75 = XNN_INVALID_VALUE_ID; + std::array w75_dims = {{1, 3, 3, 96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w75_dims.size(), w75_dims.data(), + /*data=*/w75_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w75); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w75" << std::endl; + return nullptr; + } + + alignas(16) static std::array w76_data; + uint32_t w76 = XNN_INVALID_VALUE_ID; + std::array w76_dims = {{96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w76_dims.size(), w76_dims.data(), + /*data=*/w76_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w76); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w76" << std::endl; + return nullptr; + } + + alignas(16) static std::array w77_data; + uint32_t w77 = XNN_INVALID_VALUE_ID; + std::array w77_dims = {{24, 1, 1, 96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w77_dims.size(), w77_dims.data(), + /*data=*/w77_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w77); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w77" << std::endl; + return nullptr; + } + + alignas(16) static std::array w78_data; + uint32_t w78 = XNN_INVALID_VALUE_ID; + std::array w78_dims = {{24}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w78_dims.size(), w78_dims.data(), + /*data=*/w78_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w78); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w78" << std::endl; + return nullptr; + } + + alignas(16) static std::array w79_data; + uint32_t w79 = XNN_INVALID_VALUE_ID; + std::array w79_dims = {{144, 1, 1, 24}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w79_dims.size(), w79_dims.data(), + /*data=*/w79_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w79); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w79" << std::endl; + return nullptr; + } + + alignas(16) static std::array w80_data; + uint32_t w80 = XNN_INVALID_VALUE_ID; + std::array w80_dims = {{144}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w80_dims.size(), w80_dims.data(), + /*data=*/w80_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w80); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w80" << std::endl; + return nullptr; + } + + alignas(16) static std::array w81_data; + uint32_t w81 = XNN_INVALID_VALUE_ID; + std::array w81_dims = {{1, 3, 3, 144}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w81_dims.size(), w81_dims.data(), + /*data=*/w81_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w81); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w81" << std::endl; + return nullptr; + } + + alignas(16) static std::array w82_data; + uint32_t w82 = XNN_INVALID_VALUE_ID; + std::array w82_dims = {{144}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w82_dims.size(), w82_dims.data(), + /*data=*/w82_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w82); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w82" << std::endl; + return nullptr; + } + + alignas(16) static std::array w83_data; + uint32_t w83 = XNN_INVALID_VALUE_ID; + std::array w83_dims = {{24, 1, 1, 144}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w83_dims.size(), w83_dims.data(), + /*data=*/w83_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w83); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w83" << std::endl; + return nullptr; + } + + alignas(16) static std::array w84_data; + uint32_t w84 = XNN_INVALID_VALUE_ID; + std::array w84_dims = {{24}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w84_dims.size(), w84_dims.data(), + /*data=*/w84_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w84); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w84" << std::endl; + return nullptr; + } + + alignas(16) static std::array w85_data; + uint32_t w85 = XNN_INVALID_VALUE_ID; + std::array w85_dims = {{144, 1, 1, 24}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w85_dims.size(), w85_dims.data(), + /*data=*/w85_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w85); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w85" << std::endl; + return nullptr; + } + + alignas(16) static std::array w86_data; + uint32_t w86 = XNN_INVALID_VALUE_ID; + std::array w86_dims = {{144}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w86_dims.size(), w86_dims.data(), + /*data=*/w86_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w86); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w86" << std::endl; + return nullptr; + } + + alignas(16) static std::array w87_data; + uint32_t w87 = XNN_INVALID_VALUE_ID; + std::array w87_dims = {{1, 3, 3, 144}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w87_dims.size(), w87_dims.data(), + /*data=*/w87_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w87); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w87" << std::endl; + return nullptr; + } + + alignas(16) static std::array w88_data; + uint32_t w88 = XNN_INVALID_VALUE_ID; + std::array w88_dims = {{144}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w88_dims.size(), w88_dims.data(), + /*data=*/w88_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w88); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w88" << std::endl; + return nullptr; + } + + alignas(16) static std::array w89_data; + uint32_t w89 = XNN_INVALID_VALUE_ID; + std::array w89_dims = {{32, 1, 1, 144}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w89_dims.size(), w89_dims.data(), + /*data=*/w89_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w89); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w89" << std::endl; + return nullptr; + } + + alignas(16) static std::array w90_data; + uint32_t w90 = XNN_INVALID_VALUE_ID; + std::array w90_dims = {{32}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w90_dims.size(), w90_dims.data(), + /*data=*/w90_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w90); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w90" << std::endl; + return nullptr; + } + + alignas(16) static std::array w91_data; + uint32_t w91 = XNN_INVALID_VALUE_ID; + std::array w91_dims = {{192, 1, 1, 32}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w91_dims.size(), w91_dims.data(), + /*data=*/w91_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w91); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w91" << std::endl; + return nullptr; + } + + alignas(16) static std::array w92_data; + uint32_t w92 = XNN_INVALID_VALUE_ID; + std::array w92_dims = {{192}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w92_dims.size(), w92_dims.data(), + /*data=*/w92_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w92); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w92" << std::endl; + return nullptr; + } + + alignas(16) static std::array w93_data; + uint32_t w93 = XNN_INVALID_VALUE_ID; + std::array w93_dims = {{1, 3, 3, 192}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w93_dims.size(), w93_dims.data(), + /*data=*/w93_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w93); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w93" << std::endl; + return nullptr; + } + + alignas(16) static std::array w94_data; + uint32_t w94 = XNN_INVALID_VALUE_ID; + std::array w94_dims = {{192}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w94_dims.size(), w94_dims.data(), + /*data=*/w94_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w94); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w94" << std::endl; + return nullptr; + } + + alignas(16) static std::array w95_data; + uint32_t w95 = XNN_INVALID_VALUE_ID; + std::array w95_dims = {{32, 1, 1, 192}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w95_dims.size(), w95_dims.data(), + /*data=*/w95_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w95); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w95" << std::endl; + return nullptr; + } + + alignas(16) static std::array w96_data; + uint32_t w96 = XNN_INVALID_VALUE_ID; + std::array w96_dims = {{32}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w96_dims.size(), w96_dims.data(), + /*data=*/w96_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w96); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w96" << std::endl; + return nullptr; + } + + alignas(16) static std::array w97_data; + uint32_t w97 = XNN_INVALID_VALUE_ID; + std::array w97_dims = {{192, 1, 1, 32}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w97_dims.size(), w97_dims.data(), + /*data=*/w97_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w97); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w97" << std::endl; + return nullptr; + } + + alignas(16) static std::array w98_data; + uint32_t w98 = XNN_INVALID_VALUE_ID; + std::array w98_dims = {{192}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w98_dims.size(), w98_dims.data(), + /*data=*/w98_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w98); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w98" << std::endl; + return nullptr; + } + + alignas(16) static std::array w99_data; + uint32_t w99 = XNN_INVALID_VALUE_ID; + std::array w99_dims = {{1, 3, 3, 192}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w99_dims.size(), w99_dims.data(), + /*data=*/w99_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w99); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w99" << std::endl; + return nullptr; + } + + alignas(16) static std::array w100_data; + uint32_t w100 = XNN_INVALID_VALUE_ID; + std::array w100_dims = {{192}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w100_dims.size(), w100_dims.data(), + /*data=*/w100_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w100); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w100" << std::endl; + return nullptr; + } + + alignas(16) static std::array w101_data; + uint32_t w101 = XNN_INVALID_VALUE_ID; + std::array w101_dims = {{32, 1, 1, 192}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w101_dims.size(), w101_dims.data(), + /*data=*/w101_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w101); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w101" << std::endl; + return nullptr; + } + + alignas(16) static std::array w102_data; + uint32_t w102 = XNN_INVALID_VALUE_ID; + std::array w102_dims = {{32}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w102_dims.size(), w102_dims.data(), + /*data=*/w102_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w102); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w102" << std::endl; + return nullptr; + } + + alignas(16) static std::array w103_data; + uint32_t w103 = XNN_INVALID_VALUE_ID; + std::array w103_dims = {{192, 1, 1, 32}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w103_dims.size(), w103_dims.data(), + /*data=*/w103_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w103); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w103" << std::endl; + return nullptr; + } + + alignas(16) static std::array w104_data; + uint32_t w104 = XNN_INVALID_VALUE_ID; + std::array w104_dims = {{192}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w104_dims.size(), w104_dims.data(), + /*data=*/w104_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w104); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w104" << std::endl; + return nullptr; + } + + alignas(16) static std::array w105_data; + uint32_t w105 = XNN_INVALID_VALUE_ID; + std::array w105_dims = {{1, 3, 3, 192}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w105_dims.size(), w105_dims.data(), + /*data=*/w105_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w105); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w105" << std::endl; + return nullptr; + } + + alignas(16) static std::array w106_data; + uint32_t w106 = XNN_INVALID_VALUE_ID; + std::array w106_dims = {{192}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w106_dims.size(), w106_dims.data(), + /*data=*/w106_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w106); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w106" << std::endl; + return nullptr; + } + + alignas(16) static std::array w107_data; + uint32_t w107 = XNN_INVALID_VALUE_ID; + std::array w107_dims = {{64, 1, 1, 192}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w107_dims.size(), w107_dims.data(), + /*data=*/w107_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w107); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w107" << std::endl; + return nullptr; + } + + alignas(16) static std::array w108_data; + uint32_t w108 = XNN_INVALID_VALUE_ID; + std::array w108_dims = {{64}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w108_dims.size(), w108_dims.data(), + /*data=*/w108_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w108); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w108" << std::endl; + return nullptr; + } + + alignas(16) static std::array w109_data; + uint32_t w109 = XNN_INVALID_VALUE_ID; + std::array w109_dims = {{384, 1, 1, 64}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w109_dims.size(), w109_dims.data(), + /*data=*/w109_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w109); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w109" << std::endl; + return nullptr; + } + + alignas(16) static std::array w110_data; + uint32_t w110 = XNN_INVALID_VALUE_ID; + std::array w110_dims = {{384}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w110_dims.size(), w110_dims.data(), + /*data=*/w110_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w110); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w110" << std::endl; + return nullptr; + } + + alignas(16) static std::array w111_data; + uint32_t w111 = XNN_INVALID_VALUE_ID; + std::array w111_dims = {{1, 3, 3, 384}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w111_dims.size(), w111_dims.data(), + /*data=*/w111_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w111); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w111" << std::endl; + return nullptr; + } + + alignas(16) static std::array w112_data; + uint32_t w112 = XNN_INVALID_VALUE_ID; + std::array w112_dims = {{384}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w112_dims.size(), w112_dims.data(), + /*data=*/w112_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w112); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w112" << std::endl; + return nullptr; + } + + alignas(16) static std::array w113_data; + uint32_t w113 = XNN_INVALID_VALUE_ID; + std::array w113_dims = {{64, 1, 1, 384}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w113_dims.size(), w113_dims.data(), + /*data=*/w113_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w113); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w113" << std::endl; + return nullptr; + } + + alignas(16) static std::array w114_data; + uint32_t w114 = XNN_INVALID_VALUE_ID; + std::array w114_dims = {{64}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w114_dims.size(), w114_dims.data(), + /*data=*/w114_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w114); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w114" << std::endl; + return nullptr; + } + + alignas(16) static std::array w115_data; + uint32_t w115 = XNN_INVALID_VALUE_ID; + std::array w115_dims = {{384, 1, 1, 64}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w115_dims.size(), w115_dims.data(), + /*data=*/w115_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w115); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w115" << std::endl; + return nullptr; + } + + alignas(16) static std::array w116_data; + uint32_t w116 = XNN_INVALID_VALUE_ID; + std::array w116_dims = {{384}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w116_dims.size(), w116_dims.data(), + /*data=*/w116_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w116); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w116" << std::endl; + return nullptr; + } + + alignas(16) static std::array w117_data; + uint32_t w117 = XNN_INVALID_VALUE_ID; + std::array w117_dims = {{1, 3, 3, 384}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w117_dims.size(), w117_dims.data(), + /*data=*/w117_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w117); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w117" << std::endl; + return nullptr; + } + + alignas(16) static std::array w118_data; + uint32_t w118 = XNN_INVALID_VALUE_ID; + std::array w118_dims = {{384}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w118_dims.size(), w118_dims.data(), + /*data=*/w118_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w118); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w118" << std::endl; + return nullptr; + } + + alignas(16) static std::array w119_data; + uint32_t w119 = XNN_INVALID_VALUE_ID; + std::array w119_dims = {{64, 1, 1, 384}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w119_dims.size(), w119_dims.data(), + /*data=*/w119_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w119); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w119" << std::endl; + return nullptr; + } + + alignas(16) static std::array w120_data; + uint32_t w120 = XNN_INVALID_VALUE_ID; + std::array w120_dims = {{64}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w120_dims.size(), w120_dims.data(), + /*data=*/w120_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w120); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w120" << std::endl; + return nullptr; + } + + alignas(16) static std::array w121_data; + uint32_t w121 = XNN_INVALID_VALUE_ID; + std::array w121_dims = {{384, 1, 1, 64}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w121_dims.size(), w121_dims.data(), + /*data=*/w121_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w121); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w121" << std::endl; + return nullptr; + } + + alignas(16) static std::array w122_data; + uint32_t w122 = XNN_INVALID_VALUE_ID; + std::array w122_dims = {{384}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w122_dims.size(), w122_dims.data(), + /*data=*/w122_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w122); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w122" << std::endl; + return nullptr; + } + + alignas(16) static std::array w123_data; + uint32_t w123 = XNN_INVALID_VALUE_ID; + std::array w123_dims = {{1, 3, 3, 384}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w123_dims.size(), w123_dims.data(), + /*data=*/w123_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w123); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w123" << std::endl; + return nullptr; + } + + alignas(16) static std::array w124_data; + uint32_t w124 = XNN_INVALID_VALUE_ID; + std::array w124_dims = {{384}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w124_dims.size(), w124_dims.data(), + /*data=*/w124_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w124); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w124" << std::endl; + return nullptr; + } + + alignas(16) static std::array w125_data; + uint32_t w125 = XNN_INVALID_VALUE_ID; + std::array w125_dims = {{64, 1, 1, 384}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w125_dims.size(), w125_dims.data(), + /*data=*/w125_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w125); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w125" << std::endl; + return nullptr; + } + + alignas(16) static std::array w126_data; + uint32_t w126 = XNN_INVALID_VALUE_ID; + std::array w126_dims = {{64}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w126_dims.size(), w126_dims.data(), + /*data=*/w126_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w126); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w126" << std::endl; + return nullptr; + } + + alignas(16) static std::array w127_data; + uint32_t w127 = XNN_INVALID_VALUE_ID; + std::array w127_dims = {{384, 1, 1, 64}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w127_dims.size(), w127_dims.data(), + /*data=*/w127_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w127); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w127" << std::endl; + return nullptr; + } + + alignas(16) static std::array w128_data; + uint32_t w128 = XNN_INVALID_VALUE_ID; + std::array w128_dims = {{384}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w128_dims.size(), w128_dims.data(), + /*data=*/w128_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w128); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w128" << std::endl; + return nullptr; + } + + alignas(16) static std::array w129_data; + uint32_t w129 = XNN_INVALID_VALUE_ID; + std::array w129_dims = {{1, 3, 3, 384}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w129_dims.size(), w129_dims.data(), + /*data=*/w129_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w129); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w129" << std::endl; + return nullptr; + } + + alignas(16) static std::array w130_data; + uint32_t w130 = XNN_INVALID_VALUE_ID; + std::array w130_dims = {{384}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w130_dims.size(), w130_dims.data(), + /*data=*/w130_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w130); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w130" << std::endl; + return nullptr; + } + + alignas(16) static std::array w131_data; + uint32_t w131 = XNN_INVALID_VALUE_ID; + std::array w131_dims = {{96, 1, 1, 384}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w131_dims.size(), w131_dims.data(), + /*data=*/w131_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w131); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w131" << std::endl; + return nullptr; + } + + alignas(16) static std::array w132_data; + uint32_t w132 = XNN_INVALID_VALUE_ID; + std::array w132_dims = {{96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w132_dims.size(), w132_dims.data(), + /*data=*/w132_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w132); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w132" << std::endl; + return nullptr; + } + + alignas(16) static std::array w133_data; + uint32_t w133 = XNN_INVALID_VALUE_ID; + std::array w133_dims = {{576, 1, 1, 96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w133_dims.size(), w133_dims.data(), + /*data=*/w133_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w133); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w133" << std::endl; + return nullptr; + } + + alignas(16) static std::array w134_data; + uint32_t w134 = XNN_INVALID_VALUE_ID; + std::array w134_dims = {{576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w134_dims.size(), w134_dims.data(), + /*data=*/w134_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w134); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w134" << std::endl; + return nullptr; + } + + alignas(16) static std::array w135_data; + uint32_t w135 = XNN_INVALID_VALUE_ID; + std::array w135_dims = {{1, 3, 3, 576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w135_dims.size(), w135_dims.data(), + /*data=*/w135_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w135); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w135" << std::endl; + return nullptr; + } + + alignas(16) static std::array w136_data; + uint32_t w136 = XNN_INVALID_VALUE_ID; + std::array w136_dims = {{576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w136_dims.size(), w136_dims.data(), + /*data=*/w136_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w136); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w136" << std::endl; + return nullptr; + } + + alignas(16) static std::array w137_data; + uint32_t w137 = XNN_INVALID_VALUE_ID; + std::array w137_dims = {{96, 1, 1, 576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w137_dims.size(), w137_dims.data(), + /*data=*/w137_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w137); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w137" << std::endl; + return nullptr; + } + + alignas(16) static std::array w138_data; + uint32_t w138 = XNN_INVALID_VALUE_ID; + std::array w138_dims = {{96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w138_dims.size(), w138_dims.data(), + /*data=*/w138_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w138); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w138" << std::endl; + return nullptr; + } + + alignas(16) static std::array w139_data; + uint32_t w139 = XNN_INVALID_VALUE_ID; + std::array w139_dims = {{576, 1, 1, 96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w139_dims.size(), w139_dims.data(), + /*data=*/w139_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w139); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w139" << std::endl; + return nullptr; + } + + alignas(16) static std::array w140_data; + uint32_t w140 = XNN_INVALID_VALUE_ID; + std::array w140_dims = {{576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w140_dims.size(), w140_dims.data(), + /*data=*/w140_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w140); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w140" << std::endl; + return nullptr; + } + + alignas(16) static std::array w141_data; + uint32_t w141 = XNN_INVALID_VALUE_ID; + std::array w141_dims = {{1, 3, 3, 576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w141_dims.size(), w141_dims.data(), + /*data=*/w141_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w141); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w141" << std::endl; + return nullptr; + } + + alignas(16) static std::array w142_data; + uint32_t w142 = XNN_INVALID_VALUE_ID; + std::array w142_dims = {{576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w142_dims.size(), w142_dims.data(), + /*data=*/w142_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w142); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w142" << std::endl; + return nullptr; + } + + alignas(16) static std::array w143_data; + uint32_t w143 = XNN_INVALID_VALUE_ID; + std::array w143_dims = {{96, 1, 1, 576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w143_dims.size(), w143_dims.data(), + /*data=*/w143_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w143); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w143" << std::endl; + return nullptr; + } + + alignas(16) static std::array w144_data; + uint32_t w144 = XNN_INVALID_VALUE_ID; + std::array w144_dims = {{96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w144_dims.size(), w144_dims.data(), + /*data=*/w144_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w144); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w144" << std::endl; + return nullptr; + } + + alignas(16) static std::array w145_data; + uint32_t w145 = XNN_INVALID_VALUE_ID; + std::array w145_dims = {{576, 1, 1, 96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w145_dims.size(), w145_dims.data(), + /*data=*/w145_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w145); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w145" << std::endl; + return nullptr; + } + + alignas(16) static std::array w146_data; + uint32_t w146 = XNN_INVALID_VALUE_ID; + std::array w146_dims = {{576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w146_dims.size(), w146_dims.data(), + /*data=*/w146_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w146); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w146" << std::endl; + return nullptr; + } + + alignas(16) static std::array w147_data; + uint32_t w147 = XNN_INVALID_VALUE_ID; + std::array w147_dims = {{1, 3, 3, 576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w147_dims.size(), w147_dims.data(), + /*data=*/w147_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w147); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w147" << std::endl; + return nullptr; + } + + alignas(16) static std::array w148_data; + uint32_t w148 = XNN_INVALID_VALUE_ID; + std::array w148_dims = {{576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w148_dims.size(), w148_dims.data(), + /*data=*/w148_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w148); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w148" << std::endl; + return nullptr; + } + + alignas(16) static std::array w149_data; + uint32_t w149 = XNN_INVALID_VALUE_ID; + std::array w149_dims = {{160, 1, 1, 576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w149_dims.size(), w149_dims.data(), + /*data=*/w149_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w149); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w149" << std::endl; + return nullptr; + } + + alignas(16) static std::array w150_data; + uint32_t w150 = XNN_INVALID_VALUE_ID; + std::array w150_dims = {{160}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w150_dims.size(), w150_dims.data(), + /*data=*/w150_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w150); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w150" << std::endl; + return nullptr; + } + + alignas(16) static std::array w151_data; + uint32_t w151 = XNN_INVALID_VALUE_ID; + std::array w151_dims = {{960, 1, 1, 160}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w151_dims.size(), w151_dims.data(), + /*data=*/w151_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w151); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w151" << std::endl; + return nullptr; + } + + alignas(16) static std::array w152_data; + uint32_t w152 = XNN_INVALID_VALUE_ID; + std::array w152_dims = {{960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w152_dims.size(), w152_dims.data(), + /*data=*/w152_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w152); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w152" << std::endl; + return nullptr; + } + + alignas(16) static std::array w153_data; + uint32_t w153 = XNN_INVALID_VALUE_ID; + std::array w153_dims = {{1, 3, 3, 960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w153_dims.size(), w153_dims.data(), + /*data=*/w153_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w153); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w153" << std::endl; + return nullptr; + } + + alignas(16) static std::array w154_data; + uint32_t w154 = XNN_INVALID_VALUE_ID; + std::array w154_dims = {{960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w154_dims.size(), w154_dims.data(), + /*data=*/w154_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w154); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w154" << std::endl; + return nullptr; + } + + alignas(16) static std::array w155_data; + uint32_t w155 = XNN_INVALID_VALUE_ID; + std::array w155_dims = {{160, 1, 1, 960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w155_dims.size(), w155_dims.data(), + /*data=*/w155_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w155); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w155" << std::endl; + return nullptr; + } + + alignas(16) static std::array w156_data; + uint32_t w156 = XNN_INVALID_VALUE_ID; + std::array w156_dims = {{160}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w156_dims.size(), w156_dims.data(), + /*data=*/w156_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w156); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w156" << std::endl; + return nullptr; + } + + alignas(16) static std::array w157_data; + uint32_t w157 = XNN_INVALID_VALUE_ID; + std::array w157_dims = {{960, 1, 1, 160}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w157_dims.size(), w157_dims.data(), + /*data=*/w157_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w157); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w157" << std::endl; + return nullptr; + } + + alignas(16) static std::array w158_data; + uint32_t w158 = XNN_INVALID_VALUE_ID; + std::array w158_dims = {{960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w158_dims.size(), w158_dims.data(), + /*data=*/w158_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w158); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w158" << std::endl; + return nullptr; + } + + alignas(16) static std::array w159_data; + uint32_t w159 = XNN_INVALID_VALUE_ID; + std::array w159_dims = {{1, 3, 3, 960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w159_dims.size(), w159_dims.data(), + /*data=*/w159_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w159); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w159" << std::endl; + return nullptr; + } + + alignas(16) static std::array w160_data; + uint32_t w160 = XNN_INVALID_VALUE_ID; + std::array w160_dims = {{960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w160_dims.size(), w160_dims.data(), + /*data=*/w160_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w160); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w160" << std::endl; + return nullptr; + } + + alignas(16) static std::array w161_data; + uint32_t w161 = XNN_INVALID_VALUE_ID; + std::array w161_dims = {{160, 1, 1, 960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w161_dims.size(), w161_dims.data(), + /*data=*/w161_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w161); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w161" << std::endl; + return nullptr; + } + + alignas(16) static std::array w162_data; + uint32_t w162 = XNN_INVALID_VALUE_ID; + std::array w162_dims = {{160}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w162_dims.size(), w162_dims.data(), + /*data=*/w162_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w162); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w162" << std::endl; + return nullptr; + } + + alignas(16) static std::array w163_data; + uint32_t w163 = XNN_INVALID_VALUE_ID; + std::array w163_dims = {{960, 1, 1, 160}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w163_dims.size(), w163_dims.data(), + /*data=*/w163_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w163); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w163" << std::endl; + return nullptr; + } + + alignas(16) static std::array w164_data; + uint32_t w164 = XNN_INVALID_VALUE_ID; + std::array w164_dims = {{960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w164_dims.size(), w164_dims.data(), + /*data=*/w164_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w164); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w164" << std::endl; + return nullptr; + } + + alignas(16) static std::array w165_data; + uint32_t w165 = XNN_INVALID_VALUE_ID; + std::array w165_dims = {{1, 3, 3, 960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w165_dims.size(), w165_dims.data(), + /*data=*/w165_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w165); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w165" << std::endl; + return nullptr; + } + + alignas(16) static std::array w166_data; + uint32_t w166 = XNN_INVALID_VALUE_ID; + std::array w166_dims = {{960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w166_dims.size(), w166_dims.data(), + /*data=*/w166_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w166); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w166" << std::endl; + return nullptr; + } + + alignas(16) static std::array w167_data; + uint32_t w167 = XNN_INVALID_VALUE_ID; + std::array w167_dims = {{320, 1, 1, 960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w167_dims.size(), w167_dims.data(), + /*data=*/w167_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w167); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w167" << std::endl; + return nullptr; + } + + alignas(16) static std::array w168_data; + uint32_t w168 = XNN_INVALID_VALUE_ID; + std::array w168_dims = {{320}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w168_dims.size(), w168_dims.data(), + /*data=*/w168_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w168); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w168" << std::endl; + return nullptr; + } + + alignas(16) static std::array w169_data; + uint32_t w169 = XNN_INVALID_VALUE_ID; + std::array w169_dims = {{1280, 1, 1, 320}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w169_dims.size(), w169_dims.data(), + /*data=*/w169_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w169); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w169" << std::endl; + return nullptr; + } + + alignas(16) static std::array w170_data; + uint32_t w170 = XNN_INVALID_VALUE_ID; + std::array w170_dims = {{1280}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w170_dims.size(), w170_dims.data(), + /*data=*/w170_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w170); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w170" << std::endl; + return nullptr; + } + + alignas(16) static std::array w171_data; + uint32_t w171 = XNN_INVALID_VALUE_ID; + std::array w171_dims = {{1001, 1, 1, 1280}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w171_dims.size(), w171_dims.data(), + /*data=*/w171_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w171); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w171" << std::endl; + return nullptr; + } + + alignas(16) static std::array w172_data; + uint32_t w172 = XNN_INVALID_VALUE_ID; + std::array w172_dims = {{1001}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w172_dims.size(), w172_dims.data(), + /*data=*/w172_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w172); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w172" << std::endl; + return nullptr; + } + + std::random_device random_device; + auto rng = std::mt19937(random_device()); + auto f32rng = std::bind(std::uniform_real_distribution(-1.0f, +1.0f), std::ref(rng)); + std::generate(w67_data.begin(), w67_data.end(), std::ref(f32rng)); + std::generate(w68_data.begin(), w68_data.end(), std::ref(f32rng)); + std::generate(w69_data.begin(), w69_data.end(), std::ref(f32rng)); + std::generate(w70_data.begin(), w70_data.end(), std::ref(f32rng)); + std::generate(w71_data.begin(), w71_data.end(), std::ref(f32rng)); + std::generate(w72_data.begin(), w72_data.end(), std::ref(f32rng)); + std::generate(w73_data.begin(), w73_data.end(), std::ref(f32rng)); + std::generate(w74_data.begin(), w74_data.end(), std::ref(f32rng)); + std::generate(w75_data.begin(), w75_data.end(), std::ref(f32rng)); + std::generate(w76_data.begin(), w76_data.end(), std::ref(f32rng)); + std::generate(w77_data.begin(), w77_data.end(), std::ref(f32rng)); + std::generate(w78_data.begin(), w78_data.end(), std::ref(f32rng)); + std::generate(w79_data.begin(), w79_data.end(), std::ref(f32rng)); + std::generate(w80_data.begin(), w80_data.end(), std::ref(f32rng)); + std::generate(w81_data.begin(), w81_data.end(), std::ref(f32rng)); + std::generate(w82_data.begin(), w82_data.end(), std::ref(f32rng)); + std::generate(w83_data.begin(), w83_data.end(), std::ref(f32rng)); + std::generate(w84_data.begin(), w84_data.end(), std::ref(f32rng)); + std::generate(w85_data.begin(), w85_data.end(), std::ref(f32rng)); + std::generate(w86_data.begin(), w86_data.end(), std::ref(f32rng)); + std::generate(w87_data.begin(), w87_data.end(), std::ref(f32rng)); + std::generate(w88_data.begin(), w88_data.end(), std::ref(f32rng)); + std::generate(w89_data.begin(), w89_data.end(), std::ref(f32rng)); + std::generate(w90_data.begin(), w90_data.end(), std::ref(f32rng)); + std::generate(w91_data.begin(), w91_data.end(), std::ref(f32rng)); + std::generate(w92_data.begin(), w92_data.end(), std::ref(f32rng)); + std::generate(w93_data.begin(), w93_data.end(), std::ref(f32rng)); + std::generate(w94_data.begin(), w94_data.end(), std::ref(f32rng)); + std::generate(w95_data.begin(), w95_data.end(), std::ref(f32rng)); + std::generate(w96_data.begin(), w96_data.end(), std::ref(f32rng)); + std::generate(w97_data.begin(), w97_data.end(), std::ref(f32rng)); + std::generate(w98_data.begin(), w98_data.end(), std::ref(f32rng)); + std::generate(w99_data.begin(), w99_data.end(), std::ref(f32rng)); + std::generate(w100_data.begin(), w100_data.end(), std::ref(f32rng)); + std::generate(w101_data.begin(), w101_data.end(), std::ref(f32rng)); + std::generate(w102_data.begin(), w102_data.end(), std::ref(f32rng)); + std::generate(w103_data.begin(), w103_data.end(), std::ref(f32rng)); + std::generate(w104_data.begin(), w104_data.end(), std::ref(f32rng)); + std::generate(w105_data.begin(), w105_data.end(), std::ref(f32rng)); + std::generate(w106_data.begin(), w106_data.end(), std::ref(f32rng)); + std::generate(w107_data.begin(), w107_data.end(), std::ref(f32rng)); + std::generate(w108_data.begin(), w108_data.end(), std::ref(f32rng)); + std::generate(w109_data.begin(), w109_data.end(), std::ref(f32rng)); + std::generate(w110_data.begin(), w110_data.end(), std::ref(f32rng)); + std::generate(w111_data.begin(), w111_data.end(), std::ref(f32rng)); + std::generate(w112_data.begin(), w112_data.end(), std::ref(f32rng)); + std::generate(w113_data.begin(), w113_data.end(), std::ref(f32rng)); + std::generate(w114_data.begin(), w114_data.end(), std::ref(f32rng)); + std::generate(w115_data.begin(), w115_data.end(), std::ref(f32rng)); + std::generate(w116_data.begin(), w116_data.end(), std::ref(f32rng)); + std::generate(w117_data.begin(), w117_data.end(), std::ref(f32rng)); + std::generate(w118_data.begin(), w118_data.end(), std::ref(f32rng)); + std::generate(w119_data.begin(), w119_data.end(), std::ref(f32rng)); + std::generate(w120_data.begin(), w120_data.end(), std::ref(f32rng)); + std::generate(w121_data.begin(), w121_data.end(), std::ref(f32rng)); + std::generate(w122_data.begin(), w122_data.end(), std::ref(f32rng)); + std::generate(w123_data.begin(), w123_data.end(), std::ref(f32rng)); + std::generate(w124_data.begin(), w124_data.end(), std::ref(f32rng)); + std::generate(w125_data.begin(), w125_data.end(), std::ref(f32rng)); + std::generate(w126_data.begin(), w126_data.end(), std::ref(f32rng)); + std::generate(w127_data.begin(), w127_data.end(), std::ref(f32rng)); + std::generate(w128_data.begin(), w128_data.end(), std::ref(f32rng)); + std::generate(w129_data.begin(), w129_data.end(), std::ref(f32rng)); + std::generate(w130_data.begin(), w130_data.end(), std::ref(f32rng)); + std::generate(w131_data.begin(), w131_data.end(), std::ref(f32rng)); + std::generate(w132_data.begin(), w132_data.end(), std::ref(f32rng)); + std::generate(w133_data.begin(), w133_data.end(), std::ref(f32rng)); + std::generate(w134_data.begin(), w134_data.end(), std::ref(f32rng)); + std::generate(w135_data.begin(), w135_data.end(), std::ref(f32rng)); + std::generate(w136_data.begin(), w136_data.end(), std::ref(f32rng)); + std::generate(w137_data.begin(), w137_data.end(), std::ref(f32rng)); + std::generate(w138_data.begin(), w138_data.end(), std::ref(f32rng)); + std::generate(w139_data.begin(), w139_data.end(), std::ref(f32rng)); + std::generate(w140_data.begin(), w140_data.end(), std::ref(f32rng)); + std::generate(w141_data.begin(), w141_data.end(), std::ref(f32rng)); + std::generate(w142_data.begin(), w142_data.end(), std::ref(f32rng)); + std::generate(w143_data.begin(), w143_data.end(), std::ref(f32rng)); + std::generate(w144_data.begin(), w144_data.end(), std::ref(f32rng)); + std::generate(w145_data.begin(), w145_data.end(), std::ref(f32rng)); + std::generate(w146_data.begin(), w146_data.end(), std::ref(f32rng)); + std::generate(w147_data.begin(), w147_data.end(), std::ref(f32rng)); + std::generate(w148_data.begin(), w148_data.end(), std::ref(f32rng)); + std::generate(w149_data.begin(), w149_data.end(), std::ref(f32rng)); + std::generate(w150_data.begin(), w150_data.end(), std::ref(f32rng)); + std::generate(w151_data.begin(), w151_data.end(), std::ref(f32rng)); + std::generate(w152_data.begin(), w152_data.end(), std::ref(f32rng)); + std::generate(w153_data.begin(), w153_data.end(), std::ref(f32rng)); + std::generate(w154_data.begin(), w154_data.end(), std::ref(f32rng)); + std::generate(w155_data.begin(), w155_data.end(), std::ref(f32rng)); + std::generate(w156_data.begin(), w156_data.end(), std::ref(f32rng)); + std::generate(w157_data.begin(), w157_data.end(), std::ref(f32rng)); + std::generate(w158_data.begin(), w158_data.end(), std::ref(f32rng)); + std::generate(w159_data.begin(), w159_data.end(), std::ref(f32rng)); + std::generate(w160_data.begin(), w160_data.end(), std::ref(f32rng)); + std::generate(w161_data.begin(), w161_data.end(), std::ref(f32rng)); + std::generate(w162_data.begin(), w162_data.end(), std::ref(f32rng)); + std::generate(w163_data.begin(), w163_data.end(), std::ref(f32rng)); + std::generate(w164_data.begin(), w164_data.end(), std::ref(f32rng)); + std::generate(w165_data.begin(), w165_data.end(), std::ref(f32rng)); + std::generate(w166_data.begin(), w166_data.end(), std::ref(f32rng)); + std::generate(w167_data.begin(), w167_data.end(), std::ref(f32rng)); + std::generate(w168_data.begin(), w168_data.end(), std::ref(f32rng)); + std::generate(w169_data.begin(), w169_data.end(), std::ref(f32rng)); + std::generate(w170_data.begin(), w170_data.end(), std::ref(f32rng)); + std::generate(w171_data.begin(), w171_data.end(), std::ref(f32rng)); + std::generate(w172_data.begin(), w172_data.end(), std::ref(f32rng)); + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/0, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/2, /*subsampling_width=*/2, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/3, + /*group_output_channels=*/32, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v0, + w67, + w68, + v1, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #0" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/32, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v1, + w69, + w70, + v2, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #1" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/32, + /*group_output_channels=*/16, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v2, + w71, + w72, + v3, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #2" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/16, + /*group_output_channels=*/96, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v3, + w73, + w74, + v4, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #3" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/0, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/2, /*subsampling_width=*/2, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/96, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v4, + w75, + w76, + v5, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #4" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/96, + /*group_output_channels=*/24, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v5, + w77, + w78, + v6, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #5" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/24, + /*group_output_channels=*/144, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v6, + w79, + w80, + v7, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #6" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/144, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v7, + w81, + w82, + v8, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #7" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/144, + /*group_output_channels=*/24, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v8, + w83, + w84, + v9, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #8" << std::endl; + return nullptr; + } + + status = xnn_define_add2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v9, + v6, + v10, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #9" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/24, + /*group_output_channels=*/144, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v10, + w85, + w86, + v11, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #10" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/0, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/2, /*subsampling_width=*/2, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/144, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v11, + w87, + w88, + v12, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #11" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/144, + /*group_output_channels=*/32, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v12, + w89, + w90, + v13, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #12" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/32, + /*group_output_channels=*/192, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v13, + w91, + w92, + v14, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #13" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/192, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v14, + w93, + w94, + v15, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #14" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/192, + /*group_output_channels=*/32, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v15, + w95, + w96, + v16, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #15" << std::endl; + return nullptr; + } + + status = xnn_define_add2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v16, + v13, + v17, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #16" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/32, + /*group_output_channels=*/192, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v17, + w97, + w98, + v18, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #17" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/192, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v18, + w99, + w100, + v19, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #18" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/192, + /*group_output_channels=*/32, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v19, + w101, + w102, + v20, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #19" << std::endl; + return nullptr; + } + + status = xnn_define_add2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v20, + v17, + v21, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #20" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/32, + /*group_output_channels=*/192, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v21, + w103, + w104, + v22, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #21" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/0, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/2, /*subsampling_width=*/2, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/192, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v22, + w105, + w106, + v23, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #22" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/192, + /*group_output_channels=*/64, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v23, + w107, + w108, + v24, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #23" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/64, + /*group_output_channels=*/384, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v24, + w109, + w110, + v25, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #24" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/384, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v25, + w111, + w112, + v26, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #25" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/384, + /*group_output_channels=*/64, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v26, + w113, + w114, + v27, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #26" << std::endl; + return nullptr; + } + + status = xnn_define_add2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v27, + v24, + v28, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #27" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/64, + /*group_output_channels=*/384, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v28, + w115, + w116, + v29, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #28" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/384, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v29, + w117, + w118, + v30, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #29" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/384, + /*group_output_channels=*/64, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v30, + w119, + w120, + v31, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #30" << std::endl; + return nullptr; + } + + status = xnn_define_add2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v31, + v28, + v32, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #31" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/64, + /*group_output_channels=*/384, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v32, + w121, + w122, + v33, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #32" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/384, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v33, + w123, + w124, + v34, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #33" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/384, + /*group_output_channels=*/64, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v34, + w125, + w126, + v35, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #34" << std::endl; + return nullptr; + } + + status = xnn_define_add2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v35, + v32, + v36, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #35" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/64, + /*group_output_channels=*/384, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v36, + w127, + w128, + v37, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #36" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/384, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v37, + w129, + w130, + v38, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #37" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/384, + /*group_output_channels=*/96, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v38, + w131, + w132, + v39, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #38" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/96, + /*group_output_channels=*/576, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v39, + w133, + w134, + v40, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #39" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/576, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v40, + w135, + w136, + v41, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #40" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/576, + /*group_output_channels=*/96, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v41, + w137, + w138, + v42, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #41" << std::endl; + return nullptr; + } + + status = xnn_define_add2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v42, + v39, + v43, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #42" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/96, + /*group_output_channels=*/576, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v43, + w139, + w140, + v44, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #43" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/576, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v44, + w141, + w142, + v45, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #44" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/576, + /*group_output_channels=*/96, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v45, + w143, + w144, + v46, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #45" << std::endl; + return nullptr; + } + + status = xnn_define_add2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v46, + v43, + v47, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #46" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/96, + /*group_output_channels=*/576, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v47, + w145, + w146, + v48, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #47" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/0, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/2, /*subsampling_width=*/2, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/576, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v48, + w147, + w148, + v49, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #48" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/576, + /*group_output_channels=*/160, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v49, + w149, + w150, + v50, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #49" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/160, + /*group_output_channels=*/960, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v50, + w151, + w152, + v51, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #50" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/960, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v51, + w153, + w154, + v52, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #51" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/960, + /*group_output_channels=*/160, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v52, + w155, + w156, + v53, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #52" << std::endl; + return nullptr; + } + + status = xnn_define_add2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v53, + v50, + v54, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #53" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/160, + /*group_output_channels=*/960, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v54, + w157, + w158, + v55, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #54" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/960, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v55, + w159, + w160, + v56, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #55" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/960, + /*group_output_channels=*/160, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v56, + w161, + w162, + v57, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #56" << std::endl; + return nullptr; + } + + status = xnn_define_add2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v57, + v54, + v58, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #57" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/160, + /*group_output_channels=*/960, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v58, + w163, + w164, + v59, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #58" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/960, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v59, + w165, + w166, + v60, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #59" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/960, + /*group_output_channels=*/320, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v60, + w167, + w168, + v61, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #60" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/320, + /*group_output_channels=*/1280, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v61, + w169, + w170, + v62, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #61" << std::endl; + return nullptr; + } + + status = xnn_define_average_pooling_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*pooling_height=*/7, /*pooling_width=*/7, + /*stride_height=*/1, /*stride_width=*/1, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v62, + v63, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #62" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/1280, + /*group_output_channels=*/1001, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v63, + w171, + w172, + v64, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #63" << std::endl; + return nullptr; + } + + status = xnn_define_copy( + subgraph, + v64, + v65, + 0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #64" << std::endl; + return nullptr; + } + + status = xnn_define_softmax( + subgraph, + v65, + v66, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #65" << std::endl; + return nullptr; + } + + return subgraph; +} + +} // namespace models diff --git a/bench/models/fp32-mobilenet-v3-large.cc b/bench/models/fp32-mobilenet-v3-large.cc new file mode 100644 index 00000000000..44f672da228 --- /dev/null +++ b/bench/models/fp32-mobilenet-v3-large.cc @@ -0,0 +1,5407 @@ +// Copyright 2020 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. +// +// Auto-generated file. Do not edit! + +#include "xnnpack.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xnnpack.h" + +// align a size up to XNN_EXTRA_BYTES +#define XNN_PAD_EXTRA_BYTES(s, t) (((s) + XNN_EXTRA_BYTES / sizeof(t) - 1) & ~(XNN_EXTRA_BYTES / sizeof(t) - 1)) + +namespace models { + +xnn_subgraph_t FP32MobileNetV3Large() { + xnn_status status; + xnn_subgraph_t subgraph = nullptr; + status = xnn_create_subgraph(/*num_external_values=*/2, 0, &subgraph); + if (status != xnn_status_success) { + std::cerr << "failed to create subgrpah" << std::endl; + return nullptr; + } + + uint32_t v0 = XNN_INVALID_VALUE_ID; + std::array v0_dims = {{1, 224, 224, 3}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v0_dims.size(), v0_dims.data(), + /*data=*/nullptr, + 1, XNN_VALUE_FLAG_EXTERNAL_INPUT, &v0); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v0" << std::endl; + return nullptr; + } + + uint32_t v1 = XNN_INVALID_VALUE_ID; + std::array v1_dims = {{1, 112, 112, 16}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v1_dims.size(), v1_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v1); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v1" << std::endl; + return nullptr; + } + + uint32_t v2 = XNN_INVALID_VALUE_ID; + std::array v2_dims = {{1, 112, 112, 16}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v2_dims.size(), v2_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v2); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v2" << std::endl; + return nullptr; + } + + uint32_t v3 = XNN_INVALID_VALUE_ID; + std::array v3_dims = {{1, 112, 112, 16}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v3_dims.size(), v3_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v3); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v3" << std::endl; + return nullptr; + } + + uint32_t v4 = XNN_INVALID_VALUE_ID; + std::array v4_dims = {{1, 112, 112, 16}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v4_dims.size(), v4_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v4); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v4" << std::endl; + return nullptr; + } + + uint32_t v5 = XNN_INVALID_VALUE_ID; + std::array v5_dims = {{1, 112, 112, 16}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v5_dims.size(), v5_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v5); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v5" << std::endl; + return nullptr; + } + + uint32_t v6 = XNN_INVALID_VALUE_ID; + std::array v6_dims = {{1, 112, 112, 64}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v6_dims.size(), v6_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v6); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v6" << std::endl; + return nullptr; + } + + uint32_t v7 = XNN_INVALID_VALUE_ID; + std::array v7_dims = {{1, 56, 56, 64}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v7_dims.size(), v7_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v7); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v7" << std::endl; + return nullptr; + } + + uint32_t v8 = XNN_INVALID_VALUE_ID; + std::array v8_dims = {{1, 56, 56, 24}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v8_dims.size(), v8_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v8); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v8" << std::endl; + return nullptr; + } + + uint32_t v9 = XNN_INVALID_VALUE_ID; + std::array v9_dims = {{1, 56, 56, 72}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v9_dims.size(), v9_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v9); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v9" << std::endl; + return nullptr; + } + + uint32_t v10 = XNN_INVALID_VALUE_ID; + std::array v10_dims = {{1, 56, 56, 72}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v10_dims.size(), v10_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v10); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v10" << std::endl; + return nullptr; + } + + uint32_t v11 = XNN_INVALID_VALUE_ID; + std::array v11_dims = {{1, 56, 56, 24}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v11_dims.size(), v11_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v11); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v11" << std::endl; + return nullptr; + } + + uint32_t v12 = XNN_INVALID_VALUE_ID; + std::array v12_dims = {{1, 56, 56, 24}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v12_dims.size(), v12_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v12); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v12" << std::endl; + return nullptr; + } + + uint32_t v13 = XNN_INVALID_VALUE_ID; + std::array v13_dims = {{1, 56, 56, 72}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v13_dims.size(), v13_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v13); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v13" << std::endl; + return nullptr; + } + + uint32_t v14 = XNN_INVALID_VALUE_ID; + std::array v14_dims = {{1, 28, 28, 72}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v14_dims.size(), v14_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v14); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v14" << std::endl; + return nullptr; + } + + uint32_t v15 = XNN_INVALID_VALUE_ID; + std::array v15_dims = {{1, 1, 1, 72}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v15_dims.size(), v15_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v15); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v15" << std::endl; + return nullptr; + } + + uint32_t v16 = XNN_INVALID_VALUE_ID; + std::array v16_dims = {{1, 1, 1, 24}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v16_dims.size(), v16_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v16); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v16" << std::endl; + return nullptr; + } + + uint32_t v17 = XNN_INVALID_VALUE_ID; + std::array v17_dims = {{1, 1, 1, 72}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v17_dims.size(), v17_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v17); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v17" << std::endl; + return nullptr; + } + + uint32_t v18 = XNN_INVALID_VALUE_ID; + std::array v18_dims = {{1, 1, 1, 72}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v18_dims.size(), v18_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v18); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v18" << std::endl; + return nullptr; + } + + uint32_t v19 = XNN_INVALID_VALUE_ID; + std::array v19_dims = {{1, 28, 28, 72}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v19_dims.size(), v19_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v19); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v19" << std::endl; + return nullptr; + } + + uint32_t v20 = XNN_INVALID_VALUE_ID; + std::array v20_dims = {{1, 28, 28, 40}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v20_dims.size(), v20_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v20); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v20" << std::endl; + return nullptr; + } + + uint32_t v21 = XNN_INVALID_VALUE_ID; + std::array v21_dims = {{1, 28, 28, 120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v21_dims.size(), v21_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v21); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v21" << std::endl; + return nullptr; + } + + uint32_t v22 = XNN_INVALID_VALUE_ID; + std::array v22_dims = {{1, 28, 28, 120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v22_dims.size(), v22_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v22); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v22" << std::endl; + return nullptr; + } + + uint32_t v23 = XNN_INVALID_VALUE_ID; + std::array v23_dims = {{1, 1, 1, 120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v23_dims.size(), v23_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v23); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v23" << std::endl; + return nullptr; + } + + uint32_t v24 = XNN_INVALID_VALUE_ID; + std::array v24_dims = {{1, 1, 1, 32}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v24_dims.size(), v24_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v24); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v24" << std::endl; + return nullptr; + } + + uint32_t v25 = XNN_INVALID_VALUE_ID; + std::array v25_dims = {{1, 1, 1, 120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v25_dims.size(), v25_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v25); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v25" << std::endl; + return nullptr; + } + + uint32_t v26 = XNN_INVALID_VALUE_ID; + std::array v26_dims = {{1, 1, 1, 120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v26_dims.size(), v26_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v26); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v26" << std::endl; + return nullptr; + } + + uint32_t v27 = XNN_INVALID_VALUE_ID; + std::array v27_dims = {{1, 28, 28, 120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v27_dims.size(), v27_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v27); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v27" << std::endl; + return nullptr; + } + + uint32_t v28 = XNN_INVALID_VALUE_ID; + std::array v28_dims = {{1, 28, 28, 40}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v28_dims.size(), v28_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v28); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v28" << std::endl; + return nullptr; + } + + uint32_t v29 = XNN_INVALID_VALUE_ID; + std::array v29_dims = {{1, 28, 28, 40}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v29_dims.size(), v29_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v29); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v29" << std::endl; + return nullptr; + } + + uint32_t v30 = XNN_INVALID_VALUE_ID; + std::array v30_dims = {{1, 28, 28, 120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v30_dims.size(), v30_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v30); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v30" << std::endl; + return nullptr; + } + + uint32_t v31 = XNN_INVALID_VALUE_ID; + std::array v31_dims = {{1, 28, 28, 120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v31_dims.size(), v31_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v31); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v31" << std::endl; + return nullptr; + } + + uint32_t v32 = XNN_INVALID_VALUE_ID; + std::array v32_dims = {{1, 1, 1, 120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v32_dims.size(), v32_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v32); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v32" << std::endl; + return nullptr; + } + + uint32_t v33 = XNN_INVALID_VALUE_ID; + std::array v33_dims = {{1, 1, 1, 32}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v33_dims.size(), v33_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v33); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v33" << std::endl; + return nullptr; + } + + uint32_t v34 = XNN_INVALID_VALUE_ID; + std::array v34_dims = {{1, 1, 1, 120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v34_dims.size(), v34_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v34); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v34" << std::endl; + return nullptr; + } + + uint32_t v35 = XNN_INVALID_VALUE_ID; + std::array v35_dims = {{1, 1, 1, 120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v35_dims.size(), v35_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v35); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v35" << std::endl; + return nullptr; + } + + uint32_t v36 = XNN_INVALID_VALUE_ID; + std::array v36_dims = {{1, 28, 28, 120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v36_dims.size(), v36_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v36); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v36" << std::endl; + return nullptr; + } + + uint32_t v37 = XNN_INVALID_VALUE_ID; + std::array v37_dims = {{1, 28, 28, 40}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v37_dims.size(), v37_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v37); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v37" << std::endl; + return nullptr; + } + + uint32_t v38 = XNN_INVALID_VALUE_ID; + std::array v38_dims = {{1, 28, 28, 40}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v38_dims.size(), v38_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v38); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v38" << std::endl; + return nullptr; + } + + uint32_t v39 = XNN_INVALID_VALUE_ID; + std::array v39_dims = {{1, 28, 28, 240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v39_dims.size(), v39_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v39); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v39" << std::endl; + return nullptr; + } + + uint32_t v40 = XNN_INVALID_VALUE_ID; + std::array v40_dims = {{1, 28, 28, 240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v40_dims.size(), v40_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v40); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v40" << std::endl; + return nullptr; + } + + uint32_t v41 = XNN_INVALID_VALUE_ID; + std::array v41_dims = {{1, 14, 14, 240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v41_dims.size(), v41_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v41); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v41" << std::endl; + return nullptr; + } + + uint32_t v42 = XNN_INVALID_VALUE_ID; + std::array v42_dims = {{1, 14, 14, 240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v42_dims.size(), v42_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v42); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v42" << std::endl; + return nullptr; + } + + uint32_t v43 = XNN_INVALID_VALUE_ID; + std::array v43_dims = {{1, 14, 14, 80}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v43_dims.size(), v43_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v43); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v43" << std::endl; + return nullptr; + } + + uint32_t v44 = XNN_INVALID_VALUE_ID; + std::array v44_dims = {{1, 14, 14, 200}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v44_dims.size(), v44_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v44); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v44" << std::endl; + return nullptr; + } + + uint32_t v45 = XNN_INVALID_VALUE_ID; + std::array v45_dims = {{1, 14, 14, 200}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v45_dims.size(), v45_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v45); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v45" << std::endl; + return nullptr; + } + + uint32_t v46 = XNN_INVALID_VALUE_ID; + std::array v46_dims = {{1, 14, 14, 200}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v46_dims.size(), v46_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v46); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v46" << std::endl; + return nullptr; + } + + uint32_t v47 = XNN_INVALID_VALUE_ID; + std::array v47_dims = {{1, 14, 14, 200}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v47_dims.size(), v47_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v47); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v47" << std::endl; + return nullptr; + } + + uint32_t v48 = XNN_INVALID_VALUE_ID; + std::array v48_dims = {{1, 14, 14, 80}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v48_dims.size(), v48_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v48); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v48" << std::endl; + return nullptr; + } + + uint32_t v49 = XNN_INVALID_VALUE_ID; + std::array v49_dims = {{1, 14, 14, 80}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v49_dims.size(), v49_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v49); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v49" << std::endl; + return nullptr; + } + + uint32_t v50 = XNN_INVALID_VALUE_ID; + std::array v50_dims = {{1, 14, 14, 184}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v50_dims.size(), v50_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v50); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v50" << std::endl; + return nullptr; + } + + uint32_t v51 = XNN_INVALID_VALUE_ID; + std::array v51_dims = {{1, 14, 14, 184}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v51_dims.size(), v51_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v51); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v51" << std::endl; + return nullptr; + } + + uint32_t v52 = XNN_INVALID_VALUE_ID; + std::array v52_dims = {{1, 14, 14, 184}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v52_dims.size(), v52_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v52); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v52" << std::endl; + return nullptr; + } + + uint32_t v53 = XNN_INVALID_VALUE_ID; + std::array v53_dims = {{1, 14, 14, 184}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v53_dims.size(), v53_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v53); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v53" << std::endl; + return nullptr; + } + + uint32_t v54 = XNN_INVALID_VALUE_ID; + std::array v54_dims = {{1, 14, 14, 80}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v54_dims.size(), v54_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v54); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v54" << std::endl; + return nullptr; + } + + uint32_t v55 = XNN_INVALID_VALUE_ID; + std::array v55_dims = {{1, 14, 14, 80}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v55_dims.size(), v55_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v55); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v55" << std::endl; + return nullptr; + } + + uint32_t v56 = XNN_INVALID_VALUE_ID; + std::array v56_dims = {{1, 14, 14, 184}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v56_dims.size(), v56_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v56); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v56" << std::endl; + return nullptr; + } + + uint32_t v57 = XNN_INVALID_VALUE_ID; + std::array v57_dims = {{1, 14, 14, 184}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v57_dims.size(), v57_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v57); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v57" << std::endl; + return nullptr; + } + + uint32_t v58 = XNN_INVALID_VALUE_ID; + std::array v58_dims = {{1, 14, 14, 184}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v58_dims.size(), v58_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v58); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v58" << std::endl; + return nullptr; + } + + uint32_t v59 = XNN_INVALID_VALUE_ID; + std::array v59_dims = {{1, 14, 14, 184}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v59_dims.size(), v59_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v59); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v59" << std::endl; + return nullptr; + } + + uint32_t v60 = XNN_INVALID_VALUE_ID; + std::array v60_dims = {{1, 14, 14, 80}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v60_dims.size(), v60_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v60); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v60" << std::endl; + return nullptr; + } + + uint32_t v61 = XNN_INVALID_VALUE_ID; + std::array v61_dims = {{1, 14, 14, 80}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v61_dims.size(), v61_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v61); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v61" << std::endl; + return nullptr; + } + + uint32_t v62 = XNN_INVALID_VALUE_ID; + std::array v62_dims = {{1, 14, 14, 480}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v62_dims.size(), v62_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v62); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v62" << std::endl; + return nullptr; + } + + uint32_t v63 = XNN_INVALID_VALUE_ID; + std::array v63_dims = {{1, 14, 14, 480}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v63_dims.size(), v63_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v63); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v63" << std::endl; + return nullptr; + } + + uint32_t v64 = XNN_INVALID_VALUE_ID; + std::array v64_dims = {{1, 14, 14, 480}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v64_dims.size(), v64_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v64); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v64" << std::endl; + return nullptr; + } + + uint32_t v65 = XNN_INVALID_VALUE_ID; + std::array v65_dims = {{1, 14, 14, 480}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v65_dims.size(), v65_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v65); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v65" << std::endl; + return nullptr; + } + + uint32_t v66 = XNN_INVALID_VALUE_ID; + std::array v66_dims = {{1, 1, 1, 480}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v66_dims.size(), v66_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v66); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v66" << std::endl; + return nullptr; + } + + uint32_t v67 = XNN_INVALID_VALUE_ID; + std::array v67_dims = {{1, 1, 1, 120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v67_dims.size(), v67_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v67); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v67" << std::endl; + return nullptr; + } + + uint32_t v68 = XNN_INVALID_VALUE_ID; + std::array v68_dims = {{1, 1, 1, 480}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v68_dims.size(), v68_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v68); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v68" << std::endl; + return nullptr; + } + + uint32_t v69 = XNN_INVALID_VALUE_ID; + std::array v69_dims = {{1, 1, 1, 480}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v69_dims.size(), v69_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v69); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v69" << std::endl; + return nullptr; + } + + uint32_t v70 = XNN_INVALID_VALUE_ID; + std::array v70_dims = {{1, 14, 14, 480}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v70_dims.size(), v70_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v70); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v70" << std::endl; + return nullptr; + } + + uint32_t v71 = XNN_INVALID_VALUE_ID; + std::array v71_dims = {{1, 14, 14, 112}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v71_dims.size(), v71_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v71); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v71" << std::endl; + return nullptr; + } + + uint32_t v72 = XNN_INVALID_VALUE_ID; + std::array v72_dims = {{1, 14, 14, 672}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v72_dims.size(), v72_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v72); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v72" << std::endl; + return nullptr; + } + + uint32_t v73 = XNN_INVALID_VALUE_ID; + std::array v73_dims = {{1, 14, 14, 672}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v73_dims.size(), v73_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v73); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v73" << std::endl; + return nullptr; + } + + uint32_t v74 = XNN_INVALID_VALUE_ID; + std::array v74_dims = {{1, 14, 14, 672}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v74_dims.size(), v74_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v74); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v74" << std::endl; + return nullptr; + } + + uint32_t v75 = XNN_INVALID_VALUE_ID; + std::array v75_dims = {{1, 14, 14, 672}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v75_dims.size(), v75_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v75); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v75" << std::endl; + return nullptr; + } + + uint32_t v76 = XNN_INVALID_VALUE_ID; + std::array v76_dims = {{1, 1, 1, 672}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v76_dims.size(), v76_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v76); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v76" << std::endl; + return nullptr; + } + + uint32_t v77 = XNN_INVALID_VALUE_ID; + std::array v77_dims = {{1, 1, 1, 168}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v77_dims.size(), v77_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v77); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v77" << std::endl; + return nullptr; + } + + uint32_t v78 = XNN_INVALID_VALUE_ID; + std::array v78_dims = {{1, 1, 1, 672}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v78_dims.size(), v78_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v78); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v78" << std::endl; + return nullptr; + } + + uint32_t v79 = XNN_INVALID_VALUE_ID; + std::array v79_dims = {{1, 1, 1, 672}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v79_dims.size(), v79_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v79); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v79" << std::endl; + return nullptr; + } + + uint32_t v80 = XNN_INVALID_VALUE_ID; + std::array v80_dims = {{1, 14, 14, 672}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v80_dims.size(), v80_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v80); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v80" << std::endl; + return nullptr; + } + + uint32_t v81 = XNN_INVALID_VALUE_ID; + std::array v81_dims = {{1, 14, 14, 112}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v81_dims.size(), v81_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v81); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v81" << std::endl; + return nullptr; + } + + uint32_t v82 = XNN_INVALID_VALUE_ID; + std::array v82_dims = {{1, 14, 14, 112}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v82_dims.size(), v82_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v82); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v82" << std::endl; + return nullptr; + } + + uint32_t v83 = XNN_INVALID_VALUE_ID; + std::array v83_dims = {{1, 14, 14, 672}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v83_dims.size(), v83_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v83); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v83" << std::endl; + return nullptr; + } + + uint32_t v84 = XNN_INVALID_VALUE_ID; + std::array v84_dims = {{1, 14, 14, 672}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v84_dims.size(), v84_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v84); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v84" << std::endl; + return nullptr; + } + + uint32_t v85 = XNN_INVALID_VALUE_ID; + std::array v85_dims = {{1, 7, 7, 672}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v85_dims.size(), v85_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v85); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v85" << std::endl; + return nullptr; + } + + uint32_t v86 = XNN_INVALID_VALUE_ID; + std::array v86_dims = {{1, 7, 7, 672}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v86_dims.size(), v86_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v86); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v86" << std::endl; + return nullptr; + } + + uint32_t v87 = XNN_INVALID_VALUE_ID; + std::array v87_dims = {{1, 1, 1, 672}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v87_dims.size(), v87_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v87); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v87" << std::endl; + return nullptr; + } + + uint32_t v88 = XNN_INVALID_VALUE_ID; + std::array v88_dims = {{1, 1, 1, 168}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v88_dims.size(), v88_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v88); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v88" << std::endl; + return nullptr; + } + + uint32_t v89 = XNN_INVALID_VALUE_ID; + std::array v89_dims = {{1, 1, 1, 672}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v89_dims.size(), v89_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v89); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v89" << std::endl; + return nullptr; + } + + uint32_t v90 = XNN_INVALID_VALUE_ID; + std::array v90_dims = {{1, 1, 1, 672}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v90_dims.size(), v90_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v90); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v90" << std::endl; + return nullptr; + } + + uint32_t v91 = XNN_INVALID_VALUE_ID; + std::array v91_dims = {{1, 7, 7, 672}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v91_dims.size(), v91_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v91); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v91" << std::endl; + return nullptr; + } + + uint32_t v92 = XNN_INVALID_VALUE_ID; + std::array v92_dims = {{1, 7, 7, 160}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v92_dims.size(), v92_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v92); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v92" << std::endl; + return nullptr; + } + + uint32_t v93 = XNN_INVALID_VALUE_ID; + std::array v93_dims = {{1, 7, 7, 960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v93_dims.size(), v93_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v93); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v93" << std::endl; + return nullptr; + } + + uint32_t v94 = XNN_INVALID_VALUE_ID; + std::array v94_dims = {{1, 7, 7, 960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v94_dims.size(), v94_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v94); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v94" << std::endl; + return nullptr; + } + + uint32_t v95 = XNN_INVALID_VALUE_ID; + std::array v95_dims = {{1, 7, 7, 960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v95_dims.size(), v95_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v95); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v95" << std::endl; + return nullptr; + } + + uint32_t v96 = XNN_INVALID_VALUE_ID; + std::array v96_dims = {{1, 7, 7, 960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v96_dims.size(), v96_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v96); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v96" << std::endl; + return nullptr; + } + + uint32_t v97 = XNN_INVALID_VALUE_ID; + std::array v97_dims = {{1, 1, 1, 960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v97_dims.size(), v97_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v97); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v97" << std::endl; + return nullptr; + } + + uint32_t v98 = XNN_INVALID_VALUE_ID; + std::array v98_dims = {{1, 1, 1, 240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v98_dims.size(), v98_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v98); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v98" << std::endl; + return nullptr; + } + + uint32_t v99 = XNN_INVALID_VALUE_ID; + std::array v99_dims = {{1, 1, 1, 960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v99_dims.size(), v99_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v99); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v99" << std::endl; + return nullptr; + } + + uint32_t v100 = XNN_INVALID_VALUE_ID; + std::array v100_dims = {{1, 1, 1, 960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v100_dims.size(), v100_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v100); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v100" << std::endl; + return nullptr; + } + + uint32_t v101 = XNN_INVALID_VALUE_ID; + std::array v101_dims = {{1, 7, 7, 960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v101_dims.size(), v101_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v101); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v101" << std::endl; + return nullptr; + } + + uint32_t v102 = XNN_INVALID_VALUE_ID; + std::array v102_dims = {{1, 7, 7, 160}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v102_dims.size(), v102_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v102); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v102" << std::endl; + return nullptr; + } + + uint32_t v103 = XNN_INVALID_VALUE_ID; + std::array v103_dims = {{1, 7, 7, 160}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v103_dims.size(), v103_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v103); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v103" << std::endl; + return nullptr; + } + + uint32_t v104 = XNN_INVALID_VALUE_ID; + std::array v104_dims = {{1, 7, 7, 960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v104_dims.size(), v104_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v104); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v104" << std::endl; + return nullptr; + } + + uint32_t v105 = XNN_INVALID_VALUE_ID; + std::array v105_dims = {{1, 7, 7, 960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v105_dims.size(), v105_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v105); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v105" << std::endl; + return nullptr; + } + + uint32_t v106 = XNN_INVALID_VALUE_ID; + std::array v106_dims = {{1, 7, 7, 960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v106_dims.size(), v106_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v106); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v106" << std::endl; + return nullptr; + } + + uint32_t v107 = XNN_INVALID_VALUE_ID; + std::array v107_dims = {{1, 7, 7, 960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v107_dims.size(), v107_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v107); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v107" << std::endl; + return nullptr; + } + + uint32_t v108 = XNN_INVALID_VALUE_ID; + std::array v108_dims = {{1, 1, 1, 960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v108_dims.size(), v108_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v108); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v108" << std::endl; + return nullptr; + } + + uint32_t v109 = XNN_INVALID_VALUE_ID; + std::array v109_dims = {{1, 1, 1, 240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v109_dims.size(), v109_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v109); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v109" << std::endl; + return nullptr; + } + + uint32_t v110 = XNN_INVALID_VALUE_ID; + std::array v110_dims = {{1, 1, 1, 960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v110_dims.size(), v110_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v110); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v110" << std::endl; + return nullptr; + } + + uint32_t v111 = XNN_INVALID_VALUE_ID; + std::array v111_dims = {{1, 1, 1, 960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v111_dims.size(), v111_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v111); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v111" << std::endl; + return nullptr; + } + + uint32_t v112 = XNN_INVALID_VALUE_ID; + std::array v112_dims = {{1, 7, 7, 960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v112_dims.size(), v112_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v112); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v112" << std::endl; + return nullptr; + } + + uint32_t v113 = XNN_INVALID_VALUE_ID; + std::array v113_dims = {{1, 7, 7, 160}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v113_dims.size(), v113_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v113); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v113" << std::endl; + return nullptr; + } + + uint32_t v114 = XNN_INVALID_VALUE_ID; + std::array v114_dims = {{1, 7, 7, 160}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v114_dims.size(), v114_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v114); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v114" << std::endl; + return nullptr; + } + + uint32_t v115 = XNN_INVALID_VALUE_ID; + std::array v115_dims = {{1, 7, 7, 960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v115_dims.size(), v115_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v115); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v115" << std::endl; + return nullptr; + } + + uint32_t v116 = XNN_INVALID_VALUE_ID; + std::array v116_dims = {{1, 7, 7, 960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v116_dims.size(), v116_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v116); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v116" << std::endl; + return nullptr; + } + + uint32_t v117 = XNN_INVALID_VALUE_ID; + std::array v117_dims = {{1, 1, 1, 960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v117_dims.size(), v117_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v117); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v117" << std::endl; + return nullptr; + } + + uint32_t v118 = XNN_INVALID_VALUE_ID; + std::array v118_dims = {{1, 1, 1, 1280}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v118_dims.size(), v118_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v118); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v118" << std::endl; + return nullptr; + } + + uint32_t v119 = XNN_INVALID_VALUE_ID; + std::array v119_dims = {{1, 1, 1, 1280}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v119_dims.size(), v119_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v119); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v119" << std::endl; + return nullptr; + } + + uint32_t v120 = XNN_INVALID_VALUE_ID; + std::array v120_dims = {{1, 1, 1, 1280}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v120_dims.size(), v120_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v120); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v120" << std::endl; + return nullptr; + } + + uint32_t v121 = XNN_INVALID_VALUE_ID; + std::array v121_dims = {{1, 1, 1, 1001}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v121_dims.size(), v121_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v121); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v121" << std::endl; + return nullptr; + } + + uint32_t v122 = XNN_INVALID_VALUE_ID; + std::array v122_dims = {{1, 1001}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v122_dims.size(), v122_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v122); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v122" << std::endl; + return nullptr; + } + + uint32_t v123 = XNN_INVALID_VALUE_ID; + std::array v123_dims = {{1, 1001}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v123_dims.size(), v123_dims.data(), + /*data=*/nullptr, + 0, XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &v123); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v123" << std::endl; + return nullptr; + } + + alignas(16) static std::array w124_data; + uint32_t w124 = XNN_INVALID_VALUE_ID; + std::array w124_dims = {{16, 3, 3, 3}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w124_dims.size(), w124_dims.data(), + /*data=*/w124_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w124); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w124" << std::endl; + return nullptr; + } + + alignas(16) static std::array w125_data; + uint32_t w125 = XNN_INVALID_VALUE_ID; + std::array w125_dims = {{16}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w125_dims.size(), w125_dims.data(), + /*data=*/w125_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w125); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w125" << std::endl; + return nullptr; + } + + alignas(16) static std::array w126_data; + uint32_t w126 = XNN_INVALID_VALUE_ID; + std::array w126_dims = {{1, 3, 3, 16}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w126_dims.size(), w126_dims.data(), + /*data=*/w126_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w126); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w126" << std::endl; + return nullptr; + } + + alignas(16) static std::array w127_data; + uint32_t w127 = XNN_INVALID_VALUE_ID; + std::array w127_dims = {{16}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w127_dims.size(), w127_dims.data(), + /*data=*/w127_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w127); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w127" << std::endl; + return nullptr; + } + + alignas(16) static std::array w128_data; + uint32_t w128 = XNN_INVALID_VALUE_ID; + std::array w128_dims = {{16, 1, 1, 16}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w128_dims.size(), w128_dims.data(), + /*data=*/w128_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w128); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w128" << std::endl; + return nullptr; + } + + alignas(16) static std::array w129_data; + uint32_t w129 = XNN_INVALID_VALUE_ID; + std::array w129_dims = {{16}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w129_dims.size(), w129_dims.data(), + /*data=*/w129_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w129); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w129" << std::endl; + return nullptr; + } + + alignas(16) static std::array w130_data; + uint32_t w130 = XNN_INVALID_VALUE_ID; + std::array w130_dims = {{64, 1, 1, 16}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w130_dims.size(), w130_dims.data(), + /*data=*/w130_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w130); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w130" << std::endl; + return nullptr; + } + + alignas(16) static std::array w131_data; + uint32_t w131 = XNN_INVALID_VALUE_ID; + std::array w131_dims = {{64}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w131_dims.size(), w131_dims.data(), + /*data=*/w131_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w131); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w131" << std::endl; + return nullptr; + } + + alignas(16) static std::array w132_data; + uint32_t w132 = XNN_INVALID_VALUE_ID; + std::array w132_dims = {{1, 3, 3, 64}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w132_dims.size(), w132_dims.data(), + /*data=*/w132_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w132); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w132" << std::endl; + return nullptr; + } + + alignas(16) static std::array w133_data; + uint32_t w133 = XNN_INVALID_VALUE_ID; + std::array w133_dims = {{64}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w133_dims.size(), w133_dims.data(), + /*data=*/w133_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w133); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w133" << std::endl; + return nullptr; + } + + alignas(16) static std::array w134_data; + uint32_t w134 = XNN_INVALID_VALUE_ID; + std::array w134_dims = {{24, 1, 1, 64}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w134_dims.size(), w134_dims.data(), + /*data=*/w134_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w134); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w134" << std::endl; + return nullptr; + } + + alignas(16) static std::array w135_data; + uint32_t w135 = XNN_INVALID_VALUE_ID; + std::array w135_dims = {{24}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w135_dims.size(), w135_dims.data(), + /*data=*/w135_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w135); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w135" << std::endl; + return nullptr; + } + + alignas(16) static std::array w136_data; + uint32_t w136 = XNN_INVALID_VALUE_ID; + std::array w136_dims = {{72, 1, 1, 24}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w136_dims.size(), w136_dims.data(), + /*data=*/w136_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w136); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w136" << std::endl; + return nullptr; + } + + alignas(16) static std::array w137_data; + uint32_t w137 = XNN_INVALID_VALUE_ID; + std::array w137_dims = {{72}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w137_dims.size(), w137_dims.data(), + /*data=*/w137_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w137); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w137" << std::endl; + return nullptr; + } + + alignas(16) static std::array w138_data; + uint32_t w138 = XNN_INVALID_VALUE_ID; + std::array w138_dims = {{1, 3, 3, 72}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w138_dims.size(), w138_dims.data(), + /*data=*/w138_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w138); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w138" << std::endl; + return nullptr; + } + + alignas(16) static std::array w139_data; + uint32_t w139 = XNN_INVALID_VALUE_ID; + std::array w139_dims = {{72}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w139_dims.size(), w139_dims.data(), + /*data=*/w139_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w139); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w139" << std::endl; + return nullptr; + } + + alignas(16) static std::array w140_data; + uint32_t w140 = XNN_INVALID_VALUE_ID; + std::array w140_dims = {{24, 1, 1, 72}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w140_dims.size(), w140_dims.data(), + /*data=*/w140_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w140); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w140" << std::endl; + return nullptr; + } + + alignas(16) static std::array w141_data; + uint32_t w141 = XNN_INVALID_VALUE_ID; + std::array w141_dims = {{24}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w141_dims.size(), w141_dims.data(), + /*data=*/w141_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w141); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w141" << std::endl; + return nullptr; + } + + alignas(16) static std::array w142_data; + uint32_t w142 = XNN_INVALID_VALUE_ID; + std::array w142_dims = {{72, 1, 1, 24}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w142_dims.size(), w142_dims.data(), + /*data=*/w142_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w142); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w142" << std::endl; + return nullptr; + } + + alignas(16) static std::array w143_data; + uint32_t w143 = XNN_INVALID_VALUE_ID; + std::array w143_dims = {{72}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w143_dims.size(), w143_dims.data(), + /*data=*/w143_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w143); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w143" << std::endl; + return nullptr; + } + + alignas(16) static std::array w144_data; + uint32_t w144 = XNN_INVALID_VALUE_ID; + std::array w144_dims = {{1, 5, 5, 72}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w144_dims.size(), w144_dims.data(), + /*data=*/w144_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w144); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w144" << std::endl; + return nullptr; + } + + alignas(16) static std::array w145_data; + uint32_t w145 = XNN_INVALID_VALUE_ID; + std::array w145_dims = {{72}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w145_dims.size(), w145_dims.data(), + /*data=*/w145_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w145); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w145" << std::endl; + return nullptr; + } + + alignas(16) static std::array w146_data; + uint32_t w146 = XNN_INVALID_VALUE_ID; + std::array w146_dims = {{24, 1, 1, 72}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w146_dims.size(), w146_dims.data(), + /*data=*/w146_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w146); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w146" << std::endl; + return nullptr; + } + + alignas(16) static std::array w147_data; + uint32_t w147 = XNN_INVALID_VALUE_ID; + std::array w147_dims = {{24}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w147_dims.size(), w147_dims.data(), + /*data=*/w147_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w147); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w147" << std::endl; + return nullptr; + } + + alignas(16) static std::array w148_data; + uint32_t w148 = XNN_INVALID_VALUE_ID; + std::array w148_dims = {{72, 1, 1, 24}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w148_dims.size(), w148_dims.data(), + /*data=*/w148_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w148); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w148" << std::endl; + return nullptr; + } + + alignas(16) static std::array w149_data; + uint32_t w149 = XNN_INVALID_VALUE_ID; + std::array w149_dims = {{72}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w149_dims.size(), w149_dims.data(), + /*data=*/w149_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w149); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w149" << std::endl; + return nullptr; + } + + alignas(16) static std::array w150_data; + uint32_t w150 = XNN_INVALID_VALUE_ID; + std::array w150_dims = {{1}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w150_dims.size(), w150_dims.data(), + /*data=*/w150_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w150); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w150" << std::endl; + return nullptr; + } + + alignas(16) static std::array w151_data; + uint32_t w151 = XNN_INVALID_VALUE_ID; + std::array w151_dims = {{40, 1, 1, 72}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w151_dims.size(), w151_dims.data(), + /*data=*/w151_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w151); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w151" << std::endl; + return nullptr; + } + + alignas(16) static std::array w152_data; + uint32_t w152 = XNN_INVALID_VALUE_ID; + std::array w152_dims = {{40}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w152_dims.size(), w152_dims.data(), + /*data=*/w152_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w152); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w152" << std::endl; + return nullptr; + } + + alignas(16) static std::array w153_data; + uint32_t w153 = XNN_INVALID_VALUE_ID; + std::array w153_dims = {{120, 1, 1, 40}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w153_dims.size(), w153_dims.data(), + /*data=*/w153_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w153); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w153" << std::endl; + return nullptr; + } + + alignas(16) static std::array w154_data; + uint32_t w154 = XNN_INVALID_VALUE_ID; + std::array w154_dims = {{120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w154_dims.size(), w154_dims.data(), + /*data=*/w154_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w154); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w154" << std::endl; + return nullptr; + } + + alignas(16) static std::array w155_data; + uint32_t w155 = XNN_INVALID_VALUE_ID; + std::array w155_dims = {{1, 5, 5, 120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w155_dims.size(), w155_dims.data(), + /*data=*/w155_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w155); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w155" << std::endl; + return nullptr; + } + + alignas(16) static std::array w156_data; + uint32_t w156 = XNN_INVALID_VALUE_ID; + std::array w156_dims = {{120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w156_dims.size(), w156_dims.data(), + /*data=*/w156_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w156); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w156" << std::endl; + return nullptr; + } + + alignas(16) static std::array w157_data; + uint32_t w157 = XNN_INVALID_VALUE_ID; + std::array w157_dims = {{32, 1, 1, 120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w157_dims.size(), w157_dims.data(), + /*data=*/w157_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w157); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w157" << std::endl; + return nullptr; + } + + alignas(16) static std::array w158_data; + uint32_t w158 = XNN_INVALID_VALUE_ID; + std::array w158_dims = {{32}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w158_dims.size(), w158_dims.data(), + /*data=*/w158_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w158); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w158" << std::endl; + return nullptr; + } + + alignas(16) static std::array w159_data; + uint32_t w159 = XNN_INVALID_VALUE_ID; + std::array w159_dims = {{120, 1, 1, 32}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w159_dims.size(), w159_dims.data(), + /*data=*/w159_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w159); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w159" << std::endl; + return nullptr; + } + + alignas(16) static std::array w160_data; + uint32_t w160 = XNN_INVALID_VALUE_ID; + std::array w160_dims = {{120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w160_dims.size(), w160_dims.data(), + /*data=*/w160_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w160); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w160" << std::endl; + return nullptr; + } + + alignas(16) static std::array w161_data; + uint32_t w161 = XNN_INVALID_VALUE_ID; + std::array w161_dims = {{1}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w161_dims.size(), w161_dims.data(), + /*data=*/w161_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w161); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w161" << std::endl; + return nullptr; + } + + alignas(16) static std::array w162_data; + uint32_t w162 = XNN_INVALID_VALUE_ID; + std::array w162_dims = {{40, 1, 1, 120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w162_dims.size(), w162_dims.data(), + /*data=*/w162_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w162); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w162" << std::endl; + return nullptr; + } + + alignas(16) static std::array w163_data; + uint32_t w163 = XNN_INVALID_VALUE_ID; + std::array w163_dims = {{40}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w163_dims.size(), w163_dims.data(), + /*data=*/w163_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w163); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w163" << std::endl; + return nullptr; + } + + alignas(16) static std::array w164_data; + uint32_t w164 = XNN_INVALID_VALUE_ID; + std::array w164_dims = {{120, 1, 1, 40}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w164_dims.size(), w164_dims.data(), + /*data=*/w164_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w164); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w164" << std::endl; + return nullptr; + } + + alignas(16) static std::array w165_data; + uint32_t w165 = XNN_INVALID_VALUE_ID; + std::array w165_dims = {{120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w165_dims.size(), w165_dims.data(), + /*data=*/w165_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w165); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w165" << std::endl; + return nullptr; + } + + alignas(16) static std::array w166_data; + uint32_t w166 = XNN_INVALID_VALUE_ID; + std::array w166_dims = {{1, 5, 5, 120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w166_dims.size(), w166_dims.data(), + /*data=*/w166_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w166); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w166" << std::endl; + return nullptr; + } + + alignas(16) static std::array w167_data; + uint32_t w167 = XNN_INVALID_VALUE_ID; + std::array w167_dims = {{120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w167_dims.size(), w167_dims.data(), + /*data=*/w167_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w167); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w167" << std::endl; + return nullptr; + } + + alignas(16) static std::array w168_data; + uint32_t w168 = XNN_INVALID_VALUE_ID; + std::array w168_dims = {{32, 1, 1, 120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w168_dims.size(), w168_dims.data(), + /*data=*/w168_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w168); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w168" << std::endl; + return nullptr; + } + + alignas(16) static std::array w169_data; + uint32_t w169 = XNN_INVALID_VALUE_ID; + std::array w169_dims = {{32}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w169_dims.size(), w169_dims.data(), + /*data=*/w169_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w169); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w169" << std::endl; + return nullptr; + } + + alignas(16) static std::array w170_data; + uint32_t w170 = XNN_INVALID_VALUE_ID; + std::array w170_dims = {{120, 1, 1, 32}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w170_dims.size(), w170_dims.data(), + /*data=*/w170_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w170); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w170" << std::endl; + return nullptr; + } + + alignas(16) static std::array w171_data; + uint32_t w171 = XNN_INVALID_VALUE_ID; + std::array w171_dims = {{120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w171_dims.size(), w171_dims.data(), + /*data=*/w171_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w171); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w171" << std::endl; + return nullptr; + } + + alignas(16) static std::array w172_data; + uint32_t w172 = XNN_INVALID_VALUE_ID; + std::array w172_dims = {{1}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w172_dims.size(), w172_dims.data(), + /*data=*/w172_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w172); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w172" << std::endl; + return nullptr; + } + + alignas(16) static std::array w173_data; + uint32_t w173 = XNN_INVALID_VALUE_ID; + std::array w173_dims = {{40, 1, 1, 120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w173_dims.size(), w173_dims.data(), + /*data=*/w173_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w173); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w173" << std::endl; + return nullptr; + } + + alignas(16) static std::array w174_data; + uint32_t w174 = XNN_INVALID_VALUE_ID; + std::array w174_dims = {{40}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w174_dims.size(), w174_dims.data(), + /*data=*/w174_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w174); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w174" << std::endl; + return nullptr; + } + + alignas(16) static std::array w175_data; + uint32_t w175 = XNN_INVALID_VALUE_ID; + std::array w175_dims = {{240, 1, 1, 40}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w175_dims.size(), w175_dims.data(), + /*data=*/w175_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w175); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w175" << std::endl; + return nullptr; + } + + alignas(16) static std::array w176_data; + uint32_t w176 = XNN_INVALID_VALUE_ID; + std::array w176_dims = {{240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w176_dims.size(), w176_dims.data(), + /*data=*/w176_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w176); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w176" << std::endl; + return nullptr; + } + + alignas(16) static std::array w177_data; + uint32_t w177 = XNN_INVALID_VALUE_ID; + std::array w177_dims = {{1, 3, 3, 240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w177_dims.size(), w177_dims.data(), + /*data=*/w177_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w177); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w177" << std::endl; + return nullptr; + } + + alignas(16) static std::array w178_data; + uint32_t w178 = XNN_INVALID_VALUE_ID; + std::array w178_dims = {{240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w178_dims.size(), w178_dims.data(), + /*data=*/w178_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w178); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w178" << std::endl; + return nullptr; + } + + alignas(16) static std::array w179_data; + uint32_t w179 = XNN_INVALID_VALUE_ID; + std::array w179_dims = {{80, 1, 1, 240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w179_dims.size(), w179_dims.data(), + /*data=*/w179_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w179); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w179" << std::endl; + return nullptr; + } + + alignas(16) static std::array w180_data; + uint32_t w180 = XNN_INVALID_VALUE_ID; + std::array w180_dims = {{80}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w180_dims.size(), w180_dims.data(), + /*data=*/w180_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w180); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w180" << std::endl; + return nullptr; + } + + alignas(16) static std::array w181_data; + uint32_t w181 = XNN_INVALID_VALUE_ID; + std::array w181_dims = {{200, 1, 1, 80}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w181_dims.size(), w181_dims.data(), + /*data=*/w181_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w181); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w181" << std::endl; + return nullptr; + } + + alignas(16) static std::array w182_data; + uint32_t w182 = XNN_INVALID_VALUE_ID; + std::array w182_dims = {{200}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w182_dims.size(), w182_dims.data(), + /*data=*/w182_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w182); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w182" << std::endl; + return nullptr; + } + + alignas(16) static std::array w183_data; + uint32_t w183 = XNN_INVALID_VALUE_ID; + std::array w183_dims = {{1, 3, 3, 200}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w183_dims.size(), w183_dims.data(), + /*data=*/w183_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w183); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w183" << std::endl; + return nullptr; + } + + alignas(16) static std::array w184_data; + uint32_t w184 = XNN_INVALID_VALUE_ID; + std::array w184_dims = {{200}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w184_dims.size(), w184_dims.data(), + /*data=*/w184_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w184); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w184" << std::endl; + return nullptr; + } + + alignas(16) static std::array w185_data; + uint32_t w185 = XNN_INVALID_VALUE_ID; + std::array w185_dims = {{80, 1, 1, 200}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w185_dims.size(), w185_dims.data(), + /*data=*/w185_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w185); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w185" << std::endl; + return nullptr; + } + + alignas(16) static std::array w186_data; + uint32_t w186 = XNN_INVALID_VALUE_ID; + std::array w186_dims = {{80}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w186_dims.size(), w186_dims.data(), + /*data=*/w186_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w186); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w186" << std::endl; + return nullptr; + } + + alignas(16) static std::array w187_data; + uint32_t w187 = XNN_INVALID_VALUE_ID; + std::array w187_dims = {{184, 1, 1, 80}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w187_dims.size(), w187_dims.data(), + /*data=*/w187_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w187); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w187" << std::endl; + return nullptr; + } + + alignas(16) static std::array w188_data; + uint32_t w188 = XNN_INVALID_VALUE_ID; + std::array w188_dims = {{184}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w188_dims.size(), w188_dims.data(), + /*data=*/w188_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w188); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w188" << std::endl; + return nullptr; + } + + alignas(16) static std::array w189_data; + uint32_t w189 = XNN_INVALID_VALUE_ID; + std::array w189_dims = {{1, 3, 3, 184}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w189_dims.size(), w189_dims.data(), + /*data=*/w189_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w189); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w189" << std::endl; + return nullptr; + } + + alignas(16) static std::array w190_data; + uint32_t w190 = XNN_INVALID_VALUE_ID; + std::array w190_dims = {{184}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w190_dims.size(), w190_dims.data(), + /*data=*/w190_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w190); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w190" << std::endl; + return nullptr; + } + + alignas(16) static std::array w191_data; + uint32_t w191 = XNN_INVALID_VALUE_ID; + std::array w191_dims = {{80, 1, 1, 184}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w191_dims.size(), w191_dims.data(), + /*data=*/w191_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w191); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w191" << std::endl; + return nullptr; + } + + alignas(16) static std::array w192_data; + uint32_t w192 = XNN_INVALID_VALUE_ID; + std::array w192_dims = {{80}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w192_dims.size(), w192_dims.data(), + /*data=*/w192_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w192); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w192" << std::endl; + return nullptr; + } + + alignas(16) static std::array w193_data; + uint32_t w193 = XNN_INVALID_VALUE_ID; + std::array w193_dims = {{184, 1, 1, 80}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w193_dims.size(), w193_dims.data(), + /*data=*/w193_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w193); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w193" << std::endl; + return nullptr; + } + + alignas(16) static std::array w194_data; + uint32_t w194 = XNN_INVALID_VALUE_ID; + std::array w194_dims = {{184}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w194_dims.size(), w194_dims.data(), + /*data=*/w194_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w194); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w194" << std::endl; + return nullptr; + } + + alignas(16) static std::array w195_data; + uint32_t w195 = XNN_INVALID_VALUE_ID; + std::array w195_dims = {{1, 3, 3, 184}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w195_dims.size(), w195_dims.data(), + /*data=*/w195_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w195); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w195" << std::endl; + return nullptr; + } + + alignas(16) static std::array w196_data; + uint32_t w196 = XNN_INVALID_VALUE_ID; + std::array w196_dims = {{184}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w196_dims.size(), w196_dims.data(), + /*data=*/w196_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w196); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w196" << std::endl; + return nullptr; + } + + alignas(16) static std::array w197_data; + uint32_t w197 = XNN_INVALID_VALUE_ID; + std::array w197_dims = {{80, 1, 1, 184}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w197_dims.size(), w197_dims.data(), + /*data=*/w197_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w197); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w197" << std::endl; + return nullptr; + } + + alignas(16) static std::array w198_data; + uint32_t w198 = XNN_INVALID_VALUE_ID; + std::array w198_dims = {{80}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w198_dims.size(), w198_dims.data(), + /*data=*/w198_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w198); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w198" << std::endl; + return nullptr; + } + + alignas(16) static std::array w199_data; + uint32_t w199 = XNN_INVALID_VALUE_ID; + std::array w199_dims = {{480, 1, 1, 80}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w199_dims.size(), w199_dims.data(), + /*data=*/w199_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w199); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w199" << std::endl; + return nullptr; + } + + alignas(16) static std::array w200_data; + uint32_t w200 = XNN_INVALID_VALUE_ID; + std::array w200_dims = {{480}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w200_dims.size(), w200_dims.data(), + /*data=*/w200_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w200); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w200" << std::endl; + return nullptr; + } + + alignas(16) static std::array w201_data; + uint32_t w201 = XNN_INVALID_VALUE_ID; + std::array w201_dims = {{1, 3, 3, 480}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w201_dims.size(), w201_dims.data(), + /*data=*/w201_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w201); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w201" << std::endl; + return nullptr; + } + + alignas(16) static std::array w202_data; + uint32_t w202 = XNN_INVALID_VALUE_ID; + std::array w202_dims = {{480}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w202_dims.size(), w202_dims.data(), + /*data=*/w202_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w202); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w202" << std::endl; + return nullptr; + } + + alignas(16) static std::array w203_data; + uint32_t w203 = XNN_INVALID_VALUE_ID; + std::array w203_dims = {{120, 1, 1, 480}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w203_dims.size(), w203_dims.data(), + /*data=*/w203_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w203); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w203" << std::endl; + return nullptr; + } + + alignas(16) static std::array w204_data; + uint32_t w204 = XNN_INVALID_VALUE_ID; + std::array w204_dims = {{120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w204_dims.size(), w204_dims.data(), + /*data=*/w204_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w204); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w204" << std::endl; + return nullptr; + } + + alignas(16) static std::array w205_data; + uint32_t w205 = XNN_INVALID_VALUE_ID; + std::array w205_dims = {{480, 1, 1, 120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w205_dims.size(), w205_dims.data(), + /*data=*/w205_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w205); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w205" << std::endl; + return nullptr; + } + + alignas(16) static std::array w206_data; + uint32_t w206 = XNN_INVALID_VALUE_ID; + std::array w206_dims = {{480}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w206_dims.size(), w206_dims.data(), + /*data=*/w206_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w206); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w206" << std::endl; + return nullptr; + } + + alignas(16) static std::array w207_data; + uint32_t w207 = XNN_INVALID_VALUE_ID; + std::array w207_dims = {{1}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w207_dims.size(), w207_dims.data(), + /*data=*/w207_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w207); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w207" << std::endl; + return nullptr; + } + + alignas(16) static std::array w208_data; + uint32_t w208 = XNN_INVALID_VALUE_ID; + std::array w208_dims = {{112, 1, 1, 480}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w208_dims.size(), w208_dims.data(), + /*data=*/w208_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w208); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w208" << std::endl; + return nullptr; + } + + alignas(16) static std::array w209_data; + uint32_t w209 = XNN_INVALID_VALUE_ID; + std::array w209_dims = {{112}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w209_dims.size(), w209_dims.data(), + /*data=*/w209_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w209); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w209" << std::endl; + return nullptr; + } + + alignas(16) static std::array w210_data; + uint32_t w210 = XNN_INVALID_VALUE_ID; + std::array w210_dims = {{672, 1, 1, 112}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w210_dims.size(), w210_dims.data(), + /*data=*/w210_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w210); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w210" << std::endl; + return nullptr; + } + + alignas(16) static std::array w211_data; + uint32_t w211 = XNN_INVALID_VALUE_ID; + std::array w211_dims = {{672}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w211_dims.size(), w211_dims.data(), + /*data=*/w211_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w211); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w211" << std::endl; + return nullptr; + } + + alignas(16) static std::array w212_data; + uint32_t w212 = XNN_INVALID_VALUE_ID; + std::array w212_dims = {{1, 3, 3, 672}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w212_dims.size(), w212_dims.data(), + /*data=*/w212_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w212); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w212" << std::endl; + return nullptr; + } + + alignas(16) static std::array w213_data; + uint32_t w213 = XNN_INVALID_VALUE_ID; + std::array w213_dims = {{672}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w213_dims.size(), w213_dims.data(), + /*data=*/w213_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w213); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w213" << std::endl; + return nullptr; + } + + alignas(16) static std::array w214_data; + uint32_t w214 = XNN_INVALID_VALUE_ID; + std::array w214_dims = {{168, 1, 1, 672}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w214_dims.size(), w214_dims.data(), + /*data=*/w214_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w214); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w214" << std::endl; + return nullptr; + } + + alignas(16) static std::array w215_data; + uint32_t w215 = XNN_INVALID_VALUE_ID; + std::array w215_dims = {{168}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w215_dims.size(), w215_dims.data(), + /*data=*/w215_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w215); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w215" << std::endl; + return nullptr; + } + + alignas(16) static std::array w216_data; + uint32_t w216 = XNN_INVALID_VALUE_ID; + std::array w216_dims = {{672, 1, 1, 168}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w216_dims.size(), w216_dims.data(), + /*data=*/w216_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w216); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w216" << std::endl; + return nullptr; + } + + alignas(16) static std::array w217_data; + uint32_t w217 = XNN_INVALID_VALUE_ID; + std::array w217_dims = {{672}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w217_dims.size(), w217_dims.data(), + /*data=*/w217_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w217); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w217" << std::endl; + return nullptr; + } + + alignas(16) static std::array w218_data; + uint32_t w218 = XNN_INVALID_VALUE_ID; + std::array w218_dims = {{1}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w218_dims.size(), w218_dims.data(), + /*data=*/w218_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w218); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w218" << std::endl; + return nullptr; + } + + alignas(16) static std::array w219_data; + uint32_t w219 = XNN_INVALID_VALUE_ID; + std::array w219_dims = {{112, 1, 1, 672}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w219_dims.size(), w219_dims.data(), + /*data=*/w219_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w219); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w219" << std::endl; + return nullptr; + } + + alignas(16) static std::array w220_data; + uint32_t w220 = XNN_INVALID_VALUE_ID; + std::array w220_dims = {{112}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w220_dims.size(), w220_dims.data(), + /*data=*/w220_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w220); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w220" << std::endl; + return nullptr; + } + + alignas(16) static std::array w221_data; + uint32_t w221 = XNN_INVALID_VALUE_ID; + std::array w221_dims = {{672, 1, 1, 112}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w221_dims.size(), w221_dims.data(), + /*data=*/w221_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w221); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w221" << std::endl; + return nullptr; + } + + alignas(16) static std::array w222_data; + uint32_t w222 = XNN_INVALID_VALUE_ID; + std::array w222_dims = {{672}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w222_dims.size(), w222_dims.data(), + /*data=*/w222_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w222); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w222" << std::endl; + return nullptr; + } + + alignas(16) static std::array w223_data; + uint32_t w223 = XNN_INVALID_VALUE_ID; + std::array w223_dims = {{1, 5, 5, 672}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w223_dims.size(), w223_dims.data(), + /*data=*/w223_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w223); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w223" << std::endl; + return nullptr; + } + + alignas(16) static std::array w224_data; + uint32_t w224 = XNN_INVALID_VALUE_ID; + std::array w224_dims = {{672}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w224_dims.size(), w224_dims.data(), + /*data=*/w224_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w224); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w224" << std::endl; + return nullptr; + } + + alignas(16) static std::array w225_data; + uint32_t w225 = XNN_INVALID_VALUE_ID; + std::array w225_dims = {{168, 1, 1, 672}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w225_dims.size(), w225_dims.data(), + /*data=*/w225_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w225); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w225" << std::endl; + return nullptr; + } + + alignas(16) static std::array w226_data; + uint32_t w226 = XNN_INVALID_VALUE_ID; + std::array w226_dims = {{168}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w226_dims.size(), w226_dims.data(), + /*data=*/w226_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w226); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w226" << std::endl; + return nullptr; + } + + alignas(16) static std::array w227_data; + uint32_t w227 = XNN_INVALID_VALUE_ID; + std::array w227_dims = {{672, 1, 1, 168}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w227_dims.size(), w227_dims.data(), + /*data=*/w227_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w227); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w227" << std::endl; + return nullptr; + } + + alignas(16) static std::array w228_data; + uint32_t w228 = XNN_INVALID_VALUE_ID; + std::array w228_dims = {{672}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w228_dims.size(), w228_dims.data(), + /*data=*/w228_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w228); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w228" << std::endl; + return nullptr; + } + + alignas(16) static std::array w229_data; + uint32_t w229 = XNN_INVALID_VALUE_ID; + std::array w229_dims = {{1}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w229_dims.size(), w229_dims.data(), + /*data=*/w229_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w229); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w229" << std::endl; + return nullptr; + } + + alignas(16) static std::array w230_data; + uint32_t w230 = XNN_INVALID_VALUE_ID; + std::array w230_dims = {{160, 1, 1, 672}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w230_dims.size(), w230_dims.data(), + /*data=*/w230_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w230); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w230" << std::endl; + return nullptr; + } + + alignas(16) static std::array w231_data; + uint32_t w231 = XNN_INVALID_VALUE_ID; + std::array w231_dims = {{160}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w231_dims.size(), w231_dims.data(), + /*data=*/w231_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w231); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w231" << std::endl; + return nullptr; + } + + alignas(16) static std::array w232_data; + uint32_t w232 = XNN_INVALID_VALUE_ID; + std::array w232_dims = {{960, 1, 1, 160}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w232_dims.size(), w232_dims.data(), + /*data=*/w232_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w232); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w232" << std::endl; + return nullptr; + } + + alignas(16) static std::array w233_data; + uint32_t w233 = XNN_INVALID_VALUE_ID; + std::array w233_dims = {{960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w233_dims.size(), w233_dims.data(), + /*data=*/w233_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w233); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w233" << std::endl; + return nullptr; + } + + alignas(16) static std::array w234_data; + uint32_t w234 = XNN_INVALID_VALUE_ID; + std::array w234_dims = {{1, 5, 5, 960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w234_dims.size(), w234_dims.data(), + /*data=*/w234_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w234); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w234" << std::endl; + return nullptr; + } + + alignas(16) static std::array w235_data; + uint32_t w235 = XNN_INVALID_VALUE_ID; + std::array w235_dims = {{960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w235_dims.size(), w235_dims.data(), + /*data=*/w235_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w235); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w235" << std::endl; + return nullptr; + } + + alignas(16) static std::array w236_data; + uint32_t w236 = XNN_INVALID_VALUE_ID; + std::array w236_dims = {{240, 1, 1, 960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w236_dims.size(), w236_dims.data(), + /*data=*/w236_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w236); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w236" << std::endl; + return nullptr; + } + + alignas(16) static std::array w237_data; + uint32_t w237 = XNN_INVALID_VALUE_ID; + std::array w237_dims = {{240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w237_dims.size(), w237_dims.data(), + /*data=*/w237_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w237); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w237" << std::endl; + return nullptr; + } + + alignas(16) static std::array w238_data; + uint32_t w238 = XNN_INVALID_VALUE_ID; + std::array w238_dims = {{960, 1, 1, 240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w238_dims.size(), w238_dims.data(), + /*data=*/w238_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w238); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w238" << std::endl; + return nullptr; + } + + alignas(16) static std::array w239_data; + uint32_t w239 = XNN_INVALID_VALUE_ID; + std::array w239_dims = {{960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w239_dims.size(), w239_dims.data(), + /*data=*/w239_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w239); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w239" << std::endl; + return nullptr; + } + + alignas(16) static std::array w240_data; + uint32_t w240 = XNN_INVALID_VALUE_ID; + std::array w240_dims = {{1}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w240_dims.size(), w240_dims.data(), + /*data=*/w240_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w240); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w240" << std::endl; + return nullptr; + } + + alignas(16) static std::array w241_data; + uint32_t w241 = XNN_INVALID_VALUE_ID; + std::array w241_dims = {{160, 1, 1, 960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w241_dims.size(), w241_dims.data(), + /*data=*/w241_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w241); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w241" << std::endl; + return nullptr; + } + + alignas(16) static std::array w242_data; + uint32_t w242 = XNN_INVALID_VALUE_ID; + std::array w242_dims = {{160}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w242_dims.size(), w242_dims.data(), + /*data=*/w242_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w242); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w242" << std::endl; + return nullptr; + } + + alignas(16) static std::array w243_data; + uint32_t w243 = XNN_INVALID_VALUE_ID; + std::array w243_dims = {{960, 1, 1, 160}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w243_dims.size(), w243_dims.data(), + /*data=*/w243_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w243); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w243" << std::endl; + return nullptr; + } + + alignas(16) static std::array w244_data; + uint32_t w244 = XNN_INVALID_VALUE_ID; + std::array w244_dims = {{960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w244_dims.size(), w244_dims.data(), + /*data=*/w244_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w244); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w244" << std::endl; + return nullptr; + } + + alignas(16) static std::array w245_data; + uint32_t w245 = XNN_INVALID_VALUE_ID; + std::array w245_dims = {{1, 5, 5, 960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w245_dims.size(), w245_dims.data(), + /*data=*/w245_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w245); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w245" << std::endl; + return nullptr; + } + + alignas(16) static std::array w246_data; + uint32_t w246 = XNN_INVALID_VALUE_ID; + std::array w246_dims = {{960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w246_dims.size(), w246_dims.data(), + /*data=*/w246_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w246); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w246" << std::endl; + return nullptr; + } + + alignas(16) static std::array w247_data; + uint32_t w247 = XNN_INVALID_VALUE_ID; + std::array w247_dims = {{240, 1, 1, 960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w247_dims.size(), w247_dims.data(), + /*data=*/w247_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w247); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w247" << std::endl; + return nullptr; + } + + alignas(16) static std::array w248_data; + uint32_t w248 = XNN_INVALID_VALUE_ID; + std::array w248_dims = {{240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w248_dims.size(), w248_dims.data(), + /*data=*/w248_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w248); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w248" << std::endl; + return nullptr; + } + + alignas(16) static std::array w249_data; + uint32_t w249 = XNN_INVALID_VALUE_ID; + std::array w249_dims = {{960, 1, 1, 240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w249_dims.size(), w249_dims.data(), + /*data=*/w249_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w249); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w249" << std::endl; + return nullptr; + } + + alignas(16) static std::array w250_data; + uint32_t w250 = XNN_INVALID_VALUE_ID; + std::array w250_dims = {{960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w250_dims.size(), w250_dims.data(), + /*data=*/w250_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w250); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w250" << std::endl; + return nullptr; + } + + alignas(16) static std::array w251_data; + uint32_t w251 = XNN_INVALID_VALUE_ID; + std::array w251_dims = {{1}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w251_dims.size(), w251_dims.data(), + /*data=*/w251_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w251); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w251" << std::endl; + return nullptr; + } + + alignas(16) static std::array w252_data; + uint32_t w252 = XNN_INVALID_VALUE_ID; + std::array w252_dims = {{160, 1, 1, 960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w252_dims.size(), w252_dims.data(), + /*data=*/w252_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w252); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w252" << std::endl; + return nullptr; + } + + alignas(16) static std::array w253_data; + uint32_t w253 = XNN_INVALID_VALUE_ID; + std::array w253_dims = {{160}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w253_dims.size(), w253_dims.data(), + /*data=*/w253_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w253); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w253" << std::endl; + return nullptr; + } + + alignas(16) static std::array w254_data; + uint32_t w254 = XNN_INVALID_VALUE_ID; + std::array w254_dims = {{960, 1, 1, 160}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w254_dims.size(), w254_dims.data(), + /*data=*/w254_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w254); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w254" << std::endl; + return nullptr; + } + + alignas(16) static std::array w255_data; + uint32_t w255 = XNN_INVALID_VALUE_ID; + std::array w255_dims = {{960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w255_dims.size(), w255_dims.data(), + /*data=*/w255_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w255); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w255" << std::endl; + return nullptr; + } + + alignas(16) static std::array w256_data; + uint32_t w256 = XNN_INVALID_VALUE_ID; + std::array w256_dims = {{1280, 1, 1, 960}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w256_dims.size(), w256_dims.data(), + /*data=*/w256_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w256); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w256" << std::endl; + return nullptr; + } + + alignas(16) static std::array w257_data; + uint32_t w257 = XNN_INVALID_VALUE_ID; + std::array w257_dims = {{1280}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w257_dims.size(), w257_dims.data(), + /*data=*/w257_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w257); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w257" << std::endl; + return nullptr; + } + + alignas(16) static std::array w258_data; + uint32_t w258 = XNN_INVALID_VALUE_ID; + std::array w258_dims = {{1001, 1, 1, 1280}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w258_dims.size(), w258_dims.data(), + /*data=*/w258_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w258); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w258" << std::endl; + return nullptr; + } + + alignas(16) static std::array w259_data; + uint32_t w259 = XNN_INVALID_VALUE_ID; + std::array w259_dims = {{1001}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w259_dims.size(), w259_dims.data(), + /*data=*/w259_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w259); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w259" << std::endl; + return nullptr; + } + + std::random_device random_device; + auto rng = std::mt19937(random_device()); + auto f32rng = std::bind(std::uniform_real_distribution(-1.0f, +1.0f), std::ref(rng)); + std::generate(w124_data.begin(), w124_data.end(), std::ref(f32rng)); + std::generate(w125_data.begin(), w125_data.end(), std::ref(f32rng)); + std::generate(w126_data.begin(), w126_data.end(), std::ref(f32rng)); + std::generate(w127_data.begin(), w127_data.end(), std::ref(f32rng)); + std::generate(w128_data.begin(), w128_data.end(), std::ref(f32rng)); + std::generate(w129_data.begin(), w129_data.end(), std::ref(f32rng)); + std::generate(w130_data.begin(), w130_data.end(), std::ref(f32rng)); + std::generate(w131_data.begin(), w131_data.end(), std::ref(f32rng)); + std::generate(w132_data.begin(), w132_data.end(), std::ref(f32rng)); + std::generate(w133_data.begin(), w133_data.end(), std::ref(f32rng)); + std::generate(w134_data.begin(), w134_data.end(), std::ref(f32rng)); + std::generate(w135_data.begin(), w135_data.end(), std::ref(f32rng)); + std::generate(w136_data.begin(), w136_data.end(), std::ref(f32rng)); + std::generate(w137_data.begin(), w137_data.end(), std::ref(f32rng)); + std::generate(w138_data.begin(), w138_data.end(), std::ref(f32rng)); + std::generate(w139_data.begin(), w139_data.end(), std::ref(f32rng)); + std::generate(w140_data.begin(), w140_data.end(), std::ref(f32rng)); + std::generate(w141_data.begin(), w141_data.end(), std::ref(f32rng)); + std::generate(w142_data.begin(), w142_data.end(), std::ref(f32rng)); + std::generate(w143_data.begin(), w143_data.end(), std::ref(f32rng)); + std::generate(w144_data.begin(), w144_data.end(), std::ref(f32rng)); + std::generate(w145_data.begin(), w145_data.end(), std::ref(f32rng)); + std::generate(w146_data.begin(), w146_data.end(), std::ref(f32rng)); + std::generate(w147_data.begin(), w147_data.end(), std::ref(f32rng)); + std::generate(w148_data.begin(), w148_data.end(), std::ref(f32rng)); + std::generate(w149_data.begin(), w149_data.end(), std::ref(f32rng)); + std::generate(w150_data.begin(), w150_data.end(), std::ref(f32rng)); + std::generate(w151_data.begin(), w151_data.end(), std::ref(f32rng)); + std::generate(w152_data.begin(), w152_data.end(), std::ref(f32rng)); + std::generate(w153_data.begin(), w153_data.end(), std::ref(f32rng)); + std::generate(w154_data.begin(), w154_data.end(), std::ref(f32rng)); + std::generate(w155_data.begin(), w155_data.end(), std::ref(f32rng)); + std::generate(w156_data.begin(), w156_data.end(), std::ref(f32rng)); + std::generate(w157_data.begin(), w157_data.end(), std::ref(f32rng)); + std::generate(w158_data.begin(), w158_data.end(), std::ref(f32rng)); + std::generate(w159_data.begin(), w159_data.end(), std::ref(f32rng)); + std::generate(w160_data.begin(), w160_data.end(), std::ref(f32rng)); + std::generate(w161_data.begin(), w161_data.end(), std::ref(f32rng)); + std::generate(w162_data.begin(), w162_data.end(), std::ref(f32rng)); + std::generate(w163_data.begin(), w163_data.end(), std::ref(f32rng)); + std::generate(w164_data.begin(), w164_data.end(), std::ref(f32rng)); + std::generate(w165_data.begin(), w165_data.end(), std::ref(f32rng)); + std::generate(w166_data.begin(), w166_data.end(), std::ref(f32rng)); + std::generate(w167_data.begin(), w167_data.end(), std::ref(f32rng)); + std::generate(w168_data.begin(), w168_data.end(), std::ref(f32rng)); + std::generate(w169_data.begin(), w169_data.end(), std::ref(f32rng)); + std::generate(w170_data.begin(), w170_data.end(), std::ref(f32rng)); + std::generate(w171_data.begin(), w171_data.end(), std::ref(f32rng)); + std::generate(w172_data.begin(), w172_data.end(), std::ref(f32rng)); + std::generate(w173_data.begin(), w173_data.end(), std::ref(f32rng)); + std::generate(w174_data.begin(), w174_data.end(), std::ref(f32rng)); + std::generate(w175_data.begin(), w175_data.end(), std::ref(f32rng)); + std::generate(w176_data.begin(), w176_data.end(), std::ref(f32rng)); + std::generate(w177_data.begin(), w177_data.end(), std::ref(f32rng)); + std::generate(w178_data.begin(), w178_data.end(), std::ref(f32rng)); + std::generate(w179_data.begin(), w179_data.end(), std::ref(f32rng)); + std::generate(w180_data.begin(), w180_data.end(), std::ref(f32rng)); + std::generate(w181_data.begin(), w181_data.end(), std::ref(f32rng)); + std::generate(w182_data.begin(), w182_data.end(), std::ref(f32rng)); + std::generate(w183_data.begin(), w183_data.end(), std::ref(f32rng)); + std::generate(w184_data.begin(), w184_data.end(), std::ref(f32rng)); + std::generate(w185_data.begin(), w185_data.end(), std::ref(f32rng)); + std::generate(w186_data.begin(), w186_data.end(), std::ref(f32rng)); + std::generate(w187_data.begin(), w187_data.end(), std::ref(f32rng)); + std::generate(w188_data.begin(), w188_data.end(), std::ref(f32rng)); + std::generate(w189_data.begin(), w189_data.end(), std::ref(f32rng)); + std::generate(w190_data.begin(), w190_data.end(), std::ref(f32rng)); + std::generate(w191_data.begin(), w191_data.end(), std::ref(f32rng)); + std::generate(w192_data.begin(), w192_data.end(), std::ref(f32rng)); + std::generate(w193_data.begin(), w193_data.end(), std::ref(f32rng)); + std::generate(w194_data.begin(), w194_data.end(), std::ref(f32rng)); + std::generate(w195_data.begin(), w195_data.end(), std::ref(f32rng)); + std::generate(w196_data.begin(), w196_data.end(), std::ref(f32rng)); + std::generate(w197_data.begin(), w197_data.end(), std::ref(f32rng)); + std::generate(w198_data.begin(), w198_data.end(), std::ref(f32rng)); + std::generate(w199_data.begin(), w199_data.end(), std::ref(f32rng)); + std::generate(w200_data.begin(), w200_data.end(), std::ref(f32rng)); + std::generate(w201_data.begin(), w201_data.end(), std::ref(f32rng)); + std::generate(w202_data.begin(), w202_data.end(), std::ref(f32rng)); + std::generate(w203_data.begin(), w203_data.end(), std::ref(f32rng)); + std::generate(w204_data.begin(), w204_data.end(), std::ref(f32rng)); + std::generate(w205_data.begin(), w205_data.end(), std::ref(f32rng)); + std::generate(w206_data.begin(), w206_data.end(), std::ref(f32rng)); + std::generate(w207_data.begin(), w207_data.end(), std::ref(f32rng)); + std::generate(w208_data.begin(), w208_data.end(), std::ref(f32rng)); + std::generate(w209_data.begin(), w209_data.end(), std::ref(f32rng)); + std::generate(w210_data.begin(), w210_data.end(), std::ref(f32rng)); + std::generate(w211_data.begin(), w211_data.end(), std::ref(f32rng)); + std::generate(w212_data.begin(), w212_data.end(), std::ref(f32rng)); + std::generate(w213_data.begin(), w213_data.end(), std::ref(f32rng)); + std::generate(w214_data.begin(), w214_data.end(), std::ref(f32rng)); + std::generate(w215_data.begin(), w215_data.end(), std::ref(f32rng)); + std::generate(w216_data.begin(), w216_data.end(), std::ref(f32rng)); + std::generate(w217_data.begin(), w217_data.end(), std::ref(f32rng)); + std::generate(w218_data.begin(), w218_data.end(), std::ref(f32rng)); + std::generate(w219_data.begin(), w219_data.end(), std::ref(f32rng)); + std::generate(w220_data.begin(), w220_data.end(), std::ref(f32rng)); + std::generate(w221_data.begin(), w221_data.end(), std::ref(f32rng)); + std::generate(w222_data.begin(), w222_data.end(), std::ref(f32rng)); + std::generate(w223_data.begin(), w223_data.end(), std::ref(f32rng)); + std::generate(w224_data.begin(), w224_data.end(), std::ref(f32rng)); + std::generate(w225_data.begin(), w225_data.end(), std::ref(f32rng)); + std::generate(w226_data.begin(), w226_data.end(), std::ref(f32rng)); + std::generate(w227_data.begin(), w227_data.end(), std::ref(f32rng)); + std::generate(w228_data.begin(), w228_data.end(), std::ref(f32rng)); + std::generate(w229_data.begin(), w229_data.end(), std::ref(f32rng)); + std::generate(w230_data.begin(), w230_data.end(), std::ref(f32rng)); + std::generate(w231_data.begin(), w231_data.end(), std::ref(f32rng)); + std::generate(w232_data.begin(), w232_data.end(), std::ref(f32rng)); + std::generate(w233_data.begin(), w233_data.end(), std::ref(f32rng)); + std::generate(w234_data.begin(), w234_data.end(), std::ref(f32rng)); + std::generate(w235_data.begin(), w235_data.end(), std::ref(f32rng)); + std::generate(w236_data.begin(), w236_data.end(), std::ref(f32rng)); + std::generate(w237_data.begin(), w237_data.end(), std::ref(f32rng)); + std::generate(w238_data.begin(), w238_data.end(), std::ref(f32rng)); + std::generate(w239_data.begin(), w239_data.end(), std::ref(f32rng)); + std::generate(w240_data.begin(), w240_data.end(), std::ref(f32rng)); + std::generate(w241_data.begin(), w241_data.end(), std::ref(f32rng)); + std::generate(w242_data.begin(), w242_data.end(), std::ref(f32rng)); + std::generate(w243_data.begin(), w243_data.end(), std::ref(f32rng)); + std::generate(w244_data.begin(), w244_data.end(), std::ref(f32rng)); + std::generate(w245_data.begin(), w245_data.end(), std::ref(f32rng)); + std::generate(w246_data.begin(), w246_data.end(), std::ref(f32rng)); + std::generate(w247_data.begin(), w247_data.end(), std::ref(f32rng)); + std::generate(w248_data.begin(), w248_data.end(), std::ref(f32rng)); + std::generate(w249_data.begin(), w249_data.end(), std::ref(f32rng)); + std::generate(w250_data.begin(), w250_data.end(), std::ref(f32rng)); + std::generate(w251_data.begin(), w251_data.end(), std::ref(f32rng)); + std::generate(w252_data.begin(), w252_data.end(), std::ref(f32rng)); + std::generate(w253_data.begin(), w253_data.end(), std::ref(f32rng)); + std::generate(w254_data.begin(), w254_data.end(), std::ref(f32rng)); + std::generate(w255_data.begin(), w255_data.end(), std::ref(f32rng)); + std::generate(w256_data.begin(), w256_data.end(), std::ref(f32rng)); + std::generate(w257_data.begin(), w257_data.end(), std::ref(f32rng)); + std::generate(w258_data.begin(), w258_data.end(), std::ref(f32rng)); + std::generate(w259_data.begin(), w259_data.end(), std::ref(f32rng)); + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/0, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/2, /*subsampling_width=*/2, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/3, + /*group_output_channels=*/16, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v0, + w124, + w125, + v1, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #0" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v1, + v2, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #1" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/16, + /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), + v2, + w126, + w127, + v3, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #2" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/16, + /*group_output_channels=*/16, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v3, + w128, + w129, + v4, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #3" << std::endl; + return nullptr; + } + + status = xnn_define_add2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v4, + v2, + v5, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #4" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/16, + /*group_output_channels=*/64, + /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), + v5, + w130, + w131, + v6, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #5" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/0, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/2, /*subsampling_width=*/2, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/64, + /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), + v6, + w132, + w133, + v7, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #6" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/64, + /*group_output_channels=*/24, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v7, + w134, + w135, + v8, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #7" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/24, + /*group_output_channels=*/72, + /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), + v8, + w136, + w137, + v9, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #8" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/72, + /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), + v9, + w138, + w139, + v10, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #9" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/72, + /*group_output_channels=*/24, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v10, + w140, + w141, + v11, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #10" << std::endl; + return nullptr; + } + + status = xnn_define_add2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v11, + v8, + v12, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #11" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/24, + /*group_output_channels=*/72, + /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), + v12, + w142, + w143, + v13, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #12" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/2, /*padding_bottom=*/2, /*padding_left=*/1, + /*kernel_height=*/5, /*kernel_width=*/5, + /*subsampling_height=*/2, /*subsampling_width=*/2, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/72, + /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), + v13, + w144, + w145, + v14, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #13" << std::endl; + return nullptr; + } + + status = xnn_define_average_pooling_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*pooling_height=*/28, /*pooling_width=*/28, + /*stride_height=*/1, /*stride_width=*/1, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v14, + v15, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #14" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/72, + /*group_output_channels=*/24, + /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), + v15, + w146, + w147, + v16, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #15" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/24, + /*group_output_channels=*/72, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v16, + w148, + w149, + v17, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #16" << std::endl; + return nullptr; + } + + status = xnn_define_multiply2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v17, + w150, + v18, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #17" << std::endl; + return nullptr; + } + + status = xnn_define_multiply2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v14, + v18, + v19, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #18" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/72, + /*group_output_channels=*/40, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v19, + w151, + w152, + v20, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #19" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/40, + /*group_output_channels=*/120, + /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), + v20, + w153, + w154, + v21, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #20" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/2, /*padding_right=*/2, /*padding_bottom=*/2, /*padding_left=*/2, + /*kernel_height=*/5, /*kernel_width=*/5, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/120, + /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), + v21, + w155, + w156, + v22, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #21" << std::endl; + return nullptr; + } + + status = xnn_define_average_pooling_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*pooling_height=*/28, /*pooling_width=*/28, + /*stride_height=*/1, /*stride_width=*/1, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v22, + v23, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #22" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/120, + /*group_output_channels=*/32, + /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), + v23, + w157, + w158, + v24, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #23" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/32, + /*group_output_channels=*/120, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v24, + w159, + w160, + v25, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #24" << std::endl; + return nullptr; + } + + status = xnn_define_multiply2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v25, + w161, + v26, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #25" << std::endl; + return nullptr; + } + + status = xnn_define_multiply2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v22, + v26, + v27, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #26" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/120, + /*group_output_channels=*/40, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v27, + w162, + w163, + v28, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #27" << std::endl; + return nullptr; + } + + status = xnn_define_add2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v28, + v20, + v29, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #28" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/40, + /*group_output_channels=*/120, + /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), + v29, + w164, + w165, + v30, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #29" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/2, /*padding_right=*/2, /*padding_bottom=*/2, /*padding_left=*/2, + /*kernel_height=*/5, /*kernel_width=*/5, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/120, + /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), + v30, + w166, + w167, + v31, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #30" << std::endl; + return nullptr; + } + + status = xnn_define_average_pooling_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*pooling_height=*/28, /*pooling_width=*/28, + /*stride_height=*/1, /*stride_width=*/1, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v31, + v32, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #31" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/120, + /*group_output_channels=*/32, + /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), + v32, + w168, + w169, + v33, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #32" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/32, + /*group_output_channels=*/120, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v33, + w170, + w171, + v34, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #33" << std::endl; + return nullptr; + } + + status = xnn_define_multiply2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v34, + w172, + v35, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #34" << std::endl; + return nullptr; + } + + status = xnn_define_multiply2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v31, + v35, + v36, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #35" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/120, + /*group_output_channels=*/40, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v36, + w173, + w174, + v37, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #36" << std::endl; + return nullptr; + } + + status = xnn_define_add2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v37, + v29, + v38, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #37" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/40, + /*group_output_channels=*/240, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v38, + w175, + w176, + v39, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #38" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v39, + v40, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #39" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/0, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/2, /*subsampling_width=*/2, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/240, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v40, + w177, + w178, + v41, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #40" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v41, + v42, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #41" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/240, + /*group_output_channels=*/80, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v42, + w179, + w180, + v43, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #42" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/80, + /*group_output_channels=*/200, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v43, + w181, + w182, + v44, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #43" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v44, + v45, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #44" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/200, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v45, + w183, + w184, + v46, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #45" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v46, + v47, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #46" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/200, + /*group_output_channels=*/80, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v47, + w185, + w186, + v48, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #47" << std::endl; + return nullptr; + } + + status = xnn_define_add2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v48, + v43, + v49, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #48" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/80, + /*group_output_channels=*/184, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v49, + w187, + w188, + v50, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #49" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v50, + v51, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #50" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/184, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v51, + w189, + w190, + v52, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #51" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v52, + v53, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #52" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/184, + /*group_output_channels=*/80, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v53, + w191, + w192, + v54, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #53" << std::endl; + return nullptr; + } + + status = xnn_define_add2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v54, + v49, + v55, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #54" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/80, + /*group_output_channels=*/184, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v55, + w193, + w194, + v56, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #55" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v56, + v57, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #56" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/184, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v57, + w195, + w196, + v58, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #57" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v58, + v59, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #58" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/184, + /*group_output_channels=*/80, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v59, + w197, + w198, + v60, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #59" << std::endl; + return nullptr; + } + + status = xnn_define_add2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v60, + v55, + v61, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #60" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/80, + /*group_output_channels=*/480, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v61, + w199, + w200, + v62, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #61" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v62, + v63, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #62" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/480, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v63, + w201, + w202, + v64, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #63" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v64, + v65, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #64" << std::endl; + return nullptr; + } + + status = xnn_define_average_pooling_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*pooling_height=*/14, /*pooling_width=*/14, + /*stride_height=*/1, /*stride_width=*/1, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v65, + v66, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #65" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/480, + /*group_output_channels=*/120, + /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), + v66, + w203, + w204, + v67, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #66" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/120, + /*group_output_channels=*/480, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v67, + w205, + w206, + v68, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #67" << std::endl; + return nullptr; + } + + status = xnn_define_multiply2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v68, + w207, + v69, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #68" << std::endl; + return nullptr; + } + + status = xnn_define_multiply2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v65, + v69, + v70, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #69" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/480, + /*group_output_channels=*/112, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v70, + w208, + w209, + v71, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #70" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/112, + /*group_output_channels=*/672, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v71, + w210, + w211, + v72, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #71" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v72, + v73, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #72" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/672, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v73, + w212, + w213, + v74, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #73" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v74, + v75, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #74" << std::endl; + return nullptr; + } + + status = xnn_define_average_pooling_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*pooling_height=*/14, /*pooling_width=*/14, + /*stride_height=*/1, /*stride_width=*/1, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v75, + v76, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #75" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/672, + /*group_output_channels=*/168, + /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), + v76, + w214, + w215, + v77, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #76" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/168, + /*group_output_channels=*/672, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v77, + w216, + w217, + v78, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #77" << std::endl; + return nullptr; + } + + status = xnn_define_multiply2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v78, + w218, + v79, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #78" << std::endl; + return nullptr; + } + + status = xnn_define_multiply2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v75, + v79, + v80, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #79" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/672, + /*group_output_channels=*/112, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v80, + w219, + w220, + v81, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #80" << std::endl; + return nullptr; + } + + status = xnn_define_add2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v81, + v71, + v82, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #81" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/112, + /*group_output_channels=*/672, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v82, + w221, + w222, + v83, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #82" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v83, + v84, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #83" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/2, /*padding_bottom=*/2, /*padding_left=*/1, + /*kernel_height=*/5, /*kernel_width=*/5, + /*subsampling_height=*/2, /*subsampling_width=*/2, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/672, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v84, + w223, + w224, + v85, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #84" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v85, + v86, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #85" << std::endl; + return nullptr; + } + + status = xnn_define_average_pooling_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*pooling_height=*/7, /*pooling_width=*/7, + /*stride_height=*/1, /*stride_width=*/1, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v86, + v87, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #86" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/672, + /*group_output_channels=*/168, + /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), + v87, + w225, + w226, + v88, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #87" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/168, + /*group_output_channels=*/672, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v88, + w227, + w228, + v89, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #88" << std::endl; + return nullptr; + } + + status = xnn_define_multiply2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v89, + w229, + v90, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #89" << std::endl; + return nullptr; + } + + status = xnn_define_multiply2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v86, + v90, + v91, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #90" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/672, + /*group_output_channels=*/160, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v91, + w230, + w231, + v92, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #91" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/160, + /*group_output_channels=*/960, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v92, + w232, + w233, + v93, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #92" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v93, + v94, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #93" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/2, /*padding_right=*/2, /*padding_bottom=*/2, /*padding_left=*/2, + /*kernel_height=*/5, /*kernel_width=*/5, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/960, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v94, + w234, + w235, + v95, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #94" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v95, + v96, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #95" << std::endl; + return nullptr; + } + + status = xnn_define_average_pooling_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*pooling_height=*/7, /*pooling_width=*/7, + /*stride_height=*/1, /*stride_width=*/1, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v96, + v97, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #96" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/960, + /*group_output_channels=*/240, + /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), + v97, + w236, + w237, + v98, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #97" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/240, + /*group_output_channels=*/960, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v98, + w238, + w239, + v99, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #98" << std::endl; + return nullptr; + } + + status = xnn_define_multiply2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v99, + w240, + v100, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #99" << std::endl; + return nullptr; + } + + status = xnn_define_multiply2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v96, + v100, + v101, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #100" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/960, + /*group_output_channels=*/160, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v101, + w241, + w242, + v102, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #101" << std::endl; + return nullptr; + } + + status = xnn_define_add2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v102, + v92, + v103, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #102" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/160, + /*group_output_channels=*/960, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v103, + w243, + w244, + v104, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #103" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v104, + v105, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #104" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/2, /*padding_right=*/2, /*padding_bottom=*/2, /*padding_left=*/2, + /*kernel_height=*/5, /*kernel_width=*/5, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/960, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v105, + w245, + w246, + v106, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #105" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v106, + v107, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #106" << std::endl; + return nullptr; + } + + status = xnn_define_average_pooling_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*pooling_height=*/7, /*pooling_width=*/7, + /*stride_height=*/1, /*stride_width=*/1, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v107, + v108, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #107" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/960, + /*group_output_channels=*/240, + /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), + v108, + w247, + w248, + v109, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #108" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/240, + /*group_output_channels=*/960, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v109, + w249, + w250, + v110, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #109" << std::endl; + return nullptr; + } + + status = xnn_define_multiply2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v110, + w251, + v111, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #110" << std::endl; + return nullptr; + } + + status = xnn_define_multiply2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v107, + v111, + v112, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #111" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/960, + /*group_output_channels=*/160, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v112, + w252, + w253, + v113, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #112" << std::endl; + return nullptr; + } + + status = xnn_define_add2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v113, + v103, + v114, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #113" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/160, + /*group_output_channels=*/960, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v114, + w254, + w255, + v115, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #114" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v115, + v116, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #115" << std::endl; + return nullptr; + } + + status = xnn_define_average_pooling_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*pooling_height=*/7, /*pooling_width=*/7, + /*stride_height=*/1, /*stride_width=*/1, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v116, + v117, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #116" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/960, + /*group_output_channels=*/1280, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v117, + w256, + w257, + v118, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #117" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v118, + v119, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #118" << std::endl; + return nullptr; + } + + status = xnn_define_average_pooling_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*pooling_height=*/1, /*pooling_width=*/1, + /*stride_height=*/1, /*stride_width=*/1, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v119, + v120, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #119" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/1280, + /*group_output_channels=*/1001, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v120, + w258, + w259, + v121, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #120" << std::endl; + return nullptr; + } + + status = xnn_define_copy( + subgraph, + v121, + v122, + 0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #121" << std::endl; + return nullptr; + } + + status = xnn_define_softmax( + subgraph, + v122, + v123, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #122" << std::endl; + return nullptr; + } + + return subgraph; +} + +} // namespace models diff --git a/bench/models/fp32-mobilenet-v3-small.cc b/bench/models/fp32-mobilenet-v3-small.cc new file mode 100644 index 00000000000..a22280c3053 --- /dev/null +++ b/bench/models/fp32-mobilenet-v3-small.cc @@ -0,0 +1,4757 @@ +// Copyright 2020 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. +// +// Auto-generated file. Do not edit! + +#include "xnnpack.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xnnpack.h" + +// align a size up to XNN_EXTRA_BYTES +#define XNN_PAD_EXTRA_BYTES(s, t) (((s) + XNN_EXTRA_BYTES / sizeof(t) - 1) & ~(XNN_EXTRA_BYTES / sizeof(t) - 1)) + +namespace models { + +xnn_subgraph_t FP32MobileNetV3Small() { + xnn_status status; + xnn_subgraph_t subgraph = nullptr; + status = xnn_create_subgraph(/*num_external_values=*/2, 0, &subgraph); + if (status != xnn_status_success) { + std::cerr << "failed to create subgrpah" << std::endl; + return nullptr; + } + + uint32_t v0 = XNN_INVALID_VALUE_ID; + std::array v0_dims = {{1, 224, 224, 3}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v0_dims.size(), v0_dims.data(), + /*data=*/nullptr, + 1, XNN_VALUE_FLAG_EXTERNAL_INPUT, &v0); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v0" << std::endl; + return nullptr; + } + + uint32_t v1 = XNN_INVALID_VALUE_ID; + std::array v1_dims = {{1, 112, 112, 16}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v1_dims.size(), v1_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v1); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v1" << std::endl; + return nullptr; + } + + uint32_t v2 = XNN_INVALID_VALUE_ID; + std::array v2_dims = {{1, 112, 112, 16}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v2_dims.size(), v2_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v2); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v2" << std::endl; + return nullptr; + } + + uint32_t v3 = XNN_INVALID_VALUE_ID; + std::array v3_dims = {{1, 56, 56, 16}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v3_dims.size(), v3_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v3); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v3" << std::endl; + return nullptr; + } + + uint32_t v4 = XNN_INVALID_VALUE_ID; + std::array v4_dims = {{1, 1, 1, 16}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v4_dims.size(), v4_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v4); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v4" << std::endl; + return nullptr; + } + + uint32_t v5 = XNN_INVALID_VALUE_ID; + std::array v5_dims = {{1, 1, 1, 8}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v5_dims.size(), v5_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v5); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v5" << std::endl; + return nullptr; + } + + uint32_t v6 = XNN_INVALID_VALUE_ID; + std::array v6_dims = {{1, 1, 1, 16}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v6_dims.size(), v6_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v6); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v6" << std::endl; + return nullptr; + } + + uint32_t v7 = XNN_INVALID_VALUE_ID; + std::array v7_dims = {{1, 1, 1, 16}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v7_dims.size(), v7_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v7); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v7" << std::endl; + return nullptr; + } + + uint32_t v8 = XNN_INVALID_VALUE_ID; + std::array v8_dims = {{1, 56, 56, 16}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v8_dims.size(), v8_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v8); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v8" << std::endl; + return nullptr; + } + + uint32_t v9 = XNN_INVALID_VALUE_ID; + std::array v9_dims = {{1, 56, 56, 16}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v9_dims.size(), v9_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v9); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v9" << std::endl; + return nullptr; + } + + uint32_t v10 = XNN_INVALID_VALUE_ID; + std::array v10_dims = {{1, 56, 56, 72}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v10_dims.size(), v10_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v10); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v10" << std::endl; + return nullptr; + } + + uint32_t v11 = XNN_INVALID_VALUE_ID; + std::array v11_dims = {{1, 28, 28, 72}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v11_dims.size(), v11_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v11); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v11" << std::endl; + return nullptr; + } + + uint32_t v12 = XNN_INVALID_VALUE_ID; + std::array v12_dims = {{1, 28, 28, 24}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v12_dims.size(), v12_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v12); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v12" << std::endl; + return nullptr; + } + + uint32_t v13 = XNN_INVALID_VALUE_ID; + std::array v13_dims = {{1, 28, 28, 88}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v13_dims.size(), v13_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v13); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v13" << std::endl; + return nullptr; + } + + uint32_t v14 = XNN_INVALID_VALUE_ID; + std::array v14_dims = {{1, 28, 28, 88}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v14_dims.size(), v14_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v14); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v14" << std::endl; + return nullptr; + } + + uint32_t v15 = XNN_INVALID_VALUE_ID; + std::array v15_dims = {{1, 28, 28, 24}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v15_dims.size(), v15_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v15); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v15" << std::endl; + return nullptr; + } + + uint32_t v16 = XNN_INVALID_VALUE_ID; + std::array v16_dims = {{1, 28, 28, 24}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v16_dims.size(), v16_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v16); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v16" << std::endl; + return nullptr; + } + + uint32_t v17 = XNN_INVALID_VALUE_ID; + std::array v17_dims = {{1, 28, 28, 96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v17_dims.size(), v17_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v17); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v17" << std::endl; + return nullptr; + } + + uint32_t v18 = XNN_INVALID_VALUE_ID; + std::array v18_dims = {{1, 28, 28, 96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v18_dims.size(), v18_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v18); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v18" << std::endl; + return nullptr; + } + + uint32_t v19 = XNN_INVALID_VALUE_ID; + std::array v19_dims = {{1, 14, 14, 96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v19_dims.size(), v19_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v19); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v19" << std::endl; + return nullptr; + } + + uint32_t v20 = XNN_INVALID_VALUE_ID; + std::array v20_dims = {{1, 14, 14, 96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v20_dims.size(), v20_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v20); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v20" << std::endl; + return nullptr; + } + + uint32_t v21 = XNN_INVALID_VALUE_ID; + std::array v21_dims = {{1, 1, 1, 96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v21_dims.size(), v21_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v21); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v21" << std::endl; + return nullptr; + } + + uint32_t v22 = XNN_INVALID_VALUE_ID; + std::array v22_dims = {{1, 1, 1, 24}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v22_dims.size(), v22_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v22); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v22" << std::endl; + return nullptr; + } + + uint32_t v23 = XNN_INVALID_VALUE_ID; + std::array v23_dims = {{1, 1, 1, 96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v23_dims.size(), v23_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v23); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v23" << std::endl; + return nullptr; + } + + uint32_t v24 = XNN_INVALID_VALUE_ID; + std::array v24_dims = {{1, 1, 1, 96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v24_dims.size(), v24_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v24); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v24" << std::endl; + return nullptr; + } + + uint32_t v25 = XNN_INVALID_VALUE_ID; + std::array v25_dims = {{1, 14, 14, 96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v25_dims.size(), v25_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v25); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v25" << std::endl; + return nullptr; + } + + uint32_t v26 = XNN_INVALID_VALUE_ID; + std::array v26_dims = {{1, 14, 14, 40}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v26_dims.size(), v26_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v26); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v26" << std::endl; + return nullptr; + } + + uint32_t v27 = XNN_INVALID_VALUE_ID; + std::array v27_dims = {{1, 14, 14, 240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v27_dims.size(), v27_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v27); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v27" << std::endl; + return nullptr; + } + + uint32_t v28 = XNN_INVALID_VALUE_ID; + std::array v28_dims = {{1, 14, 14, 240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v28_dims.size(), v28_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v28); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v28" << std::endl; + return nullptr; + } + + uint32_t v29 = XNN_INVALID_VALUE_ID; + std::array v29_dims = {{1, 14, 14, 240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v29_dims.size(), v29_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v29); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v29" << std::endl; + return nullptr; + } + + uint32_t v30 = XNN_INVALID_VALUE_ID; + std::array v30_dims = {{1, 14, 14, 240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v30_dims.size(), v30_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v30); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v30" << std::endl; + return nullptr; + } + + uint32_t v31 = XNN_INVALID_VALUE_ID; + std::array v31_dims = {{1, 1, 1, 240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v31_dims.size(), v31_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v31); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v31" << std::endl; + return nullptr; + } + + uint32_t v32 = XNN_INVALID_VALUE_ID; + std::array v32_dims = {{1, 1, 1, 64}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v32_dims.size(), v32_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v32); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v32" << std::endl; + return nullptr; + } + + uint32_t v33 = XNN_INVALID_VALUE_ID; + std::array v33_dims = {{1, 1, 1, 240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v33_dims.size(), v33_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v33); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v33" << std::endl; + return nullptr; + } + + uint32_t v34 = XNN_INVALID_VALUE_ID; + std::array v34_dims = {{1, 1, 1, 240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v34_dims.size(), v34_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v34); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v34" << std::endl; + return nullptr; + } + + uint32_t v35 = XNN_INVALID_VALUE_ID; + std::array v35_dims = {{1, 14, 14, 240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v35_dims.size(), v35_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v35); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v35" << std::endl; + return nullptr; + } + + uint32_t v36 = XNN_INVALID_VALUE_ID; + std::array v36_dims = {{1, 14, 14, 40}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v36_dims.size(), v36_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v36); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v36" << std::endl; + return nullptr; + } + + uint32_t v37 = XNN_INVALID_VALUE_ID; + std::array v37_dims = {{1, 14, 14, 40}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v37_dims.size(), v37_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v37); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v37" << std::endl; + return nullptr; + } + + uint32_t v38 = XNN_INVALID_VALUE_ID; + std::array v38_dims = {{1, 14, 14, 240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v38_dims.size(), v38_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v38); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v38" << std::endl; + return nullptr; + } + + uint32_t v39 = XNN_INVALID_VALUE_ID; + std::array v39_dims = {{1, 14, 14, 240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v39_dims.size(), v39_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v39); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v39" << std::endl; + return nullptr; + } + + uint32_t v40 = XNN_INVALID_VALUE_ID; + std::array v40_dims = {{1, 14, 14, 240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v40_dims.size(), v40_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v40); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v40" << std::endl; + return nullptr; + } + + uint32_t v41 = XNN_INVALID_VALUE_ID; + std::array v41_dims = {{1, 14, 14, 240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v41_dims.size(), v41_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v41); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v41" << std::endl; + return nullptr; + } + + uint32_t v42 = XNN_INVALID_VALUE_ID; + std::array v42_dims = {{1, 1, 1, 240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v42_dims.size(), v42_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v42); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v42" << std::endl; + return nullptr; + } + + uint32_t v43 = XNN_INVALID_VALUE_ID; + std::array v43_dims = {{1, 1, 1, 64}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v43_dims.size(), v43_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v43); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v43" << std::endl; + return nullptr; + } + + uint32_t v44 = XNN_INVALID_VALUE_ID; + std::array v44_dims = {{1, 1, 1, 240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v44_dims.size(), v44_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v44); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v44" << std::endl; + return nullptr; + } + + uint32_t v45 = XNN_INVALID_VALUE_ID; + std::array v45_dims = {{1, 1, 1, 240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v45_dims.size(), v45_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v45); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v45" << std::endl; + return nullptr; + } + + uint32_t v46 = XNN_INVALID_VALUE_ID; + std::array v46_dims = {{1, 14, 14, 240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v46_dims.size(), v46_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v46); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v46" << std::endl; + return nullptr; + } + + uint32_t v47 = XNN_INVALID_VALUE_ID; + std::array v47_dims = {{1, 14, 14, 40}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v47_dims.size(), v47_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v47); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v47" << std::endl; + return nullptr; + } + + uint32_t v48 = XNN_INVALID_VALUE_ID; + std::array v48_dims = {{1, 14, 14, 40}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v48_dims.size(), v48_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v48); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v48" << std::endl; + return nullptr; + } + + uint32_t v49 = XNN_INVALID_VALUE_ID; + std::array v49_dims = {{1, 14, 14, 120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v49_dims.size(), v49_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v49); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v49" << std::endl; + return nullptr; + } + + uint32_t v50 = XNN_INVALID_VALUE_ID; + std::array v50_dims = {{1, 14, 14, 120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v50_dims.size(), v50_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v50); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v50" << std::endl; + return nullptr; + } + + uint32_t v51 = XNN_INVALID_VALUE_ID; + std::array v51_dims = {{1, 14, 14, 120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v51_dims.size(), v51_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v51); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v51" << std::endl; + return nullptr; + } + + uint32_t v52 = XNN_INVALID_VALUE_ID; + std::array v52_dims = {{1, 14, 14, 120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v52_dims.size(), v52_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v52); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v52" << std::endl; + return nullptr; + } + + uint32_t v53 = XNN_INVALID_VALUE_ID; + std::array v53_dims = {{1, 1, 1, 120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v53_dims.size(), v53_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v53); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v53" << std::endl; + return nullptr; + } + + uint32_t v54 = XNN_INVALID_VALUE_ID; + std::array v54_dims = {{1, 1, 1, 32}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v54_dims.size(), v54_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v54); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v54" << std::endl; + return nullptr; + } + + uint32_t v55 = XNN_INVALID_VALUE_ID; + std::array v55_dims = {{1, 1, 1, 120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v55_dims.size(), v55_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v55); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v55" << std::endl; + return nullptr; + } + + uint32_t v56 = XNN_INVALID_VALUE_ID; + std::array v56_dims = {{1, 1, 1, 120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v56_dims.size(), v56_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v56); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v56" << std::endl; + return nullptr; + } + + uint32_t v57 = XNN_INVALID_VALUE_ID; + std::array v57_dims = {{1, 14, 14, 120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v57_dims.size(), v57_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v57); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v57" << std::endl; + return nullptr; + } + + uint32_t v58 = XNN_INVALID_VALUE_ID; + std::array v58_dims = {{1, 14, 14, 48}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v58_dims.size(), v58_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v58); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v58" << std::endl; + return nullptr; + } + + uint32_t v59 = XNN_INVALID_VALUE_ID; + std::array v59_dims = {{1, 14, 14, 144}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v59_dims.size(), v59_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v59); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v59" << std::endl; + return nullptr; + } + + uint32_t v60 = XNN_INVALID_VALUE_ID; + std::array v60_dims = {{1, 14, 14, 144}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v60_dims.size(), v60_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v60); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v60" << std::endl; + return nullptr; + } + + uint32_t v61 = XNN_INVALID_VALUE_ID; + std::array v61_dims = {{1, 14, 14, 144}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v61_dims.size(), v61_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v61); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v61" << std::endl; + return nullptr; + } + + uint32_t v62 = XNN_INVALID_VALUE_ID; + std::array v62_dims = {{1, 14, 14, 144}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v62_dims.size(), v62_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v62); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v62" << std::endl; + return nullptr; + } + + uint32_t v63 = XNN_INVALID_VALUE_ID; + std::array v63_dims = {{1, 1, 1, 144}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v63_dims.size(), v63_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v63); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v63" << std::endl; + return nullptr; + } + + uint32_t v64 = XNN_INVALID_VALUE_ID; + std::array v64_dims = {{1, 1, 1, 40}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v64_dims.size(), v64_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v64); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v64" << std::endl; + return nullptr; + } + + uint32_t v65 = XNN_INVALID_VALUE_ID; + std::array v65_dims = {{1, 1, 1, 144}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v65_dims.size(), v65_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v65); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v65" << std::endl; + return nullptr; + } + + uint32_t v66 = XNN_INVALID_VALUE_ID; + std::array v66_dims = {{1, 1, 1, 144}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v66_dims.size(), v66_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v66); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v66" << std::endl; + return nullptr; + } + + uint32_t v67 = XNN_INVALID_VALUE_ID; + std::array v67_dims = {{1, 14, 14, 144}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v67_dims.size(), v67_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v67); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v67" << std::endl; + return nullptr; + } + + uint32_t v68 = XNN_INVALID_VALUE_ID; + std::array v68_dims = {{1, 14, 14, 48}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v68_dims.size(), v68_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v68); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v68" << std::endl; + return nullptr; + } + + uint32_t v69 = XNN_INVALID_VALUE_ID; + std::array v69_dims = {{1, 14, 14, 48}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v69_dims.size(), v69_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v69); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v69" << std::endl; + return nullptr; + } + + uint32_t v70 = XNN_INVALID_VALUE_ID; + std::array v70_dims = {{1, 14, 14, 288}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v70_dims.size(), v70_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v70); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v70" << std::endl; + return nullptr; + } + + uint32_t v71 = XNN_INVALID_VALUE_ID; + std::array v71_dims = {{1, 14, 14, 288}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v71_dims.size(), v71_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v71); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v71" << std::endl; + return nullptr; + } + + uint32_t v72 = XNN_INVALID_VALUE_ID; + std::array v72_dims = {{1, 7, 7, 288}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v72_dims.size(), v72_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v72); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v72" << std::endl; + return nullptr; + } + + uint32_t v73 = XNN_INVALID_VALUE_ID; + std::array v73_dims = {{1, 7, 7, 288}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v73_dims.size(), v73_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v73); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v73" << std::endl; + return nullptr; + } + + uint32_t v74 = XNN_INVALID_VALUE_ID; + std::array v74_dims = {{1, 1, 1, 288}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v74_dims.size(), v74_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v74); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v74" << std::endl; + return nullptr; + } + + uint32_t v75 = XNN_INVALID_VALUE_ID; + std::array v75_dims = {{1, 1, 1, 72}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v75_dims.size(), v75_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v75); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v75" << std::endl; + return nullptr; + } + + uint32_t v76 = XNN_INVALID_VALUE_ID; + std::array v76_dims = {{1, 1, 1, 288}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v76_dims.size(), v76_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v76); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v76" << std::endl; + return nullptr; + } + + uint32_t v77 = XNN_INVALID_VALUE_ID; + std::array v77_dims = {{1, 1, 1, 288}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v77_dims.size(), v77_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v77); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v77" << std::endl; + return nullptr; + } + + uint32_t v78 = XNN_INVALID_VALUE_ID; + std::array v78_dims = {{1, 7, 7, 288}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v78_dims.size(), v78_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v78); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v78" << std::endl; + return nullptr; + } + + uint32_t v79 = XNN_INVALID_VALUE_ID; + std::array v79_dims = {{1, 7, 7, 96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v79_dims.size(), v79_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v79); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v79" << std::endl; + return nullptr; + } + + uint32_t v80 = XNN_INVALID_VALUE_ID; + std::array v80_dims = {{1, 7, 7, 576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v80_dims.size(), v80_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v80); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v80" << std::endl; + return nullptr; + } + + uint32_t v81 = XNN_INVALID_VALUE_ID; + std::array v81_dims = {{1, 7, 7, 576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v81_dims.size(), v81_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v81); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v81" << std::endl; + return nullptr; + } + + uint32_t v82 = XNN_INVALID_VALUE_ID; + std::array v82_dims = {{1, 7, 7, 576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v82_dims.size(), v82_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v82); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v82" << std::endl; + return nullptr; + } + + uint32_t v83 = XNN_INVALID_VALUE_ID; + std::array v83_dims = {{1, 7, 7, 576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v83_dims.size(), v83_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v83); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v83" << std::endl; + return nullptr; + } + + uint32_t v84 = XNN_INVALID_VALUE_ID; + std::array v84_dims = {{1, 1, 1, 576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v84_dims.size(), v84_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v84); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v84" << std::endl; + return nullptr; + } + + uint32_t v85 = XNN_INVALID_VALUE_ID; + std::array v85_dims = {{1, 1, 1, 144}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v85_dims.size(), v85_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v85); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v85" << std::endl; + return nullptr; + } + + uint32_t v86 = XNN_INVALID_VALUE_ID; + std::array v86_dims = {{1, 1, 1, 576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v86_dims.size(), v86_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v86); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v86" << std::endl; + return nullptr; + } + + uint32_t v87 = XNN_INVALID_VALUE_ID; + std::array v87_dims = {{1, 1, 1, 576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v87_dims.size(), v87_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v87); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v87" << std::endl; + return nullptr; + } + + uint32_t v88 = XNN_INVALID_VALUE_ID; + std::array v88_dims = {{1, 7, 7, 576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v88_dims.size(), v88_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v88); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v88" << std::endl; + return nullptr; + } + + uint32_t v89 = XNN_INVALID_VALUE_ID; + std::array v89_dims = {{1, 7, 7, 96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v89_dims.size(), v89_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v89); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v89" << std::endl; + return nullptr; + } + + uint32_t v90 = XNN_INVALID_VALUE_ID; + std::array v90_dims = {{1, 7, 7, 96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v90_dims.size(), v90_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v90); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v90" << std::endl; + return nullptr; + } + + uint32_t v91 = XNN_INVALID_VALUE_ID; + std::array v91_dims = {{1, 7, 7, 576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v91_dims.size(), v91_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v91); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v91" << std::endl; + return nullptr; + } + + uint32_t v92 = XNN_INVALID_VALUE_ID; + std::array v92_dims = {{1, 7, 7, 576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v92_dims.size(), v92_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v92); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v92" << std::endl; + return nullptr; + } + + uint32_t v93 = XNN_INVALID_VALUE_ID; + std::array v93_dims = {{1, 7, 7, 576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v93_dims.size(), v93_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v93); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v93" << std::endl; + return nullptr; + } + + uint32_t v94 = XNN_INVALID_VALUE_ID; + std::array v94_dims = {{1, 7, 7, 576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v94_dims.size(), v94_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v94); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v94" << std::endl; + return nullptr; + } + + uint32_t v95 = XNN_INVALID_VALUE_ID; + std::array v95_dims = {{1, 1, 1, 576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v95_dims.size(), v95_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v95); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v95" << std::endl; + return nullptr; + } + + uint32_t v96 = XNN_INVALID_VALUE_ID; + std::array v96_dims = {{1, 1, 1, 144}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v96_dims.size(), v96_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v96); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v96" << std::endl; + return nullptr; + } + + uint32_t v97 = XNN_INVALID_VALUE_ID; + std::array v97_dims = {{1, 1, 1, 576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v97_dims.size(), v97_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v97); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v97" << std::endl; + return nullptr; + } + + uint32_t v98 = XNN_INVALID_VALUE_ID; + std::array v98_dims = {{1, 1, 1, 576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v98_dims.size(), v98_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v98); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v98" << std::endl; + return nullptr; + } + + uint32_t v99 = XNN_INVALID_VALUE_ID; + std::array v99_dims = {{1, 7, 7, 576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v99_dims.size(), v99_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v99); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v99" << std::endl; + return nullptr; + } + + uint32_t v100 = XNN_INVALID_VALUE_ID; + std::array v100_dims = {{1, 7, 7, 96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v100_dims.size(), v100_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v100); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v100" << std::endl; + return nullptr; + } + + uint32_t v101 = XNN_INVALID_VALUE_ID; + std::array v101_dims = {{1, 7, 7, 96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v101_dims.size(), v101_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v101); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v101" << std::endl; + return nullptr; + } + + uint32_t v102 = XNN_INVALID_VALUE_ID; + std::array v102_dims = {{1, 7, 7, 576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v102_dims.size(), v102_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v102); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v102" << std::endl; + return nullptr; + } + + uint32_t v103 = XNN_INVALID_VALUE_ID; + std::array v103_dims = {{1, 7, 7, 576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v103_dims.size(), v103_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v103); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v103" << std::endl; + return nullptr; + } + + uint32_t v104 = XNN_INVALID_VALUE_ID; + std::array v104_dims = {{1, 1, 1, 576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v104_dims.size(), v104_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v104); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v104" << std::endl; + return nullptr; + } + + uint32_t v105 = XNN_INVALID_VALUE_ID; + std::array v105_dims = {{1, 1, 1, 1024}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v105_dims.size(), v105_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v105); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v105" << std::endl; + return nullptr; + } + + uint32_t v106 = XNN_INVALID_VALUE_ID; + std::array v106_dims = {{1, 1, 1, 1024}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v106_dims.size(), v106_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v106); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v106" << std::endl; + return nullptr; + } + + uint32_t v107 = XNN_INVALID_VALUE_ID; + std::array v107_dims = {{1, 1, 1, 1024}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v107_dims.size(), v107_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v107); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v107" << std::endl; + return nullptr; + } + + uint32_t v108 = XNN_INVALID_VALUE_ID; + std::array v108_dims = {{1, 1, 1, 1001}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v108_dims.size(), v108_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v108); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v108" << std::endl; + return nullptr; + } + + uint32_t v109 = XNN_INVALID_VALUE_ID; + std::array v109_dims = {{1, 1001}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v109_dims.size(), v109_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v109); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v109" << std::endl; + return nullptr; + } + + uint32_t v110 = XNN_INVALID_VALUE_ID; + std::array v110_dims = {{1, 1001}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + v110_dims.size(), v110_dims.data(), + /*data=*/nullptr, + 0, XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &v110); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v110" << std::endl; + return nullptr; + } + + alignas(16) static std::array w111_data; + uint32_t w111 = XNN_INVALID_VALUE_ID; + std::array w111_dims = {{16, 3, 3, 3}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w111_dims.size(), w111_dims.data(), + /*data=*/w111_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w111); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w111" << std::endl; + return nullptr; + } + + alignas(16) static std::array w112_data; + uint32_t w112 = XNN_INVALID_VALUE_ID; + std::array w112_dims = {{16}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w112_dims.size(), w112_dims.data(), + /*data=*/w112_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w112); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w112" << std::endl; + return nullptr; + } + + alignas(16) static std::array w113_data; + uint32_t w113 = XNN_INVALID_VALUE_ID; + std::array w113_dims = {{1, 3, 3, 16}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w113_dims.size(), w113_dims.data(), + /*data=*/w113_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w113); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w113" << std::endl; + return nullptr; + } + + alignas(16) static std::array w114_data; + uint32_t w114 = XNN_INVALID_VALUE_ID; + std::array w114_dims = {{16}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w114_dims.size(), w114_dims.data(), + /*data=*/w114_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w114); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w114" << std::endl; + return nullptr; + } + + alignas(16) static std::array w115_data; + uint32_t w115 = XNN_INVALID_VALUE_ID; + std::array w115_dims = {{8, 1, 1, 16}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w115_dims.size(), w115_dims.data(), + /*data=*/w115_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w115); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w115" << std::endl; + return nullptr; + } + + alignas(16) static std::array w116_data; + uint32_t w116 = XNN_INVALID_VALUE_ID; + std::array w116_dims = {{8}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w116_dims.size(), w116_dims.data(), + /*data=*/w116_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w116); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w116" << std::endl; + return nullptr; + } + + alignas(16) static std::array w117_data; + uint32_t w117 = XNN_INVALID_VALUE_ID; + std::array w117_dims = {{16, 1, 1, 8}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w117_dims.size(), w117_dims.data(), + /*data=*/w117_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w117); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w117" << std::endl; + return nullptr; + } + + alignas(16) static std::array w118_data; + uint32_t w118 = XNN_INVALID_VALUE_ID; + std::array w118_dims = {{16}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w118_dims.size(), w118_dims.data(), + /*data=*/w118_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w118); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w118" << std::endl; + return nullptr; + } + + alignas(16) static std::array w119_data; + uint32_t w119 = XNN_INVALID_VALUE_ID; + std::array w119_dims = {{1}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w119_dims.size(), w119_dims.data(), + /*data=*/w119_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w119); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w119" << std::endl; + return nullptr; + } + + alignas(16) static std::array w120_data; + uint32_t w120 = XNN_INVALID_VALUE_ID; + std::array w120_dims = {{16, 1, 1, 16}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w120_dims.size(), w120_dims.data(), + /*data=*/w120_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w120); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w120" << std::endl; + return nullptr; + } + + alignas(16) static std::array w121_data; + uint32_t w121 = XNN_INVALID_VALUE_ID; + std::array w121_dims = {{16}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w121_dims.size(), w121_dims.data(), + /*data=*/w121_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w121); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w121" << std::endl; + return nullptr; + } + + alignas(16) static std::array w122_data; + uint32_t w122 = XNN_INVALID_VALUE_ID; + std::array w122_dims = {{72, 1, 1, 16}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w122_dims.size(), w122_dims.data(), + /*data=*/w122_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w122); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w122" << std::endl; + return nullptr; + } + + alignas(16) static std::array w123_data; + uint32_t w123 = XNN_INVALID_VALUE_ID; + std::array w123_dims = {{72}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w123_dims.size(), w123_dims.data(), + /*data=*/w123_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w123); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w123" << std::endl; + return nullptr; + } + + alignas(16) static std::array w124_data; + uint32_t w124 = XNN_INVALID_VALUE_ID; + std::array w124_dims = {{1, 3, 3, 72}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w124_dims.size(), w124_dims.data(), + /*data=*/w124_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w124); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w124" << std::endl; + return nullptr; + } + + alignas(16) static std::array w125_data; + uint32_t w125 = XNN_INVALID_VALUE_ID; + std::array w125_dims = {{72}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w125_dims.size(), w125_dims.data(), + /*data=*/w125_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w125); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w125" << std::endl; + return nullptr; + } + + alignas(16) static std::array w126_data; + uint32_t w126 = XNN_INVALID_VALUE_ID; + std::array w126_dims = {{24, 1, 1, 72}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w126_dims.size(), w126_dims.data(), + /*data=*/w126_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w126); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w126" << std::endl; + return nullptr; + } + + alignas(16) static std::array w127_data; + uint32_t w127 = XNN_INVALID_VALUE_ID; + std::array w127_dims = {{24}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w127_dims.size(), w127_dims.data(), + /*data=*/w127_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w127); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w127" << std::endl; + return nullptr; + } + + alignas(16) static std::array w128_data; + uint32_t w128 = XNN_INVALID_VALUE_ID; + std::array w128_dims = {{88, 1, 1, 24}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w128_dims.size(), w128_dims.data(), + /*data=*/w128_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w128); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w128" << std::endl; + return nullptr; + } + + alignas(16) static std::array w129_data; + uint32_t w129 = XNN_INVALID_VALUE_ID; + std::array w129_dims = {{88}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w129_dims.size(), w129_dims.data(), + /*data=*/w129_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w129); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w129" << std::endl; + return nullptr; + } + + alignas(16) static std::array w130_data; + uint32_t w130 = XNN_INVALID_VALUE_ID; + std::array w130_dims = {{1, 3, 3, 88}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w130_dims.size(), w130_dims.data(), + /*data=*/w130_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w130); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w130" << std::endl; + return nullptr; + } + + alignas(16) static std::array w131_data; + uint32_t w131 = XNN_INVALID_VALUE_ID; + std::array w131_dims = {{88}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w131_dims.size(), w131_dims.data(), + /*data=*/w131_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w131); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w131" << std::endl; + return nullptr; + } + + alignas(16) static std::array w132_data; + uint32_t w132 = XNN_INVALID_VALUE_ID; + std::array w132_dims = {{24, 1, 1, 88}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w132_dims.size(), w132_dims.data(), + /*data=*/w132_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w132); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w132" << std::endl; + return nullptr; + } + + alignas(16) static std::array w133_data; + uint32_t w133 = XNN_INVALID_VALUE_ID; + std::array w133_dims = {{24}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w133_dims.size(), w133_dims.data(), + /*data=*/w133_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w133); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w133" << std::endl; + return nullptr; + } + + alignas(16) static std::array w134_data; + uint32_t w134 = XNN_INVALID_VALUE_ID; + std::array w134_dims = {{96, 1, 1, 24}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w134_dims.size(), w134_dims.data(), + /*data=*/w134_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w134); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w134" << std::endl; + return nullptr; + } + + alignas(16) static std::array w135_data; + uint32_t w135 = XNN_INVALID_VALUE_ID; + std::array w135_dims = {{96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w135_dims.size(), w135_dims.data(), + /*data=*/w135_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w135); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w135" << std::endl; + return nullptr; + } + + alignas(16) static std::array w136_data; + uint32_t w136 = XNN_INVALID_VALUE_ID; + std::array w136_dims = {{1, 5, 5, 96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w136_dims.size(), w136_dims.data(), + /*data=*/w136_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w136); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w136" << std::endl; + return nullptr; + } + + alignas(16) static std::array w137_data; + uint32_t w137 = XNN_INVALID_VALUE_ID; + std::array w137_dims = {{96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w137_dims.size(), w137_dims.data(), + /*data=*/w137_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w137); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w137" << std::endl; + return nullptr; + } + + alignas(16) static std::array w138_data; + uint32_t w138 = XNN_INVALID_VALUE_ID; + std::array w138_dims = {{24, 1, 1, 96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w138_dims.size(), w138_dims.data(), + /*data=*/w138_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w138); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w138" << std::endl; + return nullptr; + } + + alignas(16) static std::array w139_data; + uint32_t w139 = XNN_INVALID_VALUE_ID; + std::array w139_dims = {{24}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w139_dims.size(), w139_dims.data(), + /*data=*/w139_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w139); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w139" << std::endl; + return nullptr; + } + + alignas(16) static std::array w140_data; + uint32_t w140 = XNN_INVALID_VALUE_ID; + std::array w140_dims = {{96, 1, 1, 24}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w140_dims.size(), w140_dims.data(), + /*data=*/w140_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w140); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w140" << std::endl; + return nullptr; + } + + alignas(16) static std::array w141_data; + uint32_t w141 = XNN_INVALID_VALUE_ID; + std::array w141_dims = {{96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w141_dims.size(), w141_dims.data(), + /*data=*/w141_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w141); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w141" << std::endl; + return nullptr; + } + + alignas(16) static std::array w142_data; + uint32_t w142 = XNN_INVALID_VALUE_ID; + std::array w142_dims = {{1}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w142_dims.size(), w142_dims.data(), + /*data=*/w142_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w142); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w142" << std::endl; + return nullptr; + } + + alignas(16) static std::array w143_data; + uint32_t w143 = XNN_INVALID_VALUE_ID; + std::array w143_dims = {{40, 1, 1, 96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w143_dims.size(), w143_dims.data(), + /*data=*/w143_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w143); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w143" << std::endl; + return nullptr; + } + + alignas(16) static std::array w144_data; + uint32_t w144 = XNN_INVALID_VALUE_ID; + std::array w144_dims = {{40}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w144_dims.size(), w144_dims.data(), + /*data=*/w144_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w144); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w144" << std::endl; + return nullptr; + } + + alignas(16) static std::array w145_data; + uint32_t w145 = XNN_INVALID_VALUE_ID; + std::array w145_dims = {{240, 1, 1, 40}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w145_dims.size(), w145_dims.data(), + /*data=*/w145_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w145); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w145" << std::endl; + return nullptr; + } + + alignas(16) static std::array w146_data; + uint32_t w146 = XNN_INVALID_VALUE_ID; + std::array w146_dims = {{240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w146_dims.size(), w146_dims.data(), + /*data=*/w146_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w146); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w146" << std::endl; + return nullptr; + } + + alignas(16) static std::array w147_data; + uint32_t w147 = XNN_INVALID_VALUE_ID; + std::array w147_dims = {{1, 5, 5, 240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w147_dims.size(), w147_dims.data(), + /*data=*/w147_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w147); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w147" << std::endl; + return nullptr; + } + + alignas(16) static std::array w148_data; + uint32_t w148 = XNN_INVALID_VALUE_ID; + std::array w148_dims = {{240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w148_dims.size(), w148_dims.data(), + /*data=*/w148_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w148); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w148" << std::endl; + return nullptr; + } + + alignas(16) static std::array w149_data; + uint32_t w149 = XNN_INVALID_VALUE_ID; + std::array w149_dims = {{64, 1, 1, 240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w149_dims.size(), w149_dims.data(), + /*data=*/w149_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w149); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w149" << std::endl; + return nullptr; + } + + alignas(16) static std::array w150_data; + uint32_t w150 = XNN_INVALID_VALUE_ID; + std::array w150_dims = {{64}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w150_dims.size(), w150_dims.data(), + /*data=*/w150_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w150); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w150" << std::endl; + return nullptr; + } + + alignas(16) static std::array w151_data; + uint32_t w151 = XNN_INVALID_VALUE_ID; + std::array w151_dims = {{240, 1, 1, 64}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w151_dims.size(), w151_dims.data(), + /*data=*/w151_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w151); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w151" << std::endl; + return nullptr; + } + + alignas(16) static std::array w152_data; + uint32_t w152 = XNN_INVALID_VALUE_ID; + std::array w152_dims = {{240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w152_dims.size(), w152_dims.data(), + /*data=*/w152_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w152); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w152" << std::endl; + return nullptr; + } + + alignas(16) static std::array w153_data; + uint32_t w153 = XNN_INVALID_VALUE_ID; + std::array w153_dims = {{1}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w153_dims.size(), w153_dims.data(), + /*data=*/w153_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w153); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w153" << std::endl; + return nullptr; + } + + alignas(16) static std::array w154_data; + uint32_t w154 = XNN_INVALID_VALUE_ID; + std::array w154_dims = {{40, 1, 1, 240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w154_dims.size(), w154_dims.data(), + /*data=*/w154_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w154); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w154" << std::endl; + return nullptr; + } + + alignas(16) static std::array w155_data; + uint32_t w155 = XNN_INVALID_VALUE_ID; + std::array w155_dims = {{40}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w155_dims.size(), w155_dims.data(), + /*data=*/w155_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w155); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w155" << std::endl; + return nullptr; + } + + alignas(16) static std::array w156_data; + uint32_t w156 = XNN_INVALID_VALUE_ID; + std::array w156_dims = {{240, 1, 1, 40}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w156_dims.size(), w156_dims.data(), + /*data=*/w156_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w156); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w156" << std::endl; + return nullptr; + } + + alignas(16) static std::array w157_data; + uint32_t w157 = XNN_INVALID_VALUE_ID; + std::array w157_dims = {{240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w157_dims.size(), w157_dims.data(), + /*data=*/w157_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w157); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w157" << std::endl; + return nullptr; + } + + alignas(16) static std::array w158_data; + uint32_t w158 = XNN_INVALID_VALUE_ID; + std::array w158_dims = {{1, 5, 5, 240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w158_dims.size(), w158_dims.data(), + /*data=*/w158_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w158); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w158" << std::endl; + return nullptr; + } + + alignas(16) static std::array w159_data; + uint32_t w159 = XNN_INVALID_VALUE_ID; + std::array w159_dims = {{240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w159_dims.size(), w159_dims.data(), + /*data=*/w159_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w159); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w159" << std::endl; + return nullptr; + } + + alignas(16) static std::array w160_data; + uint32_t w160 = XNN_INVALID_VALUE_ID; + std::array w160_dims = {{64, 1, 1, 240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w160_dims.size(), w160_dims.data(), + /*data=*/w160_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w160); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w160" << std::endl; + return nullptr; + } + + alignas(16) static std::array w161_data; + uint32_t w161 = XNN_INVALID_VALUE_ID; + std::array w161_dims = {{64}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w161_dims.size(), w161_dims.data(), + /*data=*/w161_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w161); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w161" << std::endl; + return nullptr; + } + + alignas(16) static std::array w162_data; + uint32_t w162 = XNN_INVALID_VALUE_ID; + std::array w162_dims = {{240, 1, 1, 64}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w162_dims.size(), w162_dims.data(), + /*data=*/w162_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w162); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w162" << std::endl; + return nullptr; + } + + alignas(16) static std::array w163_data; + uint32_t w163 = XNN_INVALID_VALUE_ID; + std::array w163_dims = {{240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w163_dims.size(), w163_dims.data(), + /*data=*/w163_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w163); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w163" << std::endl; + return nullptr; + } + + alignas(16) static std::array w164_data; + uint32_t w164 = XNN_INVALID_VALUE_ID; + std::array w164_dims = {{1}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w164_dims.size(), w164_dims.data(), + /*data=*/w164_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w164); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w164" << std::endl; + return nullptr; + } + + alignas(16) static std::array w165_data; + uint32_t w165 = XNN_INVALID_VALUE_ID; + std::array w165_dims = {{40, 1, 1, 240}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w165_dims.size(), w165_dims.data(), + /*data=*/w165_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w165); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w165" << std::endl; + return nullptr; + } + + alignas(16) static std::array w166_data; + uint32_t w166 = XNN_INVALID_VALUE_ID; + std::array w166_dims = {{40}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w166_dims.size(), w166_dims.data(), + /*data=*/w166_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w166); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w166" << std::endl; + return nullptr; + } + + alignas(16) static std::array w167_data; + uint32_t w167 = XNN_INVALID_VALUE_ID; + std::array w167_dims = {{120, 1, 1, 40}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w167_dims.size(), w167_dims.data(), + /*data=*/w167_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w167); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w167" << std::endl; + return nullptr; + } + + alignas(16) static std::array w168_data; + uint32_t w168 = XNN_INVALID_VALUE_ID; + std::array w168_dims = {{120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w168_dims.size(), w168_dims.data(), + /*data=*/w168_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w168); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w168" << std::endl; + return nullptr; + } + + alignas(16) static std::array w169_data; + uint32_t w169 = XNN_INVALID_VALUE_ID; + std::array w169_dims = {{1, 5, 5, 120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w169_dims.size(), w169_dims.data(), + /*data=*/w169_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w169); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w169" << std::endl; + return nullptr; + } + + alignas(16) static std::array w170_data; + uint32_t w170 = XNN_INVALID_VALUE_ID; + std::array w170_dims = {{120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w170_dims.size(), w170_dims.data(), + /*data=*/w170_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w170); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w170" << std::endl; + return nullptr; + } + + alignas(16) static std::array w171_data; + uint32_t w171 = XNN_INVALID_VALUE_ID; + std::array w171_dims = {{32, 1, 1, 120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w171_dims.size(), w171_dims.data(), + /*data=*/w171_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w171); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w171" << std::endl; + return nullptr; + } + + alignas(16) static std::array w172_data; + uint32_t w172 = XNN_INVALID_VALUE_ID; + std::array w172_dims = {{32}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w172_dims.size(), w172_dims.data(), + /*data=*/w172_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w172); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w172" << std::endl; + return nullptr; + } + + alignas(16) static std::array w173_data; + uint32_t w173 = XNN_INVALID_VALUE_ID; + std::array w173_dims = {{120, 1, 1, 32}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w173_dims.size(), w173_dims.data(), + /*data=*/w173_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w173); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w173" << std::endl; + return nullptr; + } + + alignas(16) static std::array w174_data; + uint32_t w174 = XNN_INVALID_VALUE_ID; + std::array w174_dims = {{120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w174_dims.size(), w174_dims.data(), + /*data=*/w174_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w174); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w174" << std::endl; + return nullptr; + } + + alignas(16) static std::array w175_data; + uint32_t w175 = XNN_INVALID_VALUE_ID; + std::array w175_dims = {{1}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w175_dims.size(), w175_dims.data(), + /*data=*/w175_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w175); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w175" << std::endl; + return nullptr; + } + + alignas(16) static std::array w176_data; + uint32_t w176 = XNN_INVALID_VALUE_ID; + std::array w176_dims = {{48, 1, 1, 120}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w176_dims.size(), w176_dims.data(), + /*data=*/w176_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w176); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w176" << std::endl; + return nullptr; + } + + alignas(16) static std::array w177_data; + uint32_t w177 = XNN_INVALID_VALUE_ID; + std::array w177_dims = {{48}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w177_dims.size(), w177_dims.data(), + /*data=*/w177_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w177); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w177" << std::endl; + return nullptr; + } + + alignas(16) static std::array w178_data; + uint32_t w178 = XNN_INVALID_VALUE_ID; + std::array w178_dims = {{144, 1, 1, 48}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w178_dims.size(), w178_dims.data(), + /*data=*/w178_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w178); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w178" << std::endl; + return nullptr; + } + + alignas(16) static std::array w179_data; + uint32_t w179 = XNN_INVALID_VALUE_ID; + std::array w179_dims = {{144}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w179_dims.size(), w179_dims.data(), + /*data=*/w179_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w179); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w179" << std::endl; + return nullptr; + } + + alignas(16) static std::array w180_data; + uint32_t w180 = XNN_INVALID_VALUE_ID; + std::array w180_dims = {{1, 5, 5, 144}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w180_dims.size(), w180_dims.data(), + /*data=*/w180_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w180); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w180" << std::endl; + return nullptr; + } + + alignas(16) static std::array w181_data; + uint32_t w181 = XNN_INVALID_VALUE_ID; + std::array w181_dims = {{144}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w181_dims.size(), w181_dims.data(), + /*data=*/w181_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w181); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w181" << std::endl; + return nullptr; + } + + alignas(16) static std::array w182_data; + uint32_t w182 = XNN_INVALID_VALUE_ID; + std::array w182_dims = {{40, 1, 1, 144}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w182_dims.size(), w182_dims.data(), + /*data=*/w182_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w182); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w182" << std::endl; + return nullptr; + } + + alignas(16) static std::array w183_data; + uint32_t w183 = XNN_INVALID_VALUE_ID; + std::array w183_dims = {{40}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w183_dims.size(), w183_dims.data(), + /*data=*/w183_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w183); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w183" << std::endl; + return nullptr; + } + + alignas(16) static std::array w184_data; + uint32_t w184 = XNN_INVALID_VALUE_ID; + std::array w184_dims = {{144, 1, 1, 40}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w184_dims.size(), w184_dims.data(), + /*data=*/w184_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w184); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w184" << std::endl; + return nullptr; + } + + alignas(16) static std::array w185_data; + uint32_t w185 = XNN_INVALID_VALUE_ID; + std::array w185_dims = {{144}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w185_dims.size(), w185_dims.data(), + /*data=*/w185_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w185); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w185" << std::endl; + return nullptr; + } + + alignas(16) static std::array w186_data; + uint32_t w186 = XNN_INVALID_VALUE_ID; + std::array w186_dims = {{1}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w186_dims.size(), w186_dims.data(), + /*data=*/w186_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w186); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w186" << std::endl; + return nullptr; + } + + alignas(16) static std::array w187_data; + uint32_t w187 = XNN_INVALID_VALUE_ID; + std::array w187_dims = {{48, 1, 1, 144}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w187_dims.size(), w187_dims.data(), + /*data=*/w187_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w187); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w187" << std::endl; + return nullptr; + } + + alignas(16) static std::array w188_data; + uint32_t w188 = XNN_INVALID_VALUE_ID; + std::array w188_dims = {{48}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w188_dims.size(), w188_dims.data(), + /*data=*/w188_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w188); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w188" << std::endl; + return nullptr; + } + + alignas(16) static std::array w189_data; + uint32_t w189 = XNN_INVALID_VALUE_ID; + std::array w189_dims = {{288, 1, 1, 48}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w189_dims.size(), w189_dims.data(), + /*data=*/w189_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w189); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w189" << std::endl; + return nullptr; + } + + alignas(16) static std::array w190_data; + uint32_t w190 = XNN_INVALID_VALUE_ID; + std::array w190_dims = {{288}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w190_dims.size(), w190_dims.data(), + /*data=*/w190_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w190); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w190" << std::endl; + return nullptr; + } + + alignas(16) static std::array w191_data; + uint32_t w191 = XNN_INVALID_VALUE_ID; + std::array w191_dims = {{1, 5, 5, 288}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w191_dims.size(), w191_dims.data(), + /*data=*/w191_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w191); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w191" << std::endl; + return nullptr; + } + + alignas(16) static std::array w192_data; + uint32_t w192 = XNN_INVALID_VALUE_ID; + std::array w192_dims = {{288}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w192_dims.size(), w192_dims.data(), + /*data=*/w192_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w192); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w192" << std::endl; + return nullptr; + } + + alignas(16) static std::array w193_data; + uint32_t w193 = XNN_INVALID_VALUE_ID; + std::array w193_dims = {{72, 1, 1, 288}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w193_dims.size(), w193_dims.data(), + /*data=*/w193_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w193); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w193" << std::endl; + return nullptr; + } + + alignas(16) static std::array w194_data; + uint32_t w194 = XNN_INVALID_VALUE_ID; + std::array w194_dims = {{72}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w194_dims.size(), w194_dims.data(), + /*data=*/w194_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w194); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w194" << std::endl; + return nullptr; + } + + alignas(16) static std::array w195_data; + uint32_t w195 = XNN_INVALID_VALUE_ID; + std::array w195_dims = {{288, 1, 1, 72}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w195_dims.size(), w195_dims.data(), + /*data=*/w195_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w195); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w195" << std::endl; + return nullptr; + } + + alignas(16) static std::array w196_data; + uint32_t w196 = XNN_INVALID_VALUE_ID; + std::array w196_dims = {{288}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w196_dims.size(), w196_dims.data(), + /*data=*/w196_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w196); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w196" << std::endl; + return nullptr; + } + + alignas(16) static std::array w197_data; + uint32_t w197 = XNN_INVALID_VALUE_ID; + std::array w197_dims = {{1}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w197_dims.size(), w197_dims.data(), + /*data=*/w197_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w197); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w197" << std::endl; + return nullptr; + } + + alignas(16) static std::array w198_data; + uint32_t w198 = XNN_INVALID_VALUE_ID; + std::array w198_dims = {{96, 1, 1, 288}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w198_dims.size(), w198_dims.data(), + /*data=*/w198_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w198); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w198" << std::endl; + return nullptr; + } + + alignas(16) static std::array w199_data; + uint32_t w199 = XNN_INVALID_VALUE_ID; + std::array w199_dims = {{96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w199_dims.size(), w199_dims.data(), + /*data=*/w199_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w199); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w199" << std::endl; + return nullptr; + } + + alignas(16) static std::array w200_data; + uint32_t w200 = XNN_INVALID_VALUE_ID; + std::array w200_dims = {{576, 1, 1, 96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w200_dims.size(), w200_dims.data(), + /*data=*/w200_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w200); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w200" << std::endl; + return nullptr; + } + + alignas(16) static std::array w201_data; + uint32_t w201 = XNN_INVALID_VALUE_ID; + std::array w201_dims = {{576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w201_dims.size(), w201_dims.data(), + /*data=*/w201_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w201); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w201" << std::endl; + return nullptr; + } + + alignas(16) static std::array w202_data; + uint32_t w202 = XNN_INVALID_VALUE_ID; + std::array w202_dims = {{1, 5, 5, 576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w202_dims.size(), w202_dims.data(), + /*data=*/w202_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w202); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w202" << std::endl; + return nullptr; + } + + alignas(16) static std::array w203_data; + uint32_t w203 = XNN_INVALID_VALUE_ID; + std::array w203_dims = {{576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w203_dims.size(), w203_dims.data(), + /*data=*/w203_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w203); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w203" << std::endl; + return nullptr; + } + + alignas(16) static std::array w204_data; + uint32_t w204 = XNN_INVALID_VALUE_ID; + std::array w204_dims = {{144, 1, 1, 576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w204_dims.size(), w204_dims.data(), + /*data=*/w204_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w204); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w204" << std::endl; + return nullptr; + } + + alignas(16) static std::array w205_data; + uint32_t w205 = XNN_INVALID_VALUE_ID; + std::array w205_dims = {{144}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w205_dims.size(), w205_dims.data(), + /*data=*/w205_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w205); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w205" << std::endl; + return nullptr; + } + + alignas(16) static std::array w206_data; + uint32_t w206 = XNN_INVALID_VALUE_ID; + std::array w206_dims = {{576, 1, 1, 144}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w206_dims.size(), w206_dims.data(), + /*data=*/w206_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w206); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w206" << std::endl; + return nullptr; + } + + alignas(16) static std::array w207_data; + uint32_t w207 = XNN_INVALID_VALUE_ID; + std::array w207_dims = {{576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w207_dims.size(), w207_dims.data(), + /*data=*/w207_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w207); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w207" << std::endl; + return nullptr; + } + + alignas(16) static std::array w208_data; + uint32_t w208 = XNN_INVALID_VALUE_ID; + std::array w208_dims = {{1}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w208_dims.size(), w208_dims.data(), + /*data=*/w208_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w208); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w208" << std::endl; + return nullptr; + } + + alignas(16) static std::array w209_data; + uint32_t w209 = XNN_INVALID_VALUE_ID; + std::array w209_dims = {{96, 1, 1, 576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w209_dims.size(), w209_dims.data(), + /*data=*/w209_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w209); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w209" << std::endl; + return nullptr; + } + + alignas(16) static std::array w210_data; + uint32_t w210 = XNN_INVALID_VALUE_ID; + std::array w210_dims = {{96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w210_dims.size(), w210_dims.data(), + /*data=*/w210_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w210); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w210" << std::endl; + return nullptr; + } + + alignas(16) static std::array w211_data; + uint32_t w211 = XNN_INVALID_VALUE_ID; + std::array w211_dims = {{576, 1, 1, 96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w211_dims.size(), w211_dims.data(), + /*data=*/w211_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w211); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w211" << std::endl; + return nullptr; + } + + alignas(16) static std::array w212_data; + uint32_t w212 = XNN_INVALID_VALUE_ID; + std::array w212_dims = {{576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w212_dims.size(), w212_dims.data(), + /*data=*/w212_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w212); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w212" << std::endl; + return nullptr; + } + + alignas(16) static std::array w213_data; + uint32_t w213 = XNN_INVALID_VALUE_ID; + std::array w213_dims = {{1, 5, 5, 576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w213_dims.size(), w213_dims.data(), + /*data=*/w213_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w213); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w213" << std::endl; + return nullptr; + } + + alignas(16) static std::array w214_data; + uint32_t w214 = XNN_INVALID_VALUE_ID; + std::array w214_dims = {{576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w214_dims.size(), w214_dims.data(), + /*data=*/w214_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w214); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w214" << std::endl; + return nullptr; + } + + alignas(16) static std::array w215_data; + uint32_t w215 = XNN_INVALID_VALUE_ID; + std::array w215_dims = {{144, 1, 1, 576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w215_dims.size(), w215_dims.data(), + /*data=*/w215_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w215); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w215" << std::endl; + return nullptr; + } + + alignas(16) static std::array w216_data; + uint32_t w216 = XNN_INVALID_VALUE_ID; + std::array w216_dims = {{144}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w216_dims.size(), w216_dims.data(), + /*data=*/w216_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w216); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w216" << std::endl; + return nullptr; + } + + alignas(16) static std::array w217_data; + uint32_t w217 = XNN_INVALID_VALUE_ID; + std::array w217_dims = {{576, 1, 1, 144}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w217_dims.size(), w217_dims.data(), + /*data=*/w217_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w217); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w217" << std::endl; + return nullptr; + } + + alignas(16) static std::array w218_data; + uint32_t w218 = XNN_INVALID_VALUE_ID; + std::array w218_dims = {{576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w218_dims.size(), w218_dims.data(), + /*data=*/w218_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w218); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w218" << std::endl; + return nullptr; + } + + alignas(16) static std::array w219_data; + uint32_t w219 = XNN_INVALID_VALUE_ID; + std::array w219_dims = {{1}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w219_dims.size(), w219_dims.data(), + /*data=*/w219_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w219); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w219" << std::endl; + return nullptr; + } + + alignas(16) static std::array w220_data; + uint32_t w220 = XNN_INVALID_VALUE_ID; + std::array w220_dims = {{96, 1, 1, 576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w220_dims.size(), w220_dims.data(), + /*data=*/w220_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w220); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w220" << std::endl; + return nullptr; + } + + alignas(16) static std::array w221_data; + uint32_t w221 = XNN_INVALID_VALUE_ID; + std::array w221_dims = {{96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w221_dims.size(), w221_dims.data(), + /*data=*/w221_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w221); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w221" << std::endl; + return nullptr; + } + + alignas(16) static std::array w222_data; + uint32_t w222 = XNN_INVALID_VALUE_ID; + std::array w222_dims = {{576, 1, 1, 96}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w222_dims.size(), w222_dims.data(), + /*data=*/w222_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w222); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w222" << std::endl; + return nullptr; + } + + alignas(16) static std::array w223_data; + uint32_t w223 = XNN_INVALID_VALUE_ID; + std::array w223_dims = {{576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w223_dims.size(), w223_dims.data(), + /*data=*/w223_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w223); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w223" << std::endl; + return nullptr; + } + + alignas(16) static std::array w224_data; + uint32_t w224 = XNN_INVALID_VALUE_ID; + std::array w224_dims = {{1024, 1, 1, 576}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w224_dims.size(), w224_dims.data(), + /*data=*/w224_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w224); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w224" << std::endl; + return nullptr; + } + + alignas(16) static std::array w225_data; + uint32_t w225 = XNN_INVALID_VALUE_ID; + std::array w225_dims = {{1024}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w225_dims.size(), w225_dims.data(), + /*data=*/w225_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w225); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w225" << std::endl; + return nullptr; + } + + alignas(16) static std::array w226_data; + uint32_t w226 = XNN_INVALID_VALUE_ID; + std::array w226_dims = {{1001, 1, 1, 1024}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w226_dims.size(), w226_dims.data(), + /*data=*/w226_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w226); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w226" << std::endl; + return nullptr; + } + + alignas(16) static std::array w227_data; + uint32_t w227 = XNN_INVALID_VALUE_ID; + std::array w227_dims = {{1001}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, + w227_dims.size(), w227_dims.data(), + /*data=*/w227_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w227); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w227" << std::endl; + return nullptr; + } + + std::random_device random_device; + auto rng = std::mt19937(random_device()); + auto f32rng = std::bind(std::uniform_real_distribution(-1.0f, +1.0f), std::ref(rng)); + std::generate(w111_data.begin(), w111_data.end(), std::ref(f32rng)); + std::generate(w112_data.begin(), w112_data.end(), std::ref(f32rng)); + std::generate(w113_data.begin(), w113_data.end(), std::ref(f32rng)); + std::generate(w114_data.begin(), w114_data.end(), std::ref(f32rng)); + std::generate(w115_data.begin(), w115_data.end(), std::ref(f32rng)); + std::generate(w116_data.begin(), w116_data.end(), std::ref(f32rng)); + std::generate(w117_data.begin(), w117_data.end(), std::ref(f32rng)); + std::generate(w118_data.begin(), w118_data.end(), std::ref(f32rng)); + std::generate(w119_data.begin(), w119_data.end(), std::ref(f32rng)); + std::generate(w120_data.begin(), w120_data.end(), std::ref(f32rng)); + std::generate(w121_data.begin(), w121_data.end(), std::ref(f32rng)); + std::generate(w122_data.begin(), w122_data.end(), std::ref(f32rng)); + std::generate(w123_data.begin(), w123_data.end(), std::ref(f32rng)); + std::generate(w124_data.begin(), w124_data.end(), std::ref(f32rng)); + std::generate(w125_data.begin(), w125_data.end(), std::ref(f32rng)); + std::generate(w126_data.begin(), w126_data.end(), std::ref(f32rng)); + std::generate(w127_data.begin(), w127_data.end(), std::ref(f32rng)); + std::generate(w128_data.begin(), w128_data.end(), std::ref(f32rng)); + std::generate(w129_data.begin(), w129_data.end(), std::ref(f32rng)); + std::generate(w130_data.begin(), w130_data.end(), std::ref(f32rng)); + std::generate(w131_data.begin(), w131_data.end(), std::ref(f32rng)); + std::generate(w132_data.begin(), w132_data.end(), std::ref(f32rng)); + std::generate(w133_data.begin(), w133_data.end(), std::ref(f32rng)); + std::generate(w134_data.begin(), w134_data.end(), std::ref(f32rng)); + std::generate(w135_data.begin(), w135_data.end(), std::ref(f32rng)); + std::generate(w136_data.begin(), w136_data.end(), std::ref(f32rng)); + std::generate(w137_data.begin(), w137_data.end(), std::ref(f32rng)); + std::generate(w138_data.begin(), w138_data.end(), std::ref(f32rng)); + std::generate(w139_data.begin(), w139_data.end(), std::ref(f32rng)); + std::generate(w140_data.begin(), w140_data.end(), std::ref(f32rng)); + std::generate(w141_data.begin(), w141_data.end(), std::ref(f32rng)); + std::generate(w142_data.begin(), w142_data.end(), std::ref(f32rng)); + std::generate(w143_data.begin(), w143_data.end(), std::ref(f32rng)); + std::generate(w144_data.begin(), w144_data.end(), std::ref(f32rng)); + std::generate(w145_data.begin(), w145_data.end(), std::ref(f32rng)); + std::generate(w146_data.begin(), w146_data.end(), std::ref(f32rng)); + std::generate(w147_data.begin(), w147_data.end(), std::ref(f32rng)); + std::generate(w148_data.begin(), w148_data.end(), std::ref(f32rng)); + std::generate(w149_data.begin(), w149_data.end(), std::ref(f32rng)); + std::generate(w150_data.begin(), w150_data.end(), std::ref(f32rng)); + std::generate(w151_data.begin(), w151_data.end(), std::ref(f32rng)); + std::generate(w152_data.begin(), w152_data.end(), std::ref(f32rng)); + std::generate(w153_data.begin(), w153_data.end(), std::ref(f32rng)); + std::generate(w154_data.begin(), w154_data.end(), std::ref(f32rng)); + std::generate(w155_data.begin(), w155_data.end(), std::ref(f32rng)); + std::generate(w156_data.begin(), w156_data.end(), std::ref(f32rng)); + std::generate(w157_data.begin(), w157_data.end(), std::ref(f32rng)); + std::generate(w158_data.begin(), w158_data.end(), std::ref(f32rng)); + std::generate(w159_data.begin(), w159_data.end(), std::ref(f32rng)); + std::generate(w160_data.begin(), w160_data.end(), std::ref(f32rng)); + std::generate(w161_data.begin(), w161_data.end(), std::ref(f32rng)); + std::generate(w162_data.begin(), w162_data.end(), std::ref(f32rng)); + std::generate(w163_data.begin(), w163_data.end(), std::ref(f32rng)); + std::generate(w164_data.begin(), w164_data.end(), std::ref(f32rng)); + std::generate(w165_data.begin(), w165_data.end(), std::ref(f32rng)); + std::generate(w166_data.begin(), w166_data.end(), std::ref(f32rng)); + std::generate(w167_data.begin(), w167_data.end(), std::ref(f32rng)); + std::generate(w168_data.begin(), w168_data.end(), std::ref(f32rng)); + std::generate(w169_data.begin(), w169_data.end(), std::ref(f32rng)); + std::generate(w170_data.begin(), w170_data.end(), std::ref(f32rng)); + std::generate(w171_data.begin(), w171_data.end(), std::ref(f32rng)); + std::generate(w172_data.begin(), w172_data.end(), std::ref(f32rng)); + std::generate(w173_data.begin(), w173_data.end(), std::ref(f32rng)); + std::generate(w174_data.begin(), w174_data.end(), std::ref(f32rng)); + std::generate(w175_data.begin(), w175_data.end(), std::ref(f32rng)); + std::generate(w176_data.begin(), w176_data.end(), std::ref(f32rng)); + std::generate(w177_data.begin(), w177_data.end(), std::ref(f32rng)); + std::generate(w178_data.begin(), w178_data.end(), std::ref(f32rng)); + std::generate(w179_data.begin(), w179_data.end(), std::ref(f32rng)); + std::generate(w180_data.begin(), w180_data.end(), std::ref(f32rng)); + std::generate(w181_data.begin(), w181_data.end(), std::ref(f32rng)); + std::generate(w182_data.begin(), w182_data.end(), std::ref(f32rng)); + std::generate(w183_data.begin(), w183_data.end(), std::ref(f32rng)); + std::generate(w184_data.begin(), w184_data.end(), std::ref(f32rng)); + std::generate(w185_data.begin(), w185_data.end(), std::ref(f32rng)); + std::generate(w186_data.begin(), w186_data.end(), std::ref(f32rng)); + std::generate(w187_data.begin(), w187_data.end(), std::ref(f32rng)); + std::generate(w188_data.begin(), w188_data.end(), std::ref(f32rng)); + std::generate(w189_data.begin(), w189_data.end(), std::ref(f32rng)); + std::generate(w190_data.begin(), w190_data.end(), std::ref(f32rng)); + std::generate(w191_data.begin(), w191_data.end(), std::ref(f32rng)); + std::generate(w192_data.begin(), w192_data.end(), std::ref(f32rng)); + std::generate(w193_data.begin(), w193_data.end(), std::ref(f32rng)); + std::generate(w194_data.begin(), w194_data.end(), std::ref(f32rng)); + std::generate(w195_data.begin(), w195_data.end(), std::ref(f32rng)); + std::generate(w196_data.begin(), w196_data.end(), std::ref(f32rng)); + std::generate(w197_data.begin(), w197_data.end(), std::ref(f32rng)); + std::generate(w198_data.begin(), w198_data.end(), std::ref(f32rng)); + std::generate(w199_data.begin(), w199_data.end(), std::ref(f32rng)); + std::generate(w200_data.begin(), w200_data.end(), std::ref(f32rng)); + std::generate(w201_data.begin(), w201_data.end(), std::ref(f32rng)); + std::generate(w202_data.begin(), w202_data.end(), std::ref(f32rng)); + std::generate(w203_data.begin(), w203_data.end(), std::ref(f32rng)); + std::generate(w204_data.begin(), w204_data.end(), std::ref(f32rng)); + std::generate(w205_data.begin(), w205_data.end(), std::ref(f32rng)); + std::generate(w206_data.begin(), w206_data.end(), std::ref(f32rng)); + std::generate(w207_data.begin(), w207_data.end(), std::ref(f32rng)); + std::generate(w208_data.begin(), w208_data.end(), std::ref(f32rng)); + std::generate(w209_data.begin(), w209_data.end(), std::ref(f32rng)); + std::generate(w210_data.begin(), w210_data.end(), std::ref(f32rng)); + std::generate(w211_data.begin(), w211_data.end(), std::ref(f32rng)); + std::generate(w212_data.begin(), w212_data.end(), std::ref(f32rng)); + std::generate(w213_data.begin(), w213_data.end(), std::ref(f32rng)); + std::generate(w214_data.begin(), w214_data.end(), std::ref(f32rng)); + std::generate(w215_data.begin(), w215_data.end(), std::ref(f32rng)); + std::generate(w216_data.begin(), w216_data.end(), std::ref(f32rng)); + std::generate(w217_data.begin(), w217_data.end(), std::ref(f32rng)); + std::generate(w218_data.begin(), w218_data.end(), std::ref(f32rng)); + std::generate(w219_data.begin(), w219_data.end(), std::ref(f32rng)); + std::generate(w220_data.begin(), w220_data.end(), std::ref(f32rng)); + std::generate(w221_data.begin(), w221_data.end(), std::ref(f32rng)); + std::generate(w222_data.begin(), w222_data.end(), std::ref(f32rng)); + std::generate(w223_data.begin(), w223_data.end(), std::ref(f32rng)); + std::generate(w224_data.begin(), w224_data.end(), std::ref(f32rng)); + std::generate(w225_data.begin(), w225_data.end(), std::ref(f32rng)); + std::generate(w226_data.begin(), w226_data.end(), std::ref(f32rng)); + std::generate(w227_data.begin(), w227_data.end(), std::ref(f32rng)); + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/0, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/2, /*subsampling_width=*/2, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/3, + /*group_output_channels=*/16, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v0, + w111, + w112, + v1, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #0" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v1, + v2, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #1" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/0, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/2, /*subsampling_width=*/2, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/16, + /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), + v2, + w113, + w114, + v3, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #2" << std::endl; + return nullptr; + } + + status = xnn_define_average_pooling_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*pooling_height=*/56, /*pooling_width=*/56, + /*stride_height=*/1, /*stride_width=*/1, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v3, + v4, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #3" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/16, + /*group_output_channels=*/8, + /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), + v4, + w115, + w116, + v5, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #4" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/8, + /*group_output_channels=*/16, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v5, + w117, + w118, + v6, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #5" << std::endl; + return nullptr; + } + + status = xnn_define_multiply2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v6, + w119, + v7, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #6" << std::endl; + return nullptr; + } + + status = xnn_define_multiply2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v3, + v7, + v8, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #7" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/16, + /*group_output_channels=*/16, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v8, + w120, + w121, + v9, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #8" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/16, + /*group_output_channels=*/72, + /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), + v9, + w122, + w123, + v10, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #9" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/0, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/2, /*subsampling_width=*/2, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/72, + /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), + v10, + w124, + w125, + v11, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #10" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/72, + /*group_output_channels=*/24, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v11, + w126, + w127, + v12, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #11" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/24, + /*group_output_channels=*/88, + /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), + v12, + w128, + w129, + v13, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #12" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/88, + /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), + v13, + w130, + w131, + v14, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #13" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/88, + /*group_output_channels=*/24, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v14, + w132, + w133, + v15, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #14" << std::endl; + return nullptr; + } + + status = xnn_define_add2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v15, + v12, + v16, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #15" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/24, + /*group_output_channels=*/96, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v16, + w134, + w135, + v17, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #16" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v17, + v18, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #17" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/2, /*padding_bottom=*/2, /*padding_left=*/1, + /*kernel_height=*/5, /*kernel_width=*/5, + /*subsampling_height=*/2, /*subsampling_width=*/2, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/96, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v18, + w136, + w137, + v19, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #18" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v19, + v20, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #19" << std::endl; + return nullptr; + } + + status = xnn_define_average_pooling_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*pooling_height=*/14, /*pooling_width=*/14, + /*stride_height=*/1, /*stride_width=*/1, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v20, + v21, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #20" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/96, + /*group_output_channels=*/24, + /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), + v21, + w138, + w139, + v22, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #21" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/24, + /*group_output_channels=*/96, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v22, + w140, + w141, + v23, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #22" << std::endl; + return nullptr; + } + + status = xnn_define_multiply2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v23, + w142, + v24, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #23" << std::endl; + return nullptr; + } + + status = xnn_define_multiply2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v20, + v24, + v25, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #24" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/96, + /*group_output_channels=*/40, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v25, + w143, + w144, + v26, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #25" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/40, + /*group_output_channels=*/240, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v26, + w145, + w146, + v27, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #26" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v27, + v28, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #27" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/2, /*padding_right=*/2, /*padding_bottom=*/2, /*padding_left=*/2, + /*kernel_height=*/5, /*kernel_width=*/5, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/240, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v28, + w147, + w148, + v29, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #28" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v29, + v30, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #29" << std::endl; + return nullptr; + } + + status = xnn_define_average_pooling_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*pooling_height=*/14, /*pooling_width=*/14, + /*stride_height=*/1, /*stride_width=*/1, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v30, + v31, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #30" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/240, + /*group_output_channels=*/64, + /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), + v31, + w149, + w150, + v32, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #31" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/64, + /*group_output_channels=*/240, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v32, + w151, + w152, + v33, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #32" << std::endl; + return nullptr; + } + + status = xnn_define_multiply2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v33, + w153, + v34, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #33" << std::endl; + return nullptr; + } + + status = xnn_define_multiply2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v30, + v34, + v35, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #34" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/240, + /*group_output_channels=*/40, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v35, + w154, + w155, + v36, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #35" << std::endl; + return nullptr; + } + + status = xnn_define_add2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v36, + v26, + v37, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #36" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/40, + /*group_output_channels=*/240, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v37, + w156, + w157, + v38, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #37" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v38, + v39, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #38" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/2, /*padding_right=*/2, /*padding_bottom=*/2, /*padding_left=*/2, + /*kernel_height=*/5, /*kernel_width=*/5, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/240, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v39, + w158, + w159, + v40, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #39" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v40, + v41, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #40" << std::endl; + return nullptr; + } + + status = xnn_define_average_pooling_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*pooling_height=*/14, /*pooling_width=*/14, + /*stride_height=*/1, /*stride_width=*/1, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v41, + v42, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #41" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/240, + /*group_output_channels=*/64, + /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), + v42, + w160, + w161, + v43, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #42" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/64, + /*group_output_channels=*/240, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v43, + w162, + w163, + v44, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #43" << std::endl; + return nullptr; + } + + status = xnn_define_multiply2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v44, + w164, + v45, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #44" << std::endl; + return nullptr; + } + + status = xnn_define_multiply2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v41, + v45, + v46, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #45" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/240, + /*group_output_channels=*/40, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v46, + w165, + w166, + v47, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #46" << std::endl; + return nullptr; + } + + status = xnn_define_add2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v47, + v37, + v48, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #47" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/40, + /*group_output_channels=*/120, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v48, + w167, + w168, + v49, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #48" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v49, + v50, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #49" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/2, /*padding_right=*/2, /*padding_bottom=*/2, /*padding_left=*/2, + /*kernel_height=*/5, /*kernel_width=*/5, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/120, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v50, + w169, + w170, + v51, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #50" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v51, + v52, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #51" << std::endl; + return nullptr; + } + + status = xnn_define_average_pooling_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*pooling_height=*/14, /*pooling_width=*/14, + /*stride_height=*/1, /*stride_width=*/1, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v52, + v53, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #52" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/120, + /*group_output_channels=*/32, + /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), + v53, + w171, + w172, + v54, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #53" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/32, + /*group_output_channels=*/120, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v54, + w173, + w174, + v55, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #54" << std::endl; + return nullptr; + } + + status = xnn_define_multiply2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v55, + w175, + v56, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #55" << std::endl; + return nullptr; + } + + status = xnn_define_multiply2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v52, + v56, + v57, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #56" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/120, + /*group_output_channels=*/48, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v57, + w176, + w177, + v58, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #57" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/48, + /*group_output_channels=*/144, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v58, + w178, + w179, + v59, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #58" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v59, + v60, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #59" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/2, /*padding_right=*/2, /*padding_bottom=*/2, /*padding_left=*/2, + /*kernel_height=*/5, /*kernel_width=*/5, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/144, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v60, + w180, + w181, + v61, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #60" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v61, + v62, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #61" << std::endl; + return nullptr; + } + + status = xnn_define_average_pooling_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*pooling_height=*/14, /*pooling_width=*/14, + /*stride_height=*/1, /*stride_width=*/1, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v62, + v63, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #62" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/144, + /*group_output_channels=*/40, + /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), + v63, + w182, + w183, + v64, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #63" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/40, + /*group_output_channels=*/144, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v64, + w184, + w185, + v65, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #64" << std::endl; + return nullptr; + } + + status = xnn_define_multiply2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v65, + w186, + v66, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #65" << std::endl; + return nullptr; + } + + status = xnn_define_multiply2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v62, + v66, + v67, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #66" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/144, + /*group_output_channels=*/48, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v67, + w187, + w188, + v68, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #67" << std::endl; + return nullptr; + } + + status = xnn_define_add2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v68, + v58, + v69, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #68" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/48, + /*group_output_channels=*/288, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v69, + w189, + w190, + v70, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #69" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v70, + v71, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #70" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/2, /*padding_bottom=*/2, /*padding_left=*/1, + /*kernel_height=*/5, /*kernel_width=*/5, + /*subsampling_height=*/2, /*subsampling_width=*/2, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/288, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v71, + w191, + w192, + v72, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #71" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v72, + v73, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #72" << std::endl; + return nullptr; + } + + status = xnn_define_average_pooling_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*pooling_height=*/7, /*pooling_width=*/7, + /*stride_height=*/1, /*stride_width=*/1, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v73, + v74, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #73" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/288, + /*group_output_channels=*/72, + /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), + v74, + w193, + w194, + v75, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #74" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/72, + /*group_output_channels=*/288, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v75, + w195, + w196, + v76, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #75" << std::endl; + return nullptr; + } + + status = xnn_define_multiply2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v76, + w197, + v77, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #76" << std::endl; + return nullptr; + } + + status = xnn_define_multiply2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v73, + v77, + v78, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #77" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/288, + /*group_output_channels=*/96, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v78, + w198, + w199, + v79, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #78" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/96, + /*group_output_channels=*/576, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v79, + w200, + w201, + v80, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #79" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v80, + v81, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #80" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/2, /*padding_right=*/2, /*padding_bottom=*/2, /*padding_left=*/2, + /*kernel_height=*/5, /*kernel_width=*/5, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/576, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v81, + w202, + w203, + v82, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #81" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v82, + v83, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #82" << std::endl; + return nullptr; + } + + status = xnn_define_average_pooling_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*pooling_height=*/7, /*pooling_width=*/7, + /*stride_height=*/1, /*stride_width=*/1, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v83, + v84, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #83" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/576, + /*group_output_channels=*/144, + /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), + v84, + w204, + w205, + v85, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #84" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/144, + /*group_output_channels=*/576, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v85, + w206, + w207, + v86, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #85" << std::endl; + return nullptr; + } + + status = xnn_define_multiply2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v86, + w208, + v87, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #86" << std::endl; + return nullptr; + } + + status = xnn_define_multiply2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v83, + v87, + v88, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #87" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/576, + /*group_output_channels=*/96, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v88, + w209, + w210, + v89, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #88" << std::endl; + return nullptr; + } + + status = xnn_define_add2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v89, + v79, + v90, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #89" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/96, + /*group_output_channels=*/576, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v90, + w211, + w212, + v91, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #90" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v91, + v92, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #91" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/2, /*padding_right=*/2, /*padding_bottom=*/2, /*padding_left=*/2, + /*kernel_height=*/5, /*kernel_width=*/5, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/576, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v92, + w213, + w214, + v93, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #92" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v93, + v94, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #93" << std::endl; + return nullptr; + } + + status = xnn_define_average_pooling_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*pooling_height=*/7, /*pooling_width=*/7, + /*stride_height=*/1, /*stride_width=*/1, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v94, + v95, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #94" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/576, + /*group_output_channels=*/144, + /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), + v95, + w215, + w216, + v96, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #95" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/144, + /*group_output_channels=*/576, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v96, + w217, + w218, + v97, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #96" << std::endl; + return nullptr; + } + + status = xnn_define_multiply2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v97, + w219, + v98, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #97" << std::endl; + return nullptr; + } + + status = xnn_define_multiply2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v94, + v98, + v99, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #98" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/576, + /*group_output_channels=*/96, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v99, + w220, + w221, + v100, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #99" << std::endl; + return nullptr; + } + + status = xnn_define_add2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v100, + v90, + v101, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #100" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/96, + /*group_output_channels=*/576, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v101, + w222, + w223, + v102, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #101" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v102, + v103, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #102" << std::endl; + return nullptr; + } + + status = xnn_define_average_pooling_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*pooling_height=*/7, /*pooling_width=*/7, + /*stride_height=*/1, /*stride_width=*/1, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v103, + v104, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #103" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/576, + /*group_output_channels=*/1024, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v104, + w224, + w225, + v105, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #104" << std::endl; + return nullptr; + } + + status = xnn_define_hardswish( + subgraph, + v105, + v106, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #105" << std::endl; + return nullptr; + } + + status = xnn_define_average_pooling_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*pooling_height=*/1, /*pooling_width=*/1, + /*stride_height=*/1, /*stride_width=*/1, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v106, + v107, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #106" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/1024, + /*group_output_channels=*/1001, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v107, + w226, + w227, + v108, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #107" << std::endl; + return nullptr; + } + + status = xnn_define_copy( + subgraph, + v108, + v109, + 0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #108" << std::endl; + return nullptr; + } + + status = xnn_define_softmax( + subgraph, + v109, + v110, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #109" << std::endl; + return nullptr; + } + + return subgraph; +} + +} // namespace models diff --git a/bench/models/models.h b/bench/models/models.h new file mode 100644 index 00000000000..0e356d3f0fb --- /dev/null +++ b/bench/models/models.h @@ -0,0 +1,19 @@ +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once + +#include "xnnpack.h" + +namespace models { + +xnn_subgraph_t FP32MobileNetV1(); +xnn_subgraph_t FP32MobileNetV2(); +xnn_subgraph_t FP32MobileNetV3Large(); +xnn_subgraph_t FP32MobileNetV3Small(); + +xnn_subgraph_t QS8MobileNetV2(); + +} // namespace models diff --git a/bench/models/qs8-mobilenet-v2.cc b/bench/models/qs8-mobilenet-v2.cc new file mode 100644 index 00000000000..34f03c8bd59 --- /dev/null +++ b/bench/models/qs8-mobilenet-v2.cc @@ -0,0 +1,3542 @@ +// Copyright 2020 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. +// +// Auto-generated file. Do not edit! + +#include "xnnpack.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xnnpack.h" + +// align a size up to XNN_EXTRA_BYTES +#define XNN_PAD_EXTRA_BYTES(s, t) (((s) + XNN_EXTRA_BYTES / sizeof(t) - 1) & ~(XNN_EXTRA_BYTES / sizeof(t) - 1)) + +namespace models { + +xnn_subgraph_t QS8MobileNetV2() { + xnn_status status; + xnn_subgraph_t subgraph = nullptr; + status = xnn_create_subgraph(/*num_external_values=*/2, 0, &subgraph); + if (status != xnn_status_success) { + std::cerr << "failed to create subgrpah" << std::endl; + return nullptr; + } + + uint32_t v0 = XNN_INVALID_VALUE_ID; + std::array v0_dims = {{1, 224, 224, 3}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v0_dims.size(), v0_dims.data(), + /*data=*/nullptr, + 1, XNN_VALUE_FLAG_EXTERNAL_INPUT, &v0); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v0" << std::endl; + return nullptr; + } + + uint32_t v1 = XNN_INVALID_VALUE_ID; + std::array v1_dims = {{1, 112, 112, 32}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v1_dims.size(), v1_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v1); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v1" << std::endl; + return nullptr; + } + + uint32_t v2 = XNN_INVALID_VALUE_ID; + std::array v2_dims = {{1, 112, 112, 32}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v2_dims.size(), v2_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v2); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v2" << std::endl; + return nullptr; + } + + uint32_t v3 = XNN_INVALID_VALUE_ID; + std::array v3_dims = {{1, 112, 112, 16}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v3_dims.size(), v3_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v3); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v3" << std::endl; + return nullptr; + } + + uint32_t v4 = XNN_INVALID_VALUE_ID; + std::array v4_dims = {{1, 112, 112, 96}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v4_dims.size(), v4_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v4); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v4" << std::endl; + return nullptr; + } + + uint32_t v5 = XNN_INVALID_VALUE_ID; + std::array v5_dims = {{1, 56, 56, 96}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v5_dims.size(), v5_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v5); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v5" << std::endl; + return nullptr; + } + + uint32_t v6 = XNN_INVALID_VALUE_ID; + std::array v6_dims = {{1, 56, 56, 24}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v6_dims.size(), v6_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v6); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v6" << std::endl; + return nullptr; + } + + uint32_t v7 = XNN_INVALID_VALUE_ID; + std::array v7_dims = {{1, 56, 56, 144}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v7_dims.size(), v7_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v7); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v7" << std::endl; + return nullptr; + } + + uint32_t v8 = XNN_INVALID_VALUE_ID; + std::array v8_dims = {{1, 56, 56, 144}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v8_dims.size(), v8_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v8); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v8" << std::endl; + return nullptr; + } + + uint32_t v9 = XNN_INVALID_VALUE_ID; + std::array v9_dims = {{1, 56, 56, 24}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v9_dims.size(), v9_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v9); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v9" << std::endl; + return nullptr; + } + + uint32_t v10 = XNN_INVALID_VALUE_ID; + std::array v10_dims = {{1, 56, 56, 24}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v10_dims.size(), v10_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v10); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v10" << std::endl; + return nullptr; + } + + uint32_t v11 = XNN_INVALID_VALUE_ID; + std::array v11_dims = {{1, 56, 56, 144}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v11_dims.size(), v11_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v11); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v11" << std::endl; + return nullptr; + } + + uint32_t v12 = XNN_INVALID_VALUE_ID; + std::array v12_dims = {{1, 28, 28, 144}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v12_dims.size(), v12_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v12); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v12" << std::endl; + return nullptr; + } + + uint32_t v13 = XNN_INVALID_VALUE_ID; + std::array v13_dims = {{1, 28, 28, 32}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v13_dims.size(), v13_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v13); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v13" << std::endl; + return nullptr; + } + + uint32_t v14 = XNN_INVALID_VALUE_ID; + std::array v14_dims = {{1, 28, 28, 192}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v14_dims.size(), v14_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v14); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v14" << std::endl; + return nullptr; + } + + uint32_t v15 = XNN_INVALID_VALUE_ID; + std::array v15_dims = {{1, 28, 28, 192}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v15_dims.size(), v15_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v15); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v15" << std::endl; + return nullptr; + } + + uint32_t v16 = XNN_INVALID_VALUE_ID; + std::array v16_dims = {{1, 28, 28, 32}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v16_dims.size(), v16_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v16); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v16" << std::endl; + return nullptr; + } + + uint32_t v17 = XNN_INVALID_VALUE_ID; + std::array v17_dims = {{1, 28, 28, 32}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v17_dims.size(), v17_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v17); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v17" << std::endl; + return nullptr; + } + + uint32_t v18 = XNN_INVALID_VALUE_ID; + std::array v18_dims = {{1, 28, 28, 192}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v18_dims.size(), v18_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v18); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v18" << std::endl; + return nullptr; + } + + uint32_t v19 = XNN_INVALID_VALUE_ID; + std::array v19_dims = {{1, 28, 28, 192}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v19_dims.size(), v19_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v19); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v19" << std::endl; + return nullptr; + } + + uint32_t v20 = XNN_INVALID_VALUE_ID; + std::array v20_dims = {{1, 28, 28, 32}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v20_dims.size(), v20_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v20); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v20" << std::endl; + return nullptr; + } + + uint32_t v21 = XNN_INVALID_VALUE_ID; + std::array v21_dims = {{1, 28, 28, 32}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v21_dims.size(), v21_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v21); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v21" << std::endl; + return nullptr; + } + + uint32_t v22 = XNN_INVALID_VALUE_ID; + std::array v22_dims = {{1, 28, 28, 192}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v22_dims.size(), v22_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v22); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v22" << std::endl; + return nullptr; + } + + uint32_t v23 = XNN_INVALID_VALUE_ID; + std::array v23_dims = {{1, 14, 14, 192}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v23_dims.size(), v23_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v23); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v23" << std::endl; + return nullptr; + } + + uint32_t v24 = XNN_INVALID_VALUE_ID; + std::array v24_dims = {{1, 14, 14, 64}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v24_dims.size(), v24_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v24); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v24" << std::endl; + return nullptr; + } + + uint32_t v25 = XNN_INVALID_VALUE_ID; + std::array v25_dims = {{1, 14, 14, 384}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v25_dims.size(), v25_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v25); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v25" << std::endl; + return nullptr; + } + + uint32_t v26 = XNN_INVALID_VALUE_ID; + std::array v26_dims = {{1, 14, 14, 384}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v26_dims.size(), v26_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v26); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v26" << std::endl; + return nullptr; + } + + uint32_t v27 = XNN_INVALID_VALUE_ID; + std::array v27_dims = {{1, 14, 14, 64}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v27_dims.size(), v27_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v27); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v27" << std::endl; + return nullptr; + } + + uint32_t v28 = XNN_INVALID_VALUE_ID; + std::array v28_dims = {{1, 14, 14, 64}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v28_dims.size(), v28_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v28); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v28" << std::endl; + return nullptr; + } + + uint32_t v29 = XNN_INVALID_VALUE_ID; + std::array v29_dims = {{1, 14, 14, 384}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v29_dims.size(), v29_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v29); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v29" << std::endl; + return nullptr; + } + + uint32_t v30 = XNN_INVALID_VALUE_ID; + std::array v30_dims = {{1, 14, 14, 384}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v30_dims.size(), v30_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v30); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v30" << std::endl; + return nullptr; + } + + uint32_t v31 = XNN_INVALID_VALUE_ID; + std::array v31_dims = {{1, 14, 14, 64}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v31_dims.size(), v31_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v31); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v31" << std::endl; + return nullptr; + } + + uint32_t v32 = XNN_INVALID_VALUE_ID; + std::array v32_dims = {{1, 14, 14, 64}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v32_dims.size(), v32_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v32); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v32" << std::endl; + return nullptr; + } + + uint32_t v33 = XNN_INVALID_VALUE_ID; + std::array v33_dims = {{1, 14, 14, 384}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v33_dims.size(), v33_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v33); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v33" << std::endl; + return nullptr; + } + + uint32_t v34 = XNN_INVALID_VALUE_ID; + std::array v34_dims = {{1, 14, 14, 384}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v34_dims.size(), v34_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v34); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v34" << std::endl; + return nullptr; + } + + uint32_t v35 = XNN_INVALID_VALUE_ID; + std::array v35_dims = {{1, 14, 14, 64}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v35_dims.size(), v35_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v35); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v35" << std::endl; + return nullptr; + } + + uint32_t v36 = XNN_INVALID_VALUE_ID; + std::array v36_dims = {{1, 14, 14, 64}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v36_dims.size(), v36_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v36); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v36" << std::endl; + return nullptr; + } + + uint32_t v37 = XNN_INVALID_VALUE_ID; + std::array v37_dims = {{1, 14, 14, 384}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v37_dims.size(), v37_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v37); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v37" << std::endl; + return nullptr; + } + + uint32_t v38 = XNN_INVALID_VALUE_ID; + std::array v38_dims = {{1, 14, 14, 384}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v38_dims.size(), v38_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v38); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v38" << std::endl; + return nullptr; + } + + uint32_t v39 = XNN_INVALID_VALUE_ID; + std::array v39_dims = {{1, 14, 14, 96}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v39_dims.size(), v39_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v39); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v39" << std::endl; + return nullptr; + } + + uint32_t v40 = XNN_INVALID_VALUE_ID; + std::array v40_dims = {{1, 14, 14, 576}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v40_dims.size(), v40_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v40); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v40" << std::endl; + return nullptr; + } + + uint32_t v41 = XNN_INVALID_VALUE_ID; + std::array v41_dims = {{1, 14, 14, 576}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v41_dims.size(), v41_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v41); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v41" << std::endl; + return nullptr; + } + + uint32_t v42 = XNN_INVALID_VALUE_ID; + std::array v42_dims = {{1, 14, 14, 96}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v42_dims.size(), v42_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v42); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v42" << std::endl; + return nullptr; + } + + uint32_t v43 = XNN_INVALID_VALUE_ID; + std::array v43_dims = {{1, 14, 14, 96}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v43_dims.size(), v43_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v43); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v43" << std::endl; + return nullptr; + } + + uint32_t v44 = XNN_INVALID_VALUE_ID; + std::array v44_dims = {{1, 14, 14, 576}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v44_dims.size(), v44_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v44); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v44" << std::endl; + return nullptr; + } + + uint32_t v45 = XNN_INVALID_VALUE_ID; + std::array v45_dims = {{1, 14, 14, 576}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v45_dims.size(), v45_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v45); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v45" << std::endl; + return nullptr; + } + + uint32_t v46 = XNN_INVALID_VALUE_ID; + std::array v46_dims = {{1, 14, 14, 96}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v46_dims.size(), v46_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v46); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v46" << std::endl; + return nullptr; + } + + uint32_t v47 = XNN_INVALID_VALUE_ID; + std::array v47_dims = {{1, 14, 14, 96}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v47_dims.size(), v47_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v47); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v47" << std::endl; + return nullptr; + } + + uint32_t v48 = XNN_INVALID_VALUE_ID; + std::array v48_dims = {{1, 14, 14, 576}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v48_dims.size(), v48_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v48); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v48" << std::endl; + return nullptr; + } + + uint32_t v49 = XNN_INVALID_VALUE_ID; + std::array v49_dims = {{1, 7, 7, 576}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v49_dims.size(), v49_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v49); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v49" << std::endl; + return nullptr; + } + + uint32_t v50 = XNN_INVALID_VALUE_ID; + std::array v50_dims = {{1, 7, 7, 160}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v50_dims.size(), v50_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v50); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v50" << std::endl; + return nullptr; + } + + uint32_t v51 = XNN_INVALID_VALUE_ID; + std::array v51_dims = {{1, 7, 7, 960}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v51_dims.size(), v51_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v51); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v51" << std::endl; + return nullptr; + } + + uint32_t v52 = XNN_INVALID_VALUE_ID; + std::array v52_dims = {{1, 7, 7, 960}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v52_dims.size(), v52_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v52); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v52" << std::endl; + return nullptr; + } + + uint32_t v53 = XNN_INVALID_VALUE_ID; + std::array v53_dims = {{1, 7, 7, 160}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v53_dims.size(), v53_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v53); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v53" << std::endl; + return nullptr; + } + + uint32_t v54 = XNN_INVALID_VALUE_ID; + std::array v54_dims = {{1, 7, 7, 160}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v54_dims.size(), v54_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v54); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v54" << std::endl; + return nullptr; + } + + uint32_t v55 = XNN_INVALID_VALUE_ID; + std::array v55_dims = {{1, 7, 7, 960}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v55_dims.size(), v55_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v55); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v55" << std::endl; + return nullptr; + } + + uint32_t v56 = XNN_INVALID_VALUE_ID; + std::array v56_dims = {{1, 7, 7, 960}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v56_dims.size(), v56_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v56); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v56" << std::endl; + return nullptr; + } + + uint32_t v57 = XNN_INVALID_VALUE_ID; + std::array v57_dims = {{1, 7, 7, 160}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v57_dims.size(), v57_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v57); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v57" << std::endl; + return nullptr; + } + + uint32_t v58 = XNN_INVALID_VALUE_ID; + std::array v58_dims = {{1, 7, 7, 160}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v58_dims.size(), v58_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v58); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v58" << std::endl; + return nullptr; + } + + uint32_t v59 = XNN_INVALID_VALUE_ID; + std::array v59_dims = {{1, 7, 7, 960}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v59_dims.size(), v59_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v59); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v59" << std::endl; + return nullptr; + } + + uint32_t v60 = XNN_INVALID_VALUE_ID; + std::array v60_dims = {{1, 7, 7, 960}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v60_dims.size(), v60_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v60); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v60" << std::endl; + return nullptr; + } + + uint32_t v61 = XNN_INVALID_VALUE_ID; + std::array v61_dims = {{1, 7, 7, 320}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v61_dims.size(), v61_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v61); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v61" << std::endl; + return nullptr; + } + + uint32_t v62 = XNN_INVALID_VALUE_ID; + std::array v62_dims = {{1, 7, 7, 1280}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v62_dims.size(), v62_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v62); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v62" << std::endl; + return nullptr; + } + + uint32_t v63 = XNN_INVALID_VALUE_ID; + std::array v63_dims = {{1, 1, 1, 1280}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v63_dims.size(), v63_dims.data(), + /*data=*/nullptr, +#if 0 + XNN_INVALID_VALUE_ID, /*flags=*/0, &v63); +#else + 0, XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &v63); +#endif + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v63" << std::endl; + return nullptr; + } +#if 0 + uint32_t v64 = XNN_INVALID_VALUE_ID; + std::array v64_dims = {{1, 1, 1, 1008}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v64_dims.size(), v64_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v64); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v64" << std::endl; + return nullptr; + } + + uint32_t v65 = XNN_INVALID_VALUE_ID; + std::array v65_dims = {{1, 1008}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v65_dims.size(), v65_dims.data(), + /*data=*/nullptr, + XNN_INVALID_VALUE_ID, /*flags=*/0, &v65); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v65" << std::endl; + return nullptr; + } + + uint32_t v66 = XNN_INVALID_VALUE_ID; + std::array v66_dims = {{1, 1008}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + v66_dims.size(), v66_dims.data(), + /*data=*/nullptr, + 0, XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &v66); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor v66" << std::endl; + return nullptr; + } +#endif + + alignas(16) static std::array w67_data; + uint32_t w67 = XNN_INVALID_VALUE_ID; + std::array w67_dims = {{32, 3, 3, 3}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w67_dims.size(), w67_dims.data(), + /*data=*/w67_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w67); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w67" << std::endl; + return nullptr; + } + + alignas(16) static std::array w68_data; + uint32_t w68 = XNN_INVALID_VALUE_ID; + std::array w68_dims = {{32}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w68_dims.size(), w68_dims.data(), + /*data=*/w68_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w68); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w68" << std::endl; + return nullptr; + } + + alignas(16) static std::array w69_data; + uint32_t w69 = XNN_INVALID_VALUE_ID; + std::array w69_dims = {{1, 3, 3, 32}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w69_dims.size(), w69_dims.data(), + /*data=*/w69_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w69); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w69" << std::endl; + return nullptr; + } + + alignas(16) static std::array w70_data; + uint32_t w70 = XNN_INVALID_VALUE_ID; + std::array w70_dims = {{32}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w70_dims.size(), w70_dims.data(), + /*data=*/w70_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w70); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w70" << std::endl; + return nullptr; + } + + alignas(16) static std::array w71_data; + uint32_t w71 = XNN_INVALID_VALUE_ID; + std::array w71_dims = {{16, 1, 1, 32}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w71_dims.size(), w71_dims.data(), + /*data=*/w71_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w71); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w71" << std::endl; + return nullptr; + } + + alignas(16) static std::array w72_data; + uint32_t w72 = XNN_INVALID_VALUE_ID; + std::array w72_dims = {{16}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w72_dims.size(), w72_dims.data(), + /*data=*/w72_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w72); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w72" << std::endl; + return nullptr; + } + + alignas(16) static std::array w73_data; + uint32_t w73 = XNN_INVALID_VALUE_ID; + std::array w73_dims = {{96, 1, 1, 16}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w73_dims.size(), w73_dims.data(), + /*data=*/w73_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w73); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w73" << std::endl; + return nullptr; + } + + alignas(16) static std::array w74_data; + uint32_t w74 = XNN_INVALID_VALUE_ID; + std::array w74_dims = {{96}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w74_dims.size(), w74_dims.data(), + /*data=*/w74_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w74); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w74" << std::endl; + return nullptr; + } + + alignas(16) static std::array w75_data; + uint32_t w75 = XNN_INVALID_VALUE_ID; + std::array w75_dims = {{1, 3, 3, 96}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w75_dims.size(), w75_dims.data(), + /*data=*/w75_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w75); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w75" << std::endl; + return nullptr; + } + + alignas(16) static std::array w76_data; + uint32_t w76 = XNN_INVALID_VALUE_ID; + std::array w76_dims = {{96}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w76_dims.size(), w76_dims.data(), + /*data=*/w76_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w76); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w76" << std::endl; + return nullptr; + } + + alignas(16) static std::array w77_data; + uint32_t w77 = XNN_INVALID_VALUE_ID; + std::array w77_dims = {{24, 1, 1, 96}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w77_dims.size(), w77_dims.data(), + /*data=*/w77_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w77); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w77" << std::endl; + return nullptr; + } + + alignas(16) static std::array w78_data; + uint32_t w78 = XNN_INVALID_VALUE_ID; + std::array w78_dims = {{24}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w78_dims.size(), w78_dims.data(), + /*data=*/w78_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w78); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w78" << std::endl; + return nullptr; + } + + alignas(16) static std::array w79_data; + uint32_t w79 = XNN_INVALID_VALUE_ID; + std::array w79_dims = {{144, 1, 1, 24}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w79_dims.size(), w79_dims.data(), + /*data=*/w79_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w79); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w79" << std::endl; + return nullptr; + } + + alignas(16) static std::array w80_data; + uint32_t w80 = XNN_INVALID_VALUE_ID; + std::array w80_dims = {{144}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w80_dims.size(), w80_dims.data(), + /*data=*/w80_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w80); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w80" << std::endl; + return nullptr; + } + + alignas(16) static std::array w81_data; + uint32_t w81 = XNN_INVALID_VALUE_ID; + std::array w81_dims = {{1, 3, 3, 144}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w81_dims.size(), w81_dims.data(), + /*data=*/w81_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w81); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w81" << std::endl; + return nullptr; + } + + alignas(16) static std::array w82_data; + uint32_t w82 = XNN_INVALID_VALUE_ID; + std::array w82_dims = {{144}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w82_dims.size(), w82_dims.data(), + /*data=*/w82_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w82); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w82" << std::endl; + return nullptr; + } + + alignas(16) static std::array w83_data; + uint32_t w83 = XNN_INVALID_VALUE_ID; + std::array w83_dims = {{24, 1, 1, 144}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w83_dims.size(), w83_dims.data(), + /*data=*/w83_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w83); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w83" << std::endl; + return nullptr; + } + + alignas(16) static std::array w84_data; + uint32_t w84 = XNN_INVALID_VALUE_ID; + std::array w84_dims = {{24}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w84_dims.size(), w84_dims.data(), + /*data=*/w84_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w84); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w84" << std::endl; + return nullptr; + } + + alignas(16) static std::array w85_data; + uint32_t w85 = XNN_INVALID_VALUE_ID; + std::array w85_dims = {{144, 1, 1, 24}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w85_dims.size(), w85_dims.data(), + /*data=*/w85_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w85); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w85" << std::endl; + return nullptr; + } + + alignas(16) static std::array w86_data; + uint32_t w86 = XNN_INVALID_VALUE_ID; + std::array w86_dims = {{144}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w86_dims.size(), w86_dims.data(), + /*data=*/w86_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w86); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w86" << std::endl; + return nullptr; + } + + alignas(16) static std::array w87_data; + uint32_t w87 = XNN_INVALID_VALUE_ID; + std::array w87_dims = {{1, 3, 3, 144}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w87_dims.size(), w87_dims.data(), + /*data=*/w87_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w87); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w87" << std::endl; + return nullptr; + } + + alignas(16) static std::array w88_data; + uint32_t w88 = XNN_INVALID_VALUE_ID; + std::array w88_dims = {{144}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w88_dims.size(), w88_dims.data(), + /*data=*/w88_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w88); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w88" << std::endl; + return nullptr; + } + + alignas(16) static std::array w89_data; + uint32_t w89 = XNN_INVALID_VALUE_ID; + std::array w89_dims = {{32, 1, 1, 144}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w89_dims.size(), w89_dims.data(), + /*data=*/w89_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w89); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w89" << std::endl; + return nullptr; + } + + alignas(16) static std::array w90_data; + uint32_t w90 = XNN_INVALID_VALUE_ID; + std::array w90_dims = {{32}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w90_dims.size(), w90_dims.data(), + /*data=*/w90_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w90); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w90" << std::endl; + return nullptr; + } + + alignas(16) static std::array w91_data; + uint32_t w91 = XNN_INVALID_VALUE_ID; + std::array w91_dims = {{192, 1, 1, 32}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w91_dims.size(), w91_dims.data(), + /*data=*/w91_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w91); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w91" << std::endl; + return nullptr; + } + + alignas(16) static std::array w92_data; + uint32_t w92 = XNN_INVALID_VALUE_ID; + std::array w92_dims = {{192}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w92_dims.size(), w92_dims.data(), + /*data=*/w92_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w92); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w92" << std::endl; + return nullptr; + } + + alignas(16) static std::array w93_data; + uint32_t w93 = XNN_INVALID_VALUE_ID; + std::array w93_dims = {{1, 3, 3, 192}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w93_dims.size(), w93_dims.data(), + /*data=*/w93_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w93); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w93" << std::endl; + return nullptr; + } + + alignas(16) static std::array w94_data; + uint32_t w94 = XNN_INVALID_VALUE_ID; + std::array w94_dims = {{192}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w94_dims.size(), w94_dims.data(), + /*data=*/w94_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w94); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w94" << std::endl; + return nullptr; + } + + alignas(16) static std::array w95_data; + uint32_t w95 = XNN_INVALID_VALUE_ID; + std::array w95_dims = {{32, 1, 1, 192}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w95_dims.size(), w95_dims.data(), + /*data=*/w95_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w95); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w95" << std::endl; + return nullptr; + } + + alignas(16) static std::array w96_data; + uint32_t w96 = XNN_INVALID_VALUE_ID; + std::array w96_dims = {{32}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w96_dims.size(), w96_dims.data(), + /*data=*/w96_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w96); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w96" << std::endl; + return nullptr; + } + + alignas(16) static std::array w97_data; + uint32_t w97 = XNN_INVALID_VALUE_ID; + std::array w97_dims = {{192, 1, 1, 32}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w97_dims.size(), w97_dims.data(), + /*data=*/w97_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w97); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w97" << std::endl; + return nullptr; + } + + alignas(16) static std::array w98_data; + uint32_t w98 = XNN_INVALID_VALUE_ID; + std::array w98_dims = {{192}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w98_dims.size(), w98_dims.data(), + /*data=*/w98_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w98); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w98" << std::endl; + return nullptr; + } + + alignas(16) static std::array w99_data; + uint32_t w99 = XNN_INVALID_VALUE_ID; + std::array w99_dims = {{1, 3, 3, 192}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w99_dims.size(), w99_dims.data(), + /*data=*/w99_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w99); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w99" << std::endl; + return nullptr; + } + + alignas(16) static std::array w100_data; + uint32_t w100 = XNN_INVALID_VALUE_ID; + std::array w100_dims = {{192}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w100_dims.size(), w100_dims.data(), + /*data=*/w100_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w100); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w100" << std::endl; + return nullptr; + } + + alignas(16) static std::array w101_data; + uint32_t w101 = XNN_INVALID_VALUE_ID; + std::array w101_dims = {{32, 1, 1, 192}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w101_dims.size(), w101_dims.data(), + /*data=*/w101_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w101); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w101" << std::endl; + return nullptr; + } + + alignas(16) static std::array w102_data; + uint32_t w102 = XNN_INVALID_VALUE_ID; + std::array w102_dims = {{32}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w102_dims.size(), w102_dims.data(), + /*data=*/w102_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w102); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w102" << std::endl; + return nullptr; + } + + alignas(16) static std::array w103_data; + uint32_t w103 = XNN_INVALID_VALUE_ID; + std::array w103_dims = {{192, 1, 1, 32}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w103_dims.size(), w103_dims.data(), + /*data=*/w103_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w103); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w103" << std::endl; + return nullptr; + } + + alignas(16) static std::array w104_data; + uint32_t w104 = XNN_INVALID_VALUE_ID; + std::array w104_dims = {{192}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w104_dims.size(), w104_dims.data(), + /*data=*/w104_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w104); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w104" << std::endl; + return nullptr; + } + + alignas(16) static std::array w105_data; + uint32_t w105 = XNN_INVALID_VALUE_ID; + std::array w105_dims = {{1, 3, 3, 192}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w105_dims.size(), w105_dims.data(), + /*data=*/w105_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w105); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w105" << std::endl; + return nullptr; + } + + alignas(16) static std::array w106_data; + uint32_t w106 = XNN_INVALID_VALUE_ID; + std::array w106_dims = {{192}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w106_dims.size(), w106_dims.data(), + /*data=*/w106_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w106); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w106" << std::endl; + return nullptr; + } + + alignas(16) static std::array w107_data; + uint32_t w107 = XNN_INVALID_VALUE_ID; + std::array w107_dims = {{64, 1, 1, 192}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w107_dims.size(), w107_dims.data(), + /*data=*/w107_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w107); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w107" << std::endl; + return nullptr; + } + + alignas(16) static std::array w108_data; + uint32_t w108 = XNN_INVALID_VALUE_ID; + std::array w108_dims = {{64}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w108_dims.size(), w108_dims.data(), + /*data=*/w108_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w108); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w108" << std::endl; + return nullptr; + } + + alignas(16) static std::array w109_data; + uint32_t w109 = XNN_INVALID_VALUE_ID; + std::array w109_dims = {{384, 1, 1, 64}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w109_dims.size(), w109_dims.data(), + /*data=*/w109_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w109); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w109" << std::endl; + return nullptr; + } + + alignas(16) static std::array w110_data; + uint32_t w110 = XNN_INVALID_VALUE_ID; + std::array w110_dims = {{384}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w110_dims.size(), w110_dims.data(), + /*data=*/w110_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w110); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w110" << std::endl; + return nullptr; + } + + alignas(16) static std::array w111_data; + uint32_t w111 = XNN_INVALID_VALUE_ID; + std::array w111_dims = {{1, 3, 3, 384}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w111_dims.size(), w111_dims.data(), + /*data=*/w111_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w111); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w111" << std::endl; + return nullptr; + } + + alignas(16) static std::array w112_data; + uint32_t w112 = XNN_INVALID_VALUE_ID; + std::array w112_dims = {{384}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w112_dims.size(), w112_dims.data(), + /*data=*/w112_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w112); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w112" << std::endl; + return nullptr; + } + + alignas(16) static std::array w113_data; + uint32_t w113 = XNN_INVALID_VALUE_ID; + std::array w113_dims = {{64, 1, 1, 384}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w113_dims.size(), w113_dims.data(), + /*data=*/w113_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w113); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w113" << std::endl; + return nullptr; + } + + alignas(16) static std::array w114_data; + uint32_t w114 = XNN_INVALID_VALUE_ID; + std::array w114_dims = {{64}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w114_dims.size(), w114_dims.data(), + /*data=*/w114_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w114); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w114" << std::endl; + return nullptr; + } + + alignas(16) static std::array w115_data; + uint32_t w115 = XNN_INVALID_VALUE_ID; + std::array w115_dims = {{384, 1, 1, 64}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w115_dims.size(), w115_dims.data(), + /*data=*/w115_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w115); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w115" << std::endl; + return nullptr; + } + + alignas(16) static std::array w116_data; + uint32_t w116 = XNN_INVALID_VALUE_ID; + std::array w116_dims = {{384}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w116_dims.size(), w116_dims.data(), + /*data=*/w116_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w116); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w116" << std::endl; + return nullptr; + } + + alignas(16) static std::array w117_data; + uint32_t w117 = XNN_INVALID_VALUE_ID; + std::array w117_dims = {{1, 3, 3, 384}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w117_dims.size(), w117_dims.data(), + /*data=*/w117_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w117); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w117" << std::endl; + return nullptr; + } + + alignas(16) static std::array w118_data; + uint32_t w118 = XNN_INVALID_VALUE_ID; + std::array w118_dims = {{384}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w118_dims.size(), w118_dims.data(), + /*data=*/w118_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w118); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w118" << std::endl; + return nullptr; + } + + alignas(16) static std::array w119_data; + uint32_t w119 = XNN_INVALID_VALUE_ID; + std::array w119_dims = {{64, 1, 1, 384}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w119_dims.size(), w119_dims.data(), + /*data=*/w119_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w119); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w119" << std::endl; + return nullptr; + } + + alignas(16) static std::array w120_data; + uint32_t w120 = XNN_INVALID_VALUE_ID; + std::array w120_dims = {{64}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w120_dims.size(), w120_dims.data(), + /*data=*/w120_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w120); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w120" << std::endl; + return nullptr; + } + + alignas(16) static std::array w121_data; + uint32_t w121 = XNN_INVALID_VALUE_ID; + std::array w121_dims = {{384, 1, 1, 64}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w121_dims.size(), w121_dims.data(), + /*data=*/w121_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w121); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w121" << std::endl; + return nullptr; + } + + alignas(16) static std::array w122_data; + uint32_t w122 = XNN_INVALID_VALUE_ID; + std::array w122_dims = {{384}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w122_dims.size(), w122_dims.data(), + /*data=*/w122_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w122); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w122" << std::endl; + return nullptr; + } + + alignas(16) static std::array w123_data; + uint32_t w123 = XNN_INVALID_VALUE_ID; + std::array w123_dims = {{1, 3, 3, 384}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w123_dims.size(), w123_dims.data(), + /*data=*/w123_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w123); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w123" << std::endl; + return nullptr; + } + + alignas(16) static std::array w124_data; + uint32_t w124 = XNN_INVALID_VALUE_ID; + std::array w124_dims = {{384}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w124_dims.size(), w124_dims.data(), + /*data=*/w124_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w124); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w124" << std::endl; + return nullptr; + } + + alignas(16) static std::array w125_data; + uint32_t w125 = XNN_INVALID_VALUE_ID; + std::array w125_dims = {{64, 1, 1, 384}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w125_dims.size(), w125_dims.data(), + /*data=*/w125_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w125); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w125" << std::endl; + return nullptr; + } + + alignas(16) static std::array w126_data; + uint32_t w126 = XNN_INVALID_VALUE_ID; + std::array w126_dims = {{64}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w126_dims.size(), w126_dims.data(), + /*data=*/w126_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w126); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w126" << std::endl; + return nullptr; + } + + alignas(16) static std::array w127_data; + uint32_t w127 = XNN_INVALID_VALUE_ID; + std::array w127_dims = {{384, 1, 1, 64}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w127_dims.size(), w127_dims.data(), + /*data=*/w127_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w127); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w127" << std::endl; + return nullptr; + } + + alignas(16) static std::array w128_data; + uint32_t w128 = XNN_INVALID_VALUE_ID; + std::array w128_dims = {{384}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w128_dims.size(), w128_dims.data(), + /*data=*/w128_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w128); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w128" << std::endl; + return nullptr; + } + + alignas(16) static std::array w129_data; + uint32_t w129 = XNN_INVALID_VALUE_ID; + std::array w129_dims = {{1, 3, 3, 384}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w129_dims.size(), w129_dims.data(), + /*data=*/w129_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w129); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w129" << std::endl; + return nullptr; + } + + alignas(16) static std::array w130_data; + uint32_t w130 = XNN_INVALID_VALUE_ID; + std::array w130_dims = {{384}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w130_dims.size(), w130_dims.data(), + /*data=*/w130_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w130); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w130" << std::endl; + return nullptr; + } + + alignas(16) static std::array w131_data; + uint32_t w131 = XNN_INVALID_VALUE_ID; + std::array w131_dims = {{96, 1, 1, 384}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w131_dims.size(), w131_dims.data(), + /*data=*/w131_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w131); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w131" << std::endl; + return nullptr; + } + + alignas(16) static std::array w132_data; + uint32_t w132 = XNN_INVALID_VALUE_ID; + std::array w132_dims = {{96}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w132_dims.size(), w132_dims.data(), + /*data=*/w132_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w132); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w132" << std::endl; + return nullptr; + } + + alignas(16) static std::array w133_data; + uint32_t w133 = XNN_INVALID_VALUE_ID; + std::array w133_dims = {{576, 1, 1, 96}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w133_dims.size(), w133_dims.data(), + /*data=*/w133_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w133); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w133" << std::endl; + return nullptr; + } + + alignas(16) static std::array w134_data; + uint32_t w134 = XNN_INVALID_VALUE_ID; + std::array w134_dims = {{576}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w134_dims.size(), w134_dims.data(), + /*data=*/w134_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w134); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w134" << std::endl; + return nullptr; + } + + alignas(16) static std::array w135_data; + uint32_t w135 = XNN_INVALID_VALUE_ID; + std::array w135_dims = {{1, 3, 3, 576}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w135_dims.size(), w135_dims.data(), + /*data=*/w135_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w135); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w135" << std::endl; + return nullptr; + } + + alignas(16) static std::array w136_data; + uint32_t w136 = XNN_INVALID_VALUE_ID; + std::array w136_dims = {{576}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w136_dims.size(), w136_dims.data(), + /*data=*/w136_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w136); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w136" << std::endl; + return nullptr; + } + + alignas(16) static std::array w137_data; + uint32_t w137 = XNN_INVALID_VALUE_ID; + std::array w137_dims = {{96, 1, 1, 576}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w137_dims.size(), w137_dims.data(), + /*data=*/w137_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w137); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w137" << std::endl; + return nullptr; + } + + alignas(16) static std::array w138_data; + uint32_t w138 = XNN_INVALID_VALUE_ID; + std::array w138_dims = {{96}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w138_dims.size(), w138_dims.data(), + /*data=*/w138_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w138); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w138" << std::endl; + return nullptr; + } + + alignas(16) static std::array w139_data; + uint32_t w139 = XNN_INVALID_VALUE_ID; + std::array w139_dims = {{576, 1, 1, 96}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w139_dims.size(), w139_dims.data(), + /*data=*/w139_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w139); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w139" << std::endl; + return nullptr; + } + + alignas(16) static std::array w140_data; + uint32_t w140 = XNN_INVALID_VALUE_ID; + std::array w140_dims = {{576}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w140_dims.size(), w140_dims.data(), + /*data=*/w140_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w140); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w140" << std::endl; + return nullptr; + } + + alignas(16) static std::array w141_data; + uint32_t w141 = XNN_INVALID_VALUE_ID; + std::array w141_dims = {{1, 3, 3, 576}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w141_dims.size(), w141_dims.data(), + /*data=*/w141_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w141); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w141" << std::endl; + return nullptr; + } + + alignas(16) static std::array w142_data; + uint32_t w142 = XNN_INVALID_VALUE_ID; + std::array w142_dims = {{576}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w142_dims.size(), w142_dims.data(), + /*data=*/w142_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w142); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w142" << std::endl; + return nullptr; + } + + alignas(16) static std::array w143_data; + uint32_t w143 = XNN_INVALID_VALUE_ID; + std::array w143_dims = {{96, 1, 1, 576}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w143_dims.size(), w143_dims.data(), + /*data=*/w143_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w143); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w143" << std::endl; + return nullptr; + } + + alignas(16) static std::array w144_data; + uint32_t w144 = XNN_INVALID_VALUE_ID; + std::array w144_dims = {{96}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w144_dims.size(), w144_dims.data(), + /*data=*/w144_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w144); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w144" << std::endl; + return nullptr; + } + + alignas(16) static std::array w145_data; + uint32_t w145 = XNN_INVALID_VALUE_ID; + std::array w145_dims = {{576, 1, 1, 96}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w145_dims.size(), w145_dims.data(), + /*data=*/w145_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w145); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w145" << std::endl; + return nullptr; + } + + alignas(16) static std::array w146_data; + uint32_t w146 = XNN_INVALID_VALUE_ID; + std::array w146_dims = {{576}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w146_dims.size(), w146_dims.data(), + /*data=*/w146_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w146); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w146" << std::endl; + return nullptr; + } + + alignas(16) static std::array w147_data; + uint32_t w147 = XNN_INVALID_VALUE_ID; + std::array w147_dims = {{1, 3, 3, 576}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w147_dims.size(), w147_dims.data(), + /*data=*/w147_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w147); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w147" << std::endl; + return nullptr; + } + + alignas(16) static std::array w148_data; + uint32_t w148 = XNN_INVALID_VALUE_ID; + std::array w148_dims = {{576}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w148_dims.size(), w148_dims.data(), + /*data=*/w148_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w148); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w148" << std::endl; + return nullptr; + } + + alignas(16) static std::array w149_data; + uint32_t w149 = XNN_INVALID_VALUE_ID; + std::array w149_dims = {{160, 1, 1, 576}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w149_dims.size(), w149_dims.data(), + /*data=*/w149_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w149); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w149" << std::endl; + return nullptr; + } + + alignas(16) static std::array w150_data; + uint32_t w150 = XNN_INVALID_VALUE_ID; + std::array w150_dims = {{160}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w150_dims.size(), w150_dims.data(), + /*data=*/w150_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w150); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w150" << std::endl; + return nullptr; + } + + alignas(16) static std::array w151_data; + uint32_t w151 = XNN_INVALID_VALUE_ID; + std::array w151_dims = {{960, 1, 1, 160}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w151_dims.size(), w151_dims.data(), + /*data=*/w151_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w151); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w151" << std::endl; + return nullptr; + } + + alignas(16) static std::array w152_data; + uint32_t w152 = XNN_INVALID_VALUE_ID; + std::array w152_dims = {{960}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w152_dims.size(), w152_dims.data(), + /*data=*/w152_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w152); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w152" << std::endl; + return nullptr; + } + + alignas(16) static std::array w153_data; + uint32_t w153 = XNN_INVALID_VALUE_ID; + std::array w153_dims = {{1, 3, 3, 960}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w153_dims.size(), w153_dims.data(), + /*data=*/w153_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w153); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w153" << std::endl; + return nullptr; + } + + alignas(16) static std::array w154_data; + uint32_t w154 = XNN_INVALID_VALUE_ID; + std::array w154_dims = {{960}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w154_dims.size(), w154_dims.data(), + /*data=*/w154_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w154); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w154" << std::endl; + return nullptr; + } + + alignas(16) static std::array w155_data; + uint32_t w155 = XNN_INVALID_VALUE_ID; + std::array w155_dims = {{160, 1, 1, 960}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w155_dims.size(), w155_dims.data(), + /*data=*/w155_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w155); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w155" << std::endl; + return nullptr; + } + + alignas(16) static std::array w156_data; + uint32_t w156 = XNN_INVALID_VALUE_ID; + std::array w156_dims = {{160}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w156_dims.size(), w156_dims.data(), + /*data=*/w156_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w156); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w156" << std::endl; + return nullptr; + } + + alignas(16) static std::array w157_data; + uint32_t w157 = XNN_INVALID_VALUE_ID; + std::array w157_dims = {{960, 1, 1, 160}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w157_dims.size(), w157_dims.data(), + /*data=*/w157_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w157); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w157" << std::endl; + return nullptr; + } + + alignas(16) static std::array w158_data; + uint32_t w158 = XNN_INVALID_VALUE_ID; + std::array w158_dims = {{960}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w158_dims.size(), w158_dims.data(), + /*data=*/w158_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w158); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w158" << std::endl; + return nullptr; + } + + alignas(16) static std::array w159_data; + uint32_t w159 = XNN_INVALID_VALUE_ID; + std::array w159_dims = {{1, 3, 3, 960}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w159_dims.size(), w159_dims.data(), + /*data=*/w159_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w159); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w159" << std::endl; + return nullptr; + } + + alignas(16) static std::array w160_data; + uint32_t w160 = XNN_INVALID_VALUE_ID; + std::array w160_dims = {{960}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w160_dims.size(), w160_dims.data(), + /*data=*/w160_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w160); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w160" << std::endl; + return nullptr; + } + + alignas(16) static std::array w161_data; + uint32_t w161 = XNN_INVALID_VALUE_ID; + std::array w161_dims = {{160, 1, 1, 960}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w161_dims.size(), w161_dims.data(), + /*data=*/w161_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w161); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w161" << std::endl; + return nullptr; + } + + alignas(16) static std::array w162_data; + uint32_t w162 = XNN_INVALID_VALUE_ID; + std::array w162_dims = {{160}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w162_dims.size(), w162_dims.data(), + /*data=*/w162_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w162); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w162" << std::endl; + return nullptr; + } + + alignas(16) static std::array w163_data; + uint32_t w163 = XNN_INVALID_VALUE_ID; + std::array w163_dims = {{960, 1, 1, 160}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w163_dims.size(), w163_dims.data(), + /*data=*/w163_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w163); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w163" << std::endl; + return nullptr; + } + + alignas(16) static std::array w164_data; + uint32_t w164 = XNN_INVALID_VALUE_ID; + std::array w164_dims = {{960}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w164_dims.size(), w164_dims.data(), + /*data=*/w164_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w164); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w164" << std::endl; + return nullptr; + } + + alignas(16) static std::array w165_data; + uint32_t w165 = XNN_INVALID_VALUE_ID; + std::array w165_dims = {{1, 3, 3, 960}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w165_dims.size(), w165_dims.data(), + /*data=*/w165_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w165); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w165" << std::endl; + return nullptr; + } + + alignas(16) static std::array w166_data; + uint32_t w166 = XNN_INVALID_VALUE_ID; + std::array w166_dims = {{960}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w166_dims.size(), w166_dims.data(), + /*data=*/w166_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w166); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w166" << std::endl; + return nullptr; + } + + alignas(16) static std::array w167_data; + uint32_t w167 = XNN_INVALID_VALUE_ID; + std::array w167_dims = {{320, 1, 1, 960}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w167_dims.size(), w167_dims.data(), + /*data=*/w167_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w167); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w167" << std::endl; + return nullptr; + } + + alignas(16) static std::array w168_data; + uint32_t w168 = XNN_INVALID_VALUE_ID; + std::array w168_dims = {{320}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w168_dims.size(), w168_dims.data(), + /*data=*/w168_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w168); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w168" << std::endl; + return nullptr; + } + + alignas(16) static std::array w169_data; + uint32_t w169 = XNN_INVALID_VALUE_ID; + std::array w169_dims = {{1280, 1, 1, 320}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w169_dims.size(), w169_dims.data(), + /*data=*/w169_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w169); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w169" << std::endl; + return nullptr; + } + + alignas(16) static std::array w170_data; + uint32_t w170 = XNN_INVALID_VALUE_ID; + std::array w170_dims = {{1280}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w170_dims.size(), w170_dims.data(), + /*data=*/w170_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w170); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w170" << std::endl; + return nullptr; + } + + alignas(16) static std::array w171_data; + uint32_t w171 = XNN_INVALID_VALUE_ID; + std::array w171_dims = {{1008, 1, 1, 1280}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + w171_dims.size(), w171_dims.data(), + /*data=*/w171_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w171); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w171" << std::endl; + return nullptr; + } + + alignas(16) static std::array w172_data; + uint32_t w172 = XNN_INVALID_VALUE_ID; + std::array w172_dims = {{1008}}; + status = xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + w172_dims.size(), w172_dims.data(), + /*data=*/w172_data.data(), + XNN_INVALID_VALUE_ID, /*flags=*/0, &w172); + if (status != xnn_status_success) { + std::cerr << "failed to create tensor w172" << std::endl; + return nullptr; + } + + std::random_device random_device; + auto rng = std::mt19937(random_device()); + auto f32rng = std::bind(std::uniform_real_distribution(-1.0f, +1.0f), std::ref(rng)); + std::generate(w67_data.begin(), w67_data.end(), std::ref(f32rng)); + std::generate(w68_data.begin(), w68_data.end(), std::ref(f32rng)); + std::generate(w69_data.begin(), w69_data.end(), std::ref(f32rng)); + std::generate(w70_data.begin(), w70_data.end(), std::ref(f32rng)); + std::generate(w71_data.begin(), w71_data.end(), std::ref(f32rng)); + std::generate(w72_data.begin(), w72_data.end(), std::ref(f32rng)); + std::generate(w73_data.begin(), w73_data.end(), std::ref(f32rng)); + std::generate(w74_data.begin(), w74_data.end(), std::ref(f32rng)); + std::generate(w75_data.begin(), w75_data.end(), std::ref(f32rng)); + std::generate(w76_data.begin(), w76_data.end(), std::ref(f32rng)); + std::generate(w77_data.begin(), w77_data.end(), std::ref(f32rng)); + std::generate(w78_data.begin(), w78_data.end(), std::ref(f32rng)); + std::generate(w79_data.begin(), w79_data.end(), std::ref(f32rng)); + std::generate(w80_data.begin(), w80_data.end(), std::ref(f32rng)); + std::generate(w81_data.begin(), w81_data.end(), std::ref(f32rng)); + std::generate(w82_data.begin(), w82_data.end(), std::ref(f32rng)); + std::generate(w83_data.begin(), w83_data.end(), std::ref(f32rng)); + std::generate(w84_data.begin(), w84_data.end(), std::ref(f32rng)); + std::generate(w85_data.begin(), w85_data.end(), std::ref(f32rng)); + std::generate(w86_data.begin(), w86_data.end(), std::ref(f32rng)); + std::generate(w87_data.begin(), w87_data.end(), std::ref(f32rng)); + std::generate(w88_data.begin(), w88_data.end(), std::ref(f32rng)); + std::generate(w89_data.begin(), w89_data.end(), std::ref(f32rng)); + std::generate(w90_data.begin(), w90_data.end(), std::ref(f32rng)); + std::generate(w91_data.begin(), w91_data.end(), std::ref(f32rng)); + std::generate(w92_data.begin(), w92_data.end(), std::ref(f32rng)); + std::generate(w93_data.begin(), w93_data.end(), std::ref(f32rng)); + std::generate(w94_data.begin(), w94_data.end(), std::ref(f32rng)); + std::generate(w95_data.begin(), w95_data.end(), std::ref(f32rng)); + std::generate(w96_data.begin(), w96_data.end(), std::ref(f32rng)); + std::generate(w97_data.begin(), w97_data.end(), std::ref(f32rng)); + std::generate(w98_data.begin(), w98_data.end(), std::ref(f32rng)); + std::generate(w99_data.begin(), w99_data.end(), std::ref(f32rng)); + std::generate(w100_data.begin(), w100_data.end(), std::ref(f32rng)); + std::generate(w101_data.begin(), w101_data.end(), std::ref(f32rng)); + std::generate(w102_data.begin(), w102_data.end(), std::ref(f32rng)); + std::generate(w103_data.begin(), w103_data.end(), std::ref(f32rng)); + std::generate(w104_data.begin(), w104_data.end(), std::ref(f32rng)); + std::generate(w105_data.begin(), w105_data.end(), std::ref(f32rng)); + std::generate(w106_data.begin(), w106_data.end(), std::ref(f32rng)); + std::generate(w107_data.begin(), w107_data.end(), std::ref(f32rng)); + std::generate(w108_data.begin(), w108_data.end(), std::ref(f32rng)); + std::generate(w109_data.begin(), w109_data.end(), std::ref(f32rng)); + std::generate(w110_data.begin(), w110_data.end(), std::ref(f32rng)); + std::generate(w111_data.begin(), w111_data.end(), std::ref(f32rng)); + std::generate(w112_data.begin(), w112_data.end(), std::ref(f32rng)); + std::generate(w113_data.begin(), w113_data.end(), std::ref(f32rng)); + std::generate(w114_data.begin(), w114_data.end(), std::ref(f32rng)); + std::generate(w115_data.begin(), w115_data.end(), std::ref(f32rng)); + std::generate(w116_data.begin(), w116_data.end(), std::ref(f32rng)); + std::generate(w117_data.begin(), w117_data.end(), std::ref(f32rng)); + std::generate(w118_data.begin(), w118_data.end(), std::ref(f32rng)); + std::generate(w119_data.begin(), w119_data.end(), std::ref(f32rng)); + std::generate(w120_data.begin(), w120_data.end(), std::ref(f32rng)); + std::generate(w121_data.begin(), w121_data.end(), std::ref(f32rng)); + std::generate(w122_data.begin(), w122_data.end(), std::ref(f32rng)); + std::generate(w123_data.begin(), w123_data.end(), std::ref(f32rng)); + std::generate(w124_data.begin(), w124_data.end(), std::ref(f32rng)); + std::generate(w125_data.begin(), w125_data.end(), std::ref(f32rng)); + std::generate(w126_data.begin(), w126_data.end(), std::ref(f32rng)); + std::generate(w127_data.begin(), w127_data.end(), std::ref(f32rng)); + std::generate(w128_data.begin(), w128_data.end(), std::ref(f32rng)); + std::generate(w129_data.begin(), w129_data.end(), std::ref(f32rng)); + std::generate(w130_data.begin(), w130_data.end(), std::ref(f32rng)); + std::generate(w131_data.begin(), w131_data.end(), std::ref(f32rng)); + std::generate(w132_data.begin(), w132_data.end(), std::ref(f32rng)); + std::generate(w133_data.begin(), w133_data.end(), std::ref(f32rng)); + std::generate(w134_data.begin(), w134_data.end(), std::ref(f32rng)); + std::generate(w135_data.begin(), w135_data.end(), std::ref(f32rng)); + std::generate(w136_data.begin(), w136_data.end(), std::ref(f32rng)); + std::generate(w137_data.begin(), w137_data.end(), std::ref(f32rng)); + std::generate(w138_data.begin(), w138_data.end(), std::ref(f32rng)); + std::generate(w139_data.begin(), w139_data.end(), std::ref(f32rng)); + std::generate(w140_data.begin(), w140_data.end(), std::ref(f32rng)); + std::generate(w141_data.begin(), w141_data.end(), std::ref(f32rng)); + std::generate(w142_data.begin(), w142_data.end(), std::ref(f32rng)); + std::generate(w143_data.begin(), w143_data.end(), std::ref(f32rng)); + std::generate(w144_data.begin(), w144_data.end(), std::ref(f32rng)); + std::generate(w145_data.begin(), w145_data.end(), std::ref(f32rng)); + std::generate(w146_data.begin(), w146_data.end(), std::ref(f32rng)); + std::generate(w147_data.begin(), w147_data.end(), std::ref(f32rng)); + std::generate(w148_data.begin(), w148_data.end(), std::ref(f32rng)); + std::generate(w149_data.begin(), w149_data.end(), std::ref(f32rng)); + std::generate(w150_data.begin(), w150_data.end(), std::ref(f32rng)); + std::generate(w151_data.begin(), w151_data.end(), std::ref(f32rng)); + std::generate(w152_data.begin(), w152_data.end(), std::ref(f32rng)); + std::generate(w153_data.begin(), w153_data.end(), std::ref(f32rng)); + std::generate(w154_data.begin(), w154_data.end(), std::ref(f32rng)); + std::generate(w155_data.begin(), w155_data.end(), std::ref(f32rng)); + std::generate(w156_data.begin(), w156_data.end(), std::ref(f32rng)); + std::generate(w157_data.begin(), w157_data.end(), std::ref(f32rng)); + std::generate(w158_data.begin(), w158_data.end(), std::ref(f32rng)); + std::generate(w159_data.begin(), w159_data.end(), std::ref(f32rng)); + std::generate(w160_data.begin(), w160_data.end(), std::ref(f32rng)); + std::generate(w161_data.begin(), w161_data.end(), std::ref(f32rng)); + std::generate(w162_data.begin(), w162_data.end(), std::ref(f32rng)); + std::generate(w163_data.begin(), w163_data.end(), std::ref(f32rng)); + std::generate(w164_data.begin(), w164_data.end(), std::ref(f32rng)); + std::generate(w165_data.begin(), w165_data.end(), std::ref(f32rng)); + std::generate(w166_data.begin(), w166_data.end(), std::ref(f32rng)); + std::generate(w167_data.begin(), w167_data.end(), std::ref(f32rng)); + std::generate(w168_data.begin(), w168_data.end(), std::ref(f32rng)); + std::generate(w169_data.begin(), w169_data.end(), std::ref(f32rng)); + std::generate(w170_data.begin(), w170_data.end(), std::ref(f32rng)); + std::generate(w171_data.begin(), w171_data.end(), std::ref(f32rng)); + std::generate(w172_data.begin(), w172_data.end(), std::ref(f32rng)); + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/0, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/2, /*subsampling_width=*/2, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/3, + /*group_output_channels=*/32, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v0, + w67, + w68, + v1, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #0" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/32, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v1, + w69, + w70, + v2, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #1" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/32, + /*group_output_channels=*/16, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v2, + w71, + w72, + v3, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #2" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/16, + /*group_output_channels=*/96, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v3, + w73, + w74, + v4, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #3" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/0, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/2, /*subsampling_width=*/2, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/96, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v4, + w75, + w76, + v5, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #4" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/96, + /*group_output_channels=*/24, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v5, + w77, + w78, + v6, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #5" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/24, + /*group_output_channels=*/144, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v6, + w79, + w80, + v7, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #6" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/144, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v7, + w81, + w82, + v8, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #7" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/144, + /*group_output_channels=*/24, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v8, + w83, + w84, + v9, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #8" << std::endl; + return nullptr; + } + + status = xnn_define_add2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v9, + v6, + v10, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #9" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/24, + /*group_output_channels=*/144, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v10, + w85, + w86, + v11, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #10" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/0, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/2, /*subsampling_width=*/2, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/144, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v11, + w87, + w88, + v12, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #11" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/144, + /*group_output_channels=*/32, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v12, + w89, + w90, + v13, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #12" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/32, + /*group_output_channels=*/192, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v13, + w91, + w92, + v14, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #13" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/192, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v14, + w93, + w94, + v15, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #14" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/192, + /*group_output_channels=*/32, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v15, + w95, + w96, + v16, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #15" << std::endl; + return nullptr; + } + + status = xnn_define_add2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v16, + v13, + v17, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #16" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/32, + /*group_output_channels=*/192, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v17, + w97, + w98, + v18, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #17" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/192, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v18, + w99, + w100, + v19, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #18" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/192, + /*group_output_channels=*/32, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v19, + w101, + w102, + v20, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #19" << std::endl; + return nullptr; + } + + status = xnn_define_add2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v20, + v17, + v21, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #20" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/32, + /*group_output_channels=*/192, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v21, + w103, + w104, + v22, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #21" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/0, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/2, /*subsampling_width=*/2, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/192, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v22, + w105, + w106, + v23, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #22" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/192, + /*group_output_channels=*/64, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v23, + w107, + w108, + v24, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #23" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/64, + /*group_output_channels=*/384, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v24, + w109, + w110, + v25, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #24" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/384, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v25, + w111, + w112, + v26, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #25" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/384, + /*group_output_channels=*/64, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v26, + w113, + w114, + v27, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #26" << std::endl; + return nullptr; + } + + status = xnn_define_add2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v27, + v24, + v28, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #27" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/64, + /*group_output_channels=*/384, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v28, + w115, + w116, + v29, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #28" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/384, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v29, + w117, + w118, + v30, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #29" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/384, + /*group_output_channels=*/64, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v30, + w119, + w120, + v31, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #30" << std::endl; + return nullptr; + } + + status = xnn_define_add2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v31, + v28, + v32, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #31" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/64, + /*group_output_channels=*/384, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v32, + w121, + w122, + v33, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #32" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/384, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v33, + w123, + w124, + v34, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #33" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/384, + /*group_output_channels=*/64, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v34, + w125, + w126, + v35, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #34" << std::endl; + return nullptr; + } + + status = xnn_define_add2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v35, + v32, + v36, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #35" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/64, + /*group_output_channels=*/384, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v36, + w127, + w128, + v37, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #36" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/384, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v37, + w129, + w130, + v38, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #37" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/384, + /*group_output_channels=*/96, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v38, + w131, + w132, + v39, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #38" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/96, + /*group_output_channels=*/576, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v39, + w133, + w134, + v40, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #39" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/576, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v40, + w135, + w136, + v41, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #40" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/576, + /*group_output_channels=*/96, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v41, + w137, + w138, + v42, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #41" << std::endl; + return nullptr; + } + + status = xnn_define_add2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v42, + v39, + v43, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #42" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/96, + /*group_output_channels=*/576, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v43, + w139, + w140, + v44, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #43" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/576, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v44, + w141, + w142, + v45, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #44" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/576, + /*group_output_channels=*/96, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v45, + w143, + w144, + v46, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #45" << std::endl; + return nullptr; + } + + status = xnn_define_add2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v46, + v43, + v47, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #46" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/96, + /*group_output_channels=*/576, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v47, + w145, + w146, + v48, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #47" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/0, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/2, /*subsampling_width=*/2, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/576, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v48, + w147, + w148, + v49, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #48" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/576, + /*group_output_channels=*/160, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v49, + w149, + w150, + v50, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #49" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/160, + /*group_output_channels=*/960, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v50, + w151, + w152, + v51, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #50" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/960, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v51, + w153, + w154, + v52, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #51" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/960, + /*group_output_channels=*/160, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v52, + w155, + w156, + v53, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #52" << std::endl; + return nullptr; + } + + status = xnn_define_add2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v53, + v50, + v54, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #53" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/160, + /*group_output_channels=*/960, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v54, + w157, + w158, + v55, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #54" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/960, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v55, + w159, + w160, + v56, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #55" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/960, + /*group_output_channels=*/160, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v56, + w161, + w162, + v57, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #56" << std::endl; + return nullptr; + } + + status = xnn_define_add2( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), + /*output_max=*/std::numeric_limits::infinity(), + v57, + v54, + v58, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #57" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/160, + /*group_output_channels=*/960, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v58, + w163, + w164, + v59, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #58" << std::endl; + return nullptr; + } + + status = xnn_define_depthwise_convolution_2d( + subgraph, + /*padding_top=*/1, /*padding_right=*/1, /*padding_bottom=*/1, /*padding_left=*/1, + /*kernel_height=*/3, /*kernel_width=*/3, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*depth_multiplier=*/1, + /*input_channels=*/960, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v59, + w165, + w166, + v60, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #59" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/960, + /*group_output_channels=*/320, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v60, + w167, + w168, + v61, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #60" << std::endl; + return nullptr; + } + + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/320, + /*group_output_channels=*/1280, + /*output_min=*/0.0f, /*output_max=*/6.0f, + v61, + w169, + w170, + v62, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #61" << std::endl; + return nullptr; + } + + status = xnn_define_global_average_pooling_2d( + subgraph, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v62, + v63, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #62" << std::endl; + return nullptr; + } + +#if 0 + // These last few ops cause a weird crash. + status = xnn_define_convolution_2d( + subgraph, + /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, + /*kernel_height=*/1, /*kernel_width=*/1, + /*subsampling_height=*/1, /*subsampling_width=*/1, + /*dilation_height=*/1, /*dilation_width=*/1, + /*groups=*/1, + /*group_input_channels=*/1280, + /*group_output_channels=*/1008, + /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + v63, + w171, + w172, + v64, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #63" << std::endl; + return nullptr; + } + + status = xnn_define_copy( + subgraph, + v64, + v65, + 0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #64" << std::endl; + return nullptr; + } + + // Supposed to be softmax, qint8 not supported + status = xnn_define_copy( + subgraph, + v65, + v66, + /*flags=*/0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #65" << std::endl; + return nullptr; + } +#endif + return subgraph; +} + +} // namespace models diff --git a/bench/qs8-dwconv-e2e.cc b/bench/qs8-dwconv-e2e.cc deleted file mode 100644 index 0502493056f..00000000000 --- a/bench/qs8-dwconv-e2e.cc +++ /dev/null @@ -1,1937 +0,0 @@ -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include -#include - -#include -#include "bench/end2end.h" -#include "bench/utils.h" - -#include "xnnpack.h" -#include "xnnpack/config.h" -#include "xnnpack/dwconv.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams-init.h" -#include "xnnpack/models.h" - - -static void DWConvEnd2EndBenchmark( - benchmark::State& state, - models::ExecutionPlanFactory model_factory, - xnn_qs8_dwconv_minmax_unipass_ukernel_fn dwconv, - xnn_init_qs8_conv_minmax_params_fn init_params, - uint8_t channel_tile, uint8_t primary_tile, - benchmark::utils::IsaCheckFunction isa_check = nullptr) -{ - if (isa_check != nullptr && !isa_check(state)) { - return; - } - if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) { - state.SkipWithError("failed to initialize XNNPACK"); - return; - } - - struct xnn_dwconv_config* dwconv_config = xnn_init_qs8_dwconv_config(); - if (dwconv_config == nullptr) { - state.SkipWithError("failed to initialize QS8 DWCONV config"); - return; - } - - // Save dwconv_config so that we can modify it for the benchmark and later restore it. - struct xnn_dwconv_config saved_dwconv_params[XNN_MAX_QS8_DWCONV_UKERNELS]; - memcpy(saved_dwconv_params, dwconv_config, sizeof(saved_dwconv_params)); - - // Override microkernels chosen in xnn_initialize - for (size_t i = 0; i < XNN_MAX_QS8_DWCONV_UKERNELS; i++) { - // Replace only the microkernel the matching kernel size. - if (dwconv_config[i].primary_tile == primary_tile) { - // Note: do not directly assign to dwconv_config[i] because it breaks older gcc. - dwconv_config[i].minmax.unipass = xnn_dwconv_unipass_ukernel_fn(dwconv); - dwconv_config[i].channel_tile = channel_tile; - dwconv_config[i].channel_subtile = channel_tile; - dwconv_config[i].channel_round = 1; - dwconv_config[i].primary_tile = primary_tile; - dwconv_config[i].init.qs8 = init_params; - break; - } - } - - auto execution_plan = model_factory(nullptr); - if (execution_plan.empty()) { - state.SkipWithError("failed to create a model"); - return; - } - - for (auto _ : state) { - for (const std::unique_ptr& op : execution_plan) { - xnn_status status = xnn_run_operator(op.get(), nullptr); - if (status != xnn_status_success) { - state.SkipWithError("failed to run a model"); - return; - } - } - } - - const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); - if (cpu_frequency != 0) { - state.counters["cpufreq"] = cpu_frequency; - } - - // Restore dwconv_config to original state as defined in init.c. - memcpy(dwconv_config, saved_dwconv_params, sizeof(saved_dwconv_params)); -} - - -static void DWConvEnd2EndBenchmark( - benchmark::State& state, - models::ExecutionPlanFactory model_factory, - xnn_qs8_dwconv_minmax_multipass_ukernel_fn dwconv, - xnn_init_qs8_conv_minmax_params_fn init_params, - uint8_t channel_tile, uint8_t channel_subtile, uint8_t channel_round, - uint8_t primary_tile, uint8_t middle_tile, uint8_t last_tile, - uint8_t primary_tile_to_replace, - benchmark::utils::IsaCheckFunction isa_check = nullptr) -{ - if (isa_check != nullptr && !isa_check(state)) { - return; - } - if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) { - state.SkipWithError("failed to initialize XNNPACK"); - return; - } - - struct xnn_dwconv_config* dwconv_config = xnn_init_qs8_dwconv_config(); - if (dwconv_config == nullptr) { - state.SkipWithError("failed to initialize qs8 DWCONV config"); - return; - } - - // Save dwconv_config so that we can modify it for the benchmark and later restore it. - struct xnn_dwconv_config saved_dwconv_params[XNN_MAX_QS8_DWCONV_UKERNELS]; - memcpy(saved_dwconv_params, dwconv_config, sizeof(saved_dwconv_params)); - - bool found = false; - for (size_t i = 0; i < XNN_MAX_QS8_DWCONV_UKERNELS; i++) { - if (dwconv_config[i].primary_tile == primary_tile_to_replace) { - found = true; - } else if (dwconv_config[i].last_tile != 0) { - // Found a multipass microkernel, replace it. - found = true; - } - } - - if (!found) { - state.SkipWithError("can't find unipass with specified primary tile to replace or multipass to replace"); - return; - } - - // Override microkernels chosen in xnn_initialize - for (size_t i = 0; i < XNN_MAX_QS8_DWCONV_UKERNELS; i++) { - // Replace only the microkernel the matching kernel size. - if (dwconv_config[i].primary_tile == primary_tile_to_replace || - dwconv_config[i].last_tile != 0) { - // Replace either when the primary_tile_to_replace matches, or replace the - // first multipass dwconv microkernel we find. - // TODO(zhin): support specifying target multipass dwconv to replace. - std::memset(&dwconv_config[i], 0, sizeof(dwconv_config[i])); - - // Note: do not directly assign to dwconv_config[i] because it breaks older gcc. - dwconv_config[i].minmax.multipass = xnn_dwconv_multipass_ukernel_fn(dwconv); - dwconv_config[i].channel_tile = channel_tile; - dwconv_config[i].channel_subtile = channel_subtile; - dwconv_config[i].channel_round = channel_round; - dwconv_config[i].primary_tile = primary_tile; - dwconv_config[i].middle_tile = middle_tile; - dwconv_config[i].last_tile = last_tile; - dwconv_config[i].init.qs8 = init_params; - break; - } - } - - auto execution_plan = model_factory(nullptr); - if (execution_plan.empty()) { - state.SkipWithError("failed to create a model"); - return; - } - - for (auto _ : state) { - for (const std::unique_ptr& op : execution_plan) { - xnn_status status = xnn_run_operator(op.get(), nullptr); - if (status != xnn_status_success) { - state.SkipWithError("failed to run a model"); - return; - } - } - } - - const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); - if (cpu_frequency != 0) { - state.counters["cpufreq"] = cpu_frequency; - } - - // Restore dwconv_config to original state as defined in init.c. - memcpy(dwconv_config, saved_dwconv_params, sizeof(saved_dwconv_params)); -} - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - static void qs8_dwconv_9p8c__neon_mul8_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_9p8c__neon_mul8_ld64, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON); - } - static void qs8_dwconv_9p16c__neon_mul8_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mul8_ld64, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON); - } - static void qs8_dwconv_9p16c__neon_mul8_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mul8_ld128, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON); - } - static void qs8_dwconv_9p8c__neon_mla8_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_9p8c__neon_mla8_ld64, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON); - } - static void qs8_dwconv_9p16c__neon_mla8_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mla8_ld64, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON); - } - static void qs8_dwconv_9p16c__neon_mla8_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mla8_ld128, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON); - } - static void qs8_dwconv_9p8c__neon_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_9p8c__neon_mul16, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON); - } - static void qs8_dwconv_9p16c__neon_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mul16, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON); - } - static void qs8_dwconv_9p32c__neon_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_9p32c__neon_mul16, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - 32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON); - } - - static void qs8_dwconv_25p8c__neon_mul8_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_25p8c__neon_mul8_ld64, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - 8 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON); - } - static void qs8_dwconv_25p16c__neon_mul8_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_25p16c__neon_mul8_ld64, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - 16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON); - } - static void qs8_dwconv_25p16c__neon_mul8_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_25p16c__neon_mul8_ld128, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - 16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON); - } - static void qs8_dwconv_25p8c__neon_mla8_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_25p8c__neon_mla8_ld64, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - 8 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON); - } - static void qs8_dwconv_25p16c__neon_mla8_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_25p16c__neon_mla8_ld64, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - 16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON); - } - static void qs8_dwconv_25p16c__neon_mla8_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_25p16c__neon_mla8_ld128, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - 16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON); - } - static void qs8_dwconv_25p8c__neon_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_25p8c__neon_mul16, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - 8 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON); - } - static void qs8_dwconv_25p16c__neon_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_25p16c__neon_mul16, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - 16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON); - } - static void qs8_dwconv_25p32c__neon_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_25p32c__neon_mul16, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - 32 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON); - } - - static void qs8_dwconv_5f5m5l8c8s8r__neon_mul8_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l8c8s8r__neon_mul8_ld64, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_5f5m5l8c8s8r__neon_mla8_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l8c8s8r__neon_mla8_ld64, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_5f5m5l16c8s8r__neon_mul8_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l16c8s8r__neon_mul8_ld64, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_5f5m5l16c8s8r__neon_mla8_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l16c8s8r__neon_mla8_ld64, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_5f5m5l16c8s8r__neon_mul8_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l16c8s8r__neon_mul8_ld128, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_5f5m5l16c8s8r__neon_mla8_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l16c8s8r__neon_mla8_ld128, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_5f5m5l8c8s8r__neon_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l8c8s8r__neon_mul16, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_5f5m5l16c8s8r__neon_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l16c8s8r__neon_mul16, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_5f5m5l32c8s8r__neon_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_5f5m5l32c8s8r__neon_mul16, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - - static void qs8_dwconv_6f6m7l8c8s8r__neon_mul8_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l8c8s8r__neon_mul8_ld64, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_6f6m7l8c8s8r__neon_mla8_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l8c8s8r__neon_mla8_ld64, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_6f6m7l16c8s8r__neon_mul8_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l16c8s8r__neon_mul8_ld64, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_6f6m7l16c8s8r__neon_mla8_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l16c8s8r__neon_mla8_ld64, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_6f6m7l16c8s8r__neon_mul8_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l16c8s8r__neon_mul8_ld128, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_6f6m7l16c8s8r__neon_mla8_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l16c8s8r__neon_mla8_ld128, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_6f6m7l8c8s8r__neon_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l8c8s8r__neon_mul16, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_6f6m7l16c8s8r__neon_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l16c8s8r__neon_mul16, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_6f6m7l32c8s8r__neon_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_6f6m7l32c8s8r__neon_mul16, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - - static void qs8_dwconv_8f8m9l8c8s8r__neon_mul8_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l8c8s8r__neon_mul8_ld64, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_8f8m9l8c8s8r__neon_mla8_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l8c8s8r__neon_mla8_ld64, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_8f8m9l16c8s8r__neon_mul8_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l16c8s8r__neon_mul8_ld64, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_8f8m9l16c8s8r__neon_mla8_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l16c8s8r__neon_mla8_ld64, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_8f8m9l16c8s8r__neon_mul8_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l16c8s8r__neon_mul8_ld128, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_8f8m9l16c8s8r__neon_mla8_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l16c8s8r__neon_mla8_ld128, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_8f8m9l8c8s8r__neon_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l8c8s8r__neon_mul16, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_8f8m9l16c8s8r__neon_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l16c8s8r__neon_mul16, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_8f8m9l32c8s8r__neon_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_rndnu_ukernel_8f8m9l32c8s8r__neon_mul16, - xnn_init_qs8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - - BENCHMARK_QS8_END2END(qs8_dwconv_9p8c__neon_mul8_ld64); - BENCHMARK_QS8_END2END(qs8_dwconv_9p16c__neon_mul8_ld64); - BENCHMARK_QS8_END2END(qs8_dwconv_9p16c__neon_mul8_ld128); - BENCHMARK_QS8_END2END(qs8_dwconv_9p8c__neon_mla8_ld64); - BENCHMARK_QS8_END2END(qs8_dwconv_9p16c__neon_mla8_ld64); - BENCHMARK_QS8_END2END(qs8_dwconv_9p16c__neon_mla8_ld128); - BENCHMARK_QS8_END2END(qs8_dwconv_9p8c__neon_mul16); - BENCHMARK_QS8_END2END(qs8_dwconv_9p16c__neon_mul16); - BENCHMARK_QS8_END2END(qs8_dwconv_9p32c__neon_mul16); - - BENCHMARK_QS8_END2END(qs8_dwconv_25p8c__neon_mul8_ld64); - BENCHMARK_QS8_END2END(qs8_dwconv_25p16c__neon_mul8_ld64); - BENCHMARK_QS8_END2END(qs8_dwconv_25p16c__neon_mul8_ld128); - BENCHMARK_QS8_END2END(qs8_dwconv_25p8c__neon_mla8_ld64); - BENCHMARK_QS8_END2END(qs8_dwconv_25p16c__neon_mla8_ld64); - BENCHMARK_QS8_END2END(qs8_dwconv_25p16c__neon_mla8_ld128); - BENCHMARK_QS8_END2END(qs8_dwconv_25p8c__neon_mul16); - BENCHMARK_QS8_END2END(qs8_dwconv_25p16c__neon_mul16); - BENCHMARK_QS8_END2END(qs8_dwconv_25p32c__neon_mul16); - - BENCHMARK_QS8_END2END(qs8_dwconv_5f5m5l8c8s8r__neon_mul8_ld64); - BENCHMARK_QS8_END2END(qs8_dwconv_5f5m5l8c8s8r__neon_mla8_ld64); - BENCHMARK_QS8_END2END(qs8_dwconv_5f5m5l16c8s8r__neon_mul8_ld64); - BENCHMARK_QS8_END2END(qs8_dwconv_5f5m5l16c8s8r__neon_mla8_ld64); - BENCHMARK_QS8_END2END(qs8_dwconv_5f5m5l16c8s8r__neon_mul8_ld128); - BENCHMARK_QS8_END2END(qs8_dwconv_5f5m5l16c8s8r__neon_mla8_ld128); - BENCHMARK_QS8_END2END(qs8_dwconv_5f5m5l8c8s8r__neon_mul16); - BENCHMARK_QS8_END2END(qs8_dwconv_5f5m5l16c8s8r__neon_mul16); - BENCHMARK_QS8_END2END(qs8_dwconv_5f5m5l32c8s8r__neon_mul16); - - BENCHMARK_QS8_END2END(qs8_dwconv_6f6m7l8c8s8r__neon_mul8_ld64); - BENCHMARK_QS8_END2END(qs8_dwconv_6f6m7l8c8s8r__neon_mla8_ld64); - BENCHMARK_QS8_END2END(qs8_dwconv_6f6m7l16c8s8r__neon_mul8_ld64); - BENCHMARK_QS8_END2END(qs8_dwconv_6f6m7l16c8s8r__neon_mla8_ld64); - BENCHMARK_QS8_END2END(qs8_dwconv_6f6m7l16c8s8r__neon_mul8_ld128); - BENCHMARK_QS8_END2END(qs8_dwconv_6f6m7l16c8s8r__neon_mla8_ld128); - BENCHMARK_QS8_END2END(qs8_dwconv_6f6m7l8c8s8r__neon_mul16); - BENCHMARK_QS8_END2END(qs8_dwconv_6f6m7l16c8s8r__neon_mul16); - BENCHMARK_QS8_END2END(qs8_dwconv_6f6m7l32c8s8r__neon_mul16); - - BENCHMARK_QS8_END2END(qs8_dwconv_8f8m9l8c8s8r__neon_mul8_ld64); - BENCHMARK_QS8_END2END(qs8_dwconv_8f8m9l8c8s8r__neon_mla8_ld64); - BENCHMARK_QS8_END2END(qs8_dwconv_8f8m9l16c8s8r__neon_mul8_ld64); - BENCHMARK_QS8_END2END(qs8_dwconv_8f8m9l16c8s8r__neon_mla8_ld64); - BENCHMARK_QS8_END2END(qs8_dwconv_8f8m9l16c8s8r__neon_mul8_ld128); - BENCHMARK_QS8_END2END(qs8_dwconv_8f8m9l16c8s8r__neon_mla8_ld128); - BENCHMARK_QS8_END2END(qs8_dwconv_8f8m9l8c8s8r__neon_mul16); - BENCHMARK_QS8_END2END(qs8_dwconv_8f8m9l16c8s8r__neon_mul16); - BENCHMARK_QS8_END2END(qs8_dwconv_8f8m9l32c8s8r__neon_mul16); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - static void qs8_dwconv_9p16c__avx512skx_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx512skx_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX512SKX); - } - static void qs8_dwconv_9p32c__avx512skx_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p32c__avx512skx_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX512SKX); - } - static void qs8_dwconv_9p16c__avx2_mul16_vpmovsx(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul16_vpmovsx, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2); - } - static void qs8_dwconv_9p32c__avx2_mul16_vpmovsx(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul16_vpmovsx, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2); - } - static void qs8_dwconv_9p16c__avx2_mul16_vpunpck(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul16_vpunpck, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2); - } - static void qs8_dwconv_9p32c__avx2_mul16_vpunpck(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul16_vpunpck, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2); - } - static void qs8_dwconv_9p16c__avx2_mul16_add16_vpunpck(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul16_add16_vpunpck, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2); - } - static void qs8_dwconv_9p32c__avx2_mul16_add16_vpunpck(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul16_add16_vpunpck, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2); - } - static void qs8_dwconv_9p8c__avx2_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__avx2_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2); - } - static void qs8_dwconv_9p16c__avx2_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2); - } - static void qs8_dwconv_9p32c__avx2_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2); - } - static void qs8_dwconv_9p8c__avx_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__avx_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX); - } - static void qs8_dwconv_9p16c__avx_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX); - } - static void qs8_dwconv_9p8c__avx_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__avx_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX); - } - static void qs8_dwconv_9p16c__avx_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX); - } - static void qs8_dwconv_9p8c__avx_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__avx_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX); - } - static void qs8_dwconv_9p16c__avx_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX); - } - static void qs8_dwconv_9p8c__sse41_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41); - } - static void qs8_dwconv_9p16c__sse41_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__sse41_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41); - } - static void qs8_dwconv_9p8c__sse41_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41); - } - static void qs8_dwconv_9p16c__sse41_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__sse41_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41); - } - static void qs8_dwconv_9p8c__sse41_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41); - } - static void qs8_dwconv_9p16c__sse41_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__sse41_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41); - } - static void qs8_dwconv_9p8c__sse2_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__sse2_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 8 /* channel tile */, 9 /* primary tile */); - } - static void qs8_dwconv_9p16c__sse2_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__sse2_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 16 /* channel tile */, 9 /* primary tile */); - } - static void qs8_dwconv_9p8c__sse2_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__sse2_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 8 /* channel tile */, 9 /* primary tile */); - } - static void qs8_dwconv_9p16c__sse2_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__sse2_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 16 /* channel tile */, 9 /* primary tile */); - } - - static void qs8_dwconv_25p8c__sse2_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__sse2_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 8 /* channel tile */, 25 /* primary tile */); - } - static void qs8_dwconv_25p16c__sse2_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__sse2_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 16 /* channel tile */, 25 /* primary tile */); - } - static void qs8_dwconv_25p8c__sse2_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__sse2_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 8 /* channel tile */, 25 /* primary tile */); - } - static void qs8_dwconv_25p16c__sse2_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__sse2_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 16 /* channel tile */, 25 /* primary tile */); - } - static void qs8_dwconv_25p8c__sse41_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 8 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckSSE41); - } - static void qs8_dwconv_25p16c__sse41_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__sse41_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckSSE41); - } - static void qs8_dwconv_25p8c__sse41_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 8 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckSSE41); - } - static void qs8_dwconv_25p16c__sse41_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__sse41_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckSSE41); - } - static void qs8_dwconv_25p8c__sse41_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 8 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckSSE41); - } - static void qs8_dwconv_25p16c__sse41_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__sse41_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckSSE41); - } - - static void qs8_dwconv_5f5m5l8c8s8r__sse2_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse2_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_5f5m5l16c8s8r__sse2_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse2_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_6f6m7l8c8s8r__sse2_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse2_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_6f6m7l16c8s8r__sse2_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse2_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_8f8m9l8c8s8r__sse2_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse2_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_8f8m9l16c8s8r__sse2_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse2_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - - static void qs8_dwconv_5f5m5l8c8s8r__sse2_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse2_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_5f5m5l16c8s8r__sse2_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse2_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_6f6m7l8c8s8r__sse2_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse2_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_6f6m7l16c8s8r__sse2_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse2_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_8f8m9l8c8s8r__sse2_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse2_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_8f8m9l16c8s8r__sse2_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse2_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - - static void qs8_dwconv_5f5m5l8c8s8r__sse41_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse41_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckSSE41); - } - static void qs8_dwconv_5f5m5l16c8s8r__sse41_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse41_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckSSE41); - } - static void qs8_dwconv_6f6m7l8c8s8r__sse41_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse41_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckSSE41); - } - static void qs8_dwconv_6f6m7l16c8s8r__sse41_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse41_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckSSE41); - } - static void qs8_dwconv_8f8m9l8c8s8r__sse41_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse41_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckSSE41); - } - static void qs8_dwconv_8f8m9l16c8s8r__sse41_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse41_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckSSE41); - } - - static void qs8_dwconv_5f5m5l8c8s8r__sse41_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse41_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckSSE41); - } - static void qs8_dwconv_5f5m5l16c8s8r__sse41_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse41_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckSSE41); - } - static void qs8_dwconv_6f6m7l8c8s8r__sse41_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse41_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckSSE41); - } - static void qs8_dwconv_6f6m7l16c8s8r__sse41_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse41_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckSSE41); - } - static void qs8_dwconv_8f8m9l8c8s8r__sse41_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse41_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckSSE41); - } - static void qs8_dwconv_8f8m9l16c8s8r__sse41_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse41_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckSSE41); - } - - static void qs8_dwconv_5f5m5l8c4s4r__sse41_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c4s4r__sse41_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckSSE41); - } - static void qs8_dwconv_5f5m5l16c4s4r__sse41_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c4s4r__sse41_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckSSE41); - } - static void qs8_dwconv_6f6m7l8c4s4r__sse41_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c4s4r__sse41_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckSSE41); - } - static void qs8_dwconv_6f6m7l16c4s4r__sse41_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c4s4r__sse41_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckSSE41); - } - static void qs8_dwconv_8f8m9l8c4s4r__sse41_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c4s4r__sse41_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckSSE41); - } - static void qs8_dwconv_8f8m9l16c4s4r__sse41_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c4s4r__sse41_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckSSE41); - } - - static void qs8_dwconv_5f5m5l8c4s4r__avx_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c4s4r__avx_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckAVX); - } - static void qs8_dwconv_5f5m5l16c4s4r__avx_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c4s4r__avx_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckAVX); - } - static void qs8_dwconv_6f6m7l8c4s4r__avx_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c4s4r__avx_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckAVX); - } - static void qs8_dwconv_6f6m7l16c4s4r__avx_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c4s4r__avx_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckAVX); - } - static void qs8_dwconv_8f8m9l8c4s4r__avx_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c4s4r__avx_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckAVX); - } - static void qs8_dwconv_8f8m9l16c4s4r__avx_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c4s4r__avx_mul32, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckAVX); - } - - static void qs8_dwconv_5f5m5l16c16s16r__avx2_mul16_vpmovsx(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c16s16r__avx2_mul16_vpmovsx, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/16, /*channel_round=*/16, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckAVX2); - } - static void qs8_dwconv_5f5m5l32c16s16r__avx2_mul16_vpmovsx(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l32c16s16r__avx2_mul16_vpmovsx, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/32, /*channel_subtile=*/16, /*channel_round=*/16, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckAVX2); - } - static void qs8_dwconv_6f6m7l16c16s16r__avx2_mul16_vpmovsx(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c16s16r__avx2_mul16_vpmovsx, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/16, /*channel_round=*/16, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckAVX2); - } - static void qs8_dwconv_6f6m7l32c16s16r__avx2_mul16_vpmovsx(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l32c16s16r__avx2_mul16_vpmovsx, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/32, /*channel_subtile=*/16, /*channel_round=*/16, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckAVX2); - } - static void qs8_dwconv_8f8m9l16c16s16r__avx2_mul16_vpmovsx(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c16s16r__avx2_mul16_vpmovsx, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/16, /*channel_round=*/16, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckAVX2); - } - static void qs8_dwconv_8f8m9l32c16s16r__avx2_mul16_vpmovsx(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l32c16s16r__avx2_mul16_vpmovsx, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/32, /*channel_subtile=*/16, /*channel_round=*/16, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckAVX2); - } - - BENCHMARK_QS8_END2END(qs8_dwconv_9p16c__avx512skx_mul32); - BENCHMARK_QS8_END2END(qs8_dwconv_9p32c__avx512skx_mul32); - - BENCHMARK_QS8_END2END(qs8_dwconv_9p16c__avx2_mul16_vpmovsx); - BENCHMARK_QS8_END2END(qs8_dwconv_9p32c__avx2_mul16_vpmovsx); - BENCHMARK_QS8_END2END(qs8_dwconv_9p16c__avx2_mul16_vpunpck); - BENCHMARK_QS8_END2END(qs8_dwconv_9p32c__avx2_mul16_vpunpck); - BENCHMARK_QS8_END2END(qs8_dwconv_9p16c__avx2_mul16_add16_vpunpck); - BENCHMARK_QS8_END2END(qs8_dwconv_9p32c__avx2_mul16_add16_vpunpck); - BENCHMARK_QS8_END2END(qs8_dwconv_9p8c__avx2_mul32); - BENCHMARK_QS8_END2END(qs8_dwconv_9p16c__avx2_mul32); - BENCHMARK_QS8_END2END(qs8_dwconv_9p32c__avx2_mul32); - - BENCHMARK_QS8_END2END(qs8_dwconv_9p8c__avx_mul16); - BENCHMARK_QS8_END2END(qs8_dwconv_9p16c__avx_mul16); - BENCHMARK_QS8_END2END(qs8_dwconv_9p8c__avx_mul16_add16); - BENCHMARK_QS8_END2END(qs8_dwconv_9p16c__avx_mul16_add16); - BENCHMARK_QS8_END2END(qs8_dwconv_9p8c__avx_mul32); - BENCHMARK_QS8_END2END(qs8_dwconv_9p16c__avx_mul32); - - BENCHMARK_QS8_END2END(qs8_dwconv_9p8c__sse41_mul16); - BENCHMARK_QS8_END2END(qs8_dwconv_9p16c__sse41_mul16); - BENCHMARK_QS8_END2END(qs8_dwconv_9p8c__sse41_mul16_add16); - BENCHMARK_QS8_END2END(qs8_dwconv_9p16c__sse41_mul16_add16); - BENCHMARK_QS8_END2END(qs8_dwconv_9p8c__sse41_mul32); - BENCHMARK_QS8_END2END(qs8_dwconv_9p16c__sse41_mul32); - - BENCHMARK_QS8_END2END(qs8_dwconv_9p8c__sse2_mul16); - BENCHMARK_QS8_END2END(qs8_dwconv_9p16c__sse2_mul16); - BENCHMARK_QS8_END2END(qs8_dwconv_9p8c__sse2_mul16_add16); - BENCHMARK_QS8_END2END(qs8_dwconv_9p16c__sse2_mul16_add16); - - BENCHMARK_QS8_END2END(qs8_dwconv_25p8c__sse2_mul16); - BENCHMARK_QS8_END2END(qs8_dwconv_25p16c__sse2_mul16); - BENCHMARK_QS8_END2END(qs8_dwconv_25p8c__sse2_mul16_add16); - BENCHMARK_QS8_END2END(qs8_dwconv_25p16c__sse2_mul16_add16); - BENCHMARK_QS8_END2END(qs8_dwconv_25p8c__sse41_mul16); - BENCHMARK_QS8_END2END(qs8_dwconv_25p16c__sse41_mul16); - BENCHMARK_QS8_END2END(qs8_dwconv_25p8c__sse41_mul16_add16); - BENCHMARK_QS8_END2END(qs8_dwconv_25p16c__sse41_mul16_add16); - BENCHMARK_QS8_END2END(qs8_dwconv_25p8c__sse41_mul32); - BENCHMARK_QS8_END2END(qs8_dwconv_25p16c__sse41_mul32); - - BENCHMARK_QS8_END2END(qs8_dwconv_5f5m5l8c8s8r__sse2_mul16); - BENCHMARK_QS8_END2END(qs8_dwconv_5f5m5l16c8s8r__sse2_mul16); - BENCHMARK_QS8_END2END(qs8_dwconv_6f6m7l8c8s8r__sse2_mul16); - BENCHMARK_QS8_END2END(qs8_dwconv_6f6m7l16c8s8r__sse2_mul16); - BENCHMARK_QS8_END2END(qs8_dwconv_8f8m9l8c8s8r__sse2_mul16); - BENCHMARK_QS8_END2END(qs8_dwconv_8f8m9l16c8s8r__sse2_mul16); - - BENCHMARK_QS8_END2END(qs8_dwconv_5f5m5l8c8s8r__sse2_mul16_add16); - BENCHMARK_QS8_END2END(qs8_dwconv_5f5m5l16c8s8r__sse2_mul16_add16); - BENCHMARK_QS8_END2END(qs8_dwconv_6f6m7l8c8s8r__sse2_mul16_add16); - BENCHMARK_QS8_END2END(qs8_dwconv_6f6m7l16c8s8r__sse2_mul16_add16); - BENCHMARK_QS8_END2END(qs8_dwconv_8f8m9l8c8s8r__sse2_mul16_add16); - BENCHMARK_QS8_END2END(qs8_dwconv_8f8m9l16c8s8r__sse2_mul16_add16); - - BENCHMARK_QS8_END2END(qs8_dwconv_5f5m5l8c8s8r__sse41_mul16); - BENCHMARK_QS8_END2END(qs8_dwconv_5f5m5l16c8s8r__sse41_mul16); - BENCHMARK_QS8_END2END(qs8_dwconv_6f6m7l8c8s8r__sse41_mul16); - BENCHMARK_QS8_END2END(qs8_dwconv_6f6m7l16c8s8r__sse41_mul16); - BENCHMARK_QS8_END2END(qs8_dwconv_8f8m9l8c8s8r__sse41_mul16); - BENCHMARK_QS8_END2END(qs8_dwconv_8f8m9l16c8s8r__sse41_mul16); - - BENCHMARK_QS8_END2END(qs8_dwconv_5f5m5l8c8s8r__sse41_mul16_add16); - BENCHMARK_QS8_END2END(qs8_dwconv_5f5m5l16c8s8r__sse41_mul16_add16); - BENCHMARK_QS8_END2END(qs8_dwconv_6f6m7l8c8s8r__sse41_mul16_add16); - BENCHMARK_QS8_END2END(qs8_dwconv_6f6m7l16c8s8r__sse41_mul16_add16); - BENCHMARK_QS8_END2END(qs8_dwconv_8f8m9l8c8s8r__sse41_mul16_add16); - BENCHMARK_QS8_END2END(qs8_dwconv_8f8m9l16c8s8r__sse41_mul16_add16); - - BENCHMARK_QS8_END2END(qs8_dwconv_5f5m5l8c4s4r__sse41_mul32); - BENCHMARK_QS8_END2END(qs8_dwconv_5f5m5l16c4s4r__sse41_mul32); - BENCHMARK_QS8_END2END(qs8_dwconv_6f6m7l8c4s4r__sse41_mul32); - BENCHMARK_QS8_END2END(qs8_dwconv_6f6m7l16c4s4r__sse41_mul32); - BENCHMARK_QS8_END2END(qs8_dwconv_8f8m9l8c4s4r__sse41_mul32); - BENCHMARK_QS8_END2END(qs8_dwconv_8f8m9l16c4s4r__sse41_mul32); - - BENCHMARK_QS8_END2END(qs8_dwconv_5f5m5l8c4s4r__avx_mul32); - BENCHMARK_QS8_END2END(qs8_dwconv_5f5m5l16c4s4r__avx_mul32); - BENCHMARK_QS8_END2END(qs8_dwconv_6f6m7l8c4s4r__avx_mul32); - BENCHMARK_QS8_END2END(qs8_dwconv_6f6m7l16c4s4r__avx_mul32); - BENCHMARK_QS8_END2END(qs8_dwconv_8f8m9l8c4s4r__avx_mul32); - BENCHMARK_QS8_END2END(qs8_dwconv_8f8m9l16c4s4r__avx_mul32); - - BENCHMARK_QS8_END2END(qs8_dwconv_5f5m5l16c16s16r__avx2_mul16_vpmovsx); - BENCHMARK_QS8_END2END(qs8_dwconv_5f5m5l32c16s16r__avx2_mul16_vpmovsx); - BENCHMARK_QS8_END2END(qs8_dwconv_6f6m7l16c16s16r__avx2_mul16_vpmovsx); - BENCHMARK_QS8_END2END(qs8_dwconv_6f6m7l32c16s16r__avx2_mul16_vpmovsx); - BENCHMARK_QS8_END2END(qs8_dwconv_8f8m9l16c16s16r__avx2_mul16_vpmovsx); - BENCHMARK_QS8_END2END(qs8_dwconv_8f8m9l32c16s16r__avx2_mul16_vpmovsx); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - static void qs8_dwconv_9p8c__wasmsimd_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__wasmsimd_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 8 /* channel tile */, 9 /* primary tile */); - } - static void qs8_dwconv_9p16c__wasmsimd_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__wasmsimd_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 16 /* channel tile */, 9 /* primary tile */); - } - - static void qs8_dwconv_9p8c__wasmsimd_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__wasmsimd_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 8 /* channel tile */, 9 /* primary tile */); - } - static void qs8_dwconv_9p16c__wasmsimd_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__wasmsimd_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 16 /* channel tile */, 9 /* primary tile */); - } - - static void qs8_dwconv_25p8c__wasmsimd_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__wasmsimd_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 8 /* channel tile */, 25 /* primary tile */); - } - static void qs8_dwconv_25p16c__wasmsimd_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__wasmsimd_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 16 /* channel tile */, 25 /* primary tile */); - } - - static void qs8_dwconv_25p8c__wasmsimd_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__wasmsimd_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 8 /* channel tile */, 25 /* primary tile */); - } - static void qs8_dwconv_25p16c__wasmsimd_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__wasmsimd_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 16 /* channel tile */, 25 /* primary tile */); - } - - static void qs8_dwconv_5f5m5l8c8s8r__wasmsimd_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__wasmsimd_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_5f5m5l16c8s8r__wasmsimd_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__wasmsimd_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_5f5m5l8c8s8r__wasmsimd_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__wasmsimd_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_5f5m5l16c8s8r__wasmsimd_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__wasmsimd_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - - static void qs8_dwconv_6f6m7l8c8s8r__wasmsimd_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__wasmsimd_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_6f6m7l16c8s8r__wasmsimd_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__wasmsimd_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_6f6m7l8c8s8r__wasmsimd_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__wasmsimd_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_6f6m7l16c8s8r__wasmsimd_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__wasmsimd_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - - static void qs8_dwconv_8f8m9l8c8s8r__wasmsimd_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__wasmsimd_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_8f8m9l16c8s8r__wasmsimd_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__wasmsimd_mul16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_8f8m9l8c8s8r__wasmsimd_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__wasmsimd_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_8f8m9l16c8s8r__wasmsimd_mul16_add16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__wasmsimd_mul16_add16, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - - BENCHMARK_QS8_END2END(qs8_dwconv_9p8c__wasmsimd_mul16); - BENCHMARK_QS8_END2END(qs8_dwconv_9p16c__wasmsimd_mul16); - - BENCHMARK_QS8_END2END(qs8_dwconv_9p8c__wasmsimd_mul16_add16); - BENCHMARK_QS8_END2END(qs8_dwconv_9p16c__wasmsimd_mul16_add16); - - BENCHMARK_QS8_END2END(qs8_dwconv_25p8c__wasmsimd_mul16); - BENCHMARK_QS8_END2END(qs8_dwconv_25p16c__wasmsimd_mul16); - - BENCHMARK_QS8_END2END(qs8_dwconv_25p8c__wasmsimd_mul16_add16); - BENCHMARK_QS8_END2END(qs8_dwconv_25p16c__wasmsimd_mul16_add16); - - BENCHMARK_QS8_END2END(qs8_dwconv_5f5m5l8c8s8r__wasmsimd_mul16); - BENCHMARK_QS8_END2END(qs8_dwconv_5f5m5l16c8s8r__wasmsimd_mul16); - BENCHMARK_QS8_END2END(qs8_dwconv_5f5m5l8c8s8r__wasmsimd_mul16_add16); - BENCHMARK_QS8_END2END(qs8_dwconv_5f5m5l16c8s8r__wasmsimd_mul16_add16); - - BENCHMARK_QS8_END2END(qs8_dwconv_6f6m7l8c8s8r__wasmsimd_mul16); - BENCHMARK_QS8_END2END(qs8_dwconv_6f6m7l16c8s8r__wasmsimd_mul16); - BENCHMARK_QS8_END2END(qs8_dwconv_6f6m7l8c8s8r__wasmsimd_mul16_add16); - BENCHMARK_QS8_END2END(qs8_dwconv_6f6m7l16c8s8r__wasmsimd_mul16_add16); - - BENCHMARK_QS8_END2END(qs8_dwconv_8f8m9l8c8s8r__wasmsimd_mul16); - BENCHMARK_QS8_END2END(qs8_dwconv_8f8m9l16c8s8r__wasmsimd_mul16); - BENCHMARK_QS8_END2END(qs8_dwconv_8f8m9l8c8s8r__wasmsimd_mul16_add16); - BENCHMARK_QS8_END2END(qs8_dwconv_8f8m9l16c8s8r__wasmsimd_mul16_add16); - -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - static void qs8_dwconv_9p1c__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p1c__wasm_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 1 /* channel tile */, 9 /* primary tile */); - } - static void qs8_dwconv_9p2c__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p2c__wasm_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 2 /* channel tile */, 9 /* primary tile */); - } - static void qs8_dwconv_9p4c__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p4c__wasm_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 4 /* channel tile */, 9 /* primary tile */); - } - - static void qs8_dwconv_25p1c__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_25p1c__wasm_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 1 /* channel tile */, 25 /* primary tile */); - } - static void qs8_dwconv_25p2c__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_25p2c__wasm_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 2 /* channel tile */, 25 /* primary tile */); - } - static void qs8_dwconv_25p4c__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_25p4c__wasm_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 4 /* channel tile */, 25 /* primary tile */); - } - - static void qs8_dwconv_5f5m5l1c1s1r__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__wasm_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_5f5m5l2c1s1r__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__wasm_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/2, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_5f5m5l4c1s1r__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__wasm_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - - static void qs8_dwconv_6f6m7l1c1s1r__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__wasm_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_6f6m7l2c1s1r__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__wasm_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/2, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_6f6m7l4c1s1r__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__wasm_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - - static void qs8_dwconv_8f8m9l1c1s1r__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__wasm_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_8f8m9l2c1s1r__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__wasm_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/2, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - static void qs8_dwconv_8f8m9l4c1s1r__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__wasm_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - - BENCHMARK_QS8_END2END(qs8_dwconv_9p1c__wasm_fmagic); - BENCHMARK_QS8_END2END(qs8_dwconv_9p2c__wasm_fmagic); - BENCHMARK_QS8_END2END(qs8_dwconv_9p4c__wasm_fmagic); - - BENCHMARK_QS8_END2END(qs8_dwconv_25p1c__wasm_fmagic); - BENCHMARK_QS8_END2END(qs8_dwconv_25p2c__wasm_fmagic); - BENCHMARK_QS8_END2END(qs8_dwconv_25p4c__wasm_fmagic); - - BENCHMARK_QS8_END2END(qs8_dwconv_5f5m5l1c1s1r__wasm_fmagic); - BENCHMARK_QS8_END2END(qs8_dwconv_5f5m5l2c1s1r__wasm_fmagic); - BENCHMARK_QS8_END2END(qs8_dwconv_5f5m5l4c1s1r__wasm_fmagic); - - BENCHMARK_QS8_END2END(qs8_dwconv_6f6m7l1c1s1r__wasm_fmagic); - BENCHMARK_QS8_END2END(qs8_dwconv_6f6m7l2c1s1r__wasm_fmagic); - BENCHMARK_QS8_END2END(qs8_dwconv_6f6m7l4c1s1r__wasm_fmagic); - - BENCHMARK_QS8_END2END(qs8_dwconv_8f8m9l1c1s1r__wasm_fmagic); - BENCHMARK_QS8_END2END(qs8_dwconv_8f8m9l2c1s1r__wasm_fmagic); - BENCHMARK_QS8_END2END(qs8_dwconv_8f8m9l4c1s1r__wasm_fmagic); -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -static void qs8_dwconv_9p1c__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p1c__scalar_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 1 /* channel tile */, 9 /* primary tile */); -} -static void qs8_dwconv_9p2c__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p2c__scalar_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 2 /* channel tile */, 9 /* primary tile */); -} -static void qs8_dwconv_9p4c__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p4c__scalar_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 4 /* channel tile */, 9 /* primary tile */); -} - -static void qs8_dwconv_9p1c__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p1c__scalar_imagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 1 /* channel tile */, 9 /* primary tile */); -} -static void qs8_dwconv_9p2c__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p2c__scalar_imagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 2 /* channel tile */, 9 /* primary tile */); -} -static void qs8_dwconv_9p4c__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p4c__scalar_imagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 4 /* channel tile */, 9 /* primary tile */); -} - -static void qs8_dwconv_9p1c__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p1c__scalar_lrintf, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 1 /* channel tile */, 9 /* primary tile */); -} -static void qs8_dwconv_9p2c__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p2c__scalar_lrintf, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 2 /* channel tile */, 9 /* primary tile */); -} -static void qs8_dwconv_9p4c__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_9p4c__scalar_lrintf, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 4 /* channel tile */, 9 /* primary tile */); -} - -static void qs8_dwconv_25p1c__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_25p1c__scalar_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 1 /* channel tile */, 25 /* primary tile */); -} -static void qs8_dwconv_25p2c__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_25p2c__scalar_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 2 /* channel tile */, 25 /* primary tile */); -} -static void qs8_dwconv_25p4c__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_25p4c__scalar_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 4 /* channel tile */, 25 /* primary tile */); -} - -static void qs8_dwconv_25p1c__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_25p1c__scalar_imagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 1 /* channel tile */, 25 /* primary tile */); -} -static void qs8_dwconv_25p2c__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_25p2c__scalar_imagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 2 /* channel tile */, 25 /* primary tile */); -} -static void qs8_dwconv_25p4c__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_25p4c__scalar_imagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 4 /* channel tile */, 25 /* primary tile */); -} - -static void qs8_dwconv_25p1c__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_25p1c__scalar_lrintf, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 1 /* channel tile */, 25 /* primary tile */); -} -static void qs8_dwconv_25p2c__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_25p2c__scalar_lrintf, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 2 /* channel tile */, 25 /* primary tile */); -} -static void qs8_dwconv_25p4c__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_25p4c__scalar_lrintf, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - 4 /* channel tile */, 25 /* primary tile */); -} - -static void qs8_dwconv_5f5m5l1c1s1r__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); -} -static void qs8_dwconv_5f5m5l2c1s1r__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/2, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); -} -static void qs8_dwconv_5f5m5l4c1s1r__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); -} -static void qs8_dwconv_5f5m5l1c1s1r__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_imagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); -} -static void qs8_dwconv_5f5m5l2c1s1r__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_imagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/2, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); -} -static void qs8_dwconv_5f5m5l4c1s1r__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_imagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); -} -static void qs8_dwconv_5f5m5l1c1s1r__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_lrintf, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); -} -static void qs8_dwconv_5f5m5l2c1s1r__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_lrintf, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/2, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); -} -static void qs8_dwconv_5f5m5l4c1s1r__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_lrintf, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); -} - -static void qs8_dwconv_6f6m7l1c1s1r__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); -} -static void qs8_dwconv_6f6m7l2c1s1r__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/2, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); -} -static void qs8_dwconv_6f6m7l4c1s1r__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); -} -static void qs8_dwconv_6f6m7l1c1s1r__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_imagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); -} -static void qs8_dwconv_6f6m7l2c1s1r__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_imagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/2, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); -} -static void qs8_dwconv_6f6m7l4c1s1r__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_imagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); -} -static void qs8_dwconv_6f6m7l1c1s1r__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_lrintf, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); -} -static void qs8_dwconv_6f6m7l2c1s1r__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_lrintf, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/2, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); -} -static void qs8_dwconv_6f6m7l4c1s1r__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_lrintf, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); -} - -static void qs8_dwconv_8f8m9l1c1s1r__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); -} -static void qs8_dwconv_8f8m9l2c1s1r__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/2, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); -} -static void qs8_dwconv_8f8m9l4c1s1r__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_fmagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); -} -static void qs8_dwconv_8f8m9l1c1s1r__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_imagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); -} -static void qs8_dwconv_8f8m9l2c1s1r__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_imagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/2, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); -} -static void qs8_dwconv_8f8m9l4c1s1r__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_imagic, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); -} -static void qs8_dwconv_8f8m9l1c1s1r__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_lrintf, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); -} -static void qs8_dwconv_8f8m9l2c1s1r__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_lrintf, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/2, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); -} -static void qs8_dwconv_8f8m9l4c1s1r__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_lrintf, - xnn_init_qs8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); -} - -BENCHMARK_QS8_END2END(qs8_dwconv_9p1c__scalar_fmagic); -BENCHMARK_QS8_END2END(qs8_dwconv_9p2c__scalar_fmagic); -BENCHMARK_QS8_END2END(qs8_dwconv_9p4c__scalar_fmagic); - -BENCHMARK_QS8_END2END(qs8_dwconv_9p1c__scalar_imagic); -BENCHMARK_QS8_END2END(qs8_dwconv_9p2c__scalar_imagic); -BENCHMARK_QS8_END2END(qs8_dwconv_9p4c__scalar_imagic); - -BENCHMARK_QS8_END2END(qs8_dwconv_9p1c__scalar_lrintf); -BENCHMARK_QS8_END2END(qs8_dwconv_9p2c__scalar_lrintf); -BENCHMARK_QS8_END2END(qs8_dwconv_9p4c__scalar_lrintf); - -BENCHMARK_QS8_END2END(qs8_dwconv_25p1c__scalar_fmagic); -BENCHMARK_QS8_END2END(qs8_dwconv_25p2c__scalar_fmagic); -BENCHMARK_QS8_END2END(qs8_dwconv_25p4c__scalar_fmagic); - -BENCHMARK_QS8_END2END(qs8_dwconv_25p1c__scalar_imagic); -BENCHMARK_QS8_END2END(qs8_dwconv_25p2c__scalar_imagic); -BENCHMARK_QS8_END2END(qs8_dwconv_25p4c__scalar_imagic); - -BENCHMARK_QS8_END2END(qs8_dwconv_25p1c__scalar_lrintf); -BENCHMARK_QS8_END2END(qs8_dwconv_25p2c__scalar_lrintf); -BENCHMARK_QS8_END2END(qs8_dwconv_25p4c__scalar_lrintf); - -BENCHMARK_QS8_END2END(qs8_dwconv_5f5m5l1c1s1r__scalar_fmagic); -BENCHMARK_QS8_END2END(qs8_dwconv_5f5m5l2c1s1r__scalar_fmagic); -BENCHMARK_QS8_END2END(qs8_dwconv_5f5m5l4c1s1r__scalar_fmagic); -BENCHMARK_QS8_END2END(qs8_dwconv_5f5m5l1c1s1r__scalar_imagic); -BENCHMARK_QS8_END2END(qs8_dwconv_5f5m5l2c1s1r__scalar_imagic); -BENCHMARK_QS8_END2END(qs8_dwconv_5f5m5l4c1s1r__scalar_imagic); -BENCHMARK_QS8_END2END(qs8_dwconv_5f5m5l1c1s1r__scalar_lrintf); -BENCHMARK_QS8_END2END(qs8_dwconv_5f5m5l2c1s1r__scalar_lrintf); -BENCHMARK_QS8_END2END(qs8_dwconv_5f5m5l4c1s1r__scalar_lrintf); - -BENCHMARK_QS8_END2END(qs8_dwconv_6f6m7l1c1s1r__scalar_fmagic); -BENCHMARK_QS8_END2END(qs8_dwconv_6f6m7l2c1s1r__scalar_fmagic); -BENCHMARK_QS8_END2END(qs8_dwconv_6f6m7l4c1s1r__scalar_fmagic); -BENCHMARK_QS8_END2END(qs8_dwconv_6f6m7l1c1s1r__scalar_imagic); -BENCHMARK_QS8_END2END(qs8_dwconv_6f6m7l2c1s1r__scalar_imagic); -BENCHMARK_QS8_END2END(qs8_dwconv_6f6m7l4c1s1r__scalar_imagic); -BENCHMARK_QS8_END2END(qs8_dwconv_6f6m7l1c1s1r__scalar_lrintf); -BENCHMARK_QS8_END2END(qs8_dwconv_6f6m7l2c1s1r__scalar_lrintf); -BENCHMARK_QS8_END2END(qs8_dwconv_6f6m7l4c1s1r__scalar_lrintf); - -BENCHMARK_QS8_END2END(qs8_dwconv_8f8m9l1c1s1r__scalar_fmagic); -BENCHMARK_QS8_END2END(qs8_dwconv_8f8m9l2c1s1r__scalar_fmagic); -BENCHMARK_QS8_END2END(qs8_dwconv_8f8m9l4c1s1r__scalar_fmagic); -BENCHMARK_QS8_END2END(qs8_dwconv_8f8m9l1c1s1r__scalar_imagic); -BENCHMARK_QS8_END2END(qs8_dwconv_8f8m9l2c1s1r__scalar_imagic); -BENCHMARK_QS8_END2END(qs8_dwconv_8f8m9l4c1s1r__scalar_imagic); -BENCHMARK_QS8_END2END(qs8_dwconv_8f8m9l1c1s1r__scalar_lrintf); -BENCHMARK_QS8_END2END(qs8_dwconv_8f8m9l2c1s1r__scalar_lrintf); -BENCHMARK_QS8_END2END(qs8_dwconv_8f8m9l4c1s1r__scalar_lrintf); - - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/qs8-gemm-e2e.cc b/bench/qs8-gemm-e2e.cc deleted file mode 100644 index ee31dea04d1..00000000000 --- a/bench/qs8-gemm-e2e.cc +++ /dev/null @@ -1,2541 +0,0 @@ -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include -#include - -#include -#include "bench/end2end.h" -#include "bench/utils.h" - -#include "xnnpack.h" -#include "xnnpack/config.h" -#include "xnnpack/gemm.h" -#include "xnnpack/igemm.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams-init.h" -#include "xnnpack/models.h" -#include "xnnpack/pack.h" - - -static void GEMMEnd2EndBenchmark( - benchmark::State& state, - models::ExecutionPlanFactory model_factory, - xnn_qs8_qc8w_gemm_minmax_ukernel_fn gemm, - xnn_qs8_qc8w_igemm_minmax_ukernel_fn igemm, - xnn_qs8_qc8w_gemm_minmax_ukernel_fn gemm1, - xnn_qs8_qc8w_igemm_minmax_ukernel_fn igemm1, - xnn_init_qs8_qc8w_conv_minmax_params_fn init_params, - uint8_t mr, uint8_t nr, uint8_t log2_kr = 0, uint8_t log2_sr = 0, - benchmark::utils::IsaCheckFunction isa_check = nullptr) -{ - if (isa_check != nullptr && !isa_check(state)) { - return; - } - if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) { - state.SkipWithError("failed to initialize XNNPACK"); - return; - } - - struct xnn_gemm_config* gemm_config = xnn_init_qs8_qc8w_gemm_config(); - assert(gemm_config != nullptr); - - // Override microkernels chosen in xnn_initialize - std::memset(gemm_config, 0, sizeof(struct xnn_gemm_config)); - gemm_config->minmax.gemm[mr-1] = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_fn(gemm)); - gemm_config->minmax.igemm[mr-1] = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_fn(igemm)); - gemm_config->minmax.gemm[0] = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_fn(gemm1)); - gemm_config->minmax.igemm[0] = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_fn(igemm1)); - gemm_config->init.qs8_qc8w = init_params; - gemm_config->mr = mr; - gemm_config->nr = nr; - gemm_config->log2_kr = log2_kr; - gemm_config->log2_sr = log2_sr; - gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w; - gemm_config->pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w; - gemm_config->pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w; - gemm_config->pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w; - - auto execution_plan = model_factory(nullptr); - if (execution_plan.empty()) { - state.SkipWithError("failed to create a model"); - return; - } - - for (auto _ : state) { - for (const std::unique_ptr& op : execution_plan) { - xnn_status status = xnn_run_operator(op.get(), nullptr); - if (status != xnn_status_success) { - state.SkipWithError("failed to run a model"); - return; - } - } - } - - const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); - if (cpu_frequency != 0) { - state.counters["cpufreq"] = cpu_frequency; - } -} - - -#if XNN_ENABLE_ARM_DOTPROD && XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY - static void qs8_qc8w_gemm_4x8c4__asm_aarch32_neondot_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8c4__asm_aarch32_neondot_cortex_a55, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8c4__asm_aarch32_neondot_cortex_a55, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c4__neondot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c4__neondot, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/2, /*log2_sr=*/0, - benchmark::utils::CheckNEONDOT); - } - static void qs8_qc8w_gemm_4x8c4__asm_aarch32_neondot_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8c4__asm_aarch32_neondot_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8c4__asm_aarch32_neondot_ld64, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c4__neondot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c4__neondot, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/2, /*log2_sr=*/0, - benchmark::utils::CheckNEONDOT); - } - - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x8c4__asm_aarch32_neondot_cortex_a55) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x8c4__asm_aarch32_neondot_ld64) -#endif // XNN_ENABLE_ARM_DOTPROD && XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY - - -#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY - static void qs8_qc8w_gemm_4x8__asm_aarch32_neon_mlal_lane_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a53, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a53, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qs8_qc8w_gemm_4x8__asm_aarch32_neon_mlal_lane_cortex_a53_prfm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a53_prfm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a53_prfm, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7_prfm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7_prfm, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qs8_qc8w_gemm_4x8__asm_aarch32_neon_mlal_lane_cortex_a7(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a7, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a7, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qs8_qc8w_gemm_4x8__asm_aarch32_neon_mlal_lane_cortex_a7_prfm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a7_prfm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a7_prfm, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7_prfm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7_prfm, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qs8_qc8w_gemm_4x8__asm_aarch32_neon_mlal_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8__asm_aarch32_neon_mlal_lane_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8__asm_aarch32_neon_mlal_lane_ld64, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qs8_qc8w_gemm_4x8__asm_aarch32_neon_mlal_lane_ld64_prfm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8__asm_aarch32_neon_mlal_lane_ld64_prfm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8__asm_aarch32_neon_mlal_lane_ld64_prfm, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7_prfm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7_prfm, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x8__asm_aarch32_neon_mlal_lane_cortex_a53_prfm) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x8__asm_aarch32_neon_mlal_lane_cortex_a53) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x8__asm_aarch32_neon_mlal_lane_cortex_a7_prfm) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x8__asm_aarch32_neon_mlal_lane_cortex_a7) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x8__asm_aarch32_neon_mlal_lane_ld64_prfm) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x8__asm_aarch32_neon_mlal_lane_ld64) -#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY - - -#if XNN_ENABLE_ARM_DOTPROD && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY - static void qs8_qc8w_gemm_4x16c4__asm_aarch64_neondot_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__asm_aarch64_neondot_cortex_a55, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__asm_aarch64_neondot_cortex_a55, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__asm_aarch64_neondot_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__neondot, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/16, /*log2_kr=*/2, /*log2_sr=*/0, - benchmark::utils::CheckNEONDOT); - } - static void qs8_qc8w_gemm_4x16c4__asm_aarch64_neondot_ld32(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__asm_aarch64_neondot_ld32, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__neondot, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__asm_aarch64_neondot_ld32, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__neondot, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/16, /*log2_kr=*/2, /*log2_sr=*/0, - benchmark::utils::CheckNEONDOT); - } - static void qs8_qc8w_gemm_4x16c4__asm_aarch64_neondot_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__asm_aarch64_neondot_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__asm_aarch64_neondot_ld64, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__asm_aarch64_neondot_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__neondot, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/16, /*log2_kr=*/2, /*log2_sr=*/0, - benchmark::utils::CheckNEONDOT); - } - static void qs8_qc8w_gemm_4x16c4__asm_aarch64_neondot_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__asm_aarch64_neondot_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__asm_aarch64_neondot_ld128, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__asm_aarch64_neondot_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__neondot, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/16, /*log2_kr=*/2, /*log2_sr=*/0, - benchmark::utils::CheckNEONDOT); - } - - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x16c4__asm_aarch64_neondot_cortex_a55) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x16c4__asm_aarch64_neondot_ld32) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x16c4__asm_aarch64_neondot_ld64) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x16c4__asm_aarch64_neondot_ld128) -#endif // XNN_ENABLE_ARM_DOTPROD && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY - - -#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY - static void qs8_qc8w_gemm_4x16__asm_aarch64_neon_mlal_lane_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16__asm_aarch64_neon_mlal_lane_cortex_a53, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16__asm_aarch64_neon_mlal_lane_cortex_a53, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qs8_qc8w_gemm_4x16__asm_aarch64_neon_mlal_lane_cortex_a53_prfm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16__asm_aarch64_neon_mlal_lane_cortex_a53_prfm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16__asm_aarch64_neon_mlal_lane_cortex_a53_prfm, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qs8_qc8w_gemm_4x16__asm_aarch64_neon_mlal_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16__asm_aarch64_neon_mlal_lane_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16__asm_aarch64_neon_mlal_lane_ld64, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qs8_qc8w_gemm_4x16__asm_aarch64_neon_mlal_lane_ld64_prfm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16__asm_aarch64_neon_mlal_lane_ld64_prfm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16__asm_aarch64_neon_mlal_lane_ld64_prfm, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qs8_qc8w_gemm_2x8c8__asm_aarch64_neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8c8__asm_aarch64_neon_mlal, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x8c8__asm_aarch64_neon_mlal, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__asm_aarch64_neon_mlal, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__asm_aarch64_neon_mlal, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, 8 /*nr=*/, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qs8_qc8w_gemm_2x8c8__asm_aarch64_neon_mlal_prfm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8c8__asm_aarch64_neon_mlal_prfm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x8c8__asm_aarch64_neon_mlal_prfm, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__asm_aarch64_neon_mlal_prfm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__asm_aarch64_neon_mlal_prfm, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, 8 /*nr=*/, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qs8_qc8w_gemm_2x8c8__asm_aarch64_neon_mlal_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8c8__asm_aarch64_neon_mlal_cortex_a53, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x8c8__asm_aarch64_neon_mlal, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__asm_aarch64_neon_mlal, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__asm_aarch64_neon_mlal, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, 8 /*nr=*/, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qs8_qc8w_gemm_2x8c8__asm_aarch64_neon_mlal_cortex_a53_prfm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8c8__asm_aarch64_neon_mlal_cortex_a53_prfm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x8c8__asm_aarch64_neon_mlal_cortex_a53_prfm, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__asm_aarch64_neon_mlal_cortex_a53_prfm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__asm_aarch64_neon_mlal_cortex_a53_prfm, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, 8 /*nr=*/, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x16__asm_aarch64_neon_mlal_lane_cortex_a53_prfm) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x16__asm_aarch64_neon_mlal_lane_cortex_a53) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x16__asm_aarch64_neon_mlal_lane_ld64_prfm) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x16__asm_aarch64_neon_mlal_lane_ld64) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x8c8__asm_aarch64_neon_mlal_cortex_a53_prfm) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x8c8__asm_aarch64_neon_mlal_cortex_a53) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x8c8__asm_aarch64_neon_mlal_prfm) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x8c8__asm_aarch64_neon_mlal) -#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY - -#if XNN_ENABLE_ARM_I8MM && XNN_ARCH_ARM64 - static void qs8_qc8w_gemm_2x8c8__neoni8mm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8c8__neoni8mm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x8c8__neoni8mm, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__neoni8mm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__neoni8mm, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, 8 /*nr=*/, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckNEONI8MM); - } - static void qs8_qc8w_gemm_4x8c8__neoni8mm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8c8__neoni8mm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8c8__neoni8mm, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__neoni8mm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__neoni8mm, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, 8 /*nr=*/, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckNEONI8MM); - } - static void qs8_qc8w_gemm_6x8c8__neoni8mm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_6x8c8__neoni8mm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_6x8c8__neoni8mm, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__neoni8mm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__neoni8mm, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/6, 8 /*nr=*/, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckNEONI8MM); - } - static void qs8_qc8w_gemm_8x8c8__neoni8mm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_8x8c8__neoni8mm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_8x8c8__neoni8mm, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__neoni8mm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__neoni8mm, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/8, 8 /*nr=*/, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckNEONI8MM); - } - static void qs8_qc8w_gemm_2x16c8__neoni8mm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x16c8__neoni8mm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x16c8__neoni8mm, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c8__neoni8mm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c8__neoni8mm, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/16, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckNEONI8MM); - } - static void qs8_qc8w_gemm_4x16c8__neoni8mm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c8__neoni8mm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c8__neoni8mm, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c8__neoni8mm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c8__neoni8mm, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/16, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckNEONI8MM); - } - static void qs8_qc8w_gemm_6x16c8__neoni8mm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_6x16c8__neoni8mm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_6x16c8__neoni8mm, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c8__neoni8mm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c8__neoni8mm, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/6, /*nr=*/16, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckNEONI8MM); - } - static void qs8_qc8w_gemm_8x16c8__neoni8mm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_8x16c8__neoni8mm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_8x16c8__neoni8mm, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c8__neoni8mm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c8__neoni8mm, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/8, /*nr=*/16, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckNEONI8MM); - } - - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x8c8__neoni8mm); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x8c8__neoni8mm); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_6x8c8__neoni8mm); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_8x8c8__neoni8mm); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x16c8__neoni8mm); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x16c8__neoni8mm); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_6x16c8__neoni8mm); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_8x16c8__neoni8mm); -#endif // XNN_ENABLE_ARM_I8MM && XNN_ARCH_ARM64 - -#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - static void qs8_qc8w_gemm_4x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8c4__neondot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8c4__neondot, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c4__neondot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c4__neondot, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, 8 /*nr=*/, /*log2_kr=*/2, /*log2_sr=*/0, - benchmark::utils::CheckNEONDOT); - } - static void qs8_qc8w_gemm_6x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_6x8c4__neondot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_6x8c4__neondot, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c4__neondot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c4__neondot, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/6, 8 /*nr=*/, /*log2_kr=*/2, /*log2_sr=*/0, - benchmark::utils::CheckNEONDOT); - } - static void qs8_qc8w_gemm_8x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_8x8c4__neondot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_8x8c4__neondot, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c4__neondot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c4__neondot, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/8, 8 /*nr=*/, /*log2_kr=*/2, /*log2_sr=*/0, - benchmark::utils::CheckNEONDOT); - } - static void qs8_qc8w_gemm_4x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__neondot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__neondot, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__neondot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__neondot, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/16, /*log2_kr=*/2, /*log2_sr=*/0, - benchmark::utils::CheckNEONDOT); - } - static void qs8_qc8w_gemm_6x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_6x16c4__neondot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_6x16c4__neondot, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__neondot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__neondot, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/6, /*nr=*/16, /*log2_kr=*/2, /*log2_sr=*/0, - benchmark::utils::CheckNEONDOT); - } - static void qs8_qc8w_gemm_8x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_8x16c4__neondot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_8x16c4__neondot, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__neondot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__neondot, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/8, /*nr=*/16, /*log2_kr=*/2, /*log2_sr=*/0, - benchmark::utils::CheckNEONDOT); - } - - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x8c4__neondot); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_6x8c4__neondot); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_8x8c4__neondot); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x16c4__neondot); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_6x16c4__neondot); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_8x16c4__neondot); -#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - static void qs8_qc8w_gemm_2x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8__neon_mlal_lane, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x8__neon_mlal_lane, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, 8 /*nr=*/, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qs8_qc8w_gemm_2x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x16__neon_mlal_lane, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qs8_qc8w_gemm_3x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x8__neon_mlal_lane, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, 8 /*nr=*/, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qs8_qc8w_gemm_3x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x16__neon_mlal_lane, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x16__neon_mlal_lane, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qs8_qc8w_gemm_4x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, 8 /*nr=*/, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qs8_qc8w_gemm_4x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qs8_qc8w_gemm_6x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_6x8__neon_mlal_lane, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/6, 8 /*nr=*/, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qs8_qc8w_gemm_6x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/6, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qs8_qc8w_gemm_2x8__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8__neon_mlal_lane_prfm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x8__neon_mlal_lane_prfm, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane_prfm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane_prfm, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, 8 /*nr=*/, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qs8_qc8w_gemm_2x16__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane_prfm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x16__neon_mlal_lane_prfm, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane_prfm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane_prfm, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qs8_qc8w_gemm_3x8__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane_prfm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x8__neon_mlal_lane_prfm, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane_prfm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane_prfm, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, 8 /*nr=*/, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qs8_qc8w_gemm_3x16__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x16__neon_mlal_lane_prfm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x16__neon_mlal_lane_prfm, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane_prfm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane_prfm, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qs8_qc8w_gemm_4x8__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane_prfm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8__neon_mlal_lane_prfm, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane_prfm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane_prfm, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, 8 /*nr=*/, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qs8_qc8w_gemm_4x16__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane_prfm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane_prfm, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qs8_qc8w_gemm_6x8__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane_prfm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane_prfm, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/6, 8 /*nr=*/, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qs8_qc8w_gemm_6x16__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane_prfm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane_prfm, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/6, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qs8_qc8w_gemm_2x8c2__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, 8 /*nr=*/, /*log2_kr=*/1, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qs8_qc8w_gemm_2x8c2__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld1r, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld1r, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld1r, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld1r, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, 8 /*nr=*/, /*log2_kr=*/1, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qs8_qc8w_gemm_2x8c2__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld2r, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld2r, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld2r, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld2r, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, 8 /*nr=*/, /*log2_kr=*/1, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qs8_qc8w_gemm_2x8c2__neon_mlal_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld4r, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld4r, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld4r, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld4r, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, 8 /*nr=*/, /*log2_kr=*/1, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qs8_qc8w_gemm_2x8c2s4__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, 8 /*nr=*/, /*log2_kr=*/1, /*log2_sr=*/2, - benchmark::utils::CheckNEON); - } - static void qs8_qc8w_gemm_2x8c4__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, 8 /*nr=*/, /*log2_kr=*/2, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qs8_qc8w_gemm_2x8c4__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld1r, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld1r, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, 8 /*nr=*/, /*log2_kr=*/2, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qs8_qc8w_gemm_2x8c4__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld2r, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld2r, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld2r, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld2r, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, 8 /*nr=*/, /*log2_kr=*/2, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qs8_qc8w_gemm_2x8c4s2__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, 8 /*nr=*/, /*log2_kr=*/2, /*log2_sr=*/1, - benchmark::utils::CheckNEON); - } - - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x8c4__neon_mlal_dup); - - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x8c4__neon_mlal_ld1r); - - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x8c4__neon_mlal_ld2r); - - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x8c4s2__neon_mlal); - - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x8c2__neon_mlal_dup); - - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x8c2__neon_mlal_ld1r); - - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x8c2__neon_mlal_ld2r); - - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x8c2s4__neon_mlal); - - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x8__neon_mlal_lane); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x16__neon_mlal_lane); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x8__neon_mlal_lane); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x16__neon_mlal_lane); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x8__neon_mlal_lane); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x16__neon_mlal_lane); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_6x8__neon_mlal_lane); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_6x16__neon_mlal_lane); - - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x8__neon_mlal_lane_prfm); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x16__neon_mlal_lane_prfm); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x8__neon_mlal_lane_prfm); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x16__neon_mlal_lane_prfm); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x8__neon_mlal_lane_prfm); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x16__neon_mlal_lane_prfm); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_6x8__neon_mlal_lane_prfm); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_6x16__neon_mlal_lane_prfm); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM - static void qs8_qc8w_gemm_1x1c4__armsimd32(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x1c4__armsimd32, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x1c4__armsimd32, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x1c4__armsimd32, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x1c4__armsimd32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/1, 1 /*nr=*/, /*log2_kr=*/2, /*log2_sr=*/0, - benchmark::utils::CheckARMV6); - } - static void qs8_qc8w_gemm_2x1c4__armsimd32(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x1c4__armsimd32, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x1c4__armsimd32, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x1c4__armsimd32, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x1c4__armsimd32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, 1 /*nr=*/, /*log2_kr=*/2, /*log2_sr=*/0, - benchmark::utils::CheckARMV6); - } - static void qs8_qc8w_gemm_1x2c4__armsimd32(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x2c4__armsimd32, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x2c4__armsimd32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/1, 2 /*nr=*/, /*log2_kr=*/2, /*log2_sr=*/0, - benchmark::utils::CheckARMV6); - } - static void qs8_qc8w_gemm_2x2c4__armsimd32(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x2c4__armsimd32, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x2c4__armsimd32, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x2c4__armsimd32, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, 2 /*nr=*/, /*log2_kr=*/2, /*log2_sr=*/0, - benchmark::utils::CheckARMV6); - } - - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_1x1c4__armsimd32); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x1c4__armsimd32); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_1x2c4__armsimd32); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x2c4__armsimd32); -#endif // XNN_ARCH_ARM - -#if XNN_ENABLE_AVXVNNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) - - static void qs8_qc8w_gemm_1x8c8__avxvnni(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avxvnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avxvnni, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avxvnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avxvnni, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/1, /*nr=*/8, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVXVNNI); - } - static void qs8_qc8w_gemm_2x8c8__avxvnni(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8c8__avxvnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x8c8__avxvnni, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avxvnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avxvnni, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/8, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVXVNNI); - } - static void qs8_qc8w_gemm_3x8c8__avxvnni(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x8c8__avxvnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x8c8__avxvnni, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avxvnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avxvnni, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/8, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVXVNNI); - } - static void qs8_qc8w_gemm_4x8c8__avxvnni(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8c8__avxvnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8c8__avxvnni, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avxvnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avxvnni, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVXVNNI); - } - static void qs8_qc8w_gemm_5x8c8__avxvnni(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_5x8c8__avxvnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_5x8c8__avxvnni, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avxvnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avxvnni, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/5, /*nr=*/8, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVXVNNI); - } - static void qs8_qc8w_gemm_5x8c8__avxvnni_prfm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_5x8c8__avxvnni_prfm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_5x8c8__avxvnni_prfm, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avxvnni_prfm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avxvnni_prfm, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/5, /*nr=*/8, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVXVNNI); - } - static void qs8_qc8w_gemm_6x8c8__avxvnni(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_6x8c8__avxvnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_6x8c8__avxvnni, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avxvnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avxvnni, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/6, /*nr=*/8, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVXVNNI); - } - static void qs8_qc8w_gemm_7x8c8__avxvnni(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_7x8c8__avxvnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_7x8c8__avxvnni, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avxvnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avxvnni, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/7, /*nr=*/8, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVXVNNI); - } - static void qs8_qc8w_gemm_8x8c8__avxvnni(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_8x8c8__avxvnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_8x8c8__avxvnni, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avxvnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avxvnni, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/8, /*nr=*/8, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVXVNNI); - } - - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_1x8c8__avxvnni); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x8c8__avxvnni); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x8c8__avxvnni); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x8c8__avxvnni); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_5x8c8__avxvnni); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_5x8c8__avxvnni_prfm); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_6x8c8__avxvnni); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_7x8c8__avxvnni); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_8x8c8__avxvnni); -#endif // XNN_ENABLE_AVXVNNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) - -#if XNN_ENABLE_AVXVNNIINT8 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) - static void qs8_qc8w_gemm_5x8c8__avxvnniint8_prfm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_5x8c8__avxvnniint8_prfm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_5x8c8__avxvnniint8_prfm, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avxvnniint8_prfm, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avxvnniint8_prfm, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/5, /*nr=*/8, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVXVNNIINT8); - } - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_5x8c8__avxvnniint8_prfm); -#endif // XNN_ENABLE_AVXVNNIINT8 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - static void qs8_qc8w_gemm_1x16c4__avx512vnni(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__avx512vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__avx512vnni, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__avx512vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__avx512vnni, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/1, /*nr=*/16, /*log2_kr=*/2, /*log2_sr=*/0, - benchmark::utils::CheckAVX512VNNI); - } - static void qs8_qc8w_gemm_4x16c4__avx512vnni(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__avx512vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__avx512vnni, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__avx512vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__avx512vnni, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/16, /*log2_kr=*/2, /*log2_sr=*/0, - benchmark::utils::CheckAVX512VNNI); - } - static void qs8_qc8w_gemm_5x16c4__avx512vnni(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_5x16c4__avx512vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_5x16c4__avx512vnni, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__avx512vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__avx512vnni, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/5, /*nr=*/16, /*log2_kr=*/2, /*log2_sr=*/0, - benchmark::utils::CheckAVX512VNNI); - } - static void qs8_qc8w_gemm_7x16c4__avx512vnni(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_7x16c4__avx512vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_7x16c4__avx512vnni, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__avx512vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__avx512vnni, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/7, /*nr=*/16, /*log2_kr=*/2, /*log2_sr=*/0, - benchmark::utils::CheckAVX512VNNI); - } - static void qs8_qc8w_gemm_8x16c4__avx512vnni(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_8x16c4__avx512vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_8x16c4__avx512vnni, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__avx512vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__avx512vnni, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/8, /*nr=*/16, /*log2_kr=*/2, /*log2_sr=*/0, - benchmark::utils::CheckAVX512VNNI); - } - static void qs8_qc8w_gemm_9x16c4__avx512vnni(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_9x16c4__avx512vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_9x16c4__avx512vnni, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__avx512vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__avx512vnni, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/9, /*nr=*/16, /*log2_kr=*/2, /*log2_sr=*/0, - benchmark::utils::CheckAVX512VNNI); - } - static void qs8_qc8w_gemm_10x16c4__avx512vnni(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_10x16c4__avx512vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_10x16c4__avx512vnni, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__avx512vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__avx512vnni, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/10, /*nr=*/16, /*log2_kr=*/2, /*log2_sr=*/0, - benchmark::utils::CheckAVX512VNNI); - } - static void qs8_qc8w_gemm_12x16c4__avx512vnni(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_12x16c4__avx512vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_12x16c4__avx512vnni, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__avx512vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__avx512vnni, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/12, /*nr=*/16, /*log2_kr=*/2, /*log2_sr=*/0, - benchmark::utils::CheckAVX512VNNI); - } - static void qs8_qc8w_gemm_14x16c4__avx512vnni(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_14x16c4__avx512vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_14x16c4__avx512vnni, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__avx512vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__avx512vnni, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/14, /*nr=*/16, /*log2_kr=*/2, /*log2_sr=*/0, - benchmark::utils::CheckAVX512VNNI); - } - static void qs8_qc8w_gemm_1x16c8__avx512vnni(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c8__avx512vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c8__avx512vnni, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c8__avx512vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c8__avx512vnni, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/1, /*nr=*/16, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVX512VNNI); - } - static void qs8_qc8w_gemm_5x16c8__avx512vnni(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_5x16c8__avx512vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_5x16c8__avx512vnni, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c8__avx512vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c8__avx512vnni, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/5, /*nr=*/16, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVX512VNNI); - } - static void qs8_qc8w_gemm_7x16c8__avx512vnni(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_7x16c8__avx512vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_7x16c8__avx512vnni, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c8__avx512vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c8__avx512vnni, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/7, /*nr=*/16, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVX512VNNI); - } - static void qs8_qc8w_gemm_8x16c8__avx512vnni(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_8x16c8__avx512vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_8x16c8__avx512vnni, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c8__avx512vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c8__avx512vnni, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/8, /*nr=*/16, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVX512VNNI); - } - static void qs8_qc8w_gemm_9x16c8__avx512vnni(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_9x16c8__avx512vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_9x16c8__avx512vnni, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c8__avx512vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c8__avx512vnni, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/9, /*nr=*/16, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVX512VNNI); - } - static void qs8_qc8w_gemm_10x16c8__avx512vnni(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_10x16c8__avx512vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_10x16c8__avx512vnni, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c8__avx512vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c8__avx512vnni, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/10, /*nr=*/16, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVX512VNNI); - } - static void qs8_qc8w_gemm_12x16c8__avx512vnni(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_12x16c8__avx512vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_12x16c8__avx512vnni, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c8__avx512vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c8__avx512vnni, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/12, /*nr=*/16, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVX512VNNI); - } - static void qs8_qc8w_gemm_14x16c8__avx512vnni(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_14x16c8__avx512vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_14x16c8__avx512vnni, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c8__avx512vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c8__avx512vnni, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/14, /*nr=*/16, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVX512VNNI); - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - -#if XNN_ENABLE_AVX256VNNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) - static void qs8_qc8w_gemm_1x8c8__avx256vnni(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avx256vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avx256vnni, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avx256vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avx256vnni, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/1, /*nr=*/8, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVX256VNNI); - } - static void qs8_qc8w_gemm_5x8c8__avx256vnni(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_5x8c8__avx256vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_5x8c8__avx256vnni, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avx256vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avx256vnni, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/5, /*nr=*/8, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVX256VNNI); - } - static void qs8_qc8w_gemm_7x8c8__avx256vnni(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_7x8c8__avx256vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_7x8c8__avx256vnni, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avx256vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avx256vnni, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/7, /*nr=*/8, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVX256VNNI); - } - static void qs8_qc8w_gemm_8x8c8__avx256vnni(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_8x8c8__avx256vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_8x8c8__avx256vnni, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avx256vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avx256vnni, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/8, /*nr=*/8, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVX256VNNI); - } - static void qs8_qc8w_gemm_9x8c8__avx256vnni(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_9x8c8__avx256vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_9x8c8__avx256vnni, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avx256vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avx256vnni, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/9, /*nr=*/8, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVX256VNNI); - } - static void qs8_qc8w_gemm_10x8c8__avx256vnni(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_10x8c8__avx256vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_10x8c8__avx256vnni, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avx256vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avx256vnni, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/10, /*nr=*/8, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVX256VNNI); - } - static void qs8_qc8w_gemm_12x8c8__avx256vnni(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_12x8c8__avx256vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_12x8c8__avx256vnni, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avx256vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avx256vnni, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/12, /*nr=*/8, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVX256VNNI); - } - static void qs8_qc8w_gemm_14x8c8__avx256vnni(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_14x8c8__avx256vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_14x8c8__avx256vnni, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avx256vnni, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avx256vnni, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/14, /*nr=*/8, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVX256VNNI); - } -#endif // XNN_ENABLE_AVX256VNNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - static void qs8_qc8w_gemm_1x16c8__avx512skx(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c8__avx512skx, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c8__avx512skx, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/1, /*nr=*/16, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVX512F); - } - static void qs8_qc8w_gemm_5x16c8__avx512skx(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_5x16c8__avx512skx, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_5x16c8__avx512skx, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c8__avx512skx, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/5, /*nr=*/16, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVX512F); - } - static void qs8_qc8w_gemm_7x16c8__avx512skx(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_7x16c8__avx512skx, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_7x16c8__avx512skx, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c8__avx512skx, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/7, /*nr=*/16, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVX512F); - } - static void qs8_qc8w_gemm_8x16c8__avx512skx(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_8x16c8__avx512skx, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_8x16c8__avx512skx, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c8__avx512skx, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/8, /*nr=*/16, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVX512F); - } - static void qs8_qc8w_gemm_2x8c8__avx2(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8c8__avx2, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x8c8__avx2, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avx2, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avx2, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/8, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVX2); - } - static void qs8_qc8w_gemm_3x8c8__avx2(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x8c8__avx2, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x8c8__avx2, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avx2, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avx2, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/8, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVX2); - } - static void qs8_qc8w_gemm_2x4c2__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/0, - benchmark::utils::CheckAVX); - } - static void qs8_qc8w_gemm_2x4c2__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/0, - benchmark::utils::CheckAVX); - } - static void qs8_qc8w_gemm_3x4c2__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/0, - benchmark::utils::CheckAVX); - } - static void qs8_qc8w_gemm_3x4c2__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/0, - benchmark::utils::CheckAVX); - } - static void qs8_qc8w_gemm_4x4c2__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/0, - benchmark::utils::CheckAVX); - } - static void qs8_qc8w_gemm_4x4c2__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/0, - benchmark::utils::CheckAVX); - } - static void qs8_qc8w_gemm_2x4c2s4__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x4c2s4__avx_ld64, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/2, - benchmark::utils::CheckAVX); - } - static void qs8_qc8w_gemm_2x4c2s4__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x4c2s4__avx_ld128, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/2, - benchmark::utils::CheckAVX); - } - static void qs8_qc8w_gemm_3x4c2s4__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c2s4__avx_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x4c2s4__avx_ld64, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/2, - benchmark::utils::CheckAVX); - } - static void qs8_qc8w_gemm_3x4c2s4__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c2s4__avx_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x4c2s4__avx_ld128, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/2, - benchmark::utils::CheckAVX); - } - static void qs8_qc8w_gemm_4x4c2s4__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x4c2s4__avx_ld64, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/2, - benchmark::utils::CheckAVX); - } - static void qs8_qc8w_gemm_4x4c2s4__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x4c2s4__avx_ld128, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/2, - benchmark::utils::CheckAVX); - } - static void qs8_qc8w_gemm_2x4c8__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVX); - } - static void qs8_qc8w_gemm_2x4c8__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVX); - } - static void qs8_qc8w_gemm_3x4c8__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVX); - } - static void qs8_qc8w_gemm_3x4c8__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVX); - } - static void qs8_qc8w_gemm_2x4c2__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/0, - benchmark::utils::CheckSSE41); - } - static void qs8_qc8w_gemm_2x4c2__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/0, - benchmark::utils::CheckSSE41); - } - static void qs8_qc8w_gemm_3x4c2__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/0, - benchmark::utils::CheckSSE41); - } - static void qs8_qc8w_gemm_3x4c2__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/0, - benchmark::utils::CheckSSE41); - } - static void qs8_qc8w_gemm_4x4c2__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/0, - benchmark::utils::CheckSSE41); - } - static void qs8_qc8w_gemm_4x4c2__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/0, - benchmark::utils::CheckSSE41); - } - static void qs8_qc8w_gemm_2x4c2s4__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld64, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/2, - benchmark::utils::CheckSSE41); - } - static void qs8_qc8w_gemm_2x4c2s4__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld128, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/2, - benchmark::utils::CheckSSE41); - } - static void qs8_qc8w_gemm_3x4c2s4__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld64, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/2, - benchmark::utils::CheckSSE41); - } - static void qs8_qc8w_gemm_3x4c2s4__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld128, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/2, - benchmark::utils::CheckSSE41); - } - static void qs8_qc8w_gemm_4x4c2s4__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld64, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/2, - benchmark::utils::CheckSSE41); - } - static void qs8_qc8w_gemm_4x4c2s4__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld128, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/2, - benchmark::utils::CheckSSE41); - } - static void qs8_qc8w_gemm_2x4c8__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckSSE41); - } - static void qs8_qc8w_gemm_2x4c8__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckSSE41); - } - static void qs8_qc8w_gemm_3x4c8__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckSSE41); - } - static void qs8_qc8w_gemm_3x4c8__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckSSE41); - } - static void qs8_qc8w_gemm_2x4c2__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/0); - } - static void qs8_qc8w_gemm_2x4c2__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/0); - } - static void qs8_qc8w_gemm_3x4c2__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/0); - } - static void qs8_qc8w_gemm_3x4c2__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/0); - } - static void qs8_qc8w_gemm_4x4c2__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/0); - } - static void qs8_qc8w_gemm_4x4c2__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/0); - } - static void qs8_qc8w_gemm_2x4c2s4__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld64, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/2); - } - static void qs8_qc8w_gemm_2x4c2s4__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld128, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/2); - } - static void qs8_qc8w_gemm_3x4c2s4__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld64, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/2); - } - static void qs8_qc8w_gemm_3x4c2s4__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld128, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/2); - } - static void qs8_qc8w_gemm_4x4c2s4__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x4c2s4__sse2_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x4c2s4__sse2_ld64, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/2); - } - static void qs8_qc8w_gemm_4x4c2s4__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x4c2s4__sse2_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x4c2s4__sse2_ld128, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/2); - } - static void qs8_qc8w_gemm_2x4c8__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/3, /*log2_sr=*/0); - } - static void qs8_qc8w_gemm_2x4c8__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/3, /*log2_sr=*/0); - } - static void qs8_qc8w_gemm_3x4c8__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/3, /*log2_sr=*/0); - } - static void qs8_qc8w_gemm_3x4c8__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/3, /*log2_sr=*/0); - } - - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_1x16c8__avx512vnni); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_5x16c8__avx512vnni); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_7x16c8__avx512vnni); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_8x16c8__avx512vnni); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_9x16c8__avx512vnni); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_10x16c8__avx512vnni); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_12x16c8__avx512vnni); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_14x16c8__avx512vnni); - -#if XNN_ENABLE_AVX256VNNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_1x8c8__avx256vnni); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_5x8c8__avx256vnni); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_7x8c8__avx256vnni); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_8x8c8__avx256vnni); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_9x8c8__avx256vnni); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_10x8c8__avx256vnni); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_12x8c8__avx256vnni); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_14x8c8__avx256vnni); -#endif // XNN_ENABLE_AVX256VNNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) - - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_1x16c4__avx512vnni); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x16c4__avx512vnni); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_5x16c4__avx512vnni); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_7x16c4__avx512vnni); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_8x16c4__avx512vnni); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_9x16c4__avx512vnni); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_10x16c4__avx512vnni); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_12x16c4__avx512vnni); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_14x16c4__avx512vnni); - - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_1x16c8__avx512skx); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_5x16c8__avx512skx); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_7x16c8__avx512skx); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_8x16c8__avx512skx); - - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x8c8__avx2); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x8c8__avx2); - - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x4c2__avx_ld64); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x4c2__avx_ld128); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x4c2__avx_ld64); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x4c2__avx_ld128); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x4c2__avx_ld64); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x4c2__avx_ld128); - - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x4c2s4__avx_ld64); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x4c2s4__avx_ld128); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x4c2s4__avx_ld64); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x4c2s4__avx_ld128); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x4c2s4__avx_ld64); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x4c2s4__avx_ld128); - - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x4c8__avx_ld64); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x4c8__avx_ld128); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x4c8__avx_ld64); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x4c8__avx_ld128); - - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x4c2__sse41_ld64); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x4c2__sse41_ld128); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x4c2__sse41_ld64); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x4c2__sse41_ld128); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x4c2__sse41_ld64); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x4c2__sse41_ld128); - - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x4c2s4__sse41_ld64); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x4c2s4__sse41_ld128); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x4c2s4__sse41_ld64); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x4c2s4__sse41_ld128); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x4c2s4__sse41_ld64); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x4c2s4__sse41_ld128); - - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x4c8__sse41_ld64); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x4c8__sse41_ld128); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x4c8__sse41_ld64); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x4c8__sse41_ld128); - - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x4c2__sse2_ld64); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x4c2__sse2_ld128); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x4c2__sse2_ld64); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x4c2__sse2_ld128); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x4c2__sse2_ld64); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x4c2__sse2_ld128); - - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x4c2s4__sse2_ld64); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x4c2s4__sse2_ld128); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x4c2s4__sse2_ld64); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x4c2s4__sse2_ld128); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x4c2s4__sse2_ld64); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x4c2s4__sse2_ld128); - - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x4c8__sse2_ld64); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x4c8__sse2_ld128); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x4c8__sse2_ld64); - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x4c8__sse2_ld128); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_WASMRELAXEDSIMD - static void qs8_qc8w_gemm_2x4c16__wasmsdot(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c16__wasmsdot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x4c16__wasmsdot, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c16__wasmsdot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c16__wasmsdot, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/4, /*log2_sr=*/0, - benchmark::utils::CheckWAsmSDOT); - } - static void qs8_qc8w_gemm_3x4c16__wasmsdot(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c16__wasmsdot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x4c16__wasmsdot, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c16__wasmsdot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c16__wasmsdot, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/4, /*log2_sr=*/0, - benchmark::utils::CheckWAsmSDOT); - } - static void qs8_qc8w_gemm_4x4c16__wasmsdot(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x4c16__wasmsdot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x4c16__wasmsdot, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c16__wasmsdot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c16__wasmsdot, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4, /*log2_kr=*/4, /*log2_sr=*/0, - benchmark::utils::CheckWAsmSDOT); - } - - static void qs8_qc8w_gemm_2x8c16__wasmsdot(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8c16__wasmsdot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x8c16__wasmsdot, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c16__wasmsdot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c16__wasmsdot, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/8, /*log2_kr=*/4, /*log2_sr=*/0, - benchmark::utils::CheckWAsmSDOT); - } - static void qs8_qc8w_gemm_3x8c16__wasmsdot(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x8c16__wasmsdot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x8c16__wasmsdot, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c16__wasmsdot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c16__wasmsdot, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/8, /*log2_kr=*/4, /*log2_sr=*/0, - benchmark::utils::CheckWAsmSDOT); - } - static void qs8_qc8w_gemm_4x8c16__wasmsdot(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8c16__wasmsdot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8c16__wasmsdot, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c16__wasmsdot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c16__wasmsdot, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/4, /*log2_sr=*/0, - benchmark::utils::CheckWAsmSDOT); - } - - static void qs8_qc8w_gemm_2x4c16__wasmusdot(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c16__wasmusdot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x4c16__wasmusdot, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c16__wasmusdot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c16__wasmusdot, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/4, /*log2_sr=*/0, - benchmark::utils::CheckWAsmUSDOT); - } - static void qs8_qc8w_gemm_3x4c16__wasmusdot(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c16__wasmusdot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x4c16__wasmusdot, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c16__wasmusdot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c16__wasmusdot, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/4, /*log2_sr=*/0, - benchmark::utils::CheckWAsmUSDOT); - } - static void qs8_qc8w_gemm_4x4c16__wasmusdot(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x4c16__wasmusdot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x4c16__wasmusdot, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c16__wasmusdot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c16__wasmusdot, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4, /*log2_kr=*/4, /*log2_sr=*/0, - benchmark::utils::CheckWAsmUSDOT); - } - - static void qs8_qc8w_gemm_2x8c16__wasmusdot(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8c16__wasmusdot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x8c16__wasmusdot, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c16__wasmusdot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c16__wasmusdot, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/8, /*log2_kr=*/4, /*log2_sr=*/0, - benchmark::utils::CheckWAsmUSDOT); - } - static void qs8_qc8w_gemm_3x8c16__wasmusdot(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x8c16__wasmusdot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x8c16__wasmusdot, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c16__wasmusdot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c16__wasmusdot, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/8, /*log2_kr=*/4, /*log2_sr=*/0, - benchmark::utils::CheckWAsmUSDOT); - } - static void qs8_qc8w_gemm_4x8c16__wasmusdot(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8c16__wasmusdot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8c16__wasmusdot, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c16__wasmusdot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c16__wasmusdot, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/4, /*log2_sr=*/0, - benchmark::utils::CheckWAsmUSDOT); - } - - static void qs8_qc8w_gemm_2x8c8__wasmusdot(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8c8__wasmusdot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x8c8__wasmusdot, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__wasmusdot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__wasmusdot, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/8, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckWAsmUSDOT); - } - static void qs8_qc8w_gemm_3x8c8__wasmusdot(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x8c8__wasmusdot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x8c8__wasmusdot, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__wasmusdot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__wasmusdot, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/8, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckWAsmUSDOT); - } - static void qs8_qc8w_gemm_4x8c8__wasmusdot(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8c8__wasmusdot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8c8__wasmusdot, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__wasmusdot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__wasmusdot, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckWAsmUSDOT); - } - - static void qs8_qc8w_gemm_2x8c8__wasmusdot_u2(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8c8__wasmusdot_u2, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x8c8__wasmusdot_u2, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__wasmusdot_u2, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__wasmusdot_u2, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/8, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckWAsmUSDOT); - } - static void qs8_qc8w_gemm_3x8c8__wasmusdot_u2(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x8c8__wasmusdot_u2, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x8c8__wasmusdot_u2, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__wasmusdot_u2, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__wasmusdot_u2, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/8, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckWAsmUSDOT); - } - static void qs8_qc8w_gemm_4x8c8__wasmusdot_u2(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8c8__wasmusdot_u2, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8c8__wasmusdot_u2, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__wasmusdot_u2, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__wasmusdot_u2, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckWAsmUSDOT); - } - - static void qs8_qc8w_gemm_2x8c8__wasmsdot(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8c8__wasmsdot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x8c8__wasmsdot, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__wasmsdot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__wasmsdot, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/8, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckWAsmSDOT); - } - static void qs8_qc8w_gemm_3x8c8__wasmsdot(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x8c8__wasmsdot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x8c8__wasmsdot, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__wasmsdot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__wasmsdot, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/8, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckWAsmSDOT); - } - static void qs8_qc8w_gemm_4x8c8__wasmsdot(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8c8__wasmsdot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8c8__wasmsdot, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__wasmsdot, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__wasmsdot, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckWAsmSDOT); - } - - static void qs8_qc8w_gemm_2x8c8__wasmsdot_u2(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8c8__wasmsdot_u2, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x8c8__wasmsdot_u2, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__wasmsdot_u2, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__wasmsdot_u2, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/8, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckWAsmSDOT); - } - static void qs8_qc8w_gemm_3x8c8__wasmsdot_u2(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x8c8__wasmsdot_u2, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x8c8__wasmsdot_u2, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__wasmsdot_u2, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__wasmsdot_u2, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/8, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckWAsmSDOT); - } - static void qs8_qc8w_gemm_4x8c8__wasmsdot_u2(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8c8__wasmsdot_u2, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8c8__wasmsdot_u2, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__wasmsdot_u2, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__wasmsdot_u2, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckWAsmSDOT); - } - - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x4c16__wasmsdot) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x4c16__wasmsdot) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x4c16__wasmsdot) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x8c16__wasmsdot) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x8c16__wasmsdot) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x8c16__wasmsdot) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x4c16__wasmusdot) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x4c16__wasmusdot) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x4c16__wasmusdot) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x8c16__wasmusdot) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x8c16__wasmusdot) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x8c16__wasmusdot) - - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x8c8__wasmusdot) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x8c8__wasmusdot) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x8c8__wasmusdot) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x8c8__wasmusdot_u2) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x8c8__wasmusdot_u2) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x8c8__wasmusdot_u2) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x8c8__wasmsdot) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x8c8__wasmsdot) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x8c8__wasmsdot) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x8c8__wasmsdot_u2) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x8c8__wasmsdot_u2) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x8c8__wasmsdot_u2) -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - static void qs8_qc8w_gemm_2x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/1); - } - static void qs8_qc8w_gemm_2x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/1); - } - static void qs8_qc8w_gemm_3x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/1); - } - static void qs8_qc8w_gemm_3x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/1); - } - static void qs8_qc8w_gemm_4x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4, /*log2_kr=*/1); - } - static void qs8_qc8w_gemm_4x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4, /*log2_kr=*/1); - } - static void qs8_qc8w_gemm_2x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/2); - } - static void qs8_qc8w_gemm_2x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/2); - } - static void qs8_qc8w_gemm_3x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/2); - } - static void qs8_qc8w_gemm_3x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/2); - } - static void qs8_qc8w_gemm_4x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/2); - } - static void qs8_qc8w_gemm_4x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/2); - } - static void qs8_qc8w_gemm_2x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/3); - } - static void qs8_qc8w_gemm_2x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/3); - } - static void qs8_qc8w_gemm_3x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/3); - } - static void qs8_qc8w_gemm_3x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/3); - } - static void qs8_qc8w_gemm_4x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4, /*log2_kr=*/3); - } - static void qs8_qc8w_gemm_4x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4, /*log2_kr=*/3); - } - - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x4c2__wasmsimd_dot16x2_ld64) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x4c2__wasmsimd_dot16x2_ld128) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x4c2__wasmsimd_dot16x2_ld64) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x4c2__wasmsimd_dot16x2_ld128) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x4c2__wasmsimd_dot16x2_ld64) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x4c2__wasmsimd_dot16x2_ld128) - - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x4c2s4__wasmsimd_dot16x2_ld64) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x4c2s4__wasmsimd_dot16x2_ld128) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x4c2s4__wasmsimd_dot16x2_ld64) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x4c2s4__wasmsimd_dot16x2_ld128) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x4c2s4__wasmsimd_dot16x2_ld64) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x4c2s4__wasmsimd_dot16x2_ld128) - - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x4c8__wasmsimd_dot16x2_ld64) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x4c8__wasmsimd_dot16x2_ld128) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x4c8__wasmsimd_dot16x2_ld64) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x4c8__wasmsimd_dot16x2_ld128) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x4c8__wasmsimd_dot16x2_ld64) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x4c8__wasmsimd_dot16x2_ld128) -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - static void qs8_qc8w_gemm_2x2__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x2__wasm_fmagic, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x2__wasm_fmagic, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x2__wasm_fmagic, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x2__wasm_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/2); - } - static void qs8_qc8w_gemm_3x2__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x2__wasm_fmagic, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x2__wasm_fmagic, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x2__wasm_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/2); - } - static void qs8_qc8w_gemm_4x2__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x2__wasm_fmagic, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x2__wasm_fmagic, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x2__wasm_fmagic, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x2__wasm_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/2); - } - static void qs8_qc8w_gemm_2x4__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4__wasm_fmagic, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x4__wasm_fmagic, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4); - } - static void qs8_qc8w_gemm_3x4__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x4__wasm_fmagic, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4); - } - static void qs8_qc8w_gemm_4x4__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x4__wasm_fmagic, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4); - } - - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x2__wasm_fmagic) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x2__wasm_fmagic) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x2__wasm_fmagic) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x4__wasm_fmagic) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x4__wasm_fmagic) - BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x4__wasm_fmagic) -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -static void qs8_qc8w_gemm_2x2__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x2__scalar_fmagic, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x2__scalar_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/2); -} -static void qs8_qc8w_gemm_3x2__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x2__scalar_fmagic, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x2__scalar_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/2); -} -static void qs8_qc8w_gemm_4x2__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x2__scalar_fmagic, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x2__scalar_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/2); -} -static void qs8_qc8w_gemm_2x4__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4__scalar_fmagic, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x4__scalar_fmagic, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4__scalar_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4); -} -static void qs8_qc8w_gemm_3x4__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x4__scalar_fmagic, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4__scalar_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4); -} -static void qs8_qc8w_gemm_4x4__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x4__scalar_fmagic, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4__scalar_fmagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4); -} - -static void qs8_qc8w_gemm_2x2__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x2__scalar_imagic, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x2__scalar_imagic, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x2__scalar_imagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/2); -} -static void qs8_qc8w_gemm_3x2__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x2__scalar_imagic, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x2__scalar_imagic, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x2__scalar_imagic, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x2__scalar_imagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/2); -} -static void qs8_qc8w_gemm_4x2__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x2__scalar_imagic, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x2__scalar_imagic, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x2__scalar_imagic, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x2__scalar_imagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/2); -} -static void qs8_qc8w_gemm_2x4__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x4__scalar_imagic, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4__scalar_imagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4); -} -static void qs8_qc8w_gemm_3x4__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4__scalar_imagic, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x4__scalar_imagic, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4__scalar_imagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4); -} -static void qs8_qc8w_gemm_4x4__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x4__scalar_imagic, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x4__scalar_imagic, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4__scalar_imagic, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4); -} - -static void qs8_qc8w_gemm_2x2__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x2__scalar_lrintf, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x2__scalar_lrintf, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x2__scalar_lrintf, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/2); -} -static void qs8_qc8w_gemm_3x2__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x2__scalar_lrintf, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x2__scalar_lrintf, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x2__scalar_lrintf, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/2); -} -static void qs8_qc8w_gemm_4x2__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x2__scalar_lrintf, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x2__scalar_lrintf, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/2); -} -static void qs8_qc8w_gemm_2x4__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4__scalar_lrintf, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x4__scalar_lrintf, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4); -} -static void qs8_qc8w_gemm_3x4__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x4__scalar_lrintf, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4); -} -static void qs8_qc8w_gemm_4x4__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x4__scalar_lrintf, - xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf, - xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf, - xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4); -} - -BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x2__scalar_fmagic) -BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x2__scalar_fmagic) -BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x2__scalar_fmagic) -BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x4__scalar_fmagic) -BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x4__scalar_fmagic) -BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x4__scalar_fmagic) - -BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x2__scalar_imagic) -BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x2__scalar_imagic) -BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x2__scalar_imagic) -BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x4__scalar_imagic) -BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x4__scalar_imagic) -BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x4__scalar_imagic) - -BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x2__scalar_lrintf) -BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x2__scalar_lrintf) -BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x2__scalar_lrintf) -BENCHMARK_QS8_END2END(qs8_qc8w_gemm_2x4__scalar_lrintf) -BENCHMARK_QS8_END2END(qs8_qc8w_gemm_3x4__scalar_lrintf) -BENCHMARK_QS8_END2END(qs8_qc8w_gemm_4x4__scalar_lrintf) - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/qu8-dwconv-e2e.cc b/bench/qu8-dwconv-e2e.cc deleted file mode 100644 index d39b372d5d5..00000000000 --- a/bench/qu8-dwconv-e2e.cc +++ /dev/null @@ -1,1453 +0,0 @@ -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include -#include - -#include "bench/end2end.h" -#include "bench/utils.h" -#include - -#include "xnnpack.h" -#include "xnnpack/config.h" -#include "xnnpack/dwconv.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams-init.h" -#include "xnnpack/models.h" -#include "xnnpack/params.h" - - -static void DWConvEnd2EndBenchmark( - benchmark::State& state, - models::ExecutionPlanFactory model_factory, - xnn_qu8_dwconv_minmax_unipass_ukernel_fn dwconv, - xnn_init_qu8_conv_minmax_params_fn init_params, - uint8_t channel_tile, uint8_t primary_tile, - benchmark::utils::IsaCheckFunction isa_check = nullptr) -{ - if (isa_check != nullptr && !isa_check(state)) { - return; - } - if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) { - state.SkipWithError("failed to initialize XNNPACK"); - return; - } - - struct xnn_dwconv_config* dwconv_config = xnn_init_qu8_dwconv_config(); - if (dwconv_config == nullptr) { - state.SkipWithError("failed to initialize QU8 DWCONV config"); - return; - } - - // Save dwconv_config so that we can modify it for the benchmark and later restore it. - struct xnn_dwconv_config saved_dwconv_params[XNN_MAX_QU8_DWCONV_UKERNELS]; - memcpy(saved_dwconv_params, dwconv_config, sizeof(saved_dwconv_params)); - - // Override microkernels chosen in xnn_initialize - for (size_t i = 0; i < XNN_MAX_QU8_DWCONV_UKERNELS; i++) { - // Replace only the microkernel the matching kernel size. - if (dwconv_config[i].primary_tile == primary_tile) { - // Note: do not directly assign to dwconv_config[i] because it breaks older gcc. - dwconv_config[i].minmax.unipass = xnn_dwconv_unipass_ukernel_fn(dwconv); - dwconv_config[i].channel_tile = channel_tile; - dwconv_config[i].channel_subtile = channel_tile; - dwconv_config[i].channel_round = 1; - dwconv_config[i].primary_tile = primary_tile; - dwconv_config[i].init.qu8 = init_params; - break; - } - } - - auto execution_plan = model_factory(nullptr); - if (execution_plan.empty()) { - state.SkipWithError("failed to create a model"); - return; - } - - for (auto _ : state) { - for (const std::unique_ptr& op : execution_plan) { - xnn_status status = xnn_run_operator(op.get(), nullptr); - if (status != xnn_status_success) { - state.SkipWithError("failed to run a model"); - return; - } - } - } - - const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); - if (cpu_frequency != 0) { - state.counters["cpufreq"] = cpu_frequency; - } - - // Restore dwconv_config to original state as defined in init.c. - memcpy(dwconv_config, saved_dwconv_params, sizeof(saved_dwconv_params)); -} - -static void DWConvEnd2EndBenchmark( - benchmark::State& state, - models::ExecutionPlanFactory model_factory, - xnn_qu8_dwconv_minmax_multipass_ukernel_fn dwconv, - xnn_init_qu8_conv_minmax_params_fn init_params, - uint8_t channel_tile, uint8_t channel_subtile, uint8_t channel_round, - uint8_t primary_tile, uint8_t middle_tile, uint8_t last_tile, - uint8_t primary_tile_to_replace, - benchmark::utils::IsaCheckFunction isa_check = nullptr) -{ - if (isa_check != nullptr && !isa_check(state)) { - return; - } - if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) { - state.SkipWithError("failed to initialize XNNPACK"); - return; - } - - struct xnn_dwconv_config* dwconv_config = xnn_init_qu8_dwconv_config(); - if (dwconv_config == nullptr) { - state.SkipWithError("failed to initialize qu8 DWCONV config"); - return; - } - - // Save dwconv_config so that we can modify it for the benchmark and later restore it. - struct xnn_dwconv_config saved_dwconv_params[XNN_MAX_QU8_DWCONV_UKERNELS]; - memcpy(saved_dwconv_params, dwconv_config, sizeof(saved_dwconv_params)); - - bool found = false; - for (size_t i = 0; i < XNN_MAX_QU8_DWCONV_UKERNELS; i++) { - if (dwconv_config[i].primary_tile == primary_tile_to_replace) { - found = true; - } else if (dwconv_config[i].last_tile != 0) { - // Found a multipass microkernel, replace it. - found = true; - } - } - - if (!found) { - state.SkipWithError("can't replace with multipass"); - return; - } - - // Override microkernels chosen in xnn_initialize - for (size_t i = 0; i < XNN_MAX_QU8_DWCONV_UKERNELS; i++) { - // Replace only the microkernel the matching kernel size. - if (dwconv_config[i].primary_tile == primary_tile_to_replace || - dwconv_config[i].last_tile != 0) { - // Replace either when the primary_tile_to_replace matches, or replace the - // first multipass dwconv microkernel we find. - // TODO(zhin): support specifying target multipass dwconv to replace. - std::memset(&dwconv_config[i], 0, sizeof(dwconv_config[i])); - - // Note: do not directly assign to dwconv_config[i] because it breaks older gcc. - dwconv_config[i].minmax.multipass = xnn_dwconv_multipass_ukernel_fn(dwconv); - dwconv_config[i].channel_tile = channel_tile; - dwconv_config[i].channel_subtile = channel_subtile; - dwconv_config[i].channel_round = channel_round; - dwconv_config[i].primary_tile = primary_tile; - dwconv_config[i].middle_tile = middle_tile; - dwconv_config[i].last_tile = last_tile; - dwconv_config[i].init.qu8 = init_params; - break; - } - } - - auto execution_plan = model_factory(nullptr); - if (execution_plan.empty()) { - state.SkipWithError("failed to create a model"); - return; - } - - for (auto _ : state) { - for (const std::unique_ptr& op : execution_plan) { - xnn_status status = xnn_run_operator(op.get(), nullptr); - if (status != xnn_status_success) { - state.SkipWithError("failed to run a model"); - return; - } - } - } - - const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); - if (cpu_frequency != 0) { - state.counters["cpufreq"] = cpu_frequency; - } - - // Restore dwconv_config to original state as defined in init.c. - memcpy(dwconv_config, saved_dwconv_params, sizeof(saved_dwconv_params)); -} - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - static void qu8_dwconv_9p8c__neon_mul8(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_rndnu_ukernel_9p8c__neon_mul8, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON); - } - static void qu8_dwconv_9p16c__neon_mul8(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mul8, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON); - } - static void qu8_dwconv_9p32c__neon_mul8(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_rndnu_ukernel_9p32c__neon_mul8, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - 32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON); - } - static void qu8_dwconv_9p8c__neon_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_rndnu_ukernel_9p8c__neon_mul16, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON); - } - static void qu8_dwconv_9p16c__neon_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mul16, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON); - } - static void qu8_dwconv_9p32c__neon_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_rndnu_ukernel_9p32c__neon_mul16, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - 32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON); - } - - static void qu8_dwconv_25p8c__neon_mul8(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_rndnu_ukernel_25p8c__neon_mul8, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - 8 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON); - } - static void qu8_dwconv_25p16c__neon_mul8(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_rndnu_ukernel_25p16c__neon_mul8, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - 16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON); - } - static void qu8_dwconv_25p32c__neon_mul8(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_rndnu_ukernel_25p32c__neon_mul8, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - 32 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON); - } - static void qu8_dwconv_25p8c__neon_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_rndnu_ukernel_25p8c__neon_mul16, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - 8 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON); - } - static void qu8_dwconv_25p16c__neon_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_rndnu_ukernel_25p16c__neon_mul16, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - 16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON); - } - static void qu8_dwconv_25p32c__neon_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_rndnu_ukernel_25p32c__neon_mul16, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - 32 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON); - } - static void qu8_dwconv_5f5m5l8c8s8r__neon_mul8(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_rndnu_ukernel_5f5m5l8c8s8r__neon_mul8, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - static void qu8_dwconv_5f5m5l16c8s8r__neon_mul8(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_rndnu_ukernel_5f5m5l16c8s8r__neon_mul8, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - static void qu8_dwconv_5f5m5l32c8s8r__neon_mul8(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_rndnu_ukernel_5f5m5l32c8s8r__neon_mul8, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - - static void qu8_dwconv_6f6m7l8c8s8r__neon_mul8(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_rndnu_ukernel_6f6m7l8c8s8r__neon_mul8, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - static void qu8_dwconv_6f6m7l16c8s8r__neon_mul8(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_rndnu_ukernel_6f6m7l16c8s8r__neon_mul8, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - static void qu8_dwconv_6f6m7l32c8s8r__neon_mul8(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_rndnu_ukernel_6f6m7l32c8s8r__neon_mul8, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - - static void qu8_dwconv_8f8m9l8c8s8r__neon_mul8(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_rndnu_ukernel_8f8m9l8c8s8r__neon_mul8, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - static void qu8_dwconv_8f8m9l16c8s8r__neon_mul8(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_rndnu_ukernel_8f8m9l16c8s8r__neon_mul8, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - static void qu8_dwconv_8f8m9l32c8s8r__neon_mul8(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_rndnu_ukernel_8f8m9l32c8s8r__neon_mul8, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - - - static void qu8_dwconv_5f5m5l8c8s8r__neon_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_rndnu_ukernel_5f5m5l8c8s8r__neon_mul16, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - static void qu8_dwconv_5f5m5l16c8s8r__neon_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_rndnu_ukernel_5f5m5l16c8s8r__neon_mul16, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - static void qu8_dwconv_5f5m5l32c8s8r__neon_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_rndnu_ukernel_5f5m5l32c8s8r__neon_mul16, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - - static void qu8_dwconv_6f6m7l8c8s8r__neon_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_rndnu_ukernel_6f6m7l8c8s8r__neon_mul16, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - static void qu8_dwconv_6f6m7l16c8s8r__neon_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_rndnu_ukernel_6f6m7l16c8s8r__neon_mul16, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - static void qu8_dwconv_6f6m7l32c8s8r__neon_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_rndnu_ukernel_6f6m7l32c8s8r__neon_mul16, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - - static void qu8_dwconv_8f8m9l8c8s8r__neon_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_rndnu_ukernel_8f8m9l8c8s8r__neon_mul16, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - static void qu8_dwconv_8f8m9l16c8s8r__neon_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_rndnu_ukernel_8f8m9l16c8s8r__neon_mul16, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - static void qu8_dwconv_8f8m9l32c8s8r__neon_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_rndnu_ukernel_8f8m9l32c8s8r__neon_mul16, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - - BENCHMARK_QU8_END2END(qu8_dwconv_9p8c__neon_mul8); - BENCHMARK_QU8_END2END(qu8_dwconv_9p16c__neon_mul8); - BENCHMARK_QU8_END2END(qu8_dwconv_9p32c__neon_mul8); - BENCHMARK_QU8_END2END(qu8_dwconv_9p8c__neon_mul16); - BENCHMARK_QU8_END2END(qu8_dwconv_9p16c__neon_mul16); - BENCHMARK_QU8_END2END(qu8_dwconv_9p32c__neon_mul16); - - BENCHMARK_QU8_END2END(qu8_dwconv_25p8c__neon_mul8); - BENCHMARK_QU8_END2END(qu8_dwconv_25p16c__neon_mul8); - BENCHMARK_QU8_END2END(qu8_dwconv_25p32c__neon_mul8); - BENCHMARK_QU8_END2END(qu8_dwconv_25p8c__neon_mul16); - BENCHMARK_QU8_END2END(qu8_dwconv_25p16c__neon_mul16); - BENCHMARK_QU8_END2END(qu8_dwconv_25p32c__neon_mul16); - - BENCHMARK_QU8_END2END(qu8_dwconv_5f5m5l8c8s8r__neon_mul8); - BENCHMARK_QU8_END2END(qu8_dwconv_5f5m5l16c8s8r__neon_mul8); - BENCHMARK_QU8_END2END(qu8_dwconv_5f5m5l32c8s8r__neon_mul8); - - BENCHMARK_QU8_END2END(qu8_dwconv_6f6m7l8c8s8r__neon_mul8); - BENCHMARK_QU8_END2END(qu8_dwconv_6f6m7l16c8s8r__neon_mul8); - BENCHMARK_QU8_END2END(qu8_dwconv_6f6m7l32c8s8r__neon_mul8); - - BENCHMARK_QU8_END2END(qu8_dwconv_8f8m9l8c8s8r__neon_mul8); - BENCHMARK_QU8_END2END(qu8_dwconv_8f8m9l16c8s8r__neon_mul8); - BENCHMARK_QU8_END2END(qu8_dwconv_8f8m9l32c8s8r__neon_mul8); - - BENCHMARK_QU8_END2END(qu8_dwconv_5f5m5l8c8s8r__neon_mul16); - BENCHMARK_QU8_END2END(qu8_dwconv_5f5m5l16c8s8r__neon_mul16); - BENCHMARK_QU8_END2END(qu8_dwconv_5f5m5l32c8s8r__neon_mul16); - - BENCHMARK_QU8_END2END(qu8_dwconv_6f6m7l8c8s8r__neon_mul16); - BENCHMARK_QU8_END2END(qu8_dwconv_6f6m7l16c8s8r__neon_mul16); - BENCHMARK_QU8_END2END(qu8_dwconv_6f6m7l32c8s8r__neon_mul16); - - BENCHMARK_QU8_END2END(qu8_dwconv_8f8m9l8c8s8r__neon_mul16); - BENCHMARK_QU8_END2END(qu8_dwconv_8f8m9l16c8s8r__neon_mul16); - BENCHMARK_QU8_END2END(qu8_dwconv_8f8m9l32c8s8r__neon_mul16); - -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - static void qu8_dwconv_9p16c__avx512skx_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__avx512skx_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX512SKX); - } - static void qu8_dwconv_9p32c__avx512skx_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_9p32c__avx512skx_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX512SKX); - } - static void qu8_dwconv_9p8c__avx2_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__avx2_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2); - } - static void qu8_dwconv_9p16c__avx2_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2); - } - static void qu8_dwconv_9p32c__avx2_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_9p32c__avx2_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2); - } - static void qu8_dwconv_9p8c__avx_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__avx_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX); - } - static void qu8_dwconv_9p16c__avx_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__avx_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX); - } - static void qu8_dwconv_9p8c__avx_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__avx_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX); - } - static void qu8_dwconv_9p16c__avx_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__avx_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX); - } - static void qu8_dwconv_9p8c__sse41_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41); - } - static void qu8_dwconv_9p16c__sse41_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__sse41_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41); - } - static void qu8_dwconv_9p8c__sse41_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41); - } - static void qu8_dwconv_9p16c__sse41_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__sse41_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41); - } - static void qu8_dwconv_9p8c__sse2_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__sse2_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 8 /* channel tile */, 9 /* primary tile */); - } - static void qu8_dwconv_9p16c__sse2_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__sse2_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 16 /* channel tile */, 9 /* primary tile */); - } - - static void qu8_dwconv_25p8c__sse2_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__sse2_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 8 /* channel tile */, 25 /* primary tile */); - } - static void qu8_dwconv_25p16c__sse2_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__sse2_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 16 /* channel tile */, 25 /* primary tile */); - } - static void qu8_dwconv_25p8c__sse41_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 8 /* channel tile */, 25 /* primary tile */); - } - static void qu8_dwconv_25p16c__sse41_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__sse41_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 16 /* channel tile */, 25 /* primary tile */); - } - static void qu8_dwconv_25p8c__sse41_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 8 /* channel tile */, 25 /* primary tile */); - } - static void qu8_dwconv_25p16c__sse41_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__sse41_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 16 /* channel tile */, 25 /* primary tile */); - } - - static void qu8_dwconv_5f5m5l8c8s8r__sse2_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse2_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - static void qu8_dwconv_5f5m5l16c8s8r__sse2_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse2_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - static void qu8_dwconv_6f6m7l8c8s8r__sse2_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse2_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - static void qu8_dwconv_6f6m7l16c8s8r__sse2_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse2_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - static void qu8_dwconv_8f8m9l8c8s8r__sse2_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse2_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - static void qu8_dwconv_8f8m9l16c8s8r__sse2_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse2_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - - static void qu8_dwconv_5f5m5l8c8s8r__sse41_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__sse41_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckSSE41); - } - static void qu8_dwconv_5f5m5l16c8s8r__sse41_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c8s8r__sse41_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckSSE41); - } - static void qu8_dwconv_6f6m7l8c8s8r__sse41_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__sse41_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckSSE41); - } - static void qu8_dwconv_6f6m7l16c8s8r__sse41_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c8s8r__sse41_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckSSE41); - } - static void qu8_dwconv_8f8m9l8c8s8r__sse41_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__sse41_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckSSE41); - } - static void qu8_dwconv_8f8m9l16c8s8r__sse41_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__sse41_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckSSE41); - } - - static void qu8_dwconv_5f5m5l8c4s4r__sse41_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c4s4r__sse41_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckSSE41); - } - static void qu8_dwconv_5f5m5l16c4s4r__sse41_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c4s4r__sse41_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckSSE41); - } - static void qu8_dwconv_6f6m7l8c4s4r__sse41_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l8c4s4r__sse41_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckSSE41); - } - static void qu8_dwconv_6f6m7l16c4s4r__sse41_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c4s4r__sse41_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckSSE41); - } - static void qu8_dwconv_8f8m9l8c4s4r__sse41_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l8c4s4r__sse41_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckSSE41); - } - static void qu8_dwconv_8f8m9l16c4s4r__sse41_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c4s4r__sse41_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckSSE41); - } - - static void qu8_dwconv_5f5m5l8c4s4r__avx_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c4s4r__avx_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckAVX); - } - static void qu8_dwconv_5f5m5l16c4s4r__avx_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c4s4r__avx_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckAVX); - } - static void qu8_dwconv_6f6m7l8c4s4r__avx_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l8c4s4r__avx_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckAVX); - } - static void qu8_dwconv_6f6m7l16c4s4r__avx_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c4s4r__avx_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckAVX); - } - static void qu8_dwconv_8f8m9l8c4s4r__avx_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l8c4s4r__avx_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckAVX); - } - static void qu8_dwconv_8f8m9l16c4s4r__avx_mul32(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c4s4r__avx_mul32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/4, /*channel_round=*/4, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25, benchmark::utils::CheckAVX); - } - - BENCHMARK_QU8_END2END(qu8_dwconv_9p16c__avx512skx_mul32); - BENCHMARK_QU8_END2END(qu8_dwconv_9p32c__avx512skx_mul32); - - BENCHMARK_QU8_END2END(qu8_dwconv_9p8c__avx2_mul32); - BENCHMARK_QU8_END2END(qu8_dwconv_9p16c__avx2_mul32); - BENCHMARK_QU8_END2END(qu8_dwconv_9p32c__avx2_mul32); - - BENCHMARK_QU8_END2END(qu8_dwconv_9p8c__avx_mul16); - BENCHMARK_QU8_END2END(qu8_dwconv_9p16c__avx_mul16); - BENCHMARK_QU8_END2END(qu8_dwconv_9p8c__avx_mul32); - BENCHMARK_QU8_END2END(qu8_dwconv_9p16c__avx_mul32); - - BENCHMARK_QU8_END2END(qu8_dwconv_9p8c__sse41_mul16); - BENCHMARK_QU8_END2END(qu8_dwconv_9p16c__sse41_mul16); - BENCHMARK_QU8_END2END(qu8_dwconv_9p8c__sse41_mul32); - BENCHMARK_QU8_END2END(qu8_dwconv_9p16c__sse41_mul32); - - BENCHMARK_QU8_END2END(qu8_dwconv_9p8c__sse2_mul16); - BENCHMARK_QU8_END2END(qu8_dwconv_9p16c__sse2_mul16); - - BENCHMARK_QU8_END2END(qu8_dwconv_25p8c__sse2_mul16); - BENCHMARK_QU8_END2END(qu8_dwconv_25p16c__sse2_mul16); - BENCHMARK_QU8_END2END(qu8_dwconv_25p8c__sse41_mul16); - BENCHMARK_QU8_END2END(qu8_dwconv_25p16c__sse41_mul16); - BENCHMARK_QU8_END2END(qu8_dwconv_25p8c__sse41_mul32); - BENCHMARK_QU8_END2END(qu8_dwconv_25p16c__sse41_mul32); - - BENCHMARK_QU8_END2END(qu8_dwconv_5f5m5l8c8s8r__sse2_mul16); - BENCHMARK_QU8_END2END(qu8_dwconv_5f5m5l16c8s8r__sse2_mul16); - BENCHMARK_QU8_END2END(qu8_dwconv_6f6m7l8c8s8r__sse2_mul16); - BENCHMARK_QU8_END2END(qu8_dwconv_6f6m7l16c8s8r__sse2_mul16); - BENCHMARK_QU8_END2END(qu8_dwconv_8f8m9l8c8s8r__sse2_mul16); - BENCHMARK_QU8_END2END(qu8_dwconv_8f8m9l16c8s8r__sse2_mul16); - - BENCHMARK_QU8_END2END(qu8_dwconv_5f5m5l8c8s8r__sse41_mul16); - BENCHMARK_QU8_END2END(qu8_dwconv_5f5m5l16c8s8r__sse41_mul16); - BENCHMARK_QU8_END2END(qu8_dwconv_6f6m7l8c8s8r__sse41_mul16); - BENCHMARK_QU8_END2END(qu8_dwconv_6f6m7l16c8s8r__sse41_mul16); - BENCHMARK_QU8_END2END(qu8_dwconv_8f8m9l8c8s8r__sse41_mul16); - BENCHMARK_QU8_END2END(qu8_dwconv_8f8m9l16c8s8r__sse41_mul16); - - BENCHMARK_QU8_END2END(qu8_dwconv_5f5m5l8c4s4r__sse41_mul32); - BENCHMARK_QU8_END2END(qu8_dwconv_5f5m5l16c4s4r__sse41_mul32); - BENCHMARK_QU8_END2END(qu8_dwconv_6f6m7l8c4s4r__sse41_mul32); - BENCHMARK_QU8_END2END(qu8_dwconv_6f6m7l16c4s4r__sse41_mul32); - BENCHMARK_QU8_END2END(qu8_dwconv_8f8m9l8c4s4r__sse41_mul32); - BENCHMARK_QU8_END2END(qu8_dwconv_8f8m9l16c4s4r__sse41_mul32); - - BENCHMARK_QU8_END2END(qu8_dwconv_5f5m5l8c4s4r__avx_mul32); - BENCHMARK_QU8_END2END(qu8_dwconv_5f5m5l16c4s4r__avx_mul32); - BENCHMARK_QU8_END2END(qu8_dwconv_6f6m7l8c4s4r__avx_mul32); - BENCHMARK_QU8_END2END(qu8_dwconv_6f6m7l16c4s4r__avx_mul32); - BENCHMARK_QU8_END2END(qu8_dwconv_8f8m9l8c4s4r__avx_mul32); - BENCHMARK_QU8_END2END(qu8_dwconv_8f8m9l16c4s4r__avx_mul32); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - static void qu8_dwconv_9p8c__wasmsimd_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__wasmsimd_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 8 /* channel tile */, 9 /* primary tile */); - } - static void qu8_dwconv_9p16c__wasmsimd_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__wasmsimd_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 16 /* channel tile */, 9 /* primary tile */); - } - - static void qu8_dwconv_25p8c__wasmsimd_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__wasmsimd_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 8 /* channel tile */, 25 /* primary tile */); - } - static void qu8_dwconv_25p16c__wasmsimd_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__wasmsimd_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 16 /* channel tile */, 25 /* primary tile */); - } - - static void qu8_dwconv_5f5m5l8c8s8r__wasmsimd_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__wasmsimd_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - static void qu8_dwconv_5f5m5l16c8s8r__wasmsimd_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__wasmsimd_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - - static void qu8_dwconv_6f6m7l8c8s8r__wasmsimd_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__wasmsimd_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - static void qu8_dwconv_6f6m7l16c8s8r__wasmsimd_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l8c8s8r__wasmsimd_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - - static void qu8_dwconv_8f8m9l8c8s8r__wasmsimd_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__wasmsimd_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - static void qu8_dwconv_8f8m9l16c8s8r__wasmsimd_mul16(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l8c8s8r__wasmsimd_mul16, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/8, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - - BENCHMARK_QU8_END2END(qu8_dwconv_9p8c__wasmsimd_mul16); - BENCHMARK_QU8_END2END(qu8_dwconv_9p16c__wasmsimd_mul16); - - BENCHMARK_QU8_END2END(qu8_dwconv_25p8c__wasmsimd_mul16); - BENCHMARK_QU8_END2END(qu8_dwconv_25p16c__wasmsimd_mul16); - - BENCHMARK_QU8_END2END(qu8_dwconv_5f5m5l8c8s8r__wasmsimd_mul16); - BENCHMARK_QU8_END2END(qu8_dwconv_5f5m5l16c8s8r__wasmsimd_mul16); - BENCHMARK_QU8_END2END(qu8_dwconv_6f6m7l8c8s8r__wasmsimd_mul16); - BENCHMARK_QU8_END2END(qu8_dwconv_6f6m7l16c8s8r__wasmsimd_mul16); - BENCHMARK_QU8_END2END(qu8_dwconv_8f8m9l8c8s8r__wasmsimd_mul16); - BENCHMARK_QU8_END2END(qu8_dwconv_8f8m9l16c8s8r__wasmsimd_mul16); -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - static void qu8_dwconv_9p1c__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_9p1c__wasm_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 1 /* channel tile */, 9 /* primary tile */); - } - static void qu8_dwconv_9p2c__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_9p2c__wasm_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 2 /* channel tile */, 9 /* primary tile */); - } - static void qu8_dwconv_9p4c__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_9p4c__wasm_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 4 /* channel tile */, 9 /* primary tile */); - } - - static void qu8_dwconv_25p1c__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_25p1c__wasm_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 1 /* channel tile */, 25 /* primary tile */); - } - static void qu8_dwconv_25p2c__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_25p2c__wasm_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 2 /* channel tile */, 25 /* primary tile */); - } - static void qu8_dwconv_25p4c__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_25p4c__wasm_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 4 /* channel tile */, 25 /* primary tile */); - } - - static void qu8_dwconv_5f5m5l1c1s1r__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__wasm_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - static void qu8_dwconv_5f5m5l2c1s1r__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__wasm_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/2, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - static void qu8_dwconv_5f5m5l4c1s1r__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__wasm_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); - } - - static void qu8_dwconv_6f6m7l1c1s1r__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__wasm_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - static void qu8_dwconv_6f6m7l2c1s1r__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__wasm_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/2, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - static void qu8_dwconv_6f6m7l4c1s1r__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__wasm_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); - } - - static void qu8_dwconv_8f8m9l1c1s1r__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__wasm_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - static void qu8_dwconv_8f8m9l2c1s1r__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__wasm_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/2, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - static void qu8_dwconv_8f8m9l4c1s1r__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__wasm_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); - } - - BENCHMARK_QU8_END2END(qu8_dwconv_9p1c__wasm_fmagic); - BENCHMARK_QU8_END2END(qu8_dwconv_9p2c__wasm_fmagic); - BENCHMARK_QU8_END2END(qu8_dwconv_9p4c__wasm_fmagic); - - BENCHMARK_QU8_END2END(qu8_dwconv_25p1c__wasm_fmagic); - BENCHMARK_QU8_END2END(qu8_dwconv_25p2c__wasm_fmagic); - BENCHMARK_QU8_END2END(qu8_dwconv_25p4c__wasm_fmagic); - - BENCHMARK_QU8_END2END(qu8_dwconv_5f5m5l1c1s1r__wasm_fmagic); - BENCHMARK_QU8_END2END(qu8_dwconv_5f5m5l2c1s1r__wasm_fmagic); - BENCHMARK_QU8_END2END(qu8_dwconv_5f5m5l4c1s1r__wasm_fmagic); - - BENCHMARK_QU8_END2END(qu8_dwconv_6f6m7l1c1s1r__wasm_fmagic); - BENCHMARK_QU8_END2END(qu8_dwconv_6f6m7l2c1s1r__wasm_fmagic); - BENCHMARK_QU8_END2END(qu8_dwconv_6f6m7l4c1s1r__wasm_fmagic); - - BENCHMARK_QU8_END2END(qu8_dwconv_8f8m9l1c1s1r__wasm_fmagic); - BENCHMARK_QU8_END2END(qu8_dwconv_8f8m9l2c1s1r__wasm_fmagic); - BENCHMARK_QU8_END2END(qu8_dwconv_8f8m9l4c1s1r__wasm_fmagic); - -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -static void qu8_dwconv_9p1c__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_9p1c__scalar_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 1 /* channel tile */, 9 /* primary tile */); -} -static void qu8_dwconv_9p2c__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_9p2c__scalar_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 2 /* channel tile */, 9 /* primary tile */); -} -static void qu8_dwconv_9p4c__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_9p4c__scalar_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 4 /* channel tile */, 9 /* primary tile */); -} - -static void qu8_dwconv_9p1c__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_9p1c__scalar_imagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 1 /* channel tile */, 9 /* primary tile */); -} -static void qu8_dwconv_9p2c__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_9p2c__scalar_imagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 2 /* channel tile */, 9 /* primary tile */); -} -static void qu8_dwconv_9p4c__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_9p4c__scalar_imagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 4 /* channel tile */, 9 /* primary tile */); -} - -static void qu8_dwconv_9p1c__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_9p1c__scalar_lrintf, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 1 /* channel tile */, 9 /* primary tile */); -} -static void qu8_dwconv_9p2c__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_9p2c__scalar_lrintf, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 2 /* channel tile */, 9 /* primary tile */); -} -static void qu8_dwconv_9p4c__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_9p4c__scalar_lrintf, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 4 /* channel tile */, 9 /* primary tile */); -} - -static void qu8_dwconv_25p1c__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_25p1c__scalar_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 1 /* channel tile */, 25 /* primary tile */); -} -static void qu8_dwconv_25p2c__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_25p2c__scalar_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 2 /* channel tile */, 25 /* primary tile */); -} -static void qu8_dwconv_25p4c__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_25p4c__scalar_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 4 /* channel tile */, 25 /* primary tile */); -} - -static void qu8_dwconv_25p1c__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_25p1c__scalar_imagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 1 /* channel tile */, 25 /* primary tile */); -} -static void qu8_dwconv_25p2c__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_25p2c__scalar_imagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 2 /* channel tile */, 25 /* primary tile */); -} -static void qu8_dwconv_25p4c__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_25p4c__scalar_imagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 4 /* channel tile */, 25 /* primary tile */); -} - -static void qu8_dwconv_25p1c__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_25p1c__scalar_lrintf, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 1 /* channel tile */, 25 /* primary tile */); -} -static void qu8_dwconv_25p2c__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_25p2c__scalar_lrintf, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 2 /* channel tile */, 25 /* primary tile */); -} -static void qu8_dwconv_25p4c__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_25p4c__scalar_lrintf, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - 4 /* channel tile */, 25 /* primary tile */); -} - -static void qu8_dwconv_5f5m5l1c1s1r__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); -} -static void qu8_dwconv_5f5m5l2c1s1r__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/2, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); -} -static void qu8_dwconv_5f5m5l4c1s1r__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); -} -static void qu8_dwconv_5f5m5l1c1s1r__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_imagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); -} -static void qu8_dwconv_5f5m5l2c1s1r__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_imagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/2, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); -} -static void qu8_dwconv_5f5m5l4c1s1r__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_imagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); -} -static void qu8_dwconv_5f5m5l1c1s1r__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l1c1s1r__scalar_lrintf, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); -} -static void qu8_dwconv_5f5m5l2c1s1r__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l2c1s1r__scalar_lrintf, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/2, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); -} -static void qu8_dwconv_5f5m5l4c1s1r__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l4c1s1r__scalar_lrintf, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5, - /*primary_tile_to_replace=*/25); -} - -static void qu8_dwconv_6f6m7l1c1s1r__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); -} -static void qu8_dwconv_6f6m7l2c1s1r__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/2, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); -} -static void qu8_dwconv_6f6m7l4c1s1r__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); -} -static void qu8_dwconv_6f6m7l1c1s1r__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_imagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); -} -static void qu8_dwconv_6f6m7l2c1s1r__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_imagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/2, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); -} -static void qu8_dwconv_6f6m7l4c1s1r__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_imagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); -} -static void qu8_dwconv_6f6m7l1c1s1r__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l1c1s1r__scalar_lrintf, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); -} -static void qu8_dwconv_6f6m7l2c1s1r__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l2c1s1r__scalar_lrintf, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/2, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); -} -static void qu8_dwconv_6f6m7l4c1s1r__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l4c1s1r__scalar_lrintf, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7, - /*primary_tile_to_replace=*/25); -} - -static void qu8_dwconv_8f8m9l1c1s1r__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); -} -static void qu8_dwconv_8f8m9l2c1s1r__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/2, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); -} -static void qu8_dwconv_8f8m9l4c1s1r__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); -} -static void qu8_dwconv_8f8m9l1c1s1r__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_imagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); -} -static void qu8_dwconv_8f8m9l2c1s1r__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_imagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/2, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); -} -static void qu8_dwconv_8f8m9l4c1s1r__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_imagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); -} -static void qu8_dwconv_8f8m9l1c1s1r__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l1c1s1r__scalar_lrintf, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/1, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); -} -static void qu8_dwconv_8f8m9l2c1s1r__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l2c1s1r__scalar_lrintf, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/2, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); -} -static void qu8_dwconv_8f8m9l4c1s1r__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - DWConvEnd2EndBenchmark(state, model, - xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l4c1s1r__scalar_lrintf, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*channel_tile=*/4, /*channel_subtile=*/1, /*channel_round=*/1, - /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9, - /*primary_tile_to_replace=*/25); -} - -BENCHMARK_QU8_END2END(qu8_dwconv_9p1c__scalar_fmagic); -BENCHMARK_QU8_END2END(qu8_dwconv_9p2c__scalar_fmagic); -BENCHMARK_QU8_END2END(qu8_dwconv_9p4c__scalar_fmagic); - -BENCHMARK_QU8_END2END(qu8_dwconv_9p1c__scalar_imagic); -BENCHMARK_QU8_END2END(qu8_dwconv_9p2c__scalar_imagic); -BENCHMARK_QU8_END2END(qu8_dwconv_9p4c__scalar_imagic); - -BENCHMARK_QU8_END2END(qu8_dwconv_9p1c__scalar_lrintf); -BENCHMARK_QU8_END2END(qu8_dwconv_9p2c__scalar_lrintf); -BENCHMARK_QU8_END2END(qu8_dwconv_9p4c__scalar_lrintf); - -BENCHMARK_QU8_END2END(qu8_dwconv_25p1c__scalar_fmagic); -BENCHMARK_QU8_END2END(qu8_dwconv_25p2c__scalar_fmagic); -BENCHMARK_QU8_END2END(qu8_dwconv_25p4c__scalar_fmagic); - -BENCHMARK_QU8_END2END(qu8_dwconv_25p1c__scalar_imagic); -BENCHMARK_QU8_END2END(qu8_dwconv_25p2c__scalar_imagic); -BENCHMARK_QU8_END2END(qu8_dwconv_25p4c__scalar_imagic); - -BENCHMARK_QU8_END2END(qu8_dwconv_25p1c__scalar_lrintf); -BENCHMARK_QU8_END2END(qu8_dwconv_25p2c__scalar_lrintf); -BENCHMARK_QU8_END2END(qu8_dwconv_25p4c__scalar_lrintf); - -BENCHMARK_QU8_END2END(qu8_dwconv_5f5m5l1c1s1r__scalar_fmagic); -BENCHMARK_QU8_END2END(qu8_dwconv_5f5m5l2c1s1r__scalar_fmagic); -BENCHMARK_QU8_END2END(qu8_dwconv_5f5m5l4c1s1r__scalar_fmagic); -BENCHMARK_QU8_END2END(qu8_dwconv_5f5m5l1c1s1r__scalar_imagic); -BENCHMARK_QU8_END2END(qu8_dwconv_5f5m5l2c1s1r__scalar_imagic); -BENCHMARK_QU8_END2END(qu8_dwconv_5f5m5l4c1s1r__scalar_imagic); -BENCHMARK_QU8_END2END(qu8_dwconv_5f5m5l1c1s1r__scalar_lrintf); -BENCHMARK_QU8_END2END(qu8_dwconv_5f5m5l2c1s1r__scalar_lrintf); -BENCHMARK_QU8_END2END(qu8_dwconv_5f5m5l4c1s1r__scalar_lrintf); - -BENCHMARK_QU8_END2END(qu8_dwconv_6f6m7l1c1s1r__scalar_fmagic); -BENCHMARK_QU8_END2END(qu8_dwconv_6f6m7l2c1s1r__scalar_fmagic); -BENCHMARK_QU8_END2END(qu8_dwconv_6f6m7l4c1s1r__scalar_fmagic); -BENCHMARK_QU8_END2END(qu8_dwconv_6f6m7l1c1s1r__scalar_imagic); -BENCHMARK_QU8_END2END(qu8_dwconv_6f6m7l2c1s1r__scalar_imagic); -BENCHMARK_QU8_END2END(qu8_dwconv_6f6m7l4c1s1r__scalar_imagic); -BENCHMARK_QU8_END2END(qu8_dwconv_6f6m7l1c1s1r__scalar_lrintf); -BENCHMARK_QU8_END2END(qu8_dwconv_6f6m7l2c1s1r__scalar_lrintf); -BENCHMARK_QU8_END2END(qu8_dwconv_6f6m7l4c1s1r__scalar_lrintf); - -BENCHMARK_QU8_END2END(qu8_dwconv_8f8m9l1c1s1r__scalar_fmagic); -BENCHMARK_QU8_END2END(qu8_dwconv_8f8m9l2c1s1r__scalar_fmagic); -BENCHMARK_QU8_END2END(qu8_dwconv_8f8m9l4c1s1r__scalar_fmagic); -BENCHMARK_QU8_END2END(qu8_dwconv_8f8m9l1c1s1r__scalar_imagic); -BENCHMARK_QU8_END2END(qu8_dwconv_8f8m9l2c1s1r__scalar_imagic); -BENCHMARK_QU8_END2END(qu8_dwconv_8f8m9l4c1s1r__scalar_imagic); -BENCHMARK_QU8_END2END(qu8_dwconv_8f8m9l1c1s1r__scalar_lrintf); -BENCHMARK_QU8_END2END(qu8_dwconv_8f8m9l2c1s1r__scalar_lrintf); -BENCHMARK_QU8_END2END(qu8_dwconv_8f8m9l4c1s1r__scalar_lrintf); - - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/qu8-gemm-e2e.cc b/bench/qu8-gemm-e2e.cc deleted file mode 100644 index 8e3eab73959..00000000000 --- a/bench/qu8-gemm-e2e.cc +++ /dev/null @@ -1,1171 +0,0 @@ -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include -#include - -#include -#include "bench/end2end.h" -#include "bench/utils.h" - -#include "xnnpack.h" -#include "xnnpack/config.h" -#include "xnnpack/gemm.h" -#include "xnnpack/igemm.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams-init.h" -#include "xnnpack/models.h" -#include "xnnpack/pack.h" - - -static void GEMMEnd2EndBenchmark( - benchmark::State& state, - models::ExecutionPlanFactory model_factory, - xnn_qu8_gemm_minmax_ukernel_fn gemm, - xnn_qu8_igemm_minmax_ukernel_fn igemm, - xnn_qu8_gemm_minmax_ukernel_fn gemm1, - xnn_qu8_igemm_minmax_ukernel_fn igemm1, - xnn_init_qu8_conv_minmax_params_fn init_params, - uint8_t mr, uint8_t nr, uint8_t log2_kr = 0, uint8_t log2_sr = 0, - benchmark::utils::IsaCheckFunction isa_check = nullptr) -{ - if (isa_check != nullptr && !isa_check(state)) { - return; - } - if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) { - state.SkipWithError("failed to initialize XNNPACK"); - return; - } - - struct xnn_gemm_config* gemm_config = xnn_init_qu8_gemm_config(); - assert(gemm_config != nullptr); - - // Override microkernels chosen in xnn_initialize - std::memset(gemm_config, 0, sizeof(struct xnn_gemm_config)); - gemm_config->minmax.gemm[mr-1] = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_fn(gemm)); - gemm_config->minmax.igemm[mr-1] = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_fn(igemm)); - gemm_config->minmax.gemm[0] = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_fn(gemm1)); - gemm_config->minmax.igemm[0] = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_fn(igemm1)); - gemm_config->init.qu8 = init_params; - gemm_config->mr = mr; - gemm_config->nr = nr; - gemm_config->log2_kr = log2_kr; - gemm_config->log2_sr = log2_sr; - gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qu8_gemm_goi_w; - - auto execution_plan = model_factory(nullptr); - if (execution_plan.empty()) { - state.SkipWithError("failed to create a model"); - return; - } - - for (auto _ : state) { - for (const std::unique_ptr& op : execution_plan) { - xnn_status status = xnn_run_operator(op.get(), nullptr); - if (status != xnn_status_success) { - state.SkipWithError("failed to run a model"); - return; - } - } - } - - const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); - if (cpu_frequency != 0) { - state.counters["cpufreq"] = cpu_frequency; - } -} - -#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY - static void qu8_gemm_4x8__asm_aarch32_neon_mlal_lane_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a53, - xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a53, - xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7, - xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qu8_gemm_4x8__asm_aarch32_neon_mlal_lane_cortex_a53_prfm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a53_prfm, - xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a53_prfm, - xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7_prfm, - xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7_prfm, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qu8_gemm_4x8__asm_aarch32_neon_mlal_lane_cortex_a7(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a7, - xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a7, - xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7, - xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qu8_gemm_4x8__asm_aarch32_neon_mlal_lane_cortex_a7_prfm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a7_prfm, - xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a7_prfm, - xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7_prfm, - xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7_prfm, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qu8_gemm_4x8__asm_aarch32_neon_mlal_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_ld64, - xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_ld64, - xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, - xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qu8_gemm_4x8__asm_aarch32_neon_mlal_lane_ld64_prfm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_ld64_prfm, - xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_ld64_prfm, - xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, - xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - - BENCHMARK_QU8_END2END(qu8_gemm_4x8__asm_aarch32_neon_mlal_lane_cortex_a53_prfm) - BENCHMARK_QU8_END2END(qu8_gemm_4x8__asm_aarch32_neon_mlal_lane_cortex_a53) - BENCHMARK_QU8_END2END(qu8_gemm_4x8__asm_aarch32_neon_mlal_lane_cortex_a7_prfm) - BENCHMARK_QU8_END2END(qu8_gemm_4x8__asm_aarch32_neon_mlal_lane_cortex_a7) - BENCHMARK_QU8_END2END(qu8_gemm_4x8__asm_aarch32_neon_mlal_lane_ld64_prfm) - BENCHMARK_QU8_END2END(qu8_gemm_4x8__asm_aarch32_neon_mlal_lane_ld64) -#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY - -#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY - static void qu8_gemm_4x16__asm_aarch64_neon_mlal_lane_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__asm_aarch64_neon_mlal_lane_cortex_a75, - xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__asm_aarch64_neon_mlal_lane_cortex_a75, - xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, - xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - /*mr=*/4, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qu8_gemm_4x16__asm_aarch64_neon_mlal_lane_cortex_a75_prfm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__asm_aarch64_neon_mlal_lane_cortex_a75_prfm, - xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__asm_aarch64_neon_mlal_lane_cortex_a75_prfm, - xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, - xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - /*mr=*/4, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qu8_gemm_4x16__asm_aarch64_neon_mlal_lane_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__asm_aarch64_neon_mlal_lane_cortex_a53, - xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__asm_aarch64_neon_mlal_lane_cortex_a53, - xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, - xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - /*mr=*/4, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qu8_gemm_4x16__asm_aarch64_neon_mlal_lane_cortex_a53_prfm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__asm_aarch64_neon_mlal_lane_cortex_a53_prfm, - xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__asm_aarch64_neon_mlal_lane_cortex_a53_prfm, - xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, - xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - /*mr=*/4, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qu8_gemm_4x16__asm_aarch64_neon_mlal_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__asm_aarch64_neon_mlal_lane_ld64, - xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__asm_aarch64_neon_mlal_lane_ld64, - xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, - xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - /*mr=*/4, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qu8_gemm_4x16__asm_aarch64_neon_mlal_lane_ld64_prfm(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__asm_aarch64_neon_mlal_lane_ld64_prfm, - xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__asm_aarch64_neon_mlal_lane_ld64_prfm, - xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, - xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - /*mr=*/4, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - - BENCHMARK_QU8_END2END(qu8_gemm_4x16__asm_aarch64_neon_mlal_lane_cortex_a75); - BENCHMARK_QU8_END2END(qu8_gemm_4x16__asm_aarch64_neon_mlal_lane_cortex_a75_prfm); - BENCHMARK_QU8_END2END(qu8_gemm_4x16__asm_aarch64_neon_mlal_lane_cortex_a53); - BENCHMARK_QU8_END2END(qu8_gemm_4x16__asm_aarch64_neon_mlal_lane_cortex_a53_prfm); - BENCHMARK_QU8_END2END(qu8_gemm_4x16__asm_aarch64_neon_mlal_lane_ld64); - BENCHMARK_QU8_END2END(qu8_gemm_4x16__asm_aarch64_neon_mlal_lane_ld64_prfm); -#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - static void qu8_gemm_2x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane, - xnn_qu8_igemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane, - xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, - xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - /*mr=*/2, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qu8_gemm_3x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane, - xnn_qu8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane, - xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, - xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - /*mr=*/3, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qu8_gemm_4x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, - xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, - xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, - xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - /*mr=*/4, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qu8_gemm_6x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane, - xnn_qu8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane, - xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, - xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - /*mr=*/6, /*nr=*/8, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qu8_gemm_2x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, - xnn_qu8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, - xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, - xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - /*mr=*/2, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qu8_gemm_3x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, - xnn_qu8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, - xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, - xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - /*mr=*/3, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qu8_gemm_4x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane, - xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane, - xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, - xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - /*mr=*/4, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - static void qu8_gemm_6x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, - xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, - xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, - xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, - xnn_init_qu8_conv_minmax_rndnu_scalar_params, - /*mr=*/6, /*nr=*/16, /*log2_kr=*/0, /*log2_sr=*/0, - benchmark::utils::CheckNEON); - } - - BENCHMARK_QU8_END2END(qu8_gemm_2x8__neon_mlal_lane); - BENCHMARK_QU8_END2END(qu8_gemm_3x8__neon_mlal_lane); - BENCHMARK_QU8_END2END(qu8_gemm_4x8__neon_mlal_lane); - BENCHMARK_QU8_END2END(qu8_gemm_6x8__neon_mlal_lane); - BENCHMARK_QU8_END2END(qu8_gemm_2x16__neon_mlal_lane); - BENCHMARK_QU8_END2END(qu8_gemm_3x16__neon_mlal_lane); - BENCHMARK_QU8_END2END(qu8_gemm_4x16__neon_mlal_lane); - BENCHMARK_QU8_END2END(qu8_gemm_6x16__neon_mlal_lane); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM - static void qu8_gemm_1x1c4__armsimd32(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_1x1c4__armsimd32, - xnn_qu8_igemm_minmax_fp32_ukernel_1x1c4__armsimd32, - xnn_qu8_gemm_minmax_fp32_ukernel_1x1c4__armsimd32, - xnn_qu8_igemm_minmax_fp32_ukernel_1x1c4__armsimd32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/1, 1 /*nr=*/, /*log2_kr=*/2, /*log2_sr=*/0, - benchmark::utils::CheckARMV6); - } - static void qu8_gemm_2x1c4__armsimd32(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_2x1c4__armsimd32, - xnn_qu8_igemm_minmax_fp32_ukernel_2x1c4__armsimd32, - xnn_qu8_gemm_minmax_fp32_ukernel_1x1c4__armsimd32, - xnn_qu8_igemm_minmax_fp32_ukernel_1x1c4__armsimd32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/2, 1 /*nr=*/, /*log2_kr=*/2, /*log2_sr=*/0, - benchmark::utils::CheckARMV6); - } - static void qu8_gemm_1x2c4__armsimd32(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, - xnn_qu8_igemm_minmax_fp32_ukernel_1x2c4__armsimd32, - xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, - xnn_qu8_igemm_minmax_fp32_ukernel_1x2c4__armsimd32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/1, 2 /*nr=*/, /*log2_kr=*/2, /*log2_sr=*/0, - benchmark::utils::CheckARMV6); - } - static void qu8_gemm_2x2c4__armsimd32(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32, - xnn_qu8_igemm_minmax_fp32_ukernel_2x2c4__armsimd32, - xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, - xnn_qu8_igemm_minmax_fp32_ukernel_1x2c4__armsimd32, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/2, 2 /*nr=*/, /*log2_kr=*/2, /*log2_sr=*/0, - benchmark::utils::CheckARMV6); - } - - BENCHMARK_QU8_END2END(qu8_gemm_1x1c4__armsimd32); - BENCHMARK_QU8_END2END(qu8_gemm_2x1c4__armsimd32); - BENCHMARK_QU8_END2END(qu8_gemm_1x2c4__armsimd32); - BENCHMARK_QU8_END2END(qu8_gemm_2x2c4__armsimd32); -#endif // XNN_ARCH_ARM - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - static void qu8_gemm_2x8c8__avx2(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_2x8c8__avx2, - xnn_qu8_igemm_minmax_fp32_ukernel_2x8c8__avx2, - xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2, - xnn_qu8_igemm_minmax_fp32_ukernel_1x8c8__avx2, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/8, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVX2); - } - static void qu8_gemm_3x8c8__avx2(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_3x8c8__avx2, - xnn_qu8_igemm_minmax_fp32_ukernel_3x8c8__avx2, - xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2, - xnn_qu8_igemm_minmax_fp32_ukernel_1x8c8__avx2, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/8, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVX2); - } - - static void qu8_gemm_2x4c2__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/0, - benchmark::utils::CheckAVX); - } - static void qu8_gemm_2x4c2__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/0, - benchmark::utils::CheckAVX); - } - static void qu8_gemm_3x4c2__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/0, - benchmark::utils::CheckAVX); - } - static void qu8_gemm_3x4c2__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/0, - benchmark::utils::CheckAVX); - } - static void qu8_gemm_4x4c2__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/0, - benchmark::utils::CheckAVX); - } - static void qu8_gemm_4x4c2__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/0, - benchmark::utils::CheckAVX); - } - - - static void qu8_gemm_2x4c8__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVX); - } - static void qu8_gemm_2x4c8__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVX); - } - static void qu8_gemm_3x4c8__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVX); - } - static void qu8_gemm_3x4c8__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckAVX); - } - - static void qu8_gemm_2x4c2__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/0, - benchmark::utils::CheckSSE41); - } - static void qu8_gemm_2x4c2__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/0, - benchmark::utils::CheckSSE41); - } - static void qu8_gemm_3x4c2__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/0, - benchmark::utils::CheckSSE41); - } - static void qu8_gemm_3x4c2__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/0, - benchmark::utils::CheckSSE41); - } - static void qu8_gemm_4x4c2__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/0, - benchmark::utils::CheckSSE41); - } - static void qu8_gemm_4x4c2__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/0, - benchmark::utils::CheckSSE41); - } - - static void qu8_gemm_2x4c8__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckSSE41); - } - static void qu8_gemm_2x4c8__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckSSE41); - } - static void qu8_gemm_3x4c8__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckSSE41); - } - static void qu8_gemm_3x4c8__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/3, /*log2_sr=*/0, - benchmark::utils::CheckSSE41); - } - - static void qu8_gemm_2x4c2__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/0); - } - static void qu8_gemm_2x4c2__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/0); - } - static void qu8_gemm_3x4c2__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/0); - } - static void qu8_gemm_3x4c2__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/0); - } - static void qu8_gemm_4x4c2__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/0); - } - static void qu8_gemm_4x4c2__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/0); - } - - static void qu8_gemm_2x4c8__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/3, /*log2_sr=*/0); - } - static void qu8_gemm_2x4c8__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/3, /*log2_sr=*/0); - } - static void qu8_gemm_3x4c8__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/3, /*log2_sr=*/0); - } - static void qu8_gemm_3x4c8__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/3, /*log2_sr=*/0); - } - - BENCHMARK_QU8_END2END(qu8_gemm_2x8c8__avx2); - BENCHMARK_QU8_END2END(qu8_gemm_3x8c8__avx2); - - BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__avx_ld64); - BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__avx_ld128); - BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__avx_ld64); - BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__avx_ld128); - BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__avx_ld64); - BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__avx_ld128); - - BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__avx_ld64); - BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__avx_ld128); - BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__avx_ld64); - BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__avx_ld128); - - BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__sse41_ld64); - BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__sse41_ld128); - BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__sse41_ld64); - BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__sse41_ld128); - BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__sse41_ld64); - BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__sse41_ld128); - - BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__sse41_ld64); - BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__sse41_ld128); - BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__sse41_ld64); - BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__sse41_ld128); - - BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__sse2_ld64); - BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__sse2_ld128); - BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__sse2_ld64); - BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__sse2_ld128); - BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__sse2_ld64); - BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__sse2_ld128); - - BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__sse2_ld64); - BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__sse2_ld128); - BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__sse2_ld64); - BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__sse2_ld128); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - static void qu8_gemm_2x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/1); - } - static void qu8_gemm_2x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/1); - } - static void qu8_gemm_3x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/1); - } - static void qu8_gemm_3x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/1); - } - static void qu8_gemm_4x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4, /*log2_kr=*/1); - } - static void qu8_gemm_4x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4, /*log2_kr=*/1); - } - - static void qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/2); - } - static void qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/2); - } - static void qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/2); - } - static void qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/2); - } - static void qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/2); - } - static void qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4, /*log2_kr=*/1, /*log2_sr=*/2); - } - - static void qu8_gemm_2x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/3); - } - static void qu8_gemm_2x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4, /*log2_kr=*/3); - } - static void qu8_gemm_3x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/3); - } - static void qu8_gemm_3x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4, /*log2_kr=*/3); - } - static void qu8_gemm_4x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4, /*log2_kr=*/3); - } - static void qu8_gemm_4x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4, /*log2_kr=*/3); - } - - BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__wasmsimd_dot16x2_ld64) - BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__wasmsimd_dot16x2_ld128) - BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__wasmsimd_dot16x2_ld64) - BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__wasmsimd_dot16x2_ld128) - BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__wasmsimd_dot16x2_ld64) - BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__wasmsimd_dot16x2_ld128) - - BENCHMARK_QU8_END2END(qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64) - BENCHMARK_QU8_END2END(qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128) - BENCHMARK_QU8_END2END(qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64) - BENCHMARK_QU8_END2END(qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128) - BENCHMARK_QU8_END2END(qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64) - BENCHMARK_QU8_END2END(qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128) - - BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__wasmsimd_dot16x2_ld64) - BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__wasmsimd_dot16x2_ld128) - BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__wasmsimd_dot16x2_ld64) - BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__wasmsimd_dot16x2_ld128) - BENCHMARK_QU8_END2END(qu8_gemm_4x4c8__wasmsimd_dot16x2_ld64) - BENCHMARK_QU8_END2END(qu8_gemm_4x4c8__wasmsimd_dot16x2_ld128) -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - static void qu8_gemm_2x2__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_2x2__wasm_fmagic, - xnn_qu8_igemm_minmax_fp32_ukernel_2x2__wasm_fmagic, - xnn_qu8_gemm_minmax_fp32_ukernel_1x2__wasm_fmagic, - xnn_qu8_igemm_minmax_fp32_ukernel_1x2__wasm_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/2); - } - static void qu8_gemm_3x2__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic, - xnn_qu8_igemm_minmax_fp32_ukernel_3x2__wasm_fmagic, - xnn_qu8_gemm_minmax_fp32_ukernel_1x2__wasm_fmagic, - xnn_qu8_igemm_minmax_fp32_ukernel_1x2__wasm_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/2); - } - static void qu8_gemm_4x2__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_4x2__wasm_fmagic, - xnn_qu8_igemm_minmax_fp32_ukernel_4x2__wasm_fmagic, - xnn_qu8_gemm_minmax_fp32_ukernel_1x2__wasm_fmagic, - xnn_qu8_igemm_minmax_fp32_ukernel_1x2__wasm_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/2); - } - static void qu8_gemm_2x4__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_2x4__wasm_fmagic, - xnn_qu8_igemm_minmax_fp32_ukernel_2x4__wasm_fmagic, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4); - } - static void qu8_gemm_3x4__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic, - xnn_qu8_igemm_minmax_fp32_ukernel_3x4__wasm_fmagic, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4); - } - static void qu8_gemm_4x4__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic, - xnn_qu8_igemm_minmax_fp32_ukernel_4x4__wasm_fmagic, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4); - } - - BENCHMARK_QU8_END2END(qu8_gemm_2x2__wasm_fmagic) - BENCHMARK_QU8_END2END(qu8_gemm_3x2__wasm_fmagic) - BENCHMARK_QU8_END2END(qu8_gemm_4x2__wasm_fmagic) - BENCHMARK_QU8_END2END(qu8_gemm_2x4__wasm_fmagic) - BENCHMARK_QU8_END2END(qu8_gemm_3x4__wasm_fmagic) - BENCHMARK_QU8_END2END(qu8_gemm_4x4__wasm_fmagic) -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -static void qu8_gemm_2x2__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic, - xnn_qu8_igemm_minmax_fp32_ukernel_2x2__scalar_fmagic, - xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic, - xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/2); -} -static void qu8_gemm_3x2__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic, - xnn_qu8_igemm_minmax_fp32_ukernel_3x2__scalar_fmagic, - xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic, - xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/2); -} -static void qu8_gemm_4x2__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, - xnn_qu8_igemm_minmax_fp32_ukernel_4x2__scalar_fmagic, - xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic, - xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/2); -} -static void qu8_gemm_2x4__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_fmagic, - xnn_qu8_igemm_minmax_fp32_ukernel_2x4__scalar_fmagic, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4); -} -static void qu8_gemm_3x4__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, - xnn_qu8_igemm_minmax_fp32_ukernel_3x4__scalar_fmagic, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4); -} -static void qu8_gemm_4x4__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, - xnn_qu8_igemm_minmax_fp32_ukernel_4x4__scalar_fmagic, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_fmagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4); -} - -static void qu8_gemm_2x2__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, - xnn_qu8_igemm_minmax_fp32_ukernel_2x2__scalar_imagic, - xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic, - xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/2); -} -static void qu8_gemm_3x2__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_imagic, - xnn_qu8_igemm_minmax_fp32_ukernel_3x2__scalar_imagic, - xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic, - xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/2); -} -static void qu8_gemm_4x2__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_imagic, - xnn_qu8_igemm_minmax_fp32_ukernel_4x2__scalar_imagic, - xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic, - xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/2); -} -static void qu8_gemm_2x4__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, - xnn_qu8_igemm_minmax_fp32_ukernel_2x4__scalar_imagic, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_imagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4); -} -static void qu8_gemm_3x4__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_imagic, - xnn_qu8_igemm_minmax_fp32_ukernel_3x4__scalar_imagic, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_imagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4); -} -static void qu8_gemm_4x4__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_imagic, - xnn_qu8_igemm_minmax_fp32_ukernel_4x4__scalar_imagic, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_imagic, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4); -} - -static void qu8_gemm_2x2__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_lrintf, - xnn_qu8_igemm_minmax_fp32_ukernel_2x2__scalar_lrintf, - xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf, - xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_lrintf, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/2); -} -static void qu8_gemm_3x2__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_lrintf, - xnn_qu8_igemm_minmax_fp32_ukernel_3x2__scalar_lrintf, - xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf, - xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_lrintf, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/2); -} -static void qu8_gemm_4x2__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, - xnn_qu8_igemm_minmax_fp32_ukernel_4x2__scalar_lrintf, - xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf, - xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_lrintf, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/2); -} -static void qu8_gemm_2x4__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_lrintf, - xnn_qu8_igemm_minmax_fp32_ukernel_2x4__scalar_lrintf, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/2, /*nr=*/4); -} -static void qu8_gemm_3x4__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, - xnn_qu8_igemm_minmax_fp32_ukernel_3x4__scalar_lrintf, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/3, /*nr=*/4); -} -static void qu8_gemm_4x4__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) { - GEMMEnd2EndBenchmark(state, model, - xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, - xnn_qu8_igemm_minmax_fp32_ukernel_4x4__scalar_lrintf, - xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf, - xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf, - xnn_init_qu8_conv_minmax_fp32_scalar_params, - /*mr=*/4, /*nr=*/4); -} - -BENCHMARK_QU8_END2END(qu8_gemm_2x2__scalar_fmagic) -BENCHMARK_QU8_END2END(qu8_gemm_3x2__scalar_fmagic) -BENCHMARK_QU8_END2END(qu8_gemm_4x2__scalar_fmagic) -BENCHMARK_QU8_END2END(qu8_gemm_2x4__scalar_fmagic) -BENCHMARK_QU8_END2END(qu8_gemm_3x4__scalar_fmagic) -BENCHMARK_QU8_END2END(qu8_gemm_4x4__scalar_fmagic) - -BENCHMARK_QU8_END2END(qu8_gemm_2x2__scalar_imagic) -BENCHMARK_QU8_END2END(qu8_gemm_3x2__scalar_imagic) -BENCHMARK_QU8_END2END(qu8_gemm_4x2__scalar_imagic) -BENCHMARK_QU8_END2END(qu8_gemm_2x4__scalar_imagic) -BENCHMARK_QU8_END2END(qu8_gemm_3x4__scalar_imagic) -BENCHMARK_QU8_END2END(qu8_gemm_4x4__scalar_imagic) - -BENCHMARK_QU8_END2END(qu8_gemm_2x2__scalar_lrintf) -BENCHMARK_QU8_END2END(qu8_gemm_3x2__scalar_lrintf) -BENCHMARK_QU8_END2END(qu8_gemm_4x2__scalar_lrintf) -BENCHMARK_QU8_END2END(qu8_gemm_2x4__scalar_lrintf) -BENCHMARK_QU8_END2END(qu8_gemm_3x4__scalar_lrintf) -BENCHMARK_QU8_END2END(qu8_gemm_4x4__scalar_lrintf) - - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/models/BUILD b/models/BUILD deleted file mode 100644 index d3c773f8d84..00000000000 --- a/models/BUILD +++ /dev/null @@ -1,314 +0,0 @@ -# Copyright 2023 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -load( - "//:build_defs.bzl", - "xnnpack_cxx_library", - "xnnpack_visibility", -) - -xnnpack_cxx_library( - name = "fp16_mobilenet_v1", - srcs = ["fp16-mobilenet-v1.cc"], - visibility = xnnpack_visibility(), - deps = [ - "@FP16", - "//:XNNPACK", - "//:cache", - "//:common", - "//:math", - "//:models_h", - ], -) - -xnnpack_cxx_library( - name = "fp32_mobilenet_v1", - srcs = ["fp32-mobilenet-v1.cc"], - visibility = xnnpack_visibility(), - deps = [ - "//:XNNPACK", - "//:cache", - "//:common", - "//:models_h", - ], -) - -xnnpack_cxx_library( - name = "fp16_sparse_mobilenet_v1", - srcs = ["fp16-sparse-mobilenet-v1.cc"], - visibility = xnnpack_visibility(), - deps = [ - "@FP16", - "//:XNNPACK", - "//:cache", - "//:common", - "//:math", - "//:models_h", - ], -) - -xnnpack_cxx_library( - name = "fp32_sparse_mobilenet_v1", - srcs = ["fp32-sparse-mobilenet-v1.cc"], - visibility = xnnpack_visibility(), - deps = [ - "//:XNNPACK", - "//:cache", - "//:common", - "//:models_h", - ], -) - -xnnpack_cxx_library( - name = "qs8_qc8w_mobilenet_v1", - srcs = ["qs8-qc8w-mobilenet-v1.cc"], - visibility = xnnpack_visibility(), - deps = [ - "//:XNNPACK", - "//:cache", - "//:common", - "//:models_h", - ], -) - -xnnpack_cxx_library( - name = "qs8_qc8w_mobilenet_v2", - srcs = ["qs8-qc8w-mobilenet-v2.cc"], - visibility = xnnpack_visibility(), - deps = [ - "//:XNNPACK", - "//:cache", - "//:common", - "//:models_h", - ], -) - -xnnpack_cxx_library( - name = "qs8_mobilenet_v1", - srcs = ["qs8-mobilenet-v1.cc"], - visibility = xnnpack_visibility(), - deps = [ - "//:XNNPACK", - "//:cache", - "//:common", - "//:models_h", - ], -) - -xnnpack_cxx_library( - name = "qs8_mobilenet_v2", - srcs = ["qs8-mobilenet-v2.cc"], - visibility = xnnpack_visibility(), - deps = [ - "//:XNNPACK", - "//:cache", - "//:common", - "//:models_h", - ], -) - -xnnpack_cxx_library( - name = "qu8_mobilenet_v1", - srcs = ["qu8-mobilenet-v1.cc"], - visibility = xnnpack_visibility(), - deps = [ - "//:XNNPACK", - "//:cache", - "//:common", - "//:models_h", - ], -) - -xnnpack_cxx_library( - name = "qu8_mobilenet_v2", - srcs = ["qu8-mobilenet-v2.cc"], - visibility = xnnpack_visibility(), - deps = [ - "//:XNNPACK", - "//:cache", - "//:common", - "//:models_h", - ], -) - -xnnpack_cxx_library( - name = "qu8_mobilenet_v3_large", - srcs = ["qu8-mobilenet-v3-large.cc"], - visibility = xnnpack_visibility(), - deps = [ - "//:XNNPACK", - "//:cache", - "//:common", - "//:models_h", - ], -) - -xnnpack_cxx_library( - name = "qu8_mobilenet_v3_small", - srcs = ["qu8-mobilenet-v3-small.cc"], - visibility = xnnpack_visibility(), - deps = [ - "//:XNNPACK", - "//:cache", - "//:common", - "//:models_h", - ], -) - -xnnpack_cxx_library( - name = "fp16_mobilenet_v2", - srcs = ["fp16-mobilenet-v2.cc"], - visibility = xnnpack_visibility(), - deps = [ - "@FP16", - "//:XNNPACK", - "//:cache", - "//:common", - "//:math", - "//:models_h", - ], -) - -xnnpack_cxx_library( - name = "fp32_mobilenet_v2", - srcs = ["fp32-mobilenet-v2.cc"], - visibility = xnnpack_visibility(), - deps = [ - "//:XNNPACK", - "//:cache", - "//:common", - "//:models_h", - ], -) - -xnnpack_cxx_library( - name = "fp16_sparse_mobilenet_v2", - srcs = ["fp16-sparse-mobilenet-v2.cc"], - visibility = xnnpack_visibility(), - deps = [ - "@FP16", - "//:XNNPACK", - "//:cache", - "//:common", - "//:math", - "//:models_h", - ], -) - -xnnpack_cxx_library( - name = "fp32_sparse_mobilenet_v2", - srcs = ["fp32-sparse-mobilenet-v2.cc"], - visibility = xnnpack_visibility(), - deps = [ - "//:XNNPACK", - "//:cache", - "//:common", - "//:models_h", - ], -) - -xnnpack_cxx_library( - name = "fp16_mobilenet_v3_large", - srcs = ["fp16-mobilenet-v3-large.cc"], - visibility = xnnpack_visibility(), - deps = [ - "@FP16", - "//:XNNPACK", - "//:cache", - "//:common", - "//:math", - "//:models_h", - ], -) - -xnnpack_cxx_library( - name = "fp32_mobilenet_v3_large", - srcs = ["fp32-mobilenet-v3-large.cc"], - visibility = xnnpack_visibility(), - deps = [ - "//:XNNPACK", - "//:cache", - "//:common", - "//:models_h", - ], -) - -xnnpack_cxx_library( - name = "fp16_sparse_mobilenet_v3_large", - srcs = ["fp16-sparse-mobilenet-v3-large.cc"], - visibility = xnnpack_visibility(), - deps = [ - "@FP16", - "//:XNNPACK", - "//:cache", - "//:common", - "//:math", - "//:models_h", - ], -) - -xnnpack_cxx_library( - name = "fp32_sparse_mobilenet_v3_large", - srcs = ["fp32-sparse-mobilenet-v3-large.cc"], - visibility = xnnpack_visibility(), - deps = [ - "//:XNNPACK", - "//:cache", - "//:common", - "//:models_h", - ], -) - -xnnpack_cxx_library( - name = "fp16_mobilenet_v3_small", - srcs = ["fp16-mobilenet-v3-small.cc"], - visibility = xnnpack_visibility(), - deps = [ - "@FP16", - "//:XNNPACK", - "//:cache", - "//:common", - "//:math", - "//:models_h", - ], -) - -xnnpack_cxx_library( - name = "fp32_mobilenet_v3_small", - srcs = ["fp32-mobilenet-v3-small.cc"], - visibility = xnnpack_visibility(), - deps = [ - "//:XNNPACK", - "//:cache", - "//:common", - "//:models_h", - ], -) - -xnnpack_cxx_library( - name = "fp16_sparse_mobilenet_v3_small", - srcs = ["fp16-sparse-mobilenet-v3-small.cc"], - visibility = xnnpack_visibility(), - deps = [ - "@FP16", - "//:XNNPACK", - "//:cache", - "//:common", - "//:math", - "//:models_h", - ], -) - -xnnpack_cxx_library( - name = "fp32_sparse_mobilenet_v3_small", - srcs = ["fp32-sparse-mobilenet-v3-small.cc"], - visibility = xnnpack_visibility(), - deps = [ - "//:XNNPACK", - "//:cache", - "//:common", - "//:models_h", - ], -) diff --git a/models/fp16-mobilenet-v1.cc b/models/fp16-mobilenet-v1.cc deleted file mode 100644 index 773bc5103a0..00000000000 --- a/models/fp16-mobilenet-v1.cc +++ /dev/null @@ -1,1543 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include "xnnpack.h" - -#include -#include -#include -#include -#include -#include - -#include "xnnpack/cache.h" -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/models.h" - -namespace models { - -ExecutionPlan FP16MobileNetV1(bool use_jit, pthreadpool_t threadpool) { - alignas(16) static std::array v0; - alignas(16) static std::array v1; - alignas(16) static std::array v2; - alignas(16) static std::array v3; - alignas(16) static std::array v4; - alignas(16) static std::array v5; - alignas(16) static std::array v6; - alignas(16) static std::array v7; - alignas(16) static std::array v8; - alignas(16) static std::array v9; - alignas(16) static std::array v10; - alignas(16) static std::array v11; - alignas(16) static std::array v12; - alignas(16) static std::array v13; - alignas(16) static std::array v14; - alignas(16) static std::array v15; - alignas(16) static std::array v16; - alignas(16) static std::array v17; - alignas(16) static std::array v18; - alignas(16) static std::array v19; - alignas(16) static std::array v20; - alignas(16) static std::array v21; - alignas(16) static std::array v22; - alignas(16) static std::array v23; - alignas(16) static std::array v24; - alignas(16) static std::array v25; - alignas(16) static std::array v26; - alignas(16) static std::array v27; - alignas(16) static std::array v28; - alignas(16) static std::array v29; - alignas(16) static std::array w30; - alignas(16) static std::array w31; - alignas(16) static std::array w32; - alignas(16) static std::array w33; - alignas(16) static std::array w34; - alignas(16) static std::array w35; - alignas(16) static std::array w36; - alignas(16) static std::array w37; - alignas(16) static std::array w38; - alignas(16) static std::array w39; - alignas(16) static std::array w40; - alignas(16) static std::array w41; - alignas(16) static std::array w42; - alignas(16) static std::array w43; - alignas(16) static std::array w44; - alignas(16) static std::array w45; - alignas(16) static std::array w46; - alignas(16) static std::array w47; - alignas(16) static std::array w48; - alignas(16) static std::array w49; - alignas(16) static std::array w50; - alignas(16) static std::array w51; - alignas(16) static std::array w52; - alignas(16) static std::array w53; - alignas(16) static std::array w54; - alignas(16) static std::array w55; - alignas(16) static std::array w56; - alignas(16) static std::array w57; - alignas(16) static std::array w58; - alignas(16) static std::array w59; - alignas(16) static std::array w60; - alignas(16) static std::array w61; - alignas(16) static std::array w62; - alignas(16) static std::array w63; - alignas(16) static std::array w64; - alignas(16) static std::array w65; - alignas(16) static std::array w66; - alignas(16) static std::array w67; - alignas(16) static std::array w68; - alignas(16) static std::array w69; - alignas(16) static std::array w70; - alignas(16) static std::array w71; - alignas(16) static std::array w72; - alignas(16) static std::array w73; - alignas(16) static std::array w74; - alignas(16) static std::array w75; - alignas(16) static std::array w76; - alignas(16) static std::array w77; - alignas(16) static std::array w78; - alignas(16) static std::array w79; - alignas(16) static std::array w80; - alignas(16) static std::array w81; - alignas(16) static std::array w82; - alignas(16) static std::array w83; - alignas(16) static std::array w84; - alignas(16) static std::array w85; - - std::random_device random_device; - auto rng = std::mt19937(random_device()); - auto f32rng = std::bind(std::uniform_real_distribution(-1.0f, +1.0f), std::ref(rng)); - std::generate(v0.begin(), v0.end(), f32rng); - std::generate(v1.begin(), v1.end(), f32rng); - std::generate(v2.begin(), v2.end(), f32rng); - std::generate(v3.begin(), v3.end(), f32rng); - std::generate(v4.begin(), v4.end(), f32rng); - std::generate(v5.begin(), v5.end(), f32rng); - std::generate(v6.begin(), v6.end(), f32rng); - std::generate(v7.begin(), v7.end(), f32rng); - std::generate(v8.begin(), v8.end(), f32rng); - std::generate(v9.begin(), v9.end(), f32rng); - std::generate(v10.begin(), v10.end(), f32rng); - std::generate(v11.begin(), v11.end(), f32rng); - std::generate(v12.begin(), v12.end(), f32rng); - std::generate(v13.begin(), v13.end(), f32rng); - std::generate(v14.begin(), v14.end(), f32rng); - std::generate(v15.begin(), v15.end(), f32rng); - std::generate(v16.begin(), v16.end(), f32rng); - std::generate(v17.begin(), v17.end(), f32rng); - std::generate(v18.begin(), v18.end(), f32rng); - std::generate(v19.begin(), v19.end(), f32rng); - std::generate(v20.begin(), v20.end(), f32rng); - std::generate(v21.begin(), v21.end(), f32rng); - std::generate(v22.begin(), v22.end(), f32rng); - std::generate(v23.begin(), v23.end(), f32rng); - std::generate(v24.begin(), v24.end(), f32rng); - std::generate(v25.begin(), v25.end(), f32rng); - std::generate(v26.begin(), v26.end(), f32rng); - std::generate(v27.begin(), v27.end(), f32rng); - std::generate(v28.begin(), v28.end(), f32rng); - std::generate(v29.begin(), v29.end(), f32rng); - std::generate(w30.begin(), w30.end(), f32rng); - std::generate(w31.begin(), w31.end(), f32rng); - std::generate(w32.begin(), w32.end(), f32rng); - std::generate(w33.begin(), w33.end(), f32rng); - std::generate(w34.begin(), w34.end(), f32rng); - std::generate(w35.begin(), w35.end(), f32rng); - std::generate(w36.begin(), w36.end(), f32rng); - std::generate(w37.begin(), w37.end(), f32rng); - std::generate(w38.begin(), w38.end(), f32rng); - std::generate(w39.begin(), w39.end(), f32rng); - std::generate(w40.begin(), w40.end(), f32rng); - std::generate(w41.begin(), w41.end(), f32rng); - std::generate(w42.begin(), w42.end(), f32rng); - std::generate(w43.begin(), w43.end(), f32rng); - std::generate(w44.begin(), w44.end(), f32rng); - std::generate(w45.begin(), w45.end(), f32rng); - std::generate(w46.begin(), w46.end(), f32rng); - std::generate(w47.begin(), w47.end(), f32rng); - std::generate(w48.begin(), w48.end(), f32rng); - std::generate(w49.begin(), w49.end(), f32rng); - std::generate(w50.begin(), w50.end(), f32rng); - std::generate(w51.begin(), w51.end(), f32rng); - std::generate(w52.begin(), w52.end(), f32rng); - std::generate(w53.begin(), w53.end(), f32rng); - std::generate(w54.begin(), w54.end(), f32rng); - std::generate(w55.begin(), w55.end(), f32rng); - std::generate(w56.begin(), w56.end(), f32rng); - std::generate(w57.begin(), w57.end(), f32rng); - std::generate(w58.begin(), w58.end(), f32rng); - std::generate(w59.begin(), w59.end(), f32rng); - std::generate(w60.begin(), w60.end(), f32rng); - std::generate(w61.begin(), w61.end(), f32rng); - std::generate(w62.begin(), w62.end(), f32rng); - std::generate(w63.begin(), w63.end(), f32rng); - std::generate(w64.begin(), w64.end(), f32rng); - std::generate(w65.begin(), w65.end(), f32rng); - std::generate(w66.begin(), w66.end(), f32rng); - std::generate(w67.begin(), w67.end(), f32rng); - std::generate(w68.begin(), w68.end(), f32rng); - std::generate(w69.begin(), w69.end(), f32rng); - std::generate(w70.begin(), w70.end(), f32rng); - std::generate(w71.begin(), w71.end(), f32rng); - std::generate(w72.begin(), w72.end(), f32rng); - std::generate(w73.begin(), w73.end(), f32rng); - std::generate(w74.begin(), w74.end(), f32rng); - std::generate(w75.begin(), w75.end(), f32rng); - std::generate(w76.begin(), w76.end(), f32rng); - std::generate(w77.begin(), w77.end(), f32rng); - std::generate(w78.begin(), w78.end(), f32rng); - std::generate(w79.begin(), w79.end(), f32rng); - std::generate(w80.begin(), w80.end(), f32rng); - std::generate(w81.begin(), w81.end(), f32rng); - std::generate(w82.begin(), w82.end(), f32rng); - std::generate(w83.begin(), w83.end(), f32rng); - std::generate(w84.begin(), w84.end(), f32rng); - std::generate(w85.begin(), w85.end(), f32rng); - - Operators operators; - xnn_status status; - xnn_code_cache* code_cache_ptr = nullptr; - size_t max_workspace_size = 0; - - xnn_operator_t op0 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 0 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 3 /* input channels per group */, - 32 /* output_channels_per_group */, - 3 /* input pixel stride */, - 32 /* output pixel stride */, - w30.data(), w31.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op0); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #0" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op0, xnn_delete_operator); - - xnn_operator_t op1 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 32 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 32 /* input pixel stride */, - 32 /* output pixel stride */, - w32.data(), w33.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op1); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #1" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op1, xnn_delete_operator); - - xnn_operator_t op2 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 32 /* input channels per group */, - 64 /* output_channels_per_group */, - 32 /* input pixel stride */, - 64 /* output pixel stride */, - w34.data(), w35.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op2); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #2" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op2, xnn_delete_operator); - - xnn_operator_t op3 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 0 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 64 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 64 /* input pixel stride */, - 64 /* output pixel stride */, - w36.data(), w37.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op3); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #3" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op3, xnn_delete_operator); - - xnn_operator_t op4 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 64 /* input channels per group */, - 128 /* output_channels_per_group */, - 64 /* input pixel stride */, - 128 /* output pixel stride */, - w38.data(), w39.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op4); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #4" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op4, xnn_delete_operator); - - xnn_operator_t op5 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 128 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 128 /* input pixel stride */, - 128 /* output pixel stride */, - w40.data(), w41.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op5); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #5" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op5, xnn_delete_operator); - - xnn_operator_t op6 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 128 /* input channels per group */, - 128 /* output_channels_per_group */, - 128 /* input pixel stride */, - 128 /* output pixel stride */, - w42.data(), w43.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op6); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #6" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op6, xnn_delete_operator); - - xnn_operator_t op7 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 0 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 128 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 128 /* input pixel stride */, - 128 /* output pixel stride */, - w44.data(), w45.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op7); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #7" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op7, xnn_delete_operator); - - xnn_operator_t op8 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 128 /* input channels per group */, - 256 /* output_channels_per_group */, - 128 /* input pixel stride */, - 256 /* output pixel stride */, - w46.data(), w47.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op8); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #8" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op8, xnn_delete_operator); - - xnn_operator_t op9 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 256 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 256 /* input pixel stride */, - 256 /* output pixel stride */, - w48.data(), w49.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op9); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #9" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op9, xnn_delete_operator); - - xnn_operator_t op10 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 256 /* input channels per group */, - 256 /* output_channels_per_group */, - 256 /* input pixel stride */, - 256 /* output pixel stride */, - w50.data(), w51.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op10); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #10" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op10, xnn_delete_operator); - - xnn_operator_t op11 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 0 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 256 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 256 /* input pixel stride */, - 256 /* output pixel stride */, - w52.data(), w53.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op11); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #11" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op11, xnn_delete_operator); - - xnn_operator_t op12 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 256 /* input channels per group */, - 512 /* output_channels_per_group */, - 256 /* input pixel stride */, - 512 /* output pixel stride */, - w54.data(), w55.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op12); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #12" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op12, xnn_delete_operator); - - xnn_operator_t op13 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 512 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - w56.data(), w57.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op13); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #13" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op13, xnn_delete_operator); - - xnn_operator_t op14 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 512 /* input channels per group */, - 512 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - w58.data(), w59.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op14); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #14" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op14, xnn_delete_operator); - - xnn_operator_t op15 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 512 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - w60.data(), w61.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op15); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #15" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op15, xnn_delete_operator); - - xnn_operator_t op16 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 512 /* input channels per group */, - 512 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - w62.data(), w63.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op16); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #16" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op16, xnn_delete_operator); - - xnn_operator_t op17 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 512 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - w64.data(), w65.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op17); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #17" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op17, xnn_delete_operator); - - xnn_operator_t op18 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 512 /* input channels per group */, - 512 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - w66.data(), w67.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op18); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #18" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op18, xnn_delete_operator); - - xnn_operator_t op19 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 512 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - w68.data(), w69.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op19); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #19" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op19, xnn_delete_operator); - - xnn_operator_t op20 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 512 /* input channels per group */, - 512 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - w70.data(), w71.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op20); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #20" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op20, xnn_delete_operator); - - xnn_operator_t op21 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 512 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - w72.data(), w73.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op21); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #21" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op21, xnn_delete_operator); - - xnn_operator_t op22 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 512 /* input channels per group */, - 512 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - w74.data(), w75.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op22); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #22" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op22, xnn_delete_operator); - - xnn_operator_t op23 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 0 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 512 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - w76.data(), w77.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op23); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #23" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op23, xnn_delete_operator); - - xnn_operator_t op24 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 512 /* input channels per group */, - 1024 /* output_channels_per_group */, - 512 /* input pixel stride */, - 1024 /* output pixel stride */, - w78.data(), w79.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op24); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #24" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op24, xnn_delete_operator); - - xnn_operator_t op25 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1024 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 1024 /* input pixel stride */, - 1024 /* output pixel stride */, - w80.data(), w81.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op25); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #25" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op25, xnn_delete_operator); - - xnn_operator_t op26 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 1024 /* input channels per group */, - 1024 /* output_channels_per_group */, - 1024 /* input pixel stride */, - 1024 /* output pixel stride */, - w82.data(), w83.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op26); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #26" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op26, xnn_delete_operator); - - xnn_operator_t op27 = nullptr; - status = xnn_create_global_average_pooling_nwc_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op27); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #27" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op27, xnn_delete_operator); - - xnn_operator_t op28 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 1024 /* input channels per group */, - 1001 /* output_channels_per_group */, - 1024 /* input pixel stride */, - 1001 /* output pixel stride */, - w84.data(), w85.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op28); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #28" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op28, xnn_delete_operator); - - size_t op0_workspace_size = 0; - size_t op0_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op0, - /*batch_size=*/1, /*input_height=*/224, /*input_width=*/224, - &op0_workspace_size, &op0_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op0_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #0" << std::endl; - return ExecutionPlan(); - } - - size_t op1_workspace_size = 0; - size_t op1_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op1, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op1_workspace_size, &op1_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op1_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #1" << std::endl; - return ExecutionPlan(); - } - - size_t op2_workspace_size = 0; - size_t op2_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op2, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op2_workspace_size, &op2_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op2_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #2" << std::endl; - return ExecutionPlan(); - } - - size_t op3_workspace_size = 0; - size_t op3_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op3, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op3_workspace_size, &op3_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op3_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #3" << std::endl; - return ExecutionPlan(); - } - - size_t op4_workspace_size = 0; - size_t op4_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op4, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op4_workspace_size, &op4_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op4_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #4" << std::endl; - return ExecutionPlan(); - } - - size_t op5_workspace_size = 0; - size_t op5_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op5, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op5_workspace_size, &op5_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op5_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #5" << std::endl; - return ExecutionPlan(); - } - - size_t op6_workspace_size = 0; - size_t op6_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op6, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op6_workspace_size, &op6_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op6_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #6" << std::endl; - return ExecutionPlan(); - } - - size_t op7_workspace_size = 0; - size_t op7_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op7, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op7_workspace_size, &op7_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op7_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #7" << std::endl; - return ExecutionPlan(); - } - - size_t op8_workspace_size = 0; - size_t op8_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op8, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op8_workspace_size, &op8_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op8_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #8" << std::endl; - return ExecutionPlan(); - } - - size_t op9_workspace_size = 0; - size_t op9_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op9, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op9_workspace_size, &op9_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op9_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #9" << std::endl; - return ExecutionPlan(); - } - - size_t op10_workspace_size = 0; - size_t op10_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op10, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op10_workspace_size, &op10_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op10_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #10" << std::endl; - return ExecutionPlan(); - } - - size_t op11_workspace_size = 0; - size_t op11_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op11, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op11_workspace_size, &op11_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op11_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #11" << std::endl; - return ExecutionPlan(); - } - - size_t op12_workspace_size = 0; - size_t op12_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op12, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op12_workspace_size, &op12_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op12_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #12" << std::endl; - return ExecutionPlan(); - } - - size_t op13_workspace_size = 0; - size_t op13_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op13, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op13_workspace_size, &op13_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op13_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #13" << std::endl; - return ExecutionPlan(); - } - - size_t op14_workspace_size = 0; - size_t op14_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op14, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op14_workspace_size, &op14_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op14_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #14" << std::endl; - return ExecutionPlan(); - } - - size_t op15_workspace_size = 0; - size_t op15_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op15, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op15_workspace_size, &op15_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op15_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #15" << std::endl; - return ExecutionPlan(); - } - - size_t op16_workspace_size = 0; - size_t op16_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op16, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op16_workspace_size, &op16_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op16_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #16" << std::endl; - return ExecutionPlan(); - } - - size_t op17_workspace_size = 0; - size_t op17_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op17, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op17_workspace_size, &op17_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op17_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #17" << std::endl; - return ExecutionPlan(); - } - - size_t op18_workspace_size = 0; - size_t op18_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op18, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op18_workspace_size, &op18_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op18_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #18" << std::endl; - return ExecutionPlan(); - } - - size_t op19_workspace_size = 0; - size_t op19_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op19, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op19_workspace_size, &op19_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op19_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #19" << std::endl; - return ExecutionPlan(); - } - - size_t op20_workspace_size = 0; - size_t op20_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op20, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op20_workspace_size, &op20_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op20_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #20" << std::endl; - return ExecutionPlan(); - } - - size_t op21_workspace_size = 0; - size_t op21_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op21, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op21_workspace_size, &op21_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op21_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #21" << std::endl; - return ExecutionPlan(); - } - - size_t op22_workspace_size = 0; - size_t op22_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op22, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op22_workspace_size, &op22_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op22_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #22" << std::endl; - return ExecutionPlan(); - } - - size_t op23_workspace_size = 0; - size_t op23_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op23, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op23_workspace_size, &op23_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op23_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #23" << std::endl; - return ExecutionPlan(); - } - - size_t op24_workspace_size = 0; - size_t op24_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op24, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op24_workspace_size, &op24_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op24_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #24" << std::endl; - return ExecutionPlan(); - } - - size_t op25_workspace_size = 0; - size_t op25_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op25, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op25_workspace_size, &op25_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op25_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #25" << std::endl; - return ExecutionPlan(); - } - - size_t op26_workspace_size = 0; - size_t op26_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op26, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op26_workspace_size, &op26_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op26_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #26" << std::endl; - return ExecutionPlan(); - } - - size_t op27_workspace_size = 0; - size_t op27_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f16( - op27, - /*batch_size=*/1, 49 /* width */, - 1024 /* channels */, 1024 /* input stride */, 1024 /* output stride */, - &op27_workspace_size, &op27_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op27_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #27" << std::endl; - return ExecutionPlan(); - } - - size_t op28_workspace_size = 0; - size_t op28_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op28, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op28_workspace_size, &op28_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op28_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #28" << std::endl; - return ExecutionPlan(); - } - - Workspace workspace(max_workspace_size); - - status = xnn_setup_convolution2d_nhwc_f16( - op0, - workspace.data(), /*input=*/v0.data(), /*output=*/v1.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #0" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op1, - workspace.data(), /*input=*/v1.data(), /*output=*/v2.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #1" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op2, - workspace.data(), /*input=*/v2.data(), /*output=*/v3.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #2" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op3, - workspace.data(), /*input=*/v3.data(), /*output=*/v4.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #3" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op4, - workspace.data(), /*input=*/v4.data(), /*output=*/v5.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #4" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op5, - workspace.data(), /*input=*/v5.data(), /*output=*/v6.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #5" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op6, - workspace.data(), /*input=*/v6.data(), /*output=*/v7.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #6" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op7, - workspace.data(), /*input=*/v7.data(), /*output=*/v8.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #7" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op8, - workspace.data(), /*input=*/v8.data(), /*output=*/v9.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #8" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op9, - workspace.data(), /*input=*/v9.data(), /*output=*/v10.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #9" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op10, - workspace.data(), /*input=*/v10.data(), /*output=*/v11.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #10" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op11, - workspace.data(), /*input=*/v11.data(), /*output=*/v12.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #11" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op12, - workspace.data(), /*input=*/v12.data(), /*output=*/v13.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #12" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op13, - workspace.data(), /*input=*/v13.data(), /*output=*/v14.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #13" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op14, - workspace.data(), /*input=*/v14.data(), /*output=*/v15.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #14" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op15, - workspace.data(), /*input=*/v15.data(), /*output=*/v16.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #15" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op16, - workspace.data(), /*input=*/v16.data(), /*output=*/v17.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #16" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op17, - workspace.data(), /*input=*/v17.data(), /*output=*/v18.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #17" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op18, - workspace.data(), /*input=*/v18.data(), /*output=*/v19.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #18" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op19, - workspace.data(), /*input=*/v19.data(), /*output=*/v20.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #19" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op20, - workspace.data(), /*input=*/v20.data(), /*output=*/v21.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #20" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op21, - workspace.data(), /*input=*/v21.data(), /*output=*/v22.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #21" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op22, - workspace.data(), /*input=*/v22.data(), /*output=*/v23.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #22" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op23, - workspace.data(), /*input=*/v23.data(), /*output=*/v24.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #23" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op24, - workspace.data(), /*input=*/v24.data(), /*output=*/v25.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #24" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op25, - workspace.data(), /*input=*/v25.data(), /*output=*/v26.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #25" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op26, - workspace.data(), /*input=*/v26.data(), /*output=*/v27.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #26" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f16( - op27, - workspace.data(), - /*input=*/v27.data(), /*output=*/v28.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #27" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op28, - workspace.data(), /*input=*/v28.data(), /*output=*/v29.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #28" << std::endl; - return ExecutionPlan(); - } - - XNN_PRAGMA_CLANG("clang diagnostic push") - XNN_PRAGMA_CLANG("clang diagnostic ignored \"-Wpessimizing-move\"") - return ExecutionPlan{operators, workspace}; - XNN_PRAGMA_CLANG("clang diagnostic pop") -} - -ExecutionPlan FP16MobileNetV1(pthreadpool_t threadpool) { - return FP16MobileNetV1(/*use_jit=*/false, threadpool); -} - -ExecutionPlan FP16MobileNetV1Jit(pthreadpool_t threadpool) { - return FP16MobileNetV1(/*use_jit=*/true, threadpool); -} - -} // namespace models diff --git a/models/fp16-mobilenet-v2.cc b/models/fp16-mobilenet-v2.cc deleted file mode 100644 index 3138b40da1a..00000000000 --- a/models/fp16-mobilenet-v2.cc +++ /dev/null @@ -1,3183 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include "xnnpack.h" - -#include -#include -#include -#include -#include -#include - -#include "xnnpack/cache.h" -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/models.h" - -namespace models { - -ExecutionPlan FP16MobileNetV2(bool use_jit, pthreadpool_t threadpool) { - alignas(16) static std::array v0; - alignas(16) static std::array v1; - alignas(16) static std::array v2; - alignas(16) static std::array v3; - alignas(16) static std::array v4; - alignas(16) static std::array v5; - alignas(16) static std::array v6; - alignas(16) static std::array v7; - alignas(16) static std::array v8; - alignas(16) static std::array v9; - alignas(16) static std::array v10; - alignas(16) static std::array v11; - alignas(16) static std::array v12; - alignas(16) static std::array v13; - alignas(16) static std::array v14; - alignas(16) static std::array v15; - alignas(16) static std::array v16; - alignas(16) static std::array v17; - alignas(16) static std::array v18; - alignas(16) static std::array v19; - alignas(16) static std::array v20; - alignas(16) static std::array v21; - alignas(16) static std::array v22; - alignas(16) static std::array v23; - alignas(16) static std::array v24; - alignas(16) static std::array v25; - alignas(16) static std::array v26; - alignas(16) static std::array v27; - alignas(16) static std::array v28; - alignas(16) static std::array v29; - alignas(16) static std::array v30; - alignas(16) static std::array v31; - alignas(16) static std::array v32; - alignas(16) static std::array v33; - alignas(16) static std::array v34; - alignas(16) static std::array v35; - alignas(16) static std::array v36; - alignas(16) static std::array v37; - alignas(16) static std::array v38; - alignas(16) static std::array v39; - alignas(16) static std::array v40; - alignas(16) static std::array v41; - alignas(16) static std::array v42; - alignas(16) static std::array v43; - alignas(16) static std::array v44; - alignas(16) static std::array v45; - alignas(16) static std::array v46; - alignas(16) static std::array v47; - alignas(16) static std::array v48; - alignas(16) static std::array v49; - alignas(16) static std::array v50; - alignas(16) static std::array v51; - alignas(16) static std::array v52; - alignas(16) static std::array v53; - alignas(16) static std::array v54; - alignas(16) static std::array v55; - alignas(16) static std::array v56; - alignas(16) static std::array v57; - alignas(16) static std::array v58; - alignas(16) static std::array v59; - alignas(16) static std::array v60; - alignas(16) static std::array v61; - alignas(16) static std::array v62; - alignas(16) static std::array v63; - alignas(16) static std::array v64; - alignas(16) static std::array w65; - alignas(16) static std::array w66; - alignas(16) static std::array w67; - alignas(16) static std::array w68; - alignas(16) static std::array w69; - alignas(16) static std::array w70; - alignas(16) static std::array w71; - alignas(16) static std::array w72; - alignas(16) static std::array w73; - alignas(16) static std::array w74; - alignas(16) static std::array w75; - alignas(16) static std::array w76; - alignas(16) static std::array w77; - alignas(16) static std::array w78; - alignas(16) static std::array w79; - alignas(16) static std::array w80; - alignas(16) static std::array w81; - alignas(16) static std::array w82; - alignas(16) static std::array w83; - alignas(16) static std::array w84; - alignas(16) static std::array w85; - alignas(16) static std::array w86; - alignas(16) static std::array w87; - alignas(16) static std::array w88; - alignas(16) static std::array w89; - alignas(16) static std::array w90; - alignas(16) static std::array w91; - alignas(16) static std::array w92; - alignas(16) static std::array w93; - alignas(16) static std::array w94; - alignas(16) static std::array w95; - alignas(16) static std::array w96; - alignas(16) static std::array w97; - alignas(16) static std::array w98; - alignas(16) static std::array w99; - alignas(16) static std::array w100; - alignas(16) static std::array w101; - alignas(16) static std::array w102; - alignas(16) static std::array w103; - alignas(16) static std::array w104; - alignas(16) static std::array w105; - alignas(16) static std::array w106; - alignas(16) static std::array w107; - alignas(16) static std::array w108; - alignas(16) static std::array w109; - alignas(16) static std::array w110; - alignas(16) static std::array w111; - alignas(16) static std::array w112; - alignas(16) static std::array w113; - alignas(16) static std::array w114; - alignas(16) static std::array w115; - alignas(16) static std::array w116; - alignas(16) static std::array w117; - alignas(16) static std::array w118; - alignas(16) static std::array w119; - alignas(16) static std::array w120; - alignas(16) static std::array w121; - alignas(16) static std::array w122; - alignas(16) static std::array w123; - alignas(16) static std::array w124; - alignas(16) static std::array w125; - alignas(16) static std::array w126; - alignas(16) static std::array w127; - alignas(16) static std::array w128; - alignas(16) static std::array w129; - alignas(16) static std::array w130; - alignas(16) static std::array w131; - alignas(16) static std::array w132; - alignas(16) static std::array w133; - alignas(16) static std::array w134; - alignas(16) static std::array w135; - alignas(16) static std::array w136; - alignas(16) static std::array w137; - alignas(16) static std::array w138; - alignas(16) static std::array w139; - alignas(16) static std::array w140; - alignas(16) static std::array w141; - alignas(16) static std::array w142; - alignas(16) static std::array w143; - alignas(16) static std::array w144; - alignas(16) static std::array w145; - alignas(16) static std::array w146; - alignas(16) static std::array w147; - alignas(16) static std::array w148; - alignas(16) static std::array w149; - alignas(16) static std::array w150; - alignas(16) static std::array w151; - alignas(16) static std::array w152; - alignas(16) static std::array w153; - alignas(16) static std::array w154; - alignas(16) static std::array w155; - alignas(16) static std::array w156; - alignas(16) static std::array w157; - alignas(16) static std::array w158; - alignas(16) static std::array w159; - alignas(16) static std::array w160; - alignas(16) static std::array w161; - alignas(16) static std::array w162; - alignas(16) static std::array w163; - alignas(16) static std::array w164; - alignas(16) static std::array w165; - alignas(16) static std::array w166; - alignas(16) static std::array w167; - alignas(16) static std::array w168; - alignas(16) static std::array w169; - alignas(16) static std::array w170; - - std::random_device random_device; - auto rng = std::mt19937(random_device()); - auto f32rng = std::bind(std::uniform_real_distribution(-1.0f, +1.0f), std::ref(rng)); - std::generate(v0.begin(), v0.end(), f32rng); - std::generate(v1.begin(), v1.end(), f32rng); - std::generate(v2.begin(), v2.end(), f32rng); - std::generate(v3.begin(), v3.end(), f32rng); - std::generate(v4.begin(), v4.end(), f32rng); - std::generate(v5.begin(), v5.end(), f32rng); - std::generate(v6.begin(), v6.end(), f32rng); - std::generate(v7.begin(), v7.end(), f32rng); - std::generate(v8.begin(), v8.end(), f32rng); - std::generate(v9.begin(), v9.end(), f32rng); - std::generate(v10.begin(), v10.end(), f32rng); - std::generate(v11.begin(), v11.end(), f32rng); - std::generate(v12.begin(), v12.end(), f32rng); - std::generate(v13.begin(), v13.end(), f32rng); - std::generate(v14.begin(), v14.end(), f32rng); - std::generate(v15.begin(), v15.end(), f32rng); - std::generate(v16.begin(), v16.end(), f32rng); - std::generate(v17.begin(), v17.end(), f32rng); - std::generate(v18.begin(), v18.end(), f32rng); - std::generate(v19.begin(), v19.end(), f32rng); - std::generate(v20.begin(), v20.end(), f32rng); - std::generate(v21.begin(), v21.end(), f32rng); - std::generate(v22.begin(), v22.end(), f32rng); - std::generate(v23.begin(), v23.end(), f32rng); - std::generate(v24.begin(), v24.end(), f32rng); - std::generate(v25.begin(), v25.end(), f32rng); - std::generate(v26.begin(), v26.end(), f32rng); - std::generate(v27.begin(), v27.end(), f32rng); - std::generate(v28.begin(), v28.end(), f32rng); - std::generate(v29.begin(), v29.end(), f32rng); - std::generate(v30.begin(), v30.end(), f32rng); - std::generate(v31.begin(), v31.end(), f32rng); - std::generate(v32.begin(), v32.end(), f32rng); - std::generate(v33.begin(), v33.end(), f32rng); - std::generate(v34.begin(), v34.end(), f32rng); - std::generate(v35.begin(), v35.end(), f32rng); - std::generate(v36.begin(), v36.end(), f32rng); - std::generate(v37.begin(), v37.end(), f32rng); - std::generate(v38.begin(), v38.end(), f32rng); - std::generate(v39.begin(), v39.end(), f32rng); - std::generate(v40.begin(), v40.end(), f32rng); - std::generate(v41.begin(), v41.end(), f32rng); - std::generate(v42.begin(), v42.end(), f32rng); - std::generate(v43.begin(), v43.end(), f32rng); - std::generate(v44.begin(), v44.end(), f32rng); - std::generate(v45.begin(), v45.end(), f32rng); - std::generate(v46.begin(), v46.end(), f32rng); - std::generate(v47.begin(), v47.end(), f32rng); - std::generate(v48.begin(), v48.end(), f32rng); - std::generate(v49.begin(), v49.end(), f32rng); - std::generate(v50.begin(), v50.end(), f32rng); - std::generate(v51.begin(), v51.end(), f32rng); - std::generate(v52.begin(), v52.end(), f32rng); - std::generate(v53.begin(), v53.end(), f32rng); - std::generate(v54.begin(), v54.end(), f32rng); - std::generate(v55.begin(), v55.end(), f32rng); - std::generate(v56.begin(), v56.end(), f32rng); - std::generate(v57.begin(), v57.end(), f32rng); - std::generate(v58.begin(), v58.end(), f32rng); - std::generate(v59.begin(), v59.end(), f32rng); - std::generate(v60.begin(), v60.end(), f32rng); - std::generate(v61.begin(), v61.end(), f32rng); - std::generate(v62.begin(), v62.end(), f32rng); - std::generate(v63.begin(), v63.end(), f32rng); - std::generate(v64.begin(), v64.end(), f32rng); - std::generate(w65.begin(), w65.end(), f32rng); - std::generate(w66.begin(), w66.end(), f32rng); - std::generate(w67.begin(), w67.end(), f32rng); - std::generate(w68.begin(), w68.end(), f32rng); - std::generate(w69.begin(), w69.end(), f32rng); - std::generate(w70.begin(), w70.end(), f32rng); - std::generate(w71.begin(), w71.end(), f32rng); - std::generate(w72.begin(), w72.end(), f32rng); - std::generate(w73.begin(), w73.end(), f32rng); - std::generate(w74.begin(), w74.end(), f32rng); - std::generate(w75.begin(), w75.end(), f32rng); - std::generate(w76.begin(), w76.end(), f32rng); - std::generate(w77.begin(), w77.end(), f32rng); - std::generate(w78.begin(), w78.end(), f32rng); - std::generate(w79.begin(), w79.end(), f32rng); - std::generate(w80.begin(), w80.end(), f32rng); - std::generate(w81.begin(), w81.end(), f32rng); - std::generate(w82.begin(), w82.end(), f32rng); - std::generate(w83.begin(), w83.end(), f32rng); - std::generate(w84.begin(), w84.end(), f32rng); - std::generate(w85.begin(), w85.end(), f32rng); - std::generate(w86.begin(), w86.end(), f32rng); - std::generate(w87.begin(), w87.end(), f32rng); - std::generate(w88.begin(), w88.end(), f32rng); - std::generate(w89.begin(), w89.end(), f32rng); - std::generate(w90.begin(), w90.end(), f32rng); - std::generate(w91.begin(), w91.end(), f32rng); - std::generate(w92.begin(), w92.end(), f32rng); - std::generate(w93.begin(), w93.end(), f32rng); - std::generate(w94.begin(), w94.end(), f32rng); - std::generate(w95.begin(), w95.end(), f32rng); - std::generate(w96.begin(), w96.end(), f32rng); - std::generate(w97.begin(), w97.end(), f32rng); - std::generate(w98.begin(), w98.end(), f32rng); - std::generate(w99.begin(), w99.end(), f32rng); - std::generate(w100.begin(), w100.end(), f32rng); - std::generate(w101.begin(), w101.end(), f32rng); - std::generate(w102.begin(), w102.end(), f32rng); - std::generate(w103.begin(), w103.end(), f32rng); - std::generate(w104.begin(), w104.end(), f32rng); - std::generate(w105.begin(), w105.end(), f32rng); - std::generate(w106.begin(), w106.end(), f32rng); - std::generate(w107.begin(), w107.end(), f32rng); - std::generate(w108.begin(), w108.end(), f32rng); - std::generate(w109.begin(), w109.end(), f32rng); - std::generate(w110.begin(), w110.end(), f32rng); - std::generate(w111.begin(), w111.end(), f32rng); - std::generate(w112.begin(), w112.end(), f32rng); - std::generate(w113.begin(), w113.end(), f32rng); - std::generate(w114.begin(), w114.end(), f32rng); - std::generate(w115.begin(), w115.end(), f32rng); - std::generate(w116.begin(), w116.end(), f32rng); - std::generate(w117.begin(), w117.end(), f32rng); - std::generate(w118.begin(), w118.end(), f32rng); - std::generate(w119.begin(), w119.end(), f32rng); - std::generate(w120.begin(), w120.end(), f32rng); - std::generate(w121.begin(), w121.end(), f32rng); - std::generate(w122.begin(), w122.end(), f32rng); - std::generate(w123.begin(), w123.end(), f32rng); - std::generate(w124.begin(), w124.end(), f32rng); - std::generate(w125.begin(), w125.end(), f32rng); - std::generate(w126.begin(), w126.end(), f32rng); - std::generate(w127.begin(), w127.end(), f32rng); - std::generate(w128.begin(), w128.end(), f32rng); - std::generate(w129.begin(), w129.end(), f32rng); - std::generate(w130.begin(), w130.end(), f32rng); - std::generate(w131.begin(), w131.end(), f32rng); - std::generate(w132.begin(), w132.end(), f32rng); - std::generate(w133.begin(), w133.end(), f32rng); - std::generate(w134.begin(), w134.end(), f32rng); - std::generate(w135.begin(), w135.end(), f32rng); - std::generate(w136.begin(), w136.end(), f32rng); - std::generate(w137.begin(), w137.end(), f32rng); - std::generate(w138.begin(), w138.end(), f32rng); - std::generate(w139.begin(), w139.end(), f32rng); - std::generate(w140.begin(), w140.end(), f32rng); - std::generate(w141.begin(), w141.end(), f32rng); - std::generate(w142.begin(), w142.end(), f32rng); - std::generate(w143.begin(), w143.end(), f32rng); - std::generate(w144.begin(), w144.end(), f32rng); - std::generate(w145.begin(), w145.end(), f32rng); - std::generate(w146.begin(), w146.end(), f32rng); - std::generate(w147.begin(), w147.end(), f32rng); - std::generate(w148.begin(), w148.end(), f32rng); - std::generate(w149.begin(), w149.end(), f32rng); - std::generate(w150.begin(), w150.end(), f32rng); - std::generate(w151.begin(), w151.end(), f32rng); - std::generate(w152.begin(), w152.end(), f32rng); - std::generate(w153.begin(), w153.end(), f32rng); - std::generate(w154.begin(), w154.end(), f32rng); - std::generate(w155.begin(), w155.end(), f32rng); - std::generate(w156.begin(), w156.end(), f32rng); - std::generate(w157.begin(), w157.end(), f32rng); - std::generate(w158.begin(), w158.end(), f32rng); - std::generate(w159.begin(), w159.end(), f32rng); - std::generate(w160.begin(), w160.end(), f32rng); - std::generate(w161.begin(), w161.end(), f32rng); - std::generate(w162.begin(), w162.end(), f32rng); - std::generate(w163.begin(), w163.end(), f32rng); - std::generate(w164.begin(), w164.end(), f32rng); - std::generate(w165.begin(), w165.end(), f32rng); - std::generate(w166.begin(), w166.end(), f32rng); - std::generate(w167.begin(), w167.end(), f32rng); - std::generate(w168.begin(), w168.end(), f32rng); - std::generate(w169.begin(), w169.end(), f32rng); - std::generate(w170.begin(), w170.end(), f32rng); - - Operators operators; - xnn_status status; - xnn_code_cache* code_cache_ptr = nullptr; - size_t max_workspace_size = 0; - - xnn_operator_t op0 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 0 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 3 /* input channels per group */, - 32 /* output_channels_per_group */, - 3 /* input pixel stride */, - 32 /* output pixel stride */, - w65.data(), w66.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op0); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #0" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op0, xnn_delete_operator); - - xnn_operator_t op1 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 32 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 32 /* input pixel stride */, - 32 /* output pixel stride */, - w67.data(), w68.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op1); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #1" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op1, xnn_delete_operator); - - xnn_operator_t op2 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 32 /* input channels per group */, - 16 /* output_channels_per_group */, - 32 /* input pixel stride */, - 16 /* output pixel stride */, - w69.data(), w70.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op2); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #2" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op2, xnn_delete_operator); - - xnn_operator_t op3 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 16 /* input channels per group */, - 96 /* output_channels_per_group */, - 16 /* input pixel stride */, - 96 /* output pixel stride */, - w71.data(), w72.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op3); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #3" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op3, xnn_delete_operator); - - xnn_operator_t op4 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 0 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 96 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 96 /* input pixel stride */, - 96 /* output pixel stride */, - w73.data(), w74.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op4); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #4" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op4, xnn_delete_operator); - - xnn_operator_t op5 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 96 /* input channels per group */, - 24 /* output_channels_per_group */, - 96 /* input pixel stride */, - 24 /* output pixel stride */, - w75.data(), w76.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op5); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #5" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op5, xnn_delete_operator); - - xnn_operator_t op6 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 24 /* input channels per group */, - 144 /* output_channels_per_group */, - 24 /* input pixel stride */, - 144 /* output pixel stride */, - w77.data(), w78.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op6); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #6" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op6, xnn_delete_operator); - - xnn_operator_t op7 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 144 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 144 /* input pixel stride */, - 144 /* output pixel stride */, - w79.data(), w80.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op7); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #7" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op7, xnn_delete_operator); - - xnn_operator_t op8 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 144 /* input channels per group */, - 24 /* output_channels_per_group */, - 144 /* input pixel stride */, - 24 /* output pixel stride */, - w81.data(), w82.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op8); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #8" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op8, xnn_delete_operator); - - xnn_operator_t op9 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op9); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #9" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op9, xnn_delete_operator); - - xnn_operator_t op10 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 24 /* input channels per group */, - 144 /* output_channels_per_group */, - 24 /* input pixel stride */, - 144 /* output pixel stride */, - w83.data(), w84.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op10); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #10" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op10, xnn_delete_operator); - - xnn_operator_t op11 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 0 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 144 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 144 /* input pixel stride */, - 144 /* output pixel stride */, - w85.data(), w86.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op11); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #11" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op11, xnn_delete_operator); - - xnn_operator_t op12 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 144 /* input channels per group */, - 32 /* output_channels_per_group */, - 144 /* input pixel stride */, - 32 /* output pixel stride */, - w87.data(), w88.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op12); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #12" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op12, xnn_delete_operator); - - xnn_operator_t op13 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 32 /* input channels per group */, - 192 /* output_channels_per_group */, - 32 /* input pixel stride */, - 192 /* output pixel stride */, - w89.data(), w90.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op13); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #13" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op13, xnn_delete_operator); - - xnn_operator_t op14 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 192 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 192 /* input pixel stride */, - 192 /* output pixel stride */, - w91.data(), w92.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op14); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #14" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op14, xnn_delete_operator); - - xnn_operator_t op15 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 192 /* input channels per group */, - 32 /* output_channels_per_group */, - 192 /* input pixel stride */, - 32 /* output pixel stride */, - w93.data(), w94.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op15); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #15" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op15, xnn_delete_operator); - - xnn_operator_t op16 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op16); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #16" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op16, xnn_delete_operator); - - xnn_operator_t op17 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 32 /* input channels per group */, - 192 /* output_channels_per_group */, - 32 /* input pixel stride */, - 192 /* output pixel stride */, - w95.data(), w96.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op17); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #17" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op17, xnn_delete_operator); - - xnn_operator_t op18 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 192 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 192 /* input pixel stride */, - 192 /* output pixel stride */, - w97.data(), w98.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op18); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #18" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op18, xnn_delete_operator); - - xnn_operator_t op19 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 192 /* input channels per group */, - 32 /* output_channels_per_group */, - 192 /* input pixel stride */, - 32 /* output pixel stride */, - w99.data(), w100.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op19); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #19" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op19, xnn_delete_operator); - - xnn_operator_t op20 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op20); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #20" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op20, xnn_delete_operator); - - xnn_operator_t op21 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 32 /* input channels per group */, - 192 /* output_channels_per_group */, - 32 /* input pixel stride */, - 192 /* output pixel stride */, - w101.data(), w102.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op21); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #21" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op21, xnn_delete_operator); - - xnn_operator_t op22 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 0 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 192 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 192 /* input pixel stride */, - 192 /* output pixel stride */, - w103.data(), w104.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op22); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #22" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op22, xnn_delete_operator); - - xnn_operator_t op23 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 192 /* input channels per group */, - 64 /* output_channels_per_group */, - 192 /* input pixel stride */, - 64 /* output pixel stride */, - w105.data(), w106.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op23); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #23" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op23, xnn_delete_operator); - - xnn_operator_t op24 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 64 /* input channels per group */, - 384 /* output_channels_per_group */, - 64 /* input pixel stride */, - 384 /* output pixel stride */, - w107.data(), w108.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op24); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #24" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op24, xnn_delete_operator); - - xnn_operator_t op25 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 384 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 384 /* input pixel stride */, - 384 /* output pixel stride */, - w109.data(), w110.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op25); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #25" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op25, xnn_delete_operator); - - xnn_operator_t op26 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 384 /* input channels per group */, - 64 /* output_channels_per_group */, - 384 /* input pixel stride */, - 64 /* output pixel stride */, - w111.data(), w112.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op26); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #26" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op26, xnn_delete_operator); - - xnn_operator_t op27 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op27); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #27" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op27, xnn_delete_operator); - - xnn_operator_t op28 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 64 /* input channels per group */, - 384 /* output_channels_per_group */, - 64 /* input pixel stride */, - 384 /* output pixel stride */, - w113.data(), w114.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op28); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #28" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op28, xnn_delete_operator); - - xnn_operator_t op29 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 384 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 384 /* input pixel stride */, - 384 /* output pixel stride */, - w115.data(), w116.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op29); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #29" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op29, xnn_delete_operator); - - xnn_operator_t op30 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 384 /* input channels per group */, - 64 /* output_channels_per_group */, - 384 /* input pixel stride */, - 64 /* output pixel stride */, - w117.data(), w118.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op30); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #30" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op30, xnn_delete_operator); - - xnn_operator_t op31 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op31); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #31" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op31, xnn_delete_operator); - - xnn_operator_t op32 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 64 /* input channels per group */, - 384 /* output_channels_per_group */, - 64 /* input pixel stride */, - 384 /* output pixel stride */, - w119.data(), w120.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op32); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #32" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op32, xnn_delete_operator); - - xnn_operator_t op33 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 384 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 384 /* input pixel stride */, - 384 /* output pixel stride */, - w121.data(), w122.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op33); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #33" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op33, xnn_delete_operator); - - xnn_operator_t op34 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 384 /* input channels per group */, - 64 /* output_channels_per_group */, - 384 /* input pixel stride */, - 64 /* output pixel stride */, - w123.data(), w124.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op34); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #34" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op34, xnn_delete_operator); - - xnn_operator_t op35 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op35); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #35" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op35, xnn_delete_operator); - - xnn_operator_t op36 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 64 /* input channels per group */, - 384 /* output_channels_per_group */, - 64 /* input pixel stride */, - 384 /* output pixel stride */, - w125.data(), w126.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op36); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #36" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op36, xnn_delete_operator); - - xnn_operator_t op37 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 384 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 384 /* input pixel stride */, - 384 /* output pixel stride */, - w127.data(), w128.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op37); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #37" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op37, xnn_delete_operator); - - xnn_operator_t op38 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 384 /* input channels per group */, - 96 /* output_channels_per_group */, - 384 /* input pixel stride */, - 96 /* output pixel stride */, - w129.data(), w130.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op38); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #38" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op38, xnn_delete_operator); - - xnn_operator_t op39 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 96 /* input channels per group */, - 576 /* output_channels_per_group */, - 96 /* input pixel stride */, - 576 /* output pixel stride */, - w131.data(), w132.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op39); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #39" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op39, xnn_delete_operator); - - xnn_operator_t op40 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 576 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 576 /* input pixel stride */, - 576 /* output pixel stride */, - w133.data(), w134.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op40); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #40" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op40, xnn_delete_operator); - - xnn_operator_t op41 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 576 /* input channels per group */, - 96 /* output_channels_per_group */, - 576 /* input pixel stride */, - 96 /* output pixel stride */, - w135.data(), w136.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op41); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #41" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op41, xnn_delete_operator); - - xnn_operator_t op42 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op42); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #42" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op42, xnn_delete_operator); - - xnn_operator_t op43 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 96 /* input channels per group */, - 576 /* output_channels_per_group */, - 96 /* input pixel stride */, - 576 /* output pixel stride */, - w137.data(), w138.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op43); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #43" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op43, xnn_delete_operator); - - xnn_operator_t op44 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 576 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 576 /* input pixel stride */, - 576 /* output pixel stride */, - w139.data(), w140.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op44); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #44" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op44, xnn_delete_operator); - - xnn_operator_t op45 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 576 /* input channels per group */, - 96 /* output_channels_per_group */, - 576 /* input pixel stride */, - 96 /* output pixel stride */, - w141.data(), w142.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op45); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #45" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op45, xnn_delete_operator); - - xnn_operator_t op46 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op46); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #46" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op46, xnn_delete_operator); - - xnn_operator_t op47 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 96 /* input channels per group */, - 576 /* output_channels_per_group */, - 96 /* input pixel stride */, - 576 /* output pixel stride */, - w143.data(), w144.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op47); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #47" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op47, xnn_delete_operator); - - xnn_operator_t op48 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 0 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 576 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 576 /* input pixel stride */, - 576 /* output pixel stride */, - w145.data(), w146.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op48); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #48" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op48, xnn_delete_operator); - - xnn_operator_t op49 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 576 /* input channels per group */, - 160 /* output_channels_per_group */, - 576 /* input pixel stride */, - 160 /* output pixel stride */, - w147.data(), w148.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op49); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #49" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op49, xnn_delete_operator); - - xnn_operator_t op50 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 160 /* input channels per group */, - 960 /* output_channels_per_group */, - 160 /* input pixel stride */, - 960 /* output pixel stride */, - w149.data(), w150.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op50); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #50" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op50, xnn_delete_operator); - - xnn_operator_t op51 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 960 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 960 /* input pixel stride */, - 960 /* output pixel stride */, - w151.data(), w152.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op51); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #51" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op51, xnn_delete_operator); - - xnn_operator_t op52 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 960 /* input channels per group */, - 160 /* output_channels_per_group */, - 960 /* input pixel stride */, - 160 /* output pixel stride */, - w153.data(), w154.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op52); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #52" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op52, xnn_delete_operator); - - xnn_operator_t op53 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op53); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #53" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op53, xnn_delete_operator); - - xnn_operator_t op54 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 160 /* input channels per group */, - 960 /* output_channels_per_group */, - 160 /* input pixel stride */, - 960 /* output pixel stride */, - w155.data(), w156.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op54); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #54" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op54, xnn_delete_operator); - - xnn_operator_t op55 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 960 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 960 /* input pixel stride */, - 960 /* output pixel stride */, - w157.data(), w158.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op55); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #55" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op55, xnn_delete_operator); - - xnn_operator_t op56 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 960 /* input channels per group */, - 160 /* output_channels_per_group */, - 960 /* input pixel stride */, - 160 /* output pixel stride */, - w159.data(), w160.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op56); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #56" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op56, xnn_delete_operator); - - xnn_operator_t op57 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op57); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #57" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op57, xnn_delete_operator); - - xnn_operator_t op58 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 160 /* input channels per group */, - 960 /* output_channels_per_group */, - 160 /* input pixel stride */, - 960 /* output pixel stride */, - w161.data(), w162.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op58); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #58" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op58, xnn_delete_operator); - - xnn_operator_t op59 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 960 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 960 /* input pixel stride */, - 960 /* output pixel stride */, - w163.data(), w164.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op59); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #59" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op59, xnn_delete_operator); - - xnn_operator_t op60 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 960 /* input channels per group */, - 320 /* output_channels_per_group */, - 960 /* input pixel stride */, - 320 /* output pixel stride */, - w165.data(), w166.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op60); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #60" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op60, xnn_delete_operator); - - xnn_operator_t op61 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 320 /* input channels per group */, - 1280 /* output_channels_per_group */, - 320 /* input pixel stride */, - 1280 /* output pixel stride */, - w167.data(), w168.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op61); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #61" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op61, xnn_delete_operator); - - xnn_operator_t op62 = nullptr; - status = xnn_create_global_average_pooling_nwc_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op62); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #62" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op62, xnn_delete_operator); - - xnn_operator_t op63 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 1280 /* input channels per group */, - 1001 /* output_channels_per_group */, - 1280 /* input pixel stride */, - 1001 /* output pixel stride */, - w169.data(), w170.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op63); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #63" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op63, xnn_delete_operator); - - size_t op0_workspace_size = 0; - size_t op0_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op0, - /*batch_size=*/1, /*input_height=*/224, /*input_width=*/224, - &op0_workspace_size, &op0_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op0_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #0" << std::endl; - return ExecutionPlan(); - } - - size_t op1_workspace_size = 0; - size_t op1_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op1, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op1_workspace_size, &op1_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op1_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #1" << std::endl; - return ExecutionPlan(); - } - - size_t op2_workspace_size = 0; - size_t op2_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op2, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op2_workspace_size, &op2_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op2_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #2" << std::endl; - return ExecutionPlan(); - } - - size_t op3_workspace_size = 0; - size_t op3_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op3, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op3_workspace_size, &op3_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op3_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #3" << std::endl; - return ExecutionPlan(); - } - - size_t op4_workspace_size = 0; - size_t op4_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op4, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op4_workspace_size, &op4_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op4_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #4" << std::endl; - return ExecutionPlan(); - } - - size_t op5_workspace_size = 0; - size_t op5_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op5, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op5_workspace_size, &op5_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op5_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #5" << std::endl; - return ExecutionPlan(); - } - - size_t op6_workspace_size = 0; - size_t op6_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op6, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op6_workspace_size, &op6_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op6_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #6" << std::endl; - return ExecutionPlan(); - } - - size_t op7_workspace_size = 0; - size_t op7_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op7, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op7_workspace_size, &op7_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op7_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #7" << std::endl; - return ExecutionPlan(); - } - - size_t op8_workspace_size = 0; - size_t op8_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op8, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op8_workspace_size, &op8_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op8_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #8" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 56, 56, 24 }; - const size_t b_shape[] = { 1, 56, 56, 24 }; - status = xnn_reshape_add_nd_f16( - op9, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #9" << std::endl; - return ExecutionPlan(); - } - - size_t op10_workspace_size = 0; - size_t op10_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op10, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op10_workspace_size, &op10_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op10_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #10" << std::endl; - return ExecutionPlan(); - } - - size_t op11_workspace_size = 0; - size_t op11_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op11, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op11_workspace_size, &op11_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op11_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #11" << std::endl; - return ExecutionPlan(); - } - - size_t op12_workspace_size = 0; - size_t op12_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op12, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op12_workspace_size, &op12_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op12_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #12" << std::endl; - return ExecutionPlan(); - } - - size_t op13_workspace_size = 0; - size_t op13_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op13, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op13_workspace_size, &op13_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op13_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #13" << std::endl; - return ExecutionPlan(); - } - - size_t op14_workspace_size = 0; - size_t op14_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op14, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op14_workspace_size, &op14_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op14_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #14" << std::endl; - return ExecutionPlan(); - } - - size_t op15_workspace_size = 0; - size_t op15_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op15, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op15_workspace_size, &op15_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op15_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #15" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 28, 28, 32 }; - const size_t b_shape[] = { 1, 28, 28, 32 }; - status = xnn_reshape_add_nd_f16( - op16, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #16" << std::endl; - return ExecutionPlan(); - } - - size_t op17_workspace_size = 0; - size_t op17_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op17, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op17_workspace_size, &op17_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op17_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #17" << std::endl; - return ExecutionPlan(); - } - - size_t op18_workspace_size = 0; - size_t op18_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op18, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op18_workspace_size, &op18_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op18_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #18" << std::endl; - return ExecutionPlan(); - } - - size_t op19_workspace_size = 0; - size_t op19_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op19, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op19_workspace_size, &op19_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op19_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #19" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 28, 28, 32 }; - const size_t b_shape[] = { 1, 28, 28, 32 }; - status = xnn_reshape_add_nd_f16( - op20, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #20" << std::endl; - return ExecutionPlan(); - } - - size_t op21_workspace_size = 0; - size_t op21_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op21, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op21_workspace_size, &op21_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op21_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #21" << std::endl; - return ExecutionPlan(); - } - - size_t op22_workspace_size = 0; - size_t op22_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op22, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op22_workspace_size, &op22_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op22_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #22" << std::endl; - return ExecutionPlan(); - } - - size_t op23_workspace_size = 0; - size_t op23_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op23, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op23_workspace_size, &op23_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op23_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #23" << std::endl; - return ExecutionPlan(); - } - - size_t op24_workspace_size = 0; - size_t op24_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op24, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op24_workspace_size, &op24_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op24_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #24" << std::endl; - return ExecutionPlan(); - } - - size_t op25_workspace_size = 0; - size_t op25_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op25, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op25_workspace_size, &op25_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op25_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #25" << std::endl; - return ExecutionPlan(); - } - - size_t op26_workspace_size = 0; - size_t op26_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op26, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op26_workspace_size, &op26_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op26_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #26" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 64 }; - const size_t b_shape[] = { 1, 14, 14, 64 }; - status = xnn_reshape_add_nd_f16( - op27, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #27" << std::endl; - return ExecutionPlan(); - } - - size_t op28_workspace_size = 0; - size_t op28_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op28, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op28_workspace_size, &op28_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op28_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #28" << std::endl; - return ExecutionPlan(); - } - - size_t op29_workspace_size = 0; - size_t op29_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op29, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op29_workspace_size, &op29_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op29_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #29" << std::endl; - return ExecutionPlan(); - } - - size_t op30_workspace_size = 0; - size_t op30_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op30, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op30_workspace_size, &op30_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op30_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #30" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 64 }; - const size_t b_shape[] = { 1, 14, 14, 64 }; - status = xnn_reshape_add_nd_f16( - op31, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #31" << std::endl; - return ExecutionPlan(); - } - - size_t op32_workspace_size = 0; - size_t op32_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op32, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op32_workspace_size, &op32_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op32_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #32" << std::endl; - return ExecutionPlan(); - } - - size_t op33_workspace_size = 0; - size_t op33_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op33, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op33_workspace_size, &op33_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op33_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #33" << std::endl; - return ExecutionPlan(); - } - - size_t op34_workspace_size = 0; - size_t op34_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op34, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op34_workspace_size, &op34_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op34_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #34" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 64 }; - const size_t b_shape[] = { 1, 14, 14, 64 }; - status = xnn_reshape_add_nd_f16( - op35, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #35" << std::endl; - return ExecutionPlan(); - } - - size_t op36_workspace_size = 0; - size_t op36_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op36, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op36_workspace_size, &op36_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op36_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #36" << std::endl; - return ExecutionPlan(); - } - - size_t op37_workspace_size = 0; - size_t op37_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op37, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op37_workspace_size, &op37_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op37_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #37" << std::endl; - return ExecutionPlan(); - } - - size_t op38_workspace_size = 0; - size_t op38_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op38, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op38_workspace_size, &op38_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op38_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #38" << std::endl; - return ExecutionPlan(); - } - - size_t op39_workspace_size = 0; - size_t op39_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op39, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op39_workspace_size, &op39_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op39_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #39" << std::endl; - return ExecutionPlan(); - } - - size_t op40_workspace_size = 0; - size_t op40_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op40, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op40_workspace_size, &op40_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op40_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #40" << std::endl; - return ExecutionPlan(); - } - - size_t op41_workspace_size = 0; - size_t op41_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op41, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op41_workspace_size, &op41_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op41_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #41" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 96 }; - const size_t b_shape[] = { 1, 14, 14, 96 }; - status = xnn_reshape_add_nd_f16( - op42, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #42" << std::endl; - return ExecutionPlan(); - } - - size_t op43_workspace_size = 0; - size_t op43_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op43, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op43_workspace_size, &op43_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op43_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #43" << std::endl; - return ExecutionPlan(); - } - - size_t op44_workspace_size = 0; - size_t op44_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op44, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op44_workspace_size, &op44_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op44_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #44" << std::endl; - return ExecutionPlan(); - } - - size_t op45_workspace_size = 0; - size_t op45_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op45, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op45_workspace_size, &op45_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op45_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #45" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 96 }; - const size_t b_shape[] = { 1, 14, 14, 96 }; - status = xnn_reshape_add_nd_f16( - op46, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #46" << std::endl; - return ExecutionPlan(); - } - - size_t op47_workspace_size = 0; - size_t op47_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op47, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op47_workspace_size, &op47_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op47_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #47" << std::endl; - return ExecutionPlan(); - } - - size_t op48_workspace_size = 0; - size_t op48_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op48, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op48_workspace_size, &op48_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op48_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #48" << std::endl; - return ExecutionPlan(); - } - - size_t op49_workspace_size = 0; - size_t op49_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op49, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op49_workspace_size, &op49_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op49_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #49" << std::endl; - return ExecutionPlan(); - } - - size_t op50_workspace_size = 0; - size_t op50_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op50, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op50_workspace_size, &op50_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op50_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #50" << std::endl; - return ExecutionPlan(); - } - - size_t op51_workspace_size = 0; - size_t op51_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op51, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op51_workspace_size, &op51_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op51_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #51" << std::endl; - return ExecutionPlan(); - } - - size_t op52_workspace_size = 0; - size_t op52_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op52, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op52_workspace_size, &op52_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op52_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #52" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 160 }; - const size_t b_shape[] = { 1, 7, 7, 160 }; - status = xnn_reshape_add_nd_f16( - op53, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #53" << std::endl; - return ExecutionPlan(); - } - - size_t op54_workspace_size = 0; - size_t op54_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op54, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op54_workspace_size, &op54_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op54_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #54" << std::endl; - return ExecutionPlan(); - } - - size_t op55_workspace_size = 0; - size_t op55_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op55, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op55_workspace_size, &op55_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op55_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #55" << std::endl; - return ExecutionPlan(); - } - - size_t op56_workspace_size = 0; - size_t op56_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op56, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op56_workspace_size, &op56_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op56_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #56" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 160 }; - const size_t b_shape[] = { 1, 7, 7, 160 }; - status = xnn_reshape_add_nd_f16( - op57, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #57" << std::endl; - return ExecutionPlan(); - } - - size_t op58_workspace_size = 0; - size_t op58_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op58, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op58_workspace_size, &op58_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op58_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #58" << std::endl; - return ExecutionPlan(); - } - - size_t op59_workspace_size = 0; - size_t op59_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op59, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op59_workspace_size, &op59_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op59_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #59" << std::endl; - return ExecutionPlan(); - } - - size_t op60_workspace_size = 0; - size_t op60_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op60, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op60_workspace_size, &op60_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op60_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #60" << std::endl; - return ExecutionPlan(); - } - - size_t op61_workspace_size = 0; - size_t op61_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op61, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op61_workspace_size, &op61_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op61_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #61" << std::endl; - return ExecutionPlan(); - } - - size_t op62_workspace_size = 0; - size_t op62_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f16( - op62, - /*batch_size=*/1, 49 /* width */, - 1280 /* channels */, 1280 /* input stride */, 1280 /* output stride */, - &op62_workspace_size, &op62_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op62_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #62" << std::endl; - return ExecutionPlan(); - } - - size_t op63_workspace_size = 0; - size_t op63_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op63, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op63_workspace_size, &op63_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op63_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #63" << std::endl; - return ExecutionPlan(); - } - - Workspace workspace(max_workspace_size); - - status = xnn_setup_convolution2d_nhwc_f16( - op0, - workspace.data(), /*input=*/v0.data(), /*output=*/v1.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #0" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op1, - workspace.data(), /*input=*/v1.data(), /*output=*/v2.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #1" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op2, - workspace.data(), /*input=*/v2.data(), /*output=*/v3.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #2" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op3, - workspace.data(), /*input=*/v3.data(), /*output=*/v4.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #3" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op4, - workspace.data(), /*input=*/v4.data(), /*output=*/v5.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #4" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op5, - workspace.data(), /*input=*/v5.data(), /*output=*/v6.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #5" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op6, - workspace.data(), /*input=*/v6.data(), /*output=*/v7.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #6" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op7, - workspace.data(), /*input=*/v7.data(), /*output=*/v8.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #7" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op8, - workspace.data(), /*input=*/v8.data(), /*output=*/v9.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #8" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op9, - v9.data() /* a */, v6.data() /* b */, /*output=*/v10.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #9" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op10, - workspace.data(), /*input=*/v10.data(), /*output=*/v11.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #10" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op11, - workspace.data(), /*input=*/v11.data(), /*output=*/v12.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #11" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op12, - workspace.data(), /*input=*/v12.data(), /*output=*/v13.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #12" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op13, - workspace.data(), /*input=*/v13.data(), /*output=*/v14.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #13" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op14, - workspace.data(), /*input=*/v14.data(), /*output=*/v15.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #14" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op15, - workspace.data(), /*input=*/v15.data(), /*output=*/v16.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #15" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op16, - v16.data() /* a */, v13.data() /* b */, /*output=*/v17.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #16" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op17, - workspace.data(), /*input=*/v17.data(), /*output=*/v18.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #17" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op18, - workspace.data(), /*input=*/v18.data(), /*output=*/v19.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #18" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op19, - workspace.data(), /*input=*/v19.data(), /*output=*/v20.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #19" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op20, - v20.data() /* a */, v17.data() /* b */, /*output=*/v21.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #20" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op21, - workspace.data(), /*input=*/v21.data(), /*output=*/v22.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #21" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op22, - workspace.data(), /*input=*/v22.data(), /*output=*/v23.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #22" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op23, - workspace.data(), /*input=*/v23.data(), /*output=*/v24.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #23" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op24, - workspace.data(), /*input=*/v24.data(), /*output=*/v25.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #24" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op25, - workspace.data(), /*input=*/v25.data(), /*output=*/v26.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #25" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op26, - workspace.data(), /*input=*/v26.data(), /*output=*/v27.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #26" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op27, - v27.data() /* a */, v24.data() /* b */, /*output=*/v28.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #27" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op28, - workspace.data(), /*input=*/v28.data(), /*output=*/v29.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #28" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op29, - workspace.data(), /*input=*/v29.data(), /*output=*/v30.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #29" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op30, - workspace.data(), /*input=*/v30.data(), /*output=*/v31.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #30" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op31, - v31.data() /* a */, v28.data() /* b */, /*output=*/v32.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #31" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op32, - workspace.data(), /*input=*/v32.data(), /*output=*/v33.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #32" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op33, - workspace.data(), /*input=*/v33.data(), /*output=*/v34.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #33" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op34, - workspace.data(), /*input=*/v34.data(), /*output=*/v35.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #34" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op35, - v35.data() /* a */, v32.data() /* b */, /*output=*/v36.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #35" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op36, - workspace.data(), /*input=*/v36.data(), /*output=*/v37.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #36" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op37, - workspace.data(), /*input=*/v37.data(), /*output=*/v38.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #37" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op38, - workspace.data(), /*input=*/v38.data(), /*output=*/v39.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #38" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op39, - workspace.data(), /*input=*/v39.data(), /*output=*/v40.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #39" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op40, - workspace.data(), /*input=*/v40.data(), /*output=*/v41.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #40" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op41, - workspace.data(), /*input=*/v41.data(), /*output=*/v42.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #41" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op42, - v42.data() /* a */, v39.data() /* b */, /*output=*/v43.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #42" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op43, - workspace.data(), /*input=*/v43.data(), /*output=*/v44.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #43" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op44, - workspace.data(), /*input=*/v44.data(), /*output=*/v45.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #44" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op45, - workspace.data(), /*input=*/v45.data(), /*output=*/v46.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #45" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op46, - v46.data() /* a */, v43.data() /* b */, /*output=*/v47.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #46" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op47, - workspace.data(), /*input=*/v47.data(), /*output=*/v48.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #47" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op48, - workspace.data(), /*input=*/v48.data(), /*output=*/v49.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #48" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op49, - workspace.data(), /*input=*/v49.data(), /*output=*/v50.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #49" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op50, - workspace.data(), /*input=*/v50.data(), /*output=*/v51.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #50" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op51, - workspace.data(), /*input=*/v51.data(), /*output=*/v52.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #51" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op52, - workspace.data(), /*input=*/v52.data(), /*output=*/v53.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #52" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op53, - v53.data() /* a */, v50.data() /* b */, /*output=*/v54.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #53" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op54, - workspace.data(), /*input=*/v54.data(), /*output=*/v55.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #54" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op55, - workspace.data(), /*input=*/v55.data(), /*output=*/v56.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #55" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op56, - workspace.data(), /*input=*/v56.data(), /*output=*/v57.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #56" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op57, - v57.data() /* a */, v54.data() /* b */, /*output=*/v58.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #57" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op58, - workspace.data(), /*input=*/v58.data(), /*output=*/v59.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #58" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op59, - workspace.data(), /*input=*/v59.data(), /*output=*/v60.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #59" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op60, - workspace.data(), /*input=*/v60.data(), /*output=*/v61.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #60" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op61, - workspace.data(), /*input=*/v61.data(), /*output=*/v62.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #61" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f16( - op62, - workspace.data(), - /*input=*/v62.data(), /*output=*/v63.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #62" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op63, - workspace.data(), /*input=*/v63.data(), /*output=*/v64.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #63" << std::endl; - return ExecutionPlan(); - } - - XNN_PRAGMA_CLANG("clang diagnostic push") - XNN_PRAGMA_CLANG("clang diagnostic ignored \"-Wpessimizing-move\"") - return ExecutionPlan{operators, workspace}; - XNN_PRAGMA_CLANG("clang diagnostic pop") -} - -ExecutionPlan FP16MobileNetV2(pthreadpool_t threadpool) { - return FP16MobileNetV2(/*use_jit=*/false, threadpool); -} - -ExecutionPlan FP16MobileNetV2Jit(pthreadpool_t threadpool) { - return FP16MobileNetV2(/*use_jit=*/true, threadpool); -} - -} // namespace models diff --git a/models/fp16-mobilenet-v3-large.cc b/models/fp16-mobilenet-v3-large.cc deleted file mode 100644 index de5c49fc8c3..00000000000 --- a/models/fp16-mobilenet-v3-large.cc +++ /dev/null @@ -1,5023 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include "xnnpack.h" - -#include -#include -#include -#include -#include -#include - -#include "xnnpack/cache.h" -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/models.h" - -namespace models { - -ExecutionPlan FP16MobileNetV3Large(bool use_jit, pthreadpool_t threadpool) { - alignas(16) static std::array v0; - alignas(16) static std::array v1; - alignas(16) static std::array v2; - alignas(16) static std::array v3; - alignas(16) static std::array v4; - alignas(16) static std::array v5; - alignas(16) static std::array v6; - alignas(16) static std::array v7; - alignas(16) static std::array v8; - alignas(16) static std::array v9; - alignas(16) static std::array v10; - alignas(16) static std::array v11; - alignas(16) static std::array v12; - alignas(16) static std::array v13; - alignas(16) static std::array v14; - alignas(16) static std::array v15; - alignas(16) static std::array v16; - alignas(16) static std::array v17; - alignas(16) static std::array v18; - alignas(16) static std::array v19; - alignas(16) static std::array v20; - alignas(16) static std::array v21; - alignas(16) static std::array v22; - alignas(16) static std::array v23; - alignas(16) static std::array v24; - alignas(16) static std::array v25; - alignas(16) static std::array v26; - alignas(16) static std::array v27; - alignas(16) static std::array v28; - alignas(16) static std::array v29; - alignas(16) static std::array v30; - alignas(16) static std::array v31; - alignas(16) static std::array v32; - alignas(16) static std::array v33; - alignas(16) static std::array v34; - alignas(16) static std::array v35; - alignas(16) static std::array v36; - alignas(16) static std::array v37; - alignas(16) static std::array v38; - alignas(16) static std::array v39; - alignas(16) static std::array v40; - alignas(16) static std::array v41; - alignas(16) static std::array v42; - alignas(16) static std::array v43; - alignas(16) static std::array v44; - alignas(16) static std::array v45; - alignas(16) static std::array v46; - alignas(16) static std::array v47; - alignas(16) static std::array v48; - alignas(16) static std::array v49; - alignas(16) static std::array v50; - alignas(16) static std::array v51; - alignas(16) static std::array v52; - alignas(16) static std::array v53; - alignas(16) static std::array v54; - alignas(16) static std::array v55; - alignas(16) static std::array v56; - alignas(16) static std::array v57; - alignas(16) static std::array v58; - alignas(16) static std::array v59; - alignas(16) static std::array v60; - alignas(16) static std::array v61; - alignas(16) static std::array v62; - alignas(16) static std::array v63; - alignas(16) static std::array v64; - alignas(16) static std::array v65; - alignas(16) static std::array v66; - alignas(16) static std::array v67; - alignas(16) static std::array v68; - alignas(16) static std::array v69; - alignas(16) static std::array v70; - alignas(16) static std::array v71; - alignas(16) static std::array v72; - alignas(16) static std::array v73; - alignas(16) static std::array v74; - alignas(16) static std::array v75; - alignas(16) static std::array v76; - alignas(16) static std::array v77; - alignas(16) static std::array v78; - alignas(16) static std::array v79; - alignas(16) static std::array v80; - alignas(16) static std::array v81; - alignas(16) static std::array v82; - alignas(16) static std::array v83; - alignas(16) static std::array v84; - alignas(16) static std::array v85; - alignas(16) static std::array v86; - alignas(16) static std::array v87; - alignas(16) static std::array v88; - alignas(16) static std::array v89; - alignas(16) static std::array v90; - alignas(16) static std::array v91; - alignas(16) static std::array v92; - alignas(16) static std::array v93; - alignas(16) static std::array v94; - alignas(16) static std::array v95; - alignas(16) static std::array v96; - alignas(16) static std::array v97; - alignas(16) static std::array v98; - alignas(16) static std::array v99; - alignas(16) static std::array v100; - alignas(16) static std::array v101; - alignas(16) static std::array v102; - alignas(16) static std::array v103; - alignas(16) static std::array v104; - alignas(16) static std::array v105; - alignas(16) static std::array v106; - alignas(16) static std::array v107; - alignas(16) static std::array v108; - alignas(16) static std::array v109; - alignas(16) static std::array v110; - alignas(16) static std::array v111; - alignas(16) static std::array v112; - alignas(16) static std::array v113; - alignas(16) static std::array w114; - alignas(16) static std::array w115; - alignas(16) static std::array w116; - alignas(16) static std::array w117; - alignas(16) static std::array w118; - alignas(16) static std::array w119; - alignas(16) static std::array w120; - alignas(16) static std::array w121; - alignas(16) static std::array w122; - alignas(16) static std::array w123; - alignas(16) static std::array w124; - alignas(16) static std::array w125; - alignas(16) static std::array w126; - alignas(16) static std::array w127; - alignas(16) static std::array w128; - alignas(16) static std::array w129; - alignas(16) static std::array w130; - alignas(16) static std::array w131; - alignas(16) static std::array w132; - alignas(16) static std::array w133; - alignas(16) static std::array w134; - alignas(16) static std::array w135; - alignas(16) static std::array w136; - alignas(16) static std::array w137; - alignas(16) static std::array w138; - alignas(16) static std::array w139; - alignas(16) static std::array w140; - alignas(16) static std::array w141; - alignas(16) static std::array w142; - alignas(16) static std::array w143; - alignas(16) static std::array w144; - alignas(16) static std::array w145; - alignas(16) static std::array w146; - alignas(16) static std::array w147; - alignas(16) static std::array w148; - alignas(16) static std::array w149; - alignas(16) static std::array w150; - alignas(16) static std::array w151; - alignas(16) static std::array w152; - alignas(16) static std::array w153; - alignas(16) static std::array w154; - alignas(16) static std::array w155; - alignas(16) static std::array w156; - alignas(16) static std::array w157; - alignas(16) static std::array w158; - alignas(16) static std::array w159; - alignas(16) static std::array w160; - alignas(16) static std::array w161; - alignas(16) static std::array w162; - alignas(16) static std::array w163; - alignas(16) static std::array w164; - alignas(16) static std::array w165; - alignas(16) static std::array w166; - alignas(16) static std::array w167; - alignas(16) static std::array w168; - alignas(16) static std::array w169; - alignas(16) static std::array w170; - alignas(16) static std::array w171; - alignas(16) static std::array w172; - alignas(16) static std::array w173; - alignas(16) static std::array w174; - alignas(16) static std::array w175; - alignas(16) static std::array w176; - alignas(16) static std::array w177; - alignas(16) static std::array w178; - alignas(16) static std::array w179; - alignas(16) static std::array w180; - alignas(16) static std::array w181; - alignas(16) static std::array w182; - alignas(16) static std::array w183; - alignas(16) static std::array w184; - alignas(16) static std::array w185; - alignas(16) static std::array w186; - alignas(16) static std::array w187; - alignas(16) static std::array w188; - alignas(16) static std::array w189; - alignas(16) static std::array w190; - alignas(16) static std::array w191; - alignas(16) static std::array w192; - alignas(16) static std::array w193; - alignas(16) static std::array w194; - alignas(16) static std::array w195; - alignas(16) static std::array w196; - alignas(16) static std::array w197; - alignas(16) static std::array w198; - alignas(16) static std::array w199; - alignas(16) static std::array w200; - alignas(16) static std::array w201; - alignas(16) static std::array w202; - alignas(16) static std::array w203; - alignas(16) static std::array w204; - alignas(16) static std::array w205; - alignas(16) static std::array w206; - alignas(16) static std::array w207; - alignas(16) static std::array w208; - alignas(16) static std::array w209; - alignas(16) static std::array w210; - alignas(16) static std::array w211; - alignas(16) static std::array w212; - alignas(16) static std::array w213; - alignas(16) static std::array w214; - alignas(16) static std::array w215; - alignas(16) static std::array w216; - alignas(16) static std::array w217; - alignas(16) static std::array w218; - alignas(16) static std::array w219; - alignas(16) static std::array w220; - alignas(16) static std::array w221; - alignas(16) static std::array w222; - alignas(16) static std::array w223; - alignas(16) static std::array w224; - alignas(16) static std::array w225; - alignas(16) static std::array w226; - alignas(16) static std::array w227; - alignas(16) static std::array w228; - alignas(16) static std::array w229; - alignas(16) static std::array w230; - alignas(16) static std::array w231; - alignas(16) static std::array w232; - alignas(16) static std::array w233; - alignas(16) static std::array w234; - alignas(16) static std::array w235; - alignas(16) static std::array w236; - alignas(16) static std::array w237; - alignas(16) static std::array w238; - alignas(16) static std::array w239; - alignas(16) static std::array w240; - alignas(16) static std::array w241; - - std::random_device random_device; - auto rng = std::mt19937(random_device()); - auto f32rng = std::bind(std::uniform_real_distribution(-1.0f, +1.0f), std::ref(rng)); - std::generate(v0.begin(), v0.end(), f32rng); - std::generate(v1.begin(), v1.end(), f32rng); - std::generate(v2.begin(), v2.end(), f32rng); - std::generate(v3.begin(), v3.end(), f32rng); - std::generate(v4.begin(), v4.end(), f32rng); - std::generate(v5.begin(), v5.end(), f32rng); - std::generate(v6.begin(), v6.end(), f32rng); - std::generate(v7.begin(), v7.end(), f32rng); - std::generate(v8.begin(), v8.end(), f32rng); - std::generate(v9.begin(), v9.end(), f32rng); - std::generate(v10.begin(), v10.end(), f32rng); - std::generate(v11.begin(), v11.end(), f32rng); - std::generate(v12.begin(), v12.end(), f32rng); - std::generate(v13.begin(), v13.end(), f32rng); - std::generate(v14.begin(), v14.end(), f32rng); - std::generate(v15.begin(), v15.end(), f32rng); - std::generate(v16.begin(), v16.end(), f32rng); - std::generate(v17.begin(), v17.end(), f32rng); - std::generate(v18.begin(), v18.end(), f32rng); - std::generate(v19.begin(), v19.end(), f32rng); - std::generate(v20.begin(), v20.end(), f32rng); - std::generate(v21.begin(), v21.end(), f32rng); - std::generate(v22.begin(), v22.end(), f32rng); - std::generate(v23.begin(), v23.end(), f32rng); - std::generate(v24.begin(), v24.end(), f32rng); - std::generate(v25.begin(), v25.end(), f32rng); - std::generate(v26.begin(), v26.end(), f32rng); - std::generate(v27.begin(), v27.end(), f32rng); - std::generate(v28.begin(), v28.end(), f32rng); - std::generate(v29.begin(), v29.end(), f32rng); - std::generate(v30.begin(), v30.end(), f32rng); - std::generate(v31.begin(), v31.end(), f32rng); - std::generate(v32.begin(), v32.end(), f32rng); - std::generate(v33.begin(), v33.end(), f32rng); - std::generate(v34.begin(), v34.end(), f32rng); - std::generate(v35.begin(), v35.end(), f32rng); - std::generate(v36.begin(), v36.end(), f32rng); - std::generate(v37.begin(), v37.end(), f32rng); - std::generate(v38.begin(), v38.end(), f32rng); - std::generate(v39.begin(), v39.end(), f32rng); - std::generate(v40.begin(), v40.end(), f32rng); - std::generate(v41.begin(), v41.end(), f32rng); - std::generate(v42.begin(), v42.end(), f32rng); - std::generate(v43.begin(), v43.end(), f32rng); - std::generate(v44.begin(), v44.end(), f32rng); - std::generate(v45.begin(), v45.end(), f32rng); - std::generate(v46.begin(), v46.end(), f32rng); - std::generate(v47.begin(), v47.end(), f32rng); - std::generate(v48.begin(), v48.end(), f32rng); - std::generate(v49.begin(), v49.end(), f32rng); - std::generate(v50.begin(), v50.end(), f32rng); - std::generate(v51.begin(), v51.end(), f32rng); - std::generate(v52.begin(), v52.end(), f32rng); - std::generate(v53.begin(), v53.end(), f32rng); - std::generate(v54.begin(), v54.end(), f32rng); - std::generate(v55.begin(), v55.end(), f32rng); - std::generate(v56.begin(), v56.end(), f32rng); - std::generate(v57.begin(), v57.end(), f32rng); - std::generate(v58.begin(), v58.end(), f32rng); - std::generate(v59.begin(), v59.end(), f32rng); - std::generate(v60.begin(), v60.end(), f32rng); - std::generate(v61.begin(), v61.end(), f32rng); - std::generate(v62.begin(), v62.end(), f32rng); - std::generate(v63.begin(), v63.end(), f32rng); - std::generate(v64.begin(), v64.end(), f32rng); - std::generate(v65.begin(), v65.end(), f32rng); - std::generate(v66.begin(), v66.end(), f32rng); - std::generate(v67.begin(), v67.end(), f32rng); - std::generate(v68.begin(), v68.end(), f32rng); - std::generate(v69.begin(), v69.end(), f32rng); - std::generate(v70.begin(), v70.end(), f32rng); - std::generate(v71.begin(), v71.end(), f32rng); - std::generate(v72.begin(), v72.end(), f32rng); - std::generate(v73.begin(), v73.end(), f32rng); - std::generate(v74.begin(), v74.end(), f32rng); - std::generate(v75.begin(), v75.end(), f32rng); - std::generate(v76.begin(), v76.end(), f32rng); - std::generate(v77.begin(), v77.end(), f32rng); - std::generate(v78.begin(), v78.end(), f32rng); - std::generate(v79.begin(), v79.end(), f32rng); - std::generate(v80.begin(), v80.end(), f32rng); - std::generate(v81.begin(), v81.end(), f32rng); - std::generate(v82.begin(), v82.end(), f32rng); - std::generate(v83.begin(), v83.end(), f32rng); - std::generate(v84.begin(), v84.end(), f32rng); - std::generate(v85.begin(), v85.end(), f32rng); - std::generate(v86.begin(), v86.end(), f32rng); - std::generate(v87.begin(), v87.end(), f32rng); - std::generate(v88.begin(), v88.end(), f32rng); - std::generate(v89.begin(), v89.end(), f32rng); - std::generate(v90.begin(), v90.end(), f32rng); - std::generate(v91.begin(), v91.end(), f32rng); - std::generate(v92.begin(), v92.end(), f32rng); - std::generate(v93.begin(), v93.end(), f32rng); - std::generate(v94.begin(), v94.end(), f32rng); - std::generate(v95.begin(), v95.end(), f32rng); - std::generate(v96.begin(), v96.end(), f32rng); - std::generate(v97.begin(), v97.end(), f32rng); - std::generate(v98.begin(), v98.end(), f32rng); - std::generate(v99.begin(), v99.end(), f32rng); - std::generate(v100.begin(), v100.end(), f32rng); - std::generate(v101.begin(), v101.end(), f32rng); - std::generate(v102.begin(), v102.end(), f32rng); - std::generate(v103.begin(), v103.end(), f32rng); - std::generate(v104.begin(), v104.end(), f32rng); - std::generate(v105.begin(), v105.end(), f32rng); - std::generate(v106.begin(), v106.end(), f32rng); - std::generate(v107.begin(), v107.end(), f32rng); - std::generate(v108.begin(), v108.end(), f32rng); - std::generate(v109.begin(), v109.end(), f32rng); - std::generate(v110.begin(), v110.end(), f32rng); - std::generate(v111.begin(), v111.end(), f32rng); - std::generate(v112.begin(), v112.end(), f32rng); - std::generate(v113.begin(), v113.end(), f32rng); - std::generate(w114.begin(), w114.end(), f32rng); - std::generate(w115.begin(), w115.end(), f32rng); - std::generate(w116.begin(), w116.end(), f32rng); - std::generate(w117.begin(), w117.end(), f32rng); - std::generate(w118.begin(), w118.end(), f32rng); - std::generate(w119.begin(), w119.end(), f32rng); - std::generate(w120.begin(), w120.end(), f32rng); - std::generate(w121.begin(), w121.end(), f32rng); - std::generate(w122.begin(), w122.end(), f32rng); - std::generate(w123.begin(), w123.end(), f32rng); - std::generate(w124.begin(), w124.end(), f32rng); - std::generate(w125.begin(), w125.end(), f32rng); - std::generate(w126.begin(), w126.end(), f32rng); - std::generate(w127.begin(), w127.end(), f32rng); - std::generate(w128.begin(), w128.end(), f32rng); - std::generate(w129.begin(), w129.end(), f32rng); - std::generate(w130.begin(), w130.end(), f32rng); - std::generate(w131.begin(), w131.end(), f32rng); - std::generate(w132.begin(), w132.end(), f32rng); - std::generate(w133.begin(), w133.end(), f32rng); - std::generate(w134.begin(), w134.end(), f32rng); - std::generate(w135.begin(), w135.end(), f32rng); - std::generate(w136.begin(), w136.end(), f32rng); - std::generate(w137.begin(), w137.end(), f32rng); - std::generate(w138.begin(), w138.end(), f32rng); - std::generate(w139.begin(), w139.end(), f32rng); - std::generate(w140.begin(), w140.end(), f32rng); - std::generate(w141.begin(), w141.end(), f32rng); - std::generate(w142.begin(), w142.end(), f32rng); - std::generate(w143.begin(), w143.end(), f32rng); - std::generate(w144.begin(), w144.end(), f32rng); - std::generate(w145.begin(), w145.end(), f32rng); - std::generate(w146.begin(), w146.end(), f32rng); - std::generate(w147.begin(), w147.end(), f32rng); - std::generate(w148.begin(), w148.end(), f32rng); - std::generate(w149.begin(), w149.end(), f32rng); - std::generate(w150.begin(), w150.end(), f32rng); - std::generate(w151.begin(), w151.end(), f32rng); - std::generate(w152.begin(), w152.end(), f32rng); - std::generate(w153.begin(), w153.end(), f32rng); - std::generate(w154.begin(), w154.end(), f32rng); - std::generate(w155.begin(), w155.end(), f32rng); - std::generate(w156.begin(), w156.end(), f32rng); - std::generate(w157.begin(), w157.end(), f32rng); - std::generate(w158.begin(), w158.end(), f32rng); - std::generate(w159.begin(), w159.end(), f32rng); - std::generate(w160.begin(), w160.end(), f32rng); - std::generate(w161.begin(), w161.end(), f32rng); - std::generate(w162.begin(), w162.end(), f32rng); - std::generate(w163.begin(), w163.end(), f32rng); - std::generate(w164.begin(), w164.end(), f32rng); - std::generate(w165.begin(), w165.end(), f32rng); - std::generate(w166.begin(), w166.end(), f32rng); - std::generate(w167.begin(), w167.end(), f32rng); - std::generate(w168.begin(), w168.end(), f32rng); - std::generate(w169.begin(), w169.end(), f32rng); - std::generate(w170.begin(), w170.end(), f32rng); - std::generate(w171.begin(), w171.end(), f32rng); - std::generate(w172.begin(), w172.end(), f32rng); - std::generate(w173.begin(), w173.end(), f32rng); - std::generate(w174.begin(), w174.end(), f32rng); - std::generate(w175.begin(), w175.end(), f32rng); - std::generate(w176.begin(), w176.end(), f32rng); - std::generate(w177.begin(), w177.end(), f32rng); - std::generate(w178.begin(), w178.end(), f32rng); - std::generate(w179.begin(), w179.end(), f32rng); - std::generate(w180.begin(), w180.end(), f32rng); - std::generate(w181.begin(), w181.end(), f32rng); - std::generate(w182.begin(), w182.end(), f32rng); - std::generate(w183.begin(), w183.end(), f32rng); - std::generate(w184.begin(), w184.end(), f32rng); - std::generate(w185.begin(), w185.end(), f32rng); - std::generate(w186.begin(), w186.end(), f32rng); - std::generate(w187.begin(), w187.end(), f32rng); - std::generate(w188.begin(), w188.end(), f32rng); - std::generate(w189.begin(), w189.end(), f32rng); - std::generate(w190.begin(), w190.end(), f32rng); - std::generate(w191.begin(), w191.end(), f32rng); - std::generate(w192.begin(), w192.end(), f32rng); - std::generate(w193.begin(), w193.end(), f32rng); - std::generate(w194.begin(), w194.end(), f32rng); - std::generate(w195.begin(), w195.end(), f32rng); - std::generate(w196.begin(), w196.end(), f32rng); - std::generate(w197.begin(), w197.end(), f32rng); - std::generate(w198.begin(), w198.end(), f32rng); - std::generate(w199.begin(), w199.end(), f32rng); - std::generate(w200.begin(), w200.end(), f32rng); - std::generate(w201.begin(), w201.end(), f32rng); - std::generate(w202.begin(), w202.end(), f32rng); - std::generate(w203.begin(), w203.end(), f32rng); - std::generate(w204.begin(), w204.end(), f32rng); - std::generate(w205.begin(), w205.end(), f32rng); - std::generate(w206.begin(), w206.end(), f32rng); - std::generate(w207.begin(), w207.end(), f32rng); - std::generate(w208.begin(), w208.end(), f32rng); - std::generate(w209.begin(), w209.end(), f32rng); - std::generate(w210.begin(), w210.end(), f32rng); - std::generate(w211.begin(), w211.end(), f32rng); - std::generate(w212.begin(), w212.end(), f32rng); - std::generate(w213.begin(), w213.end(), f32rng); - std::generate(w214.begin(), w214.end(), f32rng); - std::generate(w215.begin(), w215.end(), f32rng); - std::generate(w216.begin(), w216.end(), f32rng); - std::generate(w217.begin(), w217.end(), f32rng); - std::generate(w218.begin(), w218.end(), f32rng); - std::generate(w219.begin(), w219.end(), f32rng); - std::generate(w220.begin(), w220.end(), f32rng); - std::generate(w221.begin(), w221.end(), f32rng); - std::generate(w222.begin(), w222.end(), f32rng); - std::generate(w223.begin(), w223.end(), f32rng); - std::generate(w224.begin(), w224.end(), f32rng); - std::generate(w225.begin(), w225.end(), f32rng); - std::generate(w226.begin(), w226.end(), f32rng); - std::generate(w227.begin(), w227.end(), f32rng); - std::generate(w228.begin(), w228.end(), f32rng); - std::generate(w229.begin(), w229.end(), f32rng); - std::generate(w230.begin(), w230.end(), f32rng); - std::generate(w231.begin(), w231.end(), f32rng); - std::generate(w232.begin(), w232.end(), f32rng); - std::generate(w233.begin(), w233.end(), f32rng); - std::generate(w234.begin(), w234.end(), f32rng); - std::generate(w235.begin(), w235.end(), f32rng); - std::generate(w236.begin(), w236.end(), f32rng); - std::generate(w237.begin(), w237.end(), f32rng); - std::generate(w238.begin(), w238.end(), f32rng); - std::generate(w239.begin(), w239.end(), f32rng); - std::generate(w240.begin(), w240.end(), f32rng); - std::generate(w241.begin(), w241.end(), f32rng); - - Operators operators; - xnn_status status; - xnn_code_cache* code_cache_ptr = nullptr; - size_t max_workspace_size = 0; - - xnn_operator_t op0 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 0 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 3 /* input channels per group */, - 16 /* output_channels_per_group */, - 3 /* input pixel stride */, - 16 /* output pixel stride */, - w114.data(), w115.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op0); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #0" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op0, xnn_delete_operator); - - xnn_operator_t op1 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op1); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #1" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op1, xnn_delete_operator); - - xnn_operator_t op2 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 16 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 16 /* input pixel stride */, - 16 /* output pixel stride */, - w116.data(), w117.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op2); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #2" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op2, xnn_delete_operator); - - xnn_operator_t op3 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 16 /* input channels per group */, - 16 /* output_channels_per_group */, - 16 /* input pixel stride */, - 16 /* output pixel stride */, - w118.data(), w119.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op3); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #3" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op3, xnn_delete_operator); - - xnn_operator_t op4 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op4); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #4" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op4, xnn_delete_operator); - - xnn_operator_t op5 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 16 /* input channels per group */, - 64 /* output_channels_per_group */, - 16 /* input pixel stride */, - 64 /* output pixel stride */, - w120.data(), w121.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op5); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #5" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op5, xnn_delete_operator); - - xnn_operator_t op6 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 0 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 64 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 64 /* input pixel stride */, - 64 /* output pixel stride */, - w122.data(), w123.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op6); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #6" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op6, xnn_delete_operator); - - xnn_operator_t op7 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 64 /* input channels per group */, - 24 /* output_channels_per_group */, - 64 /* input pixel stride */, - 24 /* output pixel stride */, - w124.data(), w125.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op7); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #7" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op7, xnn_delete_operator); - - xnn_operator_t op8 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 24 /* input channels per group */, - 72 /* output_channels_per_group */, - 24 /* input pixel stride */, - 72 /* output pixel stride */, - w126.data(), w127.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op8); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #8" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op8, xnn_delete_operator); - - xnn_operator_t op9 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 72 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 72 /* input pixel stride */, - 72 /* output pixel stride */, - w128.data(), w129.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op9); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #9" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op9, xnn_delete_operator); - - xnn_operator_t op10 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 72 /* input channels per group */, - 24 /* output_channels_per_group */, - 72 /* input pixel stride */, - 24 /* output pixel stride */, - w130.data(), w131.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op10); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #10" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op10, xnn_delete_operator); - - xnn_operator_t op11 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op11); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #11" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op11, xnn_delete_operator); - - xnn_operator_t op12 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 24 /* input channels per group */, - 72 /* output_channels_per_group */, - 24 /* input pixel stride */, - 72 /* output pixel stride */, - w132.data(), w133.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op12); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #12" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op12, xnn_delete_operator); - - xnn_operator_t op13 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 1 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 1 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 72 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 72 /* input pixel stride */, - 72 /* output pixel stride */, - w134.data(), w135.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op13); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #13" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op13, xnn_delete_operator); - - xnn_operator_t op14 = nullptr; - status = xnn_create_global_average_pooling_nwc_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op14); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #14" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op14, xnn_delete_operator); - - xnn_operator_t op15 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 72 /* input channels per group */, - 24 /* output_channels_per_group */, - 72 /* input pixel stride */, - 24 /* output pixel stride */, - w136.data(), w137.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op15); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #15" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op15, xnn_delete_operator); - - xnn_operator_t op16 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 24 /* input channels per group */, - 72 /* output_channels_per_group */, - 24 /* input pixel stride */, - 72 /* output pixel stride */, - w138.data(), w139.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op16); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #16" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op16, xnn_delete_operator); - - xnn_operator_t op17 = nullptr; - status = xnn_create_multiply_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op17); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #17" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op17, xnn_delete_operator); - - xnn_operator_t op18 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 72 /* input channels per group */, - 40 /* output_channels_per_group */, - 72 /* input pixel stride */, - 40 /* output pixel stride */, - w140.data(), w141.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op18); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #18" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op18, xnn_delete_operator); - - xnn_operator_t op19 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 40 /* input channels per group */, - 120 /* output_channels_per_group */, - 40 /* input pixel stride */, - 120 /* output pixel stride */, - w142.data(), w143.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op19); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #19" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op19, xnn_delete_operator); - - xnn_operator_t op20 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 2 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 2 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 120 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 120 /* input pixel stride */, - 120 /* output pixel stride */, - w144.data(), w145.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op20); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #20" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op20, xnn_delete_operator); - - xnn_operator_t op21 = nullptr; - status = xnn_create_global_average_pooling_nwc_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op21); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #21" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op21, xnn_delete_operator); - - xnn_operator_t op22 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 120 /* input channels per group */, - 32 /* output_channels_per_group */, - 120 /* input pixel stride */, - 32 /* output pixel stride */, - w146.data(), w147.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op22); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #22" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op22, xnn_delete_operator); - - xnn_operator_t op23 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 32 /* input channels per group */, - 120 /* output_channels_per_group */, - 32 /* input pixel stride */, - 120 /* output pixel stride */, - w148.data(), w149.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op23); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #23" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op23, xnn_delete_operator); - - xnn_operator_t op24 = nullptr; - status = xnn_create_multiply_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op24); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #24" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op24, xnn_delete_operator); - - xnn_operator_t op25 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 120 /* input channels per group */, - 40 /* output_channels_per_group */, - 120 /* input pixel stride */, - 40 /* output pixel stride */, - w150.data(), w151.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op25); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #25" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op25, xnn_delete_operator); - - xnn_operator_t op26 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op26); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #26" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op26, xnn_delete_operator); - - xnn_operator_t op27 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 40 /* input channels per group */, - 120 /* output_channels_per_group */, - 40 /* input pixel stride */, - 120 /* output pixel stride */, - w152.data(), w153.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op27); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #27" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op27, xnn_delete_operator); - - xnn_operator_t op28 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 2 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 2 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 120 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 120 /* input pixel stride */, - 120 /* output pixel stride */, - w154.data(), w155.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op28); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #28" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op28, xnn_delete_operator); - - xnn_operator_t op29 = nullptr; - status = xnn_create_global_average_pooling_nwc_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op29); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #29" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op29, xnn_delete_operator); - - xnn_operator_t op30 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 120 /* input channels per group */, - 32 /* output_channels_per_group */, - 120 /* input pixel stride */, - 32 /* output pixel stride */, - w156.data(), w157.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op30); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #30" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op30, xnn_delete_operator); - - xnn_operator_t op31 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 32 /* input channels per group */, - 120 /* output_channels_per_group */, - 32 /* input pixel stride */, - 120 /* output pixel stride */, - w158.data(), w159.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op31); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #31" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op31, xnn_delete_operator); - - xnn_operator_t op32 = nullptr; - status = xnn_create_multiply_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op32); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #32" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op32, xnn_delete_operator); - - xnn_operator_t op33 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 120 /* input channels per group */, - 40 /* output_channels_per_group */, - 120 /* input pixel stride */, - 40 /* output pixel stride */, - w160.data(), w161.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op33); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #33" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op33, xnn_delete_operator); - - xnn_operator_t op34 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op34); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #34" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op34, xnn_delete_operator); - - xnn_operator_t op35 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 40 /* input channels per group */, - 240 /* output_channels_per_group */, - 40 /* input pixel stride */, - 240 /* output pixel stride */, - w162.data(), w163.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op35); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #35" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op35, xnn_delete_operator); - - xnn_operator_t op36 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op36); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #36" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op36, xnn_delete_operator); - - xnn_operator_t op37 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 0 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 240 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 240 /* input pixel stride */, - 240 /* output pixel stride */, - w164.data(), w165.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op37); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #37" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op37, xnn_delete_operator); - - xnn_operator_t op38 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op38); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #38" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op38, xnn_delete_operator); - - xnn_operator_t op39 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 240 /* input channels per group */, - 80 /* output_channels_per_group */, - 240 /* input pixel stride */, - 80 /* output pixel stride */, - w166.data(), w167.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op39); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #39" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op39, xnn_delete_operator); - - xnn_operator_t op40 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 80 /* input channels per group */, - 200 /* output_channels_per_group */, - 80 /* input pixel stride */, - 200 /* output pixel stride */, - w168.data(), w169.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op40); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #40" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op40, xnn_delete_operator); - - xnn_operator_t op41 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op41); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #41" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op41, xnn_delete_operator); - - xnn_operator_t op42 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 200 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 200 /* input pixel stride */, - 200 /* output pixel stride */, - w170.data(), w171.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op42); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #42" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op42, xnn_delete_operator); - - xnn_operator_t op43 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op43); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #43" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op43, xnn_delete_operator); - - xnn_operator_t op44 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 200 /* input channels per group */, - 80 /* output_channels_per_group */, - 200 /* input pixel stride */, - 80 /* output pixel stride */, - w172.data(), w173.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op44); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #44" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op44, xnn_delete_operator); - - xnn_operator_t op45 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op45); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #45" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op45, xnn_delete_operator); - - xnn_operator_t op46 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 80 /* input channels per group */, - 184 /* output_channels_per_group */, - 80 /* input pixel stride */, - 184 /* output pixel stride */, - w174.data(), w175.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op46); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #46" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op46, xnn_delete_operator); - - xnn_operator_t op47 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op47); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #47" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op47, xnn_delete_operator); - - xnn_operator_t op48 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 184 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 184 /* input pixel stride */, - 184 /* output pixel stride */, - w176.data(), w177.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op48); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #48" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op48, xnn_delete_operator); - - xnn_operator_t op49 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op49); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #49" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op49, xnn_delete_operator); - - xnn_operator_t op50 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 184 /* input channels per group */, - 80 /* output_channels_per_group */, - 184 /* input pixel stride */, - 80 /* output pixel stride */, - w178.data(), w179.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op50); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #50" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op50, xnn_delete_operator); - - xnn_operator_t op51 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op51); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #51" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op51, xnn_delete_operator); - - xnn_operator_t op52 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 80 /* input channels per group */, - 184 /* output_channels_per_group */, - 80 /* input pixel stride */, - 184 /* output pixel stride */, - w180.data(), w181.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op52); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #52" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op52, xnn_delete_operator); - - xnn_operator_t op53 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op53); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #53" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op53, xnn_delete_operator); - - xnn_operator_t op54 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 184 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 184 /* input pixel stride */, - 184 /* output pixel stride */, - w182.data(), w183.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op54); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #54" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op54, xnn_delete_operator); - - xnn_operator_t op55 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op55); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #55" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op55, xnn_delete_operator); - - xnn_operator_t op56 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 184 /* input channels per group */, - 80 /* output_channels_per_group */, - 184 /* input pixel stride */, - 80 /* output pixel stride */, - w184.data(), w185.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op56); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #56" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op56, xnn_delete_operator); - - xnn_operator_t op57 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op57); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #57" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op57, xnn_delete_operator); - - xnn_operator_t op58 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 80 /* input channels per group */, - 480 /* output_channels_per_group */, - 80 /* input pixel stride */, - 480 /* output pixel stride */, - w186.data(), w187.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op58); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #58" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op58, xnn_delete_operator); - - xnn_operator_t op59 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op59); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #59" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op59, xnn_delete_operator); - - xnn_operator_t op60 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 480 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 480 /* input pixel stride */, - 480 /* output pixel stride */, - w188.data(), w189.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op60); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #60" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op60, xnn_delete_operator); - - xnn_operator_t op61 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op61); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #61" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op61, xnn_delete_operator); - - xnn_operator_t op62 = nullptr; - status = xnn_create_global_average_pooling_nwc_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op62); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #62" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op62, xnn_delete_operator); - - xnn_operator_t op63 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 480 /* input channels per group */, - 120 /* output_channels_per_group */, - 480 /* input pixel stride */, - 120 /* output pixel stride */, - w190.data(), w191.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op63); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #63" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op63, xnn_delete_operator); - - xnn_operator_t op64 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 120 /* input channels per group */, - 480 /* output_channels_per_group */, - 120 /* input pixel stride */, - 480 /* output pixel stride */, - w192.data(), w193.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op64); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #64" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op64, xnn_delete_operator); - - xnn_operator_t op65 = nullptr; - status = xnn_create_multiply_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op65); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #65" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op65, xnn_delete_operator); - - xnn_operator_t op66 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 480 /* input channels per group */, - 112 /* output_channels_per_group */, - 480 /* input pixel stride */, - 112 /* output pixel stride */, - w194.data(), w195.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op66); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #66" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op66, xnn_delete_operator); - - xnn_operator_t op67 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 112 /* input channels per group */, - 672 /* output_channels_per_group */, - 112 /* input pixel stride */, - 672 /* output pixel stride */, - w196.data(), w197.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op67); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #67" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op67, xnn_delete_operator); - - xnn_operator_t op68 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op68); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #68" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op68, xnn_delete_operator); - - xnn_operator_t op69 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 672 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 672 /* input pixel stride */, - 672 /* output pixel stride */, - w198.data(), w199.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op69); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #69" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op69, xnn_delete_operator); - - xnn_operator_t op70 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op70); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #70" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op70, xnn_delete_operator); - - xnn_operator_t op71 = nullptr; - status = xnn_create_global_average_pooling_nwc_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op71); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #71" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op71, xnn_delete_operator); - - xnn_operator_t op72 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 672 /* input channels per group */, - 168 /* output_channels_per_group */, - 672 /* input pixel stride */, - 168 /* output pixel stride */, - w200.data(), w201.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op72); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #72" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op72, xnn_delete_operator); - - xnn_operator_t op73 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 168 /* input channels per group */, - 672 /* output_channels_per_group */, - 168 /* input pixel stride */, - 672 /* output pixel stride */, - w202.data(), w203.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op73); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #73" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op73, xnn_delete_operator); - - xnn_operator_t op74 = nullptr; - status = xnn_create_multiply_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op74); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #74" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op74, xnn_delete_operator); - - xnn_operator_t op75 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 672 /* input channels per group */, - 112 /* output_channels_per_group */, - 672 /* input pixel stride */, - 112 /* output pixel stride */, - w204.data(), w205.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op75); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #75" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op75, xnn_delete_operator); - - xnn_operator_t op76 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op76); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #76" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op76, xnn_delete_operator); - - xnn_operator_t op77 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 112 /* input channels per group */, - 672 /* output_channels_per_group */, - 112 /* input pixel stride */, - 672 /* output pixel stride */, - w206.data(), w207.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op77); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #77" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op77, xnn_delete_operator); - - xnn_operator_t op78 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op78); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #78" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op78, xnn_delete_operator); - - xnn_operator_t op79 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 1 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 1 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 672 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 672 /* input pixel stride */, - 672 /* output pixel stride */, - w208.data(), w209.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op79); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #79" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op79, xnn_delete_operator); - - xnn_operator_t op80 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op80); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #80" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op80, xnn_delete_operator); - - xnn_operator_t op81 = nullptr; - status = xnn_create_global_average_pooling_nwc_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op81); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #81" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op81, xnn_delete_operator); - - xnn_operator_t op82 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 672 /* input channels per group */, - 168 /* output_channels_per_group */, - 672 /* input pixel stride */, - 168 /* output pixel stride */, - w210.data(), w211.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op82); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #82" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op82, xnn_delete_operator); - - xnn_operator_t op83 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 168 /* input channels per group */, - 672 /* output_channels_per_group */, - 168 /* input pixel stride */, - 672 /* output pixel stride */, - w212.data(), w213.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op83); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #83" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op83, xnn_delete_operator); - - xnn_operator_t op84 = nullptr; - status = xnn_create_multiply_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op84); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #84" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op84, xnn_delete_operator); - - xnn_operator_t op85 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 672 /* input channels per group */, - 160 /* output_channels_per_group */, - 672 /* input pixel stride */, - 160 /* output pixel stride */, - w214.data(), w215.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op85); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #85" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op85, xnn_delete_operator); - - xnn_operator_t op86 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 160 /* input channels per group */, - 960 /* output_channels_per_group */, - 160 /* input pixel stride */, - 960 /* output pixel stride */, - w216.data(), w217.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op86); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #86" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op86, xnn_delete_operator); - - xnn_operator_t op87 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op87); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #87" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op87, xnn_delete_operator); - - xnn_operator_t op88 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 2 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 2 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 960 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 960 /* input pixel stride */, - 960 /* output pixel stride */, - w218.data(), w219.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op88); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #88" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op88, xnn_delete_operator); - - xnn_operator_t op89 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op89); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #89" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op89, xnn_delete_operator); - - xnn_operator_t op90 = nullptr; - status = xnn_create_global_average_pooling_nwc_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op90); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #90" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op90, xnn_delete_operator); - - xnn_operator_t op91 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 960 /* input channels per group */, - 240 /* output_channels_per_group */, - 960 /* input pixel stride */, - 240 /* output pixel stride */, - w220.data(), w221.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op91); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #91" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op91, xnn_delete_operator); - - xnn_operator_t op92 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 240 /* input channels per group */, - 960 /* output_channels_per_group */, - 240 /* input pixel stride */, - 960 /* output pixel stride */, - w222.data(), w223.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op92); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #92" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op92, xnn_delete_operator); - - xnn_operator_t op93 = nullptr; - status = xnn_create_multiply_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op93); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #93" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op93, xnn_delete_operator); - - xnn_operator_t op94 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 960 /* input channels per group */, - 160 /* output_channels_per_group */, - 960 /* input pixel stride */, - 160 /* output pixel stride */, - w224.data(), w225.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op94); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #94" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op94, xnn_delete_operator); - - xnn_operator_t op95 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op95); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #95" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op95, xnn_delete_operator); - - xnn_operator_t op96 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 160 /* input channels per group */, - 960 /* output_channels_per_group */, - 160 /* input pixel stride */, - 960 /* output pixel stride */, - w226.data(), w227.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op96); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #96" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op96, xnn_delete_operator); - - xnn_operator_t op97 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op97); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #97" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op97, xnn_delete_operator); - - xnn_operator_t op98 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 2 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 2 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 960 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 960 /* input pixel stride */, - 960 /* output pixel stride */, - w228.data(), w229.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op98); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #98" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op98, xnn_delete_operator); - - xnn_operator_t op99 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op99); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #99" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op99, xnn_delete_operator); - - xnn_operator_t op100 = nullptr; - status = xnn_create_global_average_pooling_nwc_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op100); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #100" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op100, xnn_delete_operator); - - xnn_operator_t op101 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 960 /* input channels per group */, - 240 /* output_channels_per_group */, - 960 /* input pixel stride */, - 240 /* output pixel stride */, - w230.data(), w231.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op101); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #101" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op101, xnn_delete_operator); - - xnn_operator_t op102 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 240 /* input channels per group */, - 960 /* output_channels_per_group */, - 240 /* input pixel stride */, - 960 /* output pixel stride */, - w232.data(), w233.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op102); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #102" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op102, xnn_delete_operator); - - xnn_operator_t op103 = nullptr; - status = xnn_create_multiply_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op103); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #103" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op103, xnn_delete_operator); - - xnn_operator_t op104 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 960 /* input channels per group */, - 160 /* output_channels_per_group */, - 960 /* input pixel stride */, - 160 /* output pixel stride */, - w234.data(), w235.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op104); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #104" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op104, xnn_delete_operator); - - xnn_operator_t op105 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op105); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #105" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op105, xnn_delete_operator); - - xnn_operator_t op106 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 160 /* input channels per group */, - 960 /* output_channels_per_group */, - 160 /* input pixel stride */, - 960 /* output pixel stride */, - w236.data(), w237.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op106); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #106" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op106, xnn_delete_operator); - - xnn_operator_t op107 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op107); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #107" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op107, xnn_delete_operator); - - xnn_operator_t op108 = nullptr; - status = xnn_create_global_average_pooling_nwc_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op108); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #108" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op108, xnn_delete_operator); - - xnn_operator_t op109 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 960 /* input channels per group */, - 1280 /* output_channels_per_group */, - 960 /* input pixel stride */, - 1280 /* output pixel stride */, - w238.data(), w239.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op109); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #109" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op109, xnn_delete_operator); - - xnn_operator_t op110 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op110); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #110" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op110, xnn_delete_operator); - - xnn_operator_t op111 = nullptr; - status = xnn_create_global_average_pooling_nwc_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op111); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #111" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op111, xnn_delete_operator); - - xnn_operator_t op112 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 1280 /* input channels per group */, - 1001 /* output_channels_per_group */, - 1280 /* input pixel stride */, - 1001 /* output pixel stride */, - w240.data(), w241.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op112); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #112" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op112, xnn_delete_operator); - - size_t op0_workspace_size = 0; - size_t op0_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op0, - /*batch_size=*/1, /*input_height=*/224, /*input_width=*/224, - &op0_workspace_size, &op0_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op0_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #0" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op1, - /*batch_size=*/12544, - 16 /* channels */, - 16 /* input stride */, - 16 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #1" << std::endl; - return ExecutionPlan(); - } - - size_t op2_workspace_size = 0; - size_t op2_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op2, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op2_workspace_size, &op2_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op2_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #2" << std::endl; - return ExecutionPlan(); - } - - size_t op3_workspace_size = 0; - size_t op3_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op3, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op3_workspace_size, &op3_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op3_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #3" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 112, 112, 16 }; - const size_t b_shape[] = { 1, 112, 112, 16 }; - status = xnn_reshape_add_nd_f16( - op4, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #4" << std::endl; - return ExecutionPlan(); - } - - size_t op5_workspace_size = 0; - size_t op5_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op5, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op5_workspace_size, &op5_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op5_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #5" << std::endl; - return ExecutionPlan(); - } - - size_t op6_workspace_size = 0; - size_t op6_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op6, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op6_workspace_size, &op6_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op6_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #6" << std::endl; - return ExecutionPlan(); - } - - size_t op7_workspace_size = 0; - size_t op7_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op7, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op7_workspace_size, &op7_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op7_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #7" << std::endl; - return ExecutionPlan(); - } - - size_t op8_workspace_size = 0; - size_t op8_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op8, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op8_workspace_size, &op8_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op8_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #8" << std::endl; - return ExecutionPlan(); - } - - size_t op9_workspace_size = 0; - size_t op9_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op9, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op9_workspace_size, &op9_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op9_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #9" << std::endl; - return ExecutionPlan(); - } - - size_t op10_workspace_size = 0; - size_t op10_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op10, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op10_workspace_size, &op10_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op10_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #10" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 56, 56, 24 }; - const size_t b_shape[] = { 1, 56, 56, 24 }; - status = xnn_reshape_add_nd_f16( - op11, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #11" << std::endl; - return ExecutionPlan(); - } - - size_t op12_workspace_size = 0; - size_t op12_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op12, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op12_workspace_size, &op12_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op12_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #12" << std::endl; - return ExecutionPlan(); - } - - size_t op13_workspace_size = 0; - size_t op13_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op13, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op13_workspace_size, &op13_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op13_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #13" << std::endl; - return ExecutionPlan(); - } - - size_t op14_workspace_size = 0; - size_t op14_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f16( - op14, - /*batch_size=*/1, 784 /* width */, - 72 /* channels */, 72 /* input stride */, 72 /* output stride */, - &op14_workspace_size, &op14_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op14_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #14" << std::endl; - return ExecutionPlan(); - } - - size_t op15_workspace_size = 0; - size_t op15_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op15, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op15_workspace_size, &op15_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op15_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #15" << std::endl; - return ExecutionPlan(); - } - - size_t op16_workspace_size = 0; - size_t op16_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op16, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op16_workspace_size, &op16_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op16_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #16" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 28, 28, 72 }; - const size_t b_shape[] = { 1, 1, 1, 72 }; - status = xnn_reshape_multiply_nd_f16( - op17, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #17" << std::endl; - return ExecutionPlan(); - } - - size_t op18_workspace_size = 0; - size_t op18_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op18, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op18_workspace_size, &op18_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op18_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #18" << std::endl; - return ExecutionPlan(); - } - - size_t op19_workspace_size = 0; - size_t op19_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op19, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op19_workspace_size, &op19_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op19_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #19" << std::endl; - return ExecutionPlan(); - } - - size_t op20_workspace_size = 0; - size_t op20_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op20, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op20_workspace_size, &op20_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op20_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #20" << std::endl; - return ExecutionPlan(); - } - - size_t op21_workspace_size = 0; - size_t op21_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f16( - op21, - /*batch_size=*/1, 784 /* width */, - 120 /* channels */, 120 /* input stride */, 120 /* output stride */, - &op21_workspace_size, &op21_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op21_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #21" << std::endl; - return ExecutionPlan(); - } - - size_t op22_workspace_size = 0; - size_t op22_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op22, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op22_workspace_size, &op22_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op22_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #22" << std::endl; - return ExecutionPlan(); - } - - size_t op23_workspace_size = 0; - size_t op23_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op23, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op23_workspace_size, &op23_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op23_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #23" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 28, 28, 120 }; - const size_t b_shape[] = { 1, 1, 1, 120 }; - status = xnn_reshape_multiply_nd_f16( - op24, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #24" << std::endl; - return ExecutionPlan(); - } - - size_t op25_workspace_size = 0; - size_t op25_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op25, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op25_workspace_size, &op25_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op25_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #25" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 28, 28, 40 }; - const size_t b_shape[] = { 1, 28, 28, 40 }; - status = xnn_reshape_add_nd_f16( - op26, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #26" << std::endl; - return ExecutionPlan(); - } - - size_t op27_workspace_size = 0; - size_t op27_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op27, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op27_workspace_size, &op27_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op27_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #27" << std::endl; - return ExecutionPlan(); - } - - size_t op28_workspace_size = 0; - size_t op28_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op28, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op28_workspace_size, &op28_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op28_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #28" << std::endl; - return ExecutionPlan(); - } - - size_t op29_workspace_size = 0; - size_t op29_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f16( - op29, - /*batch_size=*/1, 784 /* width */, - 120 /* channels */, 120 /* input stride */, 120 /* output stride */, - &op29_workspace_size, &op29_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op29_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #29" << std::endl; - return ExecutionPlan(); - } - - size_t op30_workspace_size = 0; - size_t op30_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op30, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op30_workspace_size, &op30_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op30_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #30" << std::endl; - return ExecutionPlan(); - } - - size_t op31_workspace_size = 0; - size_t op31_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op31, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op31_workspace_size, &op31_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op31_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #31" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 28, 28, 120 }; - const size_t b_shape[] = { 1, 1, 1, 120 }; - status = xnn_reshape_multiply_nd_f16( - op32, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #32" << std::endl; - return ExecutionPlan(); - } - - size_t op33_workspace_size = 0; - size_t op33_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op33, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op33_workspace_size, &op33_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op33_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #33" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 28, 28, 40 }; - const size_t b_shape[] = { 1, 28, 28, 40 }; - status = xnn_reshape_add_nd_f16( - op34, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #34" << std::endl; - return ExecutionPlan(); - } - - size_t op35_workspace_size = 0; - size_t op35_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op35, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op35_workspace_size, &op35_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op35_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #35" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op36, - /*batch_size=*/784, - 240 /* channels */, - 240 /* input stride */, - 240 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #36" << std::endl; - return ExecutionPlan(); - } - - size_t op37_workspace_size = 0; - size_t op37_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op37, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op37_workspace_size, &op37_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op37_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #37" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op38, - /*batch_size=*/196, - 240 /* channels */, - 240 /* input stride */, - 240 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #38" << std::endl; - return ExecutionPlan(); - } - - size_t op39_workspace_size = 0; - size_t op39_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op39, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op39_workspace_size, &op39_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op39_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #39" << std::endl; - return ExecutionPlan(); - } - - size_t op40_workspace_size = 0; - size_t op40_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op40, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op40_workspace_size, &op40_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op40_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #40" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op41, - /*batch_size=*/196, - 200 /* channels */, - 200 /* input stride */, - 200 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #41" << std::endl; - return ExecutionPlan(); - } - - size_t op42_workspace_size = 0; - size_t op42_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op42, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op42_workspace_size, &op42_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op42_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #42" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op43, - /*batch_size=*/196, - 200 /* channels */, - 200 /* input stride */, - 200 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #43" << std::endl; - return ExecutionPlan(); - } - - size_t op44_workspace_size = 0; - size_t op44_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op44, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op44_workspace_size, &op44_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op44_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #44" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 80 }; - const size_t b_shape[] = { 1, 14, 14, 80 }; - status = xnn_reshape_add_nd_f16( - op45, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #45" << std::endl; - return ExecutionPlan(); - } - - size_t op46_workspace_size = 0; - size_t op46_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op46, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op46_workspace_size, &op46_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op46_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #46" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op47, - /*batch_size=*/196, - 184 /* channels */, - 184 /* input stride */, - 184 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #47" << std::endl; - return ExecutionPlan(); - } - - size_t op48_workspace_size = 0; - size_t op48_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op48, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op48_workspace_size, &op48_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op48_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #48" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op49, - /*batch_size=*/196, - 184 /* channels */, - 184 /* input stride */, - 184 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #49" << std::endl; - return ExecutionPlan(); - } - - size_t op50_workspace_size = 0; - size_t op50_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op50, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op50_workspace_size, &op50_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op50_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #50" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 80 }; - const size_t b_shape[] = { 1, 14, 14, 80 }; - status = xnn_reshape_add_nd_f16( - op51, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #51" << std::endl; - return ExecutionPlan(); - } - - size_t op52_workspace_size = 0; - size_t op52_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op52, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op52_workspace_size, &op52_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op52_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #52" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op53, - /*batch_size=*/196, - 184 /* channels */, - 184 /* input stride */, - 184 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #53" << std::endl; - return ExecutionPlan(); - } - - size_t op54_workspace_size = 0; - size_t op54_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op54, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op54_workspace_size, &op54_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op54_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #54" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op55, - /*batch_size=*/196, - 184 /* channels */, - 184 /* input stride */, - 184 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #55" << std::endl; - return ExecutionPlan(); - } - - size_t op56_workspace_size = 0; - size_t op56_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op56, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op56_workspace_size, &op56_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op56_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #56" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 80 }; - const size_t b_shape[] = { 1, 14, 14, 80 }; - status = xnn_reshape_add_nd_f16( - op57, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #57" << std::endl; - return ExecutionPlan(); - } - - size_t op58_workspace_size = 0; - size_t op58_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op58, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op58_workspace_size, &op58_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op58_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #58" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op59, - /*batch_size=*/196, - 480 /* channels */, - 480 /* input stride */, - 480 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #59" << std::endl; - return ExecutionPlan(); - } - - size_t op60_workspace_size = 0; - size_t op60_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op60, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op60_workspace_size, &op60_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op60_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #60" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op61, - /*batch_size=*/196, - 480 /* channels */, - 480 /* input stride */, - 480 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #61" << std::endl; - return ExecutionPlan(); - } - - size_t op62_workspace_size = 0; - size_t op62_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f16( - op62, - /*batch_size=*/1, 196 /* width */, - 480 /* channels */, 480 /* input stride */, 480 /* output stride */, - &op62_workspace_size, &op62_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op62_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #62" << std::endl; - return ExecutionPlan(); - } - - size_t op63_workspace_size = 0; - size_t op63_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op63, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op63_workspace_size, &op63_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op63_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #63" << std::endl; - return ExecutionPlan(); - } - - size_t op64_workspace_size = 0; - size_t op64_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op64, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op64_workspace_size, &op64_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op64_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #64" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 480 }; - const size_t b_shape[] = { 1, 1, 1, 480 }; - status = xnn_reshape_multiply_nd_f16( - op65, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #65" << std::endl; - return ExecutionPlan(); - } - - size_t op66_workspace_size = 0; - size_t op66_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op66, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op66_workspace_size, &op66_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op66_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #66" << std::endl; - return ExecutionPlan(); - } - - size_t op67_workspace_size = 0; - size_t op67_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op67, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op67_workspace_size, &op67_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op67_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #67" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op68, - /*batch_size=*/196, - 672 /* channels */, - 672 /* input stride */, - 672 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #68" << std::endl; - return ExecutionPlan(); - } - - size_t op69_workspace_size = 0; - size_t op69_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op69, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op69_workspace_size, &op69_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op69_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #69" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op70, - /*batch_size=*/196, - 672 /* channels */, - 672 /* input stride */, - 672 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #70" << std::endl; - return ExecutionPlan(); - } - - size_t op71_workspace_size = 0; - size_t op71_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f16( - op71, - /*batch_size=*/1, 196 /* width */, - 672 /* channels */, 672 /* input stride */, 672 /* output stride */, - &op71_workspace_size, &op71_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op71_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #71" << std::endl; - return ExecutionPlan(); - } - - size_t op72_workspace_size = 0; - size_t op72_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op72, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op72_workspace_size, &op72_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op72_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #72" << std::endl; - return ExecutionPlan(); - } - - size_t op73_workspace_size = 0; - size_t op73_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op73, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op73_workspace_size, &op73_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op73_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #73" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 672 }; - const size_t b_shape[] = { 1, 1, 1, 672 }; - status = xnn_reshape_multiply_nd_f16( - op74, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #74" << std::endl; - return ExecutionPlan(); - } - - size_t op75_workspace_size = 0; - size_t op75_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op75, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op75_workspace_size, &op75_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op75_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #75" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 112 }; - const size_t b_shape[] = { 1, 14, 14, 112 }; - status = xnn_reshape_add_nd_f16( - op76, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #76" << std::endl; - return ExecutionPlan(); - } - - size_t op77_workspace_size = 0; - size_t op77_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op77, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op77_workspace_size, &op77_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op77_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #77" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op78, - /*batch_size=*/196, - 672 /* channels */, - 672 /* input stride */, - 672 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #78" << std::endl; - return ExecutionPlan(); - } - - size_t op79_workspace_size = 0; - size_t op79_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op79, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op79_workspace_size, &op79_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op79_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #79" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op80, - /*batch_size=*/49, - 672 /* channels */, - 672 /* input stride */, - 672 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #80" << std::endl; - return ExecutionPlan(); - } - - size_t op81_workspace_size = 0; - size_t op81_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f16( - op81, - /*batch_size=*/1, 49 /* width */, - 672 /* channels */, 672 /* input stride */, 672 /* output stride */, - &op81_workspace_size, &op81_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op81_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #81" << std::endl; - return ExecutionPlan(); - } - - size_t op82_workspace_size = 0; - size_t op82_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op82, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op82_workspace_size, &op82_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op82_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #82" << std::endl; - return ExecutionPlan(); - } - - size_t op83_workspace_size = 0; - size_t op83_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op83, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op83_workspace_size, &op83_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op83_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #83" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 672 }; - const size_t b_shape[] = { 1, 1, 1, 672 }; - status = xnn_reshape_multiply_nd_f16( - op84, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #84" << std::endl; - return ExecutionPlan(); - } - - size_t op85_workspace_size = 0; - size_t op85_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op85, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op85_workspace_size, &op85_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op85_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #85" << std::endl; - return ExecutionPlan(); - } - - size_t op86_workspace_size = 0; - size_t op86_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op86, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op86_workspace_size, &op86_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op86_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #86" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op87, - /*batch_size=*/49, - 960 /* channels */, - 960 /* input stride */, - 960 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #87" << std::endl; - return ExecutionPlan(); - } - - size_t op88_workspace_size = 0; - size_t op88_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op88, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op88_workspace_size, &op88_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op88_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #88" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op89, - /*batch_size=*/49, - 960 /* channels */, - 960 /* input stride */, - 960 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #89" << std::endl; - return ExecutionPlan(); - } - - size_t op90_workspace_size = 0; - size_t op90_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f16( - op90, - /*batch_size=*/1, 49 /* width */, - 960 /* channels */, 960 /* input stride */, 960 /* output stride */, - &op90_workspace_size, &op90_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op90_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #90" << std::endl; - return ExecutionPlan(); - } - - size_t op91_workspace_size = 0; - size_t op91_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op91, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op91_workspace_size, &op91_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op91_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #91" << std::endl; - return ExecutionPlan(); - } - - size_t op92_workspace_size = 0; - size_t op92_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op92, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op92_workspace_size, &op92_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op92_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #92" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 960 }; - const size_t b_shape[] = { 1, 1, 1, 960 }; - status = xnn_reshape_multiply_nd_f16( - op93, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #93" << std::endl; - return ExecutionPlan(); - } - - size_t op94_workspace_size = 0; - size_t op94_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op94, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op94_workspace_size, &op94_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op94_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #94" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 160 }; - const size_t b_shape[] = { 1, 7, 7, 160 }; - status = xnn_reshape_add_nd_f16( - op95, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #95" << std::endl; - return ExecutionPlan(); - } - - size_t op96_workspace_size = 0; - size_t op96_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op96, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op96_workspace_size, &op96_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op96_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #96" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op97, - /*batch_size=*/49, - 960 /* channels */, - 960 /* input stride */, - 960 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #97" << std::endl; - return ExecutionPlan(); - } - - size_t op98_workspace_size = 0; - size_t op98_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op98, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op98_workspace_size, &op98_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op98_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #98" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op99, - /*batch_size=*/49, - 960 /* channels */, - 960 /* input stride */, - 960 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #99" << std::endl; - return ExecutionPlan(); - } - - size_t op100_workspace_size = 0; - size_t op100_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f16( - op100, - /*batch_size=*/1, 49 /* width */, - 960 /* channels */, 960 /* input stride */, 960 /* output stride */, - &op100_workspace_size, &op100_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op100_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #100" << std::endl; - return ExecutionPlan(); - } - - size_t op101_workspace_size = 0; - size_t op101_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op101, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op101_workspace_size, &op101_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op101_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #101" << std::endl; - return ExecutionPlan(); - } - - size_t op102_workspace_size = 0; - size_t op102_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op102, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op102_workspace_size, &op102_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op102_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #102" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 960 }; - const size_t b_shape[] = { 1, 1, 1, 960 }; - status = xnn_reshape_multiply_nd_f16( - op103, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #103" << std::endl; - return ExecutionPlan(); - } - - size_t op104_workspace_size = 0; - size_t op104_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op104, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op104_workspace_size, &op104_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op104_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #104" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 160 }; - const size_t b_shape[] = { 1, 7, 7, 160 }; - status = xnn_reshape_add_nd_f16( - op105, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #105" << std::endl; - return ExecutionPlan(); - } - - size_t op106_workspace_size = 0; - size_t op106_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op106, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op106_workspace_size, &op106_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op106_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #106" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op107, - /*batch_size=*/49, - 960 /* channels */, - 960 /* input stride */, - 960 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #107" << std::endl; - return ExecutionPlan(); - } - - size_t op108_workspace_size = 0; - size_t op108_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f16( - op108, - /*batch_size=*/1, 49 /* width */, - 960 /* channels */, 960 /* input stride */, 960 /* output stride */, - &op108_workspace_size, &op108_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op108_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #108" << std::endl; - return ExecutionPlan(); - } - - size_t op109_workspace_size = 0; - size_t op109_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op109, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op109_workspace_size, &op109_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op109_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #109" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op110, - /*batch_size=*/1, - 1280 /* channels */, - 1280 /* input stride */, - 1280 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #110" << std::endl; - return ExecutionPlan(); - } - - size_t op111_workspace_size = 0; - size_t op111_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f16( - op111, - /*batch_size=*/1, 1 /* width */, - 1280 /* channels */, 1280 /* input stride */, 1280 /* output stride */, - &op111_workspace_size, &op111_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op111_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #111" << std::endl; - return ExecutionPlan(); - } - - size_t op112_workspace_size = 0; - size_t op112_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op112, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op112_workspace_size, &op112_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op112_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #112" << std::endl; - return ExecutionPlan(); - } - - Workspace workspace(max_workspace_size); - - status = xnn_setup_convolution2d_nhwc_f16( - op0, - workspace.data(), /*input=*/v0.data(), /*output=*/v1.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #0" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op1, - /*input=*/v1.data(), /*output=*/v2.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #1" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op2, - workspace.data(), /*input=*/v2.data(), /*output=*/v3.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #2" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op3, - workspace.data(), /*input=*/v3.data(), /*output=*/v4.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #3" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op4, - v4.data() /* a */, v2.data() /* b */, /*output=*/v5.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #4" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op5, - workspace.data(), /*input=*/v5.data(), /*output=*/v6.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #5" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op6, - workspace.data(), /*input=*/v6.data(), /*output=*/v7.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #6" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op7, - workspace.data(), /*input=*/v7.data(), /*output=*/v8.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #7" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op8, - workspace.data(), /*input=*/v8.data(), /*output=*/v9.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #8" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op9, - workspace.data(), /*input=*/v9.data(), /*output=*/v10.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #9" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op10, - workspace.data(), /*input=*/v10.data(), /*output=*/v11.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #10" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op11, - v11.data() /* a */, v8.data() /* b */, /*output=*/v12.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #11" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op12, - workspace.data(), /*input=*/v12.data(), /*output=*/v13.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #12" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op13, - workspace.data(), /*input=*/v13.data(), /*output=*/v14.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #13" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f16( - op14, - workspace.data(), - /*input=*/v14.data(), /*output=*/v15.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #14" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op15, - workspace.data(), /*input=*/v15.data(), /*output=*/v16.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #15" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op16, - workspace.data(), /*input=*/v16.data(), /*output=*/v17.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #16" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f16( - op17, - v14.data() /* a */, v17.data() /* b */, /*output=*/v18.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #17" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op18, - workspace.data(), /*input=*/v18.data(), /*output=*/v19.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #18" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op19, - workspace.data(), /*input=*/v19.data(), /*output=*/v20.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #19" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op20, - workspace.data(), /*input=*/v20.data(), /*output=*/v21.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #20" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f16( - op21, - workspace.data(), - /*input=*/v21.data(), /*output=*/v22.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #21" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op22, - workspace.data(), /*input=*/v22.data(), /*output=*/v23.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #22" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op23, - workspace.data(), /*input=*/v23.data(), /*output=*/v24.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #23" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f16( - op24, - v21.data() /* a */, v24.data() /* b */, /*output=*/v25.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #24" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op25, - workspace.data(), /*input=*/v25.data(), /*output=*/v26.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #25" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op26, - v26.data() /* a */, v19.data() /* b */, /*output=*/v27.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #26" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op27, - workspace.data(), /*input=*/v27.data(), /*output=*/v28.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #27" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op28, - workspace.data(), /*input=*/v28.data(), /*output=*/v29.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #28" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f16( - op29, - workspace.data(), - /*input=*/v29.data(), /*output=*/v30.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #29" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op30, - workspace.data(), /*input=*/v30.data(), /*output=*/v31.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #30" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op31, - workspace.data(), /*input=*/v31.data(), /*output=*/v32.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #31" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f16( - op32, - v29.data() /* a */, v32.data() /* b */, /*output=*/v33.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #32" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op33, - workspace.data(), /*input=*/v33.data(), /*output=*/v34.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #33" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op34, - v34.data() /* a */, v27.data() /* b */, /*output=*/v35.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #34" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op35, - workspace.data(), /*input=*/v35.data(), /*output=*/v36.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #35" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op36, - /*input=*/v36.data(), /*output=*/v37.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #36" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op37, - workspace.data(), /*input=*/v37.data(), /*output=*/v38.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #37" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op38, - /*input=*/v38.data(), /*output=*/v39.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #38" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op39, - workspace.data(), /*input=*/v39.data(), /*output=*/v40.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #39" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op40, - workspace.data(), /*input=*/v40.data(), /*output=*/v41.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #40" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op41, - /*input=*/v41.data(), /*output=*/v42.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #41" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op42, - workspace.data(), /*input=*/v42.data(), /*output=*/v43.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #42" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op43, - /*input=*/v43.data(), /*output=*/v44.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #43" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op44, - workspace.data(), /*input=*/v44.data(), /*output=*/v45.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #44" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op45, - v45.data() /* a */, v40.data() /* b */, /*output=*/v46.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #45" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op46, - workspace.data(), /*input=*/v46.data(), /*output=*/v47.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #46" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op47, - /*input=*/v47.data(), /*output=*/v48.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #47" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op48, - workspace.data(), /*input=*/v48.data(), /*output=*/v49.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #48" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op49, - /*input=*/v49.data(), /*output=*/v50.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #49" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op50, - workspace.data(), /*input=*/v50.data(), /*output=*/v51.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #50" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op51, - v51.data() /* a */, v46.data() /* b */, /*output=*/v52.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #51" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op52, - workspace.data(), /*input=*/v52.data(), /*output=*/v53.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #52" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op53, - /*input=*/v53.data(), /*output=*/v54.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #53" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op54, - workspace.data(), /*input=*/v54.data(), /*output=*/v55.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #54" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op55, - /*input=*/v55.data(), /*output=*/v56.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #55" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op56, - workspace.data(), /*input=*/v56.data(), /*output=*/v57.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #56" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op57, - v57.data() /* a */, v52.data() /* b */, /*output=*/v58.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #57" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op58, - workspace.data(), /*input=*/v58.data(), /*output=*/v59.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #58" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op59, - /*input=*/v59.data(), /*output=*/v60.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #59" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op60, - workspace.data(), /*input=*/v60.data(), /*output=*/v61.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #60" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op61, - /*input=*/v61.data(), /*output=*/v62.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #61" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f16( - op62, - workspace.data(), - /*input=*/v62.data(), /*output=*/v63.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #62" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op63, - workspace.data(), /*input=*/v63.data(), /*output=*/v64.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #63" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op64, - workspace.data(), /*input=*/v64.data(), /*output=*/v65.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #64" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f16( - op65, - v62.data() /* a */, v65.data() /* b */, /*output=*/v66.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #65" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op66, - workspace.data(), /*input=*/v66.data(), /*output=*/v67.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #66" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op67, - workspace.data(), /*input=*/v67.data(), /*output=*/v68.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #67" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op68, - /*input=*/v68.data(), /*output=*/v69.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #68" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op69, - workspace.data(), /*input=*/v69.data(), /*output=*/v70.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #69" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op70, - /*input=*/v70.data(), /*output=*/v71.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #70" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f16( - op71, - workspace.data(), - /*input=*/v71.data(), /*output=*/v72.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #71" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op72, - workspace.data(), /*input=*/v72.data(), /*output=*/v73.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #72" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op73, - workspace.data(), /*input=*/v73.data(), /*output=*/v74.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #73" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f16( - op74, - v71.data() /* a */, v74.data() /* b */, /*output=*/v75.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #74" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op75, - workspace.data(), /*input=*/v75.data(), /*output=*/v76.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #75" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op76, - v76.data() /* a */, v67.data() /* b */, /*output=*/v77.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #76" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op77, - workspace.data(), /*input=*/v77.data(), /*output=*/v78.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #77" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op78, - /*input=*/v78.data(), /*output=*/v79.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #78" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op79, - workspace.data(), /*input=*/v79.data(), /*output=*/v80.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #79" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op80, - /*input=*/v80.data(), /*output=*/v81.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #80" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f16( - op81, - workspace.data(), - /*input=*/v81.data(), /*output=*/v82.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #81" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op82, - workspace.data(), /*input=*/v82.data(), /*output=*/v83.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #82" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op83, - workspace.data(), /*input=*/v83.data(), /*output=*/v84.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #83" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f16( - op84, - v81.data() /* a */, v84.data() /* b */, /*output=*/v85.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #84" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op85, - workspace.data(), /*input=*/v85.data(), /*output=*/v86.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #85" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op86, - workspace.data(), /*input=*/v86.data(), /*output=*/v87.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #86" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op87, - /*input=*/v87.data(), /*output=*/v88.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #87" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op88, - workspace.data(), /*input=*/v88.data(), /*output=*/v89.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #88" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op89, - /*input=*/v89.data(), /*output=*/v90.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #89" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f16( - op90, - workspace.data(), - /*input=*/v90.data(), /*output=*/v91.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #90" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op91, - workspace.data(), /*input=*/v91.data(), /*output=*/v92.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #91" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op92, - workspace.data(), /*input=*/v92.data(), /*output=*/v93.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #92" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f16( - op93, - v90.data() /* a */, v93.data() /* b */, /*output=*/v94.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #93" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op94, - workspace.data(), /*input=*/v94.data(), /*output=*/v95.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #94" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op95, - v95.data() /* a */, v86.data() /* b */, /*output=*/v96.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #95" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op96, - workspace.data(), /*input=*/v96.data(), /*output=*/v97.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #96" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op97, - /*input=*/v97.data(), /*output=*/v98.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #97" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op98, - workspace.data(), /*input=*/v98.data(), /*output=*/v99.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #98" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op99, - /*input=*/v99.data(), /*output=*/v100.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #99" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f16( - op100, - workspace.data(), - /*input=*/v100.data(), /*output=*/v101.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #100" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op101, - workspace.data(), /*input=*/v101.data(), /*output=*/v102.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #101" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op102, - workspace.data(), /*input=*/v102.data(), /*output=*/v103.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #102" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f16( - op103, - v100.data() /* a */, v103.data() /* b */, /*output=*/v104.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #103" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op104, - workspace.data(), /*input=*/v104.data(), /*output=*/v105.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #104" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op105, - v105.data() /* a */, v96.data() /* b */, /*output=*/v106.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #105" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op106, - workspace.data(), /*input=*/v106.data(), /*output=*/v107.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #106" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op107, - /*input=*/v107.data(), /*output=*/v108.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #107" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f16( - op108, - workspace.data(), - /*input=*/v108.data(), /*output=*/v109.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #108" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op109, - workspace.data(), /*input=*/v109.data(), /*output=*/v110.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #109" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op110, - /*input=*/v110.data(), /*output=*/v111.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #110" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f16( - op111, - workspace.data(), - /*input=*/v111.data(), /*output=*/v112.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #111" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op112, - workspace.data(), /*input=*/v112.data(), /*output=*/v113.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #112" << std::endl; - return ExecutionPlan(); - } - - XNN_PRAGMA_CLANG("clang diagnostic push") - XNN_PRAGMA_CLANG("clang diagnostic ignored \"-Wpessimizing-move\"") - return ExecutionPlan{operators, workspace}; - XNN_PRAGMA_CLANG("clang diagnostic pop") -} - -ExecutionPlan FP16MobileNetV3Large(pthreadpool_t threadpool) { - return FP16MobileNetV3Large(/*use_jit=*/false, threadpool); -} - -ExecutionPlan FP16MobileNetV3LargeJit(pthreadpool_t threadpool) { - return FP16MobileNetV3Large(/*use_jit=*/true, threadpool); -} - -} // namespace models diff --git a/models/fp16-mobilenet-v3-small.cc b/models/fp16-mobilenet-v3-small.cc deleted file mode 100644 index cab8b1073bf..00000000000 --- a/models/fp16-mobilenet-v3-small.cc +++ /dev/null @@ -1,4373 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include "xnnpack.h" - -#include -#include -#include -#include -#include -#include - -#include "xnnpack/cache.h" -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/models.h" - -namespace models { - -ExecutionPlan FP16MobileNetV3Small(bool use_jit, pthreadpool_t threadpool) { - alignas(16) static std::array v0; - alignas(16) static std::array v1; - alignas(16) static std::array v2; - alignas(16) static std::array v3; - alignas(16) static std::array v4; - alignas(16) static std::array v5; - alignas(16) static std::array v6; - alignas(16) static std::array v7; - alignas(16) static std::array v8; - alignas(16) static std::array v9; - alignas(16) static std::array v10; - alignas(16) static std::array v11; - alignas(16) static std::array v12; - alignas(16) static std::array v13; - alignas(16) static std::array v14; - alignas(16) static std::array v15; - alignas(16) static std::array v16; - alignas(16) static std::array v17; - alignas(16) static std::array v18; - alignas(16) static std::array v19; - alignas(16) static std::array v20; - alignas(16) static std::array v21; - alignas(16) static std::array v22; - alignas(16) static std::array v23; - alignas(16) static std::array v24; - alignas(16) static std::array v25; - alignas(16) static std::array v26; - alignas(16) static std::array v27; - alignas(16) static std::array v28; - alignas(16) static std::array v29; - alignas(16) static std::array v30; - alignas(16) static std::array v31; - alignas(16) static std::array v32; - alignas(16) static std::array v33; - alignas(16) static std::array v34; - alignas(16) static std::array v35; - alignas(16) static std::array v36; - alignas(16) static std::array v37; - alignas(16) static std::array v38; - alignas(16) static std::array v39; - alignas(16) static std::array v40; - alignas(16) static std::array v41; - alignas(16) static std::array v42; - alignas(16) static std::array v43; - alignas(16) static std::array v44; - alignas(16) static std::array v45; - alignas(16) static std::array v46; - alignas(16) static std::array v47; - alignas(16) static std::array v48; - alignas(16) static std::array v49; - alignas(16) static std::array v50; - alignas(16) static std::array v51; - alignas(16) static std::array v52; - alignas(16) static std::array v53; - alignas(16) static std::array v54; - alignas(16) static std::array v55; - alignas(16) static std::array v56; - alignas(16) static std::array v57; - alignas(16) static std::array v58; - alignas(16) static std::array v59; - alignas(16) static std::array v60; - alignas(16) static std::array v61; - alignas(16) static std::array v62; - alignas(16) static std::array v63; - alignas(16) static std::array v64; - alignas(16) static std::array v65; - alignas(16) static std::array v66; - alignas(16) static std::array v67; - alignas(16) static std::array v68; - alignas(16) static std::array v69; - alignas(16) static std::array v70; - alignas(16) static std::array v71; - alignas(16) static std::array v72; - alignas(16) static std::array v73; - alignas(16) static std::array v74; - alignas(16) static std::array v75; - alignas(16) static std::array v76; - alignas(16) static std::array v77; - alignas(16) static std::array v78; - alignas(16) static std::array v79; - alignas(16) static std::array v80; - alignas(16) static std::array v81; - alignas(16) static std::array v82; - alignas(16) static std::array v83; - alignas(16) static std::array v84; - alignas(16) static std::array v85; - alignas(16) static std::array v86; - alignas(16) static std::array v87; - alignas(16) static std::array v88; - alignas(16) static std::array v89; - alignas(16) static std::array v90; - alignas(16) static std::array v91; - alignas(16) static std::array v92; - alignas(16) static std::array v93; - alignas(16) static std::array v94; - alignas(16) static std::array v95; - alignas(16) static std::array v96; - alignas(16) static std::array v97; - alignas(16) static std::array v98; - alignas(16) static std::array v99; - alignas(16) static std::array w100; - alignas(16) static std::array w101; - alignas(16) static std::array w102; - alignas(16) static std::array w103; - alignas(16) static std::array w104; - alignas(16) static std::array w105; - alignas(16) static std::array w106; - alignas(16) static std::array w107; - alignas(16) static std::array w108; - alignas(16) static std::array w109; - alignas(16) static std::array w110; - alignas(16) static std::array w111; - alignas(16) static std::array w112; - alignas(16) static std::array w113; - alignas(16) static std::array w114; - alignas(16) static std::array w115; - alignas(16) static std::array w116; - alignas(16) static std::array w117; - alignas(16) static std::array w118; - alignas(16) static std::array w119; - alignas(16) static std::array w120; - alignas(16) static std::array w121; - alignas(16) static std::array w122; - alignas(16) static std::array w123; - alignas(16) static std::array w124; - alignas(16) static std::array w125; - alignas(16) static std::array w126; - alignas(16) static std::array w127; - alignas(16) static std::array w128; - alignas(16) static std::array w129; - alignas(16) static std::array w130; - alignas(16) static std::array w131; - alignas(16) static std::array w132; - alignas(16) static std::array w133; - alignas(16) static std::array w134; - alignas(16) static std::array w135; - alignas(16) static std::array w136; - alignas(16) static std::array w137; - alignas(16) static std::array w138; - alignas(16) static std::array w139; - alignas(16) static std::array w140; - alignas(16) static std::array w141; - alignas(16) static std::array w142; - alignas(16) static std::array w143; - alignas(16) static std::array w144; - alignas(16) static std::array w145; - alignas(16) static std::array w146; - alignas(16) static std::array w147; - alignas(16) static std::array w148; - alignas(16) static std::array w149; - alignas(16) static std::array w150; - alignas(16) static std::array w151; - alignas(16) static std::array w152; - alignas(16) static std::array w153; - alignas(16) static std::array w154; - alignas(16) static std::array w155; - alignas(16) static std::array w156; - alignas(16) static std::array w157; - alignas(16) static std::array w158; - alignas(16) static std::array w159; - alignas(16) static std::array w160; - alignas(16) static std::array w161; - alignas(16) static std::array w162; - alignas(16) static std::array w163; - alignas(16) static std::array w164; - alignas(16) static std::array w165; - alignas(16) static std::array w166; - alignas(16) static std::array w167; - alignas(16) static std::array w168; - alignas(16) static std::array w169; - alignas(16) static std::array w170; - alignas(16) static std::array w171; - alignas(16) static std::array w172; - alignas(16) static std::array w173; - alignas(16) static std::array w174; - alignas(16) static std::array w175; - alignas(16) static std::array w176; - alignas(16) static std::array w177; - alignas(16) static std::array w178; - alignas(16) static std::array w179; - alignas(16) static std::array w180; - alignas(16) static std::array w181; - alignas(16) static std::array w182; - alignas(16) static std::array w183; - alignas(16) static std::array w184; - alignas(16) static std::array w185; - alignas(16) static std::array w186; - alignas(16) static std::array w187; - alignas(16) static std::array w188; - alignas(16) static std::array w189; - alignas(16) static std::array w190; - alignas(16) static std::array w191; - alignas(16) static std::array w192; - alignas(16) static std::array w193; - alignas(16) static std::array w194; - alignas(16) static std::array w195; - alignas(16) static std::array w196; - alignas(16) static std::array w197; - alignas(16) static std::array w198; - alignas(16) static std::array w199; - alignas(16) static std::array w200; - alignas(16) static std::array w201; - alignas(16) static std::array w202; - alignas(16) static std::array w203; - alignas(16) static std::array w204; - alignas(16) static std::array w205; - alignas(16) static std::array w206; - alignas(16) static std::array w207; - - std::random_device random_device; - auto rng = std::mt19937(random_device()); - auto f32rng = std::bind(std::uniform_real_distribution(-1.0f, +1.0f), std::ref(rng)); - std::generate(v0.begin(), v0.end(), f32rng); - std::generate(v1.begin(), v1.end(), f32rng); - std::generate(v2.begin(), v2.end(), f32rng); - std::generate(v3.begin(), v3.end(), f32rng); - std::generate(v4.begin(), v4.end(), f32rng); - std::generate(v5.begin(), v5.end(), f32rng); - std::generate(v6.begin(), v6.end(), f32rng); - std::generate(v7.begin(), v7.end(), f32rng); - std::generate(v8.begin(), v8.end(), f32rng); - std::generate(v9.begin(), v9.end(), f32rng); - std::generate(v10.begin(), v10.end(), f32rng); - std::generate(v11.begin(), v11.end(), f32rng); - std::generate(v12.begin(), v12.end(), f32rng); - std::generate(v13.begin(), v13.end(), f32rng); - std::generate(v14.begin(), v14.end(), f32rng); - std::generate(v15.begin(), v15.end(), f32rng); - std::generate(v16.begin(), v16.end(), f32rng); - std::generate(v17.begin(), v17.end(), f32rng); - std::generate(v18.begin(), v18.end(), f32rng); - std::generate(v19.begin(), v19.end(), f32rng); - std::generate(v20.begin(), v20.end(), f32rng); - std::generate(v21.begin(), v21.end(), f32rng); - std::generate(v22.begin(), v22.end(), f32rng); - std::generate(v23.begin(), v23.end(), f32rng); - std::generate(v24.begin(), v24.end(), f32rng); - std::generate(v25.begin(), v25.end(), f32rng); - std::generate(v26.begin(), v26.end(), f32rng); - std::generate(v27.begin(), v27.end(), f32rng); - std::generate(v28.begin(), v28.end(), f32rng); - std::generate(v29.begin(), v29.end(), f32rng); - std::generate(v30.begin(), v30.end(), f32rng); - std::generate(v31.begin(), v31.end(), f32rng); - std::generate(v32.begin(), v32.end(), f32rng); - std::generate(v33.begin(), v33.end(), f32rng); - std::generate(v34.begin(), v34.end(), f32rng); - std::generate(v35.begin(), v35.end(), f32rng); - std::generate(v36.begin(), v36.end(), f32rng); - std::generate(v37.begin(), v37.end(), f32rng); - std::generate(v38.begin(), v38.end(), f32rng); - std::generate(v39.begin(), v39.end(), f32rng); - std::generate(v40.begin(), v40.end(), f32rng); - std::generate(v41.begin(), v41.end(), f32rng); - std::generate(v42.begin(), v42.end(), f32rng); - std::generate(v43.begin(), v43.end(), f32rng); - std::generate(v44.begin(), v44.end(), f32rng); - std::generate(v45.begin(), v45.end(), f32rng); - std::generate(v46.begin(), v46.end(), f32rng); - std::generate(v47.begin(), v47.end(), f32rng); - std::generate(v48.begin(), v48.end(), f32rng); - std::generate(v49.begin(), v49.end(), f32rng); - std::generate(v50.begin(), v50.end(), f32rng); - std::generate(v51.begin(), v51.end(), f32rng); - std::generate(v52.begin(), v52.end(), f32rng); - std::generate(v53.begin(), v53.end(), f32rng); - std::generate(v54.begin(), v54.end(), f32rng); - std::generate(v55.begin(), v55.end(), f32rng); - std::generate(v56.begin(), v56.end(), f32rng); - std::generate(v57.begin(), v57.end(), f32rng); - std::generate(v58.begin(), v58.end(), f32rng); - std::generate(v59.begin(), v59.end(), f32rng); - std::generate(v60.begin(), v60.end(), f32rng); - std::generate(v61.begin(), v61.end(), f32rng); - std::generate(v62.begin(), v62.end(), f32rng); - std::generate(v63.begin(), v63.end(), f32rng); - std::generate(v64.begin(), v64.end(), f32rng); - std::generate(v65.begin(), v65.end(), f32rng); - std::generate(v66.begin(), v66.end(), f32rng); - std::generate(v67.begin(), v67.end(), f32rng); - std::generate(v68.begin(), v68.end(), f32rng); - std::generate(v69.begin(), v69.end(), f32rng); - std::generate(v70.begin(), v70.end(), f32rng); - std::generate(v71.begin(), v71.end(), f32rng); - std::generate(v72.begin(), v72.end(), f32rng); - std::generate(v73.begin(), v73.end(), f32rng); - std::generate(v74.begin(), v74.end(), f32rng); - std::generate(v75.begin(), v75.end(), f32rng); - std::generate(v76.begin(), v76.end(), f32rng); - std::generate(v77.begin(), v77.end(), f32rng); - std::generate(v78.begin(), v78.end(), f32rng); - std::generate(v79.begin(), v79.end(), f32rng); - std::generate(v80.begin(), v80.end(), f32rng); - std::generate(v81.begin(), v81.end(), f32rng); - std::generate(v82.begin(), v82.end(), f32rng); - std::generate(v83.begin(), v83.end(), f32rng); - std::generate(v84.begin(), v84.end(), f32rng); - std::generate(v85.begin(), v85.end(), f32rng); - std::generate(v86.begin(), v86.end(), f32rng); - std::generate(v87.begin(), v87.end(), f32rng); - std::generate(v88.begin(), v88.end(), f32rng); - std::generate(v89.begin(), v89.end(), f32rng); - std::generate(v90.begin(), v90.end(), f32rng); - std::generate(v91.begin(), v91.end(), f32rng); - std::generate(v92.begin(), v92.end(), f32rng); - std::generate(v93.begin(), v93.end(), f32rng); - std::generate(v94.begin(), v94.end(), f32rng); - std::generate(v95.begin(), v95.end(), f32rng); - std::generate(v96.begin(), v96.end(), f32rng); - std::generate(v97.begin(), v97.end(), f32rng); - std::generate(v98.begin(), v98.end(), f32rng); - std::generate(v99.begin(), v99.end(), f32rng); - std::generate(w100.begin(), w100.end(), f32rng); - std::generate(w101.begin(), w101.end(), f32rng); - std::generate(w102.begin(), w102.end(), f32rng); - std::generate(w103.begin(), w103.end(), f32rng); - std::generate(w104.begin(), w104.end(), f32rng); - std::generate(w105.begin(), w105.end(), f32rng); - std::generate(w106.begin(), w106.end(), f32rng); - std::generate(w107.begin(), w107.end(), f32rng); - std::generate(w108.begin(), w108.end(), f32rng); - std::generate(w109.begin(), w109.end(), f32rng); - std::generate(w110.begin(), w110.end(), f32rng); - std::generate(w111.begin(), w111.end(), f32rng); - std::generate(w112.begin(), w112.end(), f32rng); - std::generate(w113.begin(), w113.end(), f32rng); - std::generate(w114.begin(), w114.end(), f32rng); - std::generate(w115.begin(), w115.end(), f32rng); - std::generate(w116.begin(), w116.end(), f32rng); - std::generate(w117.begin(), w117.end(), f32rng); - std::generate(w118.begin(), w118.end(), f32rng); - std::generate(w119.begin(), w119.end(), f32rng); - std::generate(w120.begin(), w120.end(), f32rng); - std::generate(w121.begin(), w121.end(), f32rng); - std::generate(w122.begin(), w122.end(), f32rng); - std::generate(w123.begin(), w123.end(), f32rng); - std::generate(w124.begin(), w124.end(), f32rng); - std::generate(w125.begin(), w125.end(), f32rng); - std::generate(w126.begin(), w126.end(), f32rng); - std::generate(w127.begin(), w127.end(), f32rng); - std::generate(w128.begin(), w128.end(), f32rng); - std::generate(w129.begin(), w129.end(), f32rng); - std::generate(w130.begin(), w130.end(), f32rng); - std::generate(w131.begin(), w131.end(), f32rng); - std::generate(w132.begin(), w132.end(), f32rng); - std::generate(w133.begin(), w133.end(), f32rng); - std::generate(w134.begin(), w134.end(), f32rng); - std::generate(w135.begin(), w135.end(), f32rng); - std::generate(w136.begin(), w136.end(), f32rng); - std::generate(w137.begin(), w137.end(), f32rng); - std::generate(w138.begin(), w138.end(), f32rng); - std::generate(w139.begin(), w139.end(), f32rng); - std::generate(w140.begin(), w140.end(), f32rng); - std::generate(w141.begin(), w141.end(), f32rng); - std::generate(w142.begin(), w142.end(), f32rng); - std::generate(w143.begin(), w143.end(), f32rng); - std::generate(w144.begin(), w144.end(), f32rng); - std::generate(w145.begin(), w145.end(), f32rng); - std::generate(w146.begin(), w146.end(), f32rng); - std::generate(w147.begin(), w147.end(), f32rng); - std::generate(w148.begin(), w148.end(), f32rng); - std::generate(w149.begin(), w149.end(), f32rng); - std::generate(w150.begin(), w150.end(), f32rng); - std::generate(w151.begin(), w151.end(), f32rng); - std::generate(w152.begin(), w152.end(), f32rng); - std::generate(w153.begin(), w153.end(), f32rng); - std::generate(w154.begin(), w154.end(), f32rng); - std::generate(w155.begin(), w155.end(), f32rng); - std::generate(w156.begin(), w156.end(), f32rng); - std::generate(w157.begin(), w157.end(), f32rng); - std::generate(w158.begin(), w158.end(), f32rng); - std::generate(w159.begin(), w159.end(), f32rng); - std::generate(w160.begin(), w160.end(), f32rng); - std::generate(w161.begin(), w161.end(), f32rng); - std::generate(w162.begin(), w162.end(), f32rng); - std::generate(w163.begin(), w163.end(), f32rng); - std::generate(w164.begin(), w164.end(), f32rng); - std::generate(w165.begin(), w165.end(), f32rng); - std::generate(w166.begin(), w166.end(), f32rng); - std::generate(w167.begin(), w167.end(), f32rng); - std::generate(w168.begin(), w168.end(), f32rng); - std::generate(w169.begin(), w169.end(), f32rng); - std::generate(w170.begin(), w170.end(), f32rng); - std::generate(w171.begin(), w171.end(), f32rng); - std::generate(w172.begin(), w172.end(), f32rng); - std::generate(w173.begin(), w173.end(), f32rng); - std::generate(w174.begin(), w174.end(), f32rng); - std::generate(w175.begin(), w175.end(), f32rng); - std::generate(w176.begin(), w176.end(), f32rng); - std::generate(w177.begin(), w177.end(), f32rng); - std::generate(w178.begin(), w178.end(), f32rng); - std::generate(w179.begin(), w179.end(), f32rng); - std::generate(w180.begin(), w180.end(), f32rng); - std::generate(w181.begin(), w181.end(), f32rng); - std::generate(w182.begin(), w182.end(), f32rng); - std::generate(w183.begin(), w183.end(), f32rng); - std::generate(w184.begin(), w184.end(), f32rng); - std::generate(w185.begin(), w185.end(), f32rng); - std::generate(w186.begin(), w186.end(), f32rng); - std::generate(w187.begin(), w187.end(), f32rng); - std::generate(w188.begin(), w188.end(), f32rng); - std::generate(w189.begin(), w189.end(), f32rng); - std::generate(w190.begin(), w190.end(), f32rng); - std::generate(w191.begin(), w191.end(), f32rng); - std::generate(w192.begin(), w192.end(), f32rng); - std::generate(w193.begin(), w193.end(), f32rng); - std::generate(w194.begin(), w194.end(), f32rng); - std::generate(w195.begin(), w195.end(), f32rng); - std::generate(w196.begin(), w196.end(), f32rng); - std::generate(w197.begin(), w197.end(), f32rng); - std::generate(w198.begin(), w198.end(), f32rng); - std::generate(w199.begin(), w199.end(), f32rng); - std::generate(w200.begin(), w200.end(), f32rng); - std::generate(w201.begin(), w201.end(), f32rng); - std::generate(w202.begin(), w202.end(), f32rng); - std::generate(w203.begin(), w203.end(), f32rng); - std::generate(w204.begin(), w204.end(), f32rng); - std::generate(w205.begin(), w205.end(), f32rng); - std::generate(w206.begin(), w206.end(), f32rng); - std::generate(w207.begin(), w207.end(), f32rng); - - Operators operators; - xnn_status status; - xnn_code_cache* code_cache_ptr = nullptr; - size_t max_workspace_size = 0; - - xnn_operator_t op0 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 0 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 3 /* input channels per group */, - 16 /* output_channels_per_group */, - 3 /* input pixel stride */, - 16 /* output pixel stride */, - w100.data(), w101.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op0); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #0" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op0, xnn_delete_operator); - - xnn_operator_t op1 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op1); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #1" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op1, xnn_delete_operator); - - xnn_operator_t op2 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 0 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 16 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 16 /* input pixel stride */, - 16 /* output pixel stride */, - w102.data(), w103.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op2); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #2" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op2, xnn_delete_operator); - - xnn_operator_t op3 = nullptr; - status = xnn_create_global_average_pooling_nwc_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op3); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #3" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op3, xnn_delete_operator); - - xnn_operator_t op4 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 16 /* input channels per group */, - 8 /* output_channels_per_group */, - 16 /* input pixel stride */, - 8 /* output pixel stride */, - w104.data(), w105.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op4); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #4" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op4, xnn_delete_operator); - - xnn_operator_t op5 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 8 /* input channels per group */, - 16 /* output_channels_per_group */, - 8 /* input pixel stride */, - 16 /* output pixel stride */, - w106.data(), w107.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op5); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #5" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op5, xnn_delete_operator); - - xnn_operator_t op6 = nullptr; - status = xnn_create_multiply_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op6); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #6" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op6, xnn_delete_operator); - - xnn_operator_t op7 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 16 /* input channels per group */, - 16 /* output_channels_per_group */, - 16 /* input pixel stride */, - 16 /* output pixel stride */, - w108.data(), w109.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op7); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #7" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op7, xnn_delete_operator); - - xnn_operator_t op8 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 16 /* input channels per group */, - 72 /* output_channels_per_group */, - 16 /* input pixel stride */, - 72 /* output pixel stride */, - w110.data(), w111.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op8); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #8" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op8, xnn_delete_operator); - - xnn_operator_t op9 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 0 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 72 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 72 /* input pixel stride */, - 72 /* output pixel stride */, - w112.data(), w113.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op9); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #9" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op9, xnn_delete_operator); - - xnn_operator_t op10 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 72 /* input channels per group */, - 24 /* output_channels_per_group */, - 72 /* input pixel stride */, - 24 /* output pixel stride */, - w114.data(), w115.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op10); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #10" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op10, xnn_delete_operator); - - xnn_operator_t op11 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 24 /* input channels per group */, - 88 /* output_channels_per_group */, - 24 /* input pixel stride */, - 88 /* output pixel stride */, - w116.data(), w117.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op11); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #11" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op11, xnn_delete_operator); - - xnn_operator_t op12 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 88 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 88 /* input pixel stride */, - 88 /* output pixel stride */, - w118.data(), w119.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op12); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #12" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op12, xnn_delete_operator); - - xnn_operator_t op13 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 88 /* input channels per group */, - 24 /* output_channels_per_group */, - 88 /* input pixel stride */, - 24 /* output pixel stride */, - w120.data(), w121.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op13); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #13" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op13, xnn_delete_operator); - - xnn_operator_t op14 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op14); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #14" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op14, xnn_delete_operator); - - xnn_operator_t op15 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 24 /* input channels per group */, - 96 /* output_channels_per_group */, - 24 /* input pixel stride */, - 96 /* output pixel stride */, - w122.data(), w123.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op15); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #15" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op15, xnn_delete_operator); - - xnn_operator_t op16 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op16); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #16" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op16, xnn_delete_operator); - - xnn_operator_t op17 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 1 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 1 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 96 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 96 /* input pixel stride */, - 96 /* output pixel stride */, - w124.data(), w125.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op17); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #17" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op17, xnn_delete_operator); - - xnn_operator_t op18 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op18); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #18" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op18, xnn_delete_operator); - - xnn_operator_t op19 = nullptr; - status = xnn_create_global_average_pooling_nwc_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op19); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #19" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op19, xnn_delete_operator); - - xnn_operator_t op20 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 96 /* input channels per group */, - 24 /* output_channels_per_group */, - 96 /* input pixel stride */, - 24 /* output pixel stride */, - w126.data(), w127.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op20); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #20" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op20, xnn_delete_operator); - - xnn_operator_t op21 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 24 /* input channels per group */, - 96 /* output_channels_per_group */, - 24 /* input pixel stride */, - 96 /* output pixel stride */, - w128.data(), w129.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op21); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #21" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op21, xnn_delete_operator); - - xnn_operator_t op22 = nullptr; - status = xnn_create_multiply_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op22); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #22" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op22, xnn_delete_operator); - - xnn_operator_t op23 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 96 /* input channels per group */, - 40 /* output_channels_per_group */, - 96 /* input pixel stride */, - 40 /* output pixel stride */, - w130.data(), w131.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op23); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #23" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op23, xnn_delete_operator); - - xnn_operator_t op24 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 40 /* input channels per group */, - 240 /* output_channels_per_group */, - 40 /* input pixel stride */, - 240 /* output pixel stride */, - w132.data(), w133.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op24); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #24" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op24, xnn_delete_operator); - - xnn_operator_t op25 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op25); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #25" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op25, xnn_delete_operator); - - xnn_operator_t op26 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 2 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 2 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 240 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 240 /* input pixel stride */, - 240 /* output pixel stride */, - w134.data(), w135.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op26); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #26" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op26, xnn_delete_operator); - - xnn_operator_t op27 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op27); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #27" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op27, xnn_delete_operator); - - xnn_operator_t op28 = nullptr; - status = xnn_create_global_average_pooling_nwc_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op28); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #28" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op28, xnn_delete_operator); - - xnn_operator_t op29 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 240 /* input channels per group */, - 64 /* output_channels_per_group */, - 240 /* input pixel stride */, - 64 /* output pixel stride */, - w136.data(), w137.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op29); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #29" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op29, xnn_delete_operator); - - xnn_operator_t op30 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 64 /* input channels per group */, - 240 /* output_channels_per_group */, - 64 /* input pixel stride */, - 240 /* output pixel stride */, - w138.data(), w139.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op30); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #30" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op30, xnn_delete_operator); - - xnn_operator_t op31 = nullptr; - status = xnn_create_multiply_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op31); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #31" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op31, xnn_delete_operator); - - xnn_operator_t op32 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 240 /* input channels per group */, - 40 /* output_channels_per_group */, - 240 /* input pixel stride */, - 40 /* output pixel stride */, - w140.data(), w141.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op32); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #32" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op32, xnn_delete_operator); - - xnn_operator_t op33 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op33); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #33" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op33, xnn_delete_operator); - - xnn_operator_t op34 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 40 /* input channels per group */, - 240 /* output_channels_per_group */, - 40 /* input pixel stride */, - 240 /* output pixel stride */, - w142.data(), w143.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op34); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #34" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op34, xnn_delete_operator); - - xnn_operator_t op35 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op35); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #35" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op35, xnn_delete_operator); - - xnn_operator_t op36 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 2 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 2 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 240 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 240 /* input pixel stride */, - 240 /* output pixel stride */, - w144.data(), w145.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op36); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #36" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op36, xnn_delete_operator); - - xnn_operator_t op37 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op37); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #37" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op37, xnn_delete_operator); - - xnn_operator_t op38 = nullptr; - status = xnn_create_global_average_pooling_nwc_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op38); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #38" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op38, xnn_delete_operator); - - xnn_operator_t op39 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 240 /* input channels per group */, - 64 /* output_channels_per_group */, - 240 /* input pixel stride */, - 64 /* output pixel stride */, - w146.data(), w147.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op39); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #39" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op39, xnn_delete_operator); - - xnn_operator_t op40 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 64 /* input channels per group */, - 240 /* output_channels_per_group */, - 64 /* input pixel stride */, - 240 /* output pixel stride */, - w148.data(), w149.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op40); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #40" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op40, xnn_delete_operator); - - xnn_operator_t op41 = nullptr; - status = xnn_create_multiply_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op41); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #41" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op41, xnn_delete_operator); - - xnn_operator_t op42 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 240 /* input channels per group */, - 40 /* output_channels_per_group */, - 240 /* input pixel stride */, - 40 /* output pixel stride */, - w150.data(), w151.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op42); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #42" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op42, xnn_delete_operator); - - xnn_operator_t op43 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op43); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #43" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op43, xnn_delete_operator); - - xnn_operator_t op44 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 40 /* input channels per group */, - 120 /* output_channels_per_group */, - 40 /* input pixel stride */, - 120 /* output pixel stride */, - w152.data(), w153.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op44); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #44" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op44, xnn_delete_operator); - - xnn_operator_t op45 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op45); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #45" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op45, xnn_delete_operator); - - xnn_operator_t op46 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 2 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 2 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 120 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 120 /* input pixel stride */, - 120 /* output pixel stride */, - w154.data(), w155.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op46); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #46" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op46, xnn_delete_operator); - - xnn_operator_t op47 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op47); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #47" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op47, xnn_delete_operator); - - xnn_operator_t op48 = nullptr; - status = xnn_create_global_average_pooling_nwc_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op48); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #48" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op48, xnn_delete_operator); - - xnn_operator_t op49 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 120 /* input channels per group */, - 32 /* output_channels_per_group */, - 120 /* input pixel stride */, - 32 /* output pixel stride */, - w156.data(), w157.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op49); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #49" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op49, xnn_delete_operator); - - xnn_operator_t op50 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 32 /* input channels per group */, - 120 /* output_channels_per_group */, - 32 /* input pixel stride */, - 120 /* output pixel stride */, - w158.data(), w159.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op50); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #50" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op50, xnn_delete_operator); - - xnn_operator_t op51 = nullptr; - status = xnn_create_multiply_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op51); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #51" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op51, xnn_delete_operator); - - xnn_operator_t op52 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 120 /* input channels per group */, - 48 /* output_channels_per_group */, - 120 /* input pixel stride */, - 48 /* output pixel stride */, - w160.data(), w161.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op52); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #52" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op52, xnn_delete_operator); - - xnn_operator_t op53 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 48 /* input channels per group */, - 144 /* output_channels_per_group */, - 48 /* input pixel stride */, - 144 /* output pixel stride */, - w162.data(), w163.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op53); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #53" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op53, xnn_delete_operator); - - xnn_operator_t op54 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op54); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #54" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op54, xnn_delete_operator); - - xnn_operator_t op55 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 2 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 2 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 144 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 144 /* input pixel stride */, - 144 /* output pixel stride */, - w164.data(), w165.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op55); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #55" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op55, xnn_delete_operator); - - xnn_operator_t op56 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op56); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #56" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op56, xnn_delete_operator); - - xnn_operator_t op57 = nullptr; - status = xnn_create_global_average_pooling_nwc_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op57); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #57" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op57, xnn_delete_operator); - - xnn_operator_t op58 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 144 /* input channels per group */, - 40 /* output_channels_per_group */, - 144 /* input pixel stride */, - 40 /* output pixel stride */, - w166.data(), w167.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op58); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #58" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op58, xnn_delete_operator); - - xnn_operator_t op59 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 40 /* input channels per group */, - 144 /* output_channels_per_group */, - 40 /* input pixel stride */, - 144 /* output pixel stride */, - w168.data(), w169.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op59); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #59" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op59, xnn_delete_operator); - - xnn_operator_t op60 = nullptr; - status = xnn_create_multiply_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op60); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #60" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op60, xnn_delete_operator); - - xnn_operator_t op61 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 144 /* input channels per group */, - 48 /* output_channels_per_group */, - 144 /* input pixel stride */, - 48 /* output pixel stride */, - w170.data(), w171.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op61); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #61" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op61, xnn_delete_operator); - - xnn_operator_t op62 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op62); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #62" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op62, xnn_delete_operator); - - xnn_operator_t op63 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 48 /* input channels per group */, - 288 /* output_channels_per_group */, - 48 /* input pixel stride */, - 288 /* output pixel stride */, - w172.data(), w173.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op63); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #63" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op63, xnn_delete_operator); - - xnn_operator_t op64 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op64); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #64" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op64, xnn_delete_operator); - - xnn_operator_t op65 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 1 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 1 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 288 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 288 /* input pixel stride */, - 288 /* output pixel stride */, - w174.data(), w175.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op65); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #65" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op65, xnn_delete_operator); - - xnn_operator_t op66 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op66); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #66" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op66, xnn_delete_operator); - - xnn_operator_t op67 = nullptr; - status = xnn_create_global_average_pooling_nwc_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op67); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #67" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op67, xnn_delete_operator); - - xnn_operator_t op68 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 288 /* input channels per group */, - 72 /* output_channels_per_group */, - 288 /* input pixel stride */, - 72 /* output pixel stride */, - w176.data(), w177.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op68); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #68" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op68, xnn_delete_operator); - - xnn_operator_t op69 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 72 /* input channels per group */, - 288 /* output_channels_per_group */, - 72 /* input pixel stride */, - 288 /* output pixel stride */, - w178.data(), w179.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op69); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #69" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op69, xnn_delete_operator); - - xnn_operator_t op70 = nullptr; - status = xnn_create_multiply_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op70); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #70" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op70, xnn_delete_operator); - - xnn_operator_t op71 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 288 /* input channels per group */, - 96 /* output_channels_per_group */, - 288 /* input pixel stride */, - 96 /* output pixel stride */, - w180.data(), w181.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op71); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #71" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op71, xnn_delete_operator); - - xnn_operator_t op72 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 96 /* input channels per group */, - 576 /* output_channels_per_group */, - 96 /* input pixel stride */, - 576 /* output pixel stride */, - w182.data(), w183.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op72); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #72" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op72, xnn_delete_operator); - - xnn_operator_t op73 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op73); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #73" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op73, xnn_delete_operator); - - xnn_operator_t op74 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 2 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 2 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 576 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 576 /* input pixel stride */, - 576 /* output pixel stride */, - w184.data(), w185.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op74); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #74" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op74, xnn_delete_operator); - - xnn_operator_t op75 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op75); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #75" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op75, xnn_delete_operator); - - xnn_operator_t op76 = nullptr; - status = xnn_create_global_average_pooling_nwc_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op76); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #76" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op76, xnn_delete_operator); - - xnn_operator_t op77 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 576 /* input channels per group */, - 144 /* output_channels_per_group */, - 576 /* input pixel stride */, - 144 /* output pixel stride */, - w186.data(), w187.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op77); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #77" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op77, xnn_delete_operator); - - xnn_operator_t op78 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 144 /* input channels per group */, - 576 /* output_channels_per_group */, - 144 /* input pixel stride */, - 576 /* output pixel stride */, - w188.data(), w189.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op78); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #78" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op78, xnn_delete_operator); - - xnn_operator_t op79 = nullptr; - status = xnn_create_multiply_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op79); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #79" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op79, xnn_delete_operator); - - xnn_operator_t op80 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 576 /* input channels per group */, - 96 /* output_channels_per_group */, - 576 /* input pixel stride */, - 96 /* output pixel stride */, - w190.data(), w191.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op80); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #80" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op80, xnn_delete_operator); - - xnn_operator_t op81 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op81); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #81" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op81, xnn_delete_operator); - - xnn_operator_t op82 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 96 /* input channels per group */, - 576 /* output_channels_per_group */, - 96 /* input pixel stride */, - 576 /* output pixel stride */, - w192.data(), w193.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op82); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #82" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op82, xnn_delete_operator); - - xnn_operator_t op83 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op83); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #83" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op83, xnn_delete_operator); - - xnn_operator_t op84 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 2 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 2 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 576 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 576 /* input pixel stride */, - 576 /* output pixel stride */, - w194.data(), w195.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op84); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #84" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op84, xnn_delete_operator); - - xnn_operator_t op85 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op85); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #85" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op85, xnn_delete_operator); - - xnn_operator_t op86 = nullptr; - status = xnn_create_global_average_pooling_nwc_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op86); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #86" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op86, xnn_delete_operator); - - xnn_operator_t op87 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 576 /* input channels per group */, - 144 /* output_channels_per_group */, - 576 /* input pixel stride */, - 144 /* output pixel stride */, - w196.data(), w197.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op87); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #87" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op87, xnn_delete_operator); - - xnn_operator_t op88 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 144 /* input channels per group */, - 576 /* output_channels_per_group */, - 144 /* input pixel stride */, - 576 /* output pixel stride */, - w198.data(), w199.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op88); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #88" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op88, xnn_delete_operator); - - xnn_operator_t op89 = nullptr; - status = xnn_create_multiply_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op89); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #89" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op89, xnn_delete_operator); - - xnn_operator_t op90 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 576 /* input channels per group */, - 96 /* output_channels_per_group */, - 576 /* input pixel stride */, - 96 /* output pixel stride */, - w200.data(), w201.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op90); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #90" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op90, xnn_delete_operator); - - xnn_operator_t op91 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op91); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #91" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op91, xnn_delete_operator); - - xnn_operator_t op92 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 96 /* input channels per group */, - 576 /* output_channels_per_group */, - 96 /* input pixel stride */, - 576 /* output pixel stride */, - w202.data(), w203.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op92); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #92" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op92, xnn_delete_operator); - - xnn_operator_t op93 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op93); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #93" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op93, xnn_delete_operator); - - xnn_operator_t op94 = nullptr; - status = xnn_create_global_average_pooling_nwc_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op94); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #94" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op94, xnn_delete_operator); - - xnn_operator_t op95 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 576 /* input channels per group */, - 1024 /* output_channels_per_group */, - 576 /* input pixel stride */, - 1024 /* output pixel stride */, - w204.data(), w205.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op95); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #95" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op95, xnn_delete_operator); - - xnn_operator_t op96 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op96); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #96" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op96, xnn_delete_operator); - - xnn_operator_t op97 = nullptr; - status = xnn_create_global_average_pooling_nwc_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op97); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #97" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op97, xnn_delete_operator); - - xnn_operator_t op98 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 1024 /* input channels per group */, - 1001 /* output_channels_per_group */, - 1024 /* input pixel stride */, - 1001 /* output pixel stride */, - w206.data(), w207.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - code_cache_ptr, - nullptr, - &op98); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #98" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op98, xnn_delete_operator); - - size_t op0_workspace_size = 0; - size_t op0_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op0, - /*batch_size=*/1, /*input_height=*/224, /*input_width=*/224, - &op0_workspace_size, &op0_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op0_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #0" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op1, - /*batch_size=*/12544, - 16 /* channels */, - 16 /* input stride */, - 16 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #1" << std::endl; - return ExecutionPlan(); - } - - size_t op2_workspace_size = 0; - size_t op2_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op2, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op2_workspace_size, &op2_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op2_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #2" << std::endl; - return ExecutionPlan(); - } - - size_t op3_workspace_size = 0; - size_t op3_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f16( - op3, - /*batch_size=*/1, 3136 /* width */, - 16 /* channels */, 16 /* input stride */, 16 /* output stride */, - &op3_workspace_size, &op3_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op3_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #3" << std::endl; - return ExecutionPlan(); - } - - size_t op4_workspace_size = 0; - size_t op4_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op4, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op4_workspace_size, &op4_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op4_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #4" << std::endl; - return ExecutionPlan(); - } - - size_t op5_workspace_size = 0; - size_t op5_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op5, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op5_workspace_size, &op5_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op5_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #5" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 56, 56, 16 }; - const size_t b_shape[] = { 1, 1, 1, 16 }; - status = xnn_reshape_multiply_nd_f16( - op6, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #6" << std::endl; - return ExecutionPlan(); - } - - size_t op7_workspace_size = 0; - size_t op7_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op7, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op7_workspace_size, &op7_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op7_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #7" << std::endl; - return ExecutionPlan(); - } - - size_t op8_workspace_size = 0; - size_t op8_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op8, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op8_workspace_size, &op8_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op8_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #8" << std::endl; - return ExecutionPlan(); - } - - size_t op9_workspace_size = 0; - size_t op9_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op9, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op9_workspace_size, &op9_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op9_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #9" << std::endl; - return ExecutionPlan(); - } - - size_t op10_workspace_size = 0; - size_t op10_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op10, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op10_workspace_size, &op10_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op10_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #10" << std::endl; - return ExecutionPlan(); - } - - size_t op11_workspace_size = 0; - size_t op11_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op11, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op11_workspace_size, &op11_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op11_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #11" << std::endl; - return ExecutionPlan(); - } - - size_t op12_workspace_size = 0; - size_t op12_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op12, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op12_workspace_size, &op12_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op12_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #12" << std::endl; - return ExecutionPlan(); - } - - size_t op13_workspace_size = 0; - size_t op13_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op13, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op13_workspace_size, &op13_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op13_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #13" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 28, 28, 24 }; - const size_t b_shape[] = { 1, 28, 28, 24 }; - status = xnn_reshape_add_nd_f16( - op14, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #14" << std::endl; - return ExecutionPlan(); - } - - size_t op15_workspace_size = 0; - size_t op15_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op15, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op15_workspace_size, &op15_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op15_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #15" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op16, - /*batch_size=*/784, - 96 /* channels */, - 96 /* input stride */, - 96 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #16" << std::endl; - return ExecutionPlan(); - } - - size_t op17_workspace_size = 0; - size_t op17_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op17, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op17_workspace_size, &op17_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op17_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #17" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op18, - /*batch_size=*/196, - 96 /* channels */, - 96 /* input stride */, - 96 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #18" << std::endl; - return ExecutionPlan(); - } - - size_t op19_workspace_size = 0; - size_t op19_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f16( - op19, - /*batch_size=*/1, 196 /* width */, - 96 /* channels */, 96 /* input stride */, 96 /* output stride */, - &op19_workspace_size, &op19_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op19_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #19" << std::endl; - return ExecutionPlan(); - } - - size_t op20_workspace_size = 0; - size_t op20_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op20, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op20_workspace_size, &op20_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op20_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #20" << std::endl; - return ExecutionPlan(); - } - - size_t op21_workspace_size = 0; - size_t op21_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op21, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op21_workspace_size, &op21_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op21_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #21" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 96 }; - const size_t b_shape[] = { 1, 1, 1, 96 }; - status = xnn_reshape_multiply_nd_f16( - op22, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #22" << std::endl; - return ExecutionPlan(); - } - - size_t op23_workspace_size = 0; - size_t op23_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op23, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op23_workspace_size, &op23_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op23_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #23" << std::endl; - return ExecutionPlan(); - } - - size_t op24_workspace_size = 0; - size_t op24_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op24, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op24_workspace_size, &op24_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op24_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #24" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op25, - /*batch_size=*/196, - 240 /* channels */, - 240 /* input stride */, - 240 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #25" << std::endl; - return ExecutionPlan(); - } - - size_t op26_workspace_size = 0; - size_t op26_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op26, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op26_workspace_size, &op26_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op26_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #26" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op27, - /*batch_size=*/196, - 240 /* channels */, - 240 /* input stride */, - 240 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #27" << std::endl; - return ExecutionPlan(); - } - - size_t op28_workspace_size = 0; - size_t op28_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f16( - op28, - /*batch_size=*/1, 196 /* width */, - 240 /* channels */, 240 /* input stride */, 240 /* output stride */, - &op28_workspace_size, &op28_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op28_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #28" << std::endl; - return ExecutionPlan(); - } - - size_t op29_workspace_size = 0; - size_t op29_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op29, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op29_workspace_size, &op29_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op29_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #29" << std::endl; - return ExecutionPlan(); - } - - size_t op30_workspace_size = 0; - size_t op30_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op30, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op30_workspace_size, &op30_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op30_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #30" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 240 }; - const size_t b_shape[] = { 1, 1, 1, 240 }; - status = xnn_reshape_multiply_nd_f16( - op31, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #31" << std::endl; - return ExecutionPlan(); - } - - size_t op32_workspace_size = 0; - size_t op32_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op32, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op32_workspace_size, &op32_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op32_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #32" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 40 }; - const size_t b_shape[] = { 1, 14, 14, 40 }; - status = xnn_reshape_add_nd_f16( - op33, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #33" << std::endl; - return ExecutionPlan(); - } - - size_t op34_workspace_size = 0; - size_t op34_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op34, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op34_workspace_size, &op34_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op34_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #34" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op35, - /*batch_size=*/196, - 240 /* channels */, - 240 /* input stride */, - 240 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #35" << std::endl; - return ExecutionPlan(); - } - - size_t op36_workspace_size = 0; - size_t op36_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op36, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op36_workspace_size, &op36_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op36_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #36" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op37, - /*batch_size=*/196, - 240 /* channels */, - 240 /* input stride */, - 240 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #37" << std::endl; - return ExecutionPlan(); - } - - size_t op38_workspace_size = 0; - size_t op38_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f16( - op38, - /*batch_size=*/1, 196 /* width */, - 240 /* channels */, 240 /* input stride */, 240 /* output stride */, - &op38_workspace_size, &op38_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op38_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #38" << std::endl; - return ExecutionPlan(); - } - - size_t op39_workspace_size = 0; - size_t op39_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op39, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op39_workspace_size, &op39_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op39_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #39" << std::endl; - return ExecutionPlan(); - } - - size_t op40_workspace_size = 0; - size_t op40_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op40, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op40_workspace_size, &op40_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op40_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #40" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 240 }; - const size_t b_shape[] = { 1, 1, 1, 240 }; - status = xnn_reshape_multiply_nd_f16( - op41, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #41" << std::endl; - return ExecutionPlan(); - } - - size_t op42_workspace_size = 0; - size_t op42_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op42, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op42_workspace_size, &op42_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op42_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #42" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 40 }; - const size_t b_shape[] = { 1, 14, 14, 40 }; - status = xnn_reshape_add_nd_f16( - op43, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #43" << std::endl; - return ExecutionPlan(); - } - - size_t op44_workspace_size = 0; - size_t op44_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op44, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op44_workspace_size, &op44_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op44_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #44" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op45, - /*batch_size=*/196, - 120 /* channels */, - 120 /* input stride */, - 120 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #45" << std::endl; - return ExecutionPlan(); - } - - size_t op46_workspace_size = 0; - size_t op46_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op46, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op46_workspace_size, &op46_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op46_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #46" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op47, - /*batch_size=*/196, - 120 /* channels */, - 120 /* input stride */, - 120 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #47" << std::endl; - return ExecutionPlan(); - } - - size_t op48_workspace_size = 0; - size_t op48_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f16( - op48, - /*batch_size=*/1, 196 /* width */, - 120 /* channels */, 120 /* input stride */, 120 /* output stride */, - &op48_workspace_size, &op48_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op48_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #48" << std::endl; - return ExecutionPlan(); - } - - size_t op49_workspace_size = 0; - size_t op49_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op49, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op49_workspace_size, &op49_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op49_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #49" << std::endl; - return ExecutionPlan(); - } - - size_t op50_workspace_size = 0; - size_t op50_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op50, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op50_workspace_size, &op50_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op50_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #50" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 120 }; - const size_t b_shape[] = { 1, 1, 1, 120 }; - status = xnn_reshape_multiply_nd_f16( - op51, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #51" << std::endl; - return ExecutionPlan(); - } - - size_t op52_workspace_size = 0; - size_t op52_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op52, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op52_workspace_size, &op52_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op52_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #52" << std::endl; - return ExecutionPlan(); - } - - size_t op53_workspace_size = 0; - size_t op53_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op53, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op53_workspace_size, &op53_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op53_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #53" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op54, - /*batch_size=*/196, - 144 /* channels */, - 144 /* input stride */, - 144 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #54" << std::endl; - return ExecutionPlan(); - } - - size_t op55_workspace_size = 0; - size_t op55_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op55, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op55_workspace_size, &op55_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op55_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #55" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op56, - /*batch_size=*/196, - 144 /* channels */, - 144 /* input stride */, - 144 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #56" << std::endl; - return ExecutionPlan(); - } - - size_t op57_workspace_size = 0; - size_t op57_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f16( - op57, - /*batch_size=*/1, 196 /* width */, - 144 /* channels */, 144 /* input stride */, 144 /* output stride */, - &op57_workspace_size, &op57_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op57_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #57" << std::endl; - return ExecutionPlan(); - } - - size_t op58_workspace_size = 0; - size_t op58_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op58, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op58_workspace_size, &op58_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op58_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #58" << std::endl; - return ExecutionPlan(); - } - - size_t op59_workspace_size = 0; - size_t op59_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op59, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op59_workspace_size, &op59_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op59_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #59" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 144 }; - const size_t b_shape[] = { 1, 1, 1, 144 }; - status = xnn_reshape_multiply_nd_f16( - op60, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #60" << std::endl; - return ExecutionPlan(); - } - - size_t op61_workspace_size = 0; - size_t op61_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op61, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op61_workspace_size, &op61_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op61_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #61" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 48 }; - const size_t b_shape[] = { 1, 14, 14, 48 }; - status = xnn_reshape_add_nd_f16( - op62, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #62" << std::endl; - return ExecutionPlan(); - } - - size_t op63_workspace_size = 0; - size_t op63_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op63, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op63_workspace_size, &op63_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op63_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #63" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op64, - /*batch_size=*/196, - 288 /* channels */, - 288 /* input stride */, - 288 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #64" << std::endl; - return ExecutionPlan(); - } - - size_t op65_workspace_size = 0; - size_t op65_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op65, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op65_workspace_size, &op65_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op65_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #65" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op66, - /*batch_size=*/49, - 288 /* channels */, - 288 /* input stride */, - 288 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #66" << std::endl; - return ExecutionPlan(); - } - - size_t op67_workspace_size = 0; - size_t op67_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f16( - op67, - /*batch_size=*/1, 49 /* width */, - 288 /* channels */, 288 /* input stride */, 288 /* output stride */, - &op67_workspace_size, &op67_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op67_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #67" << std::endl; - return ExecutionPlan(); - } - - size_t op68_workspace_size = 0; - size_t op68_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op68, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op68_workspace_size, &op68_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op68_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #68" << std::endl; - return ExecutionPlan(); - } - - size_t op69_workspace_size = 0; - size_t op69_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op69, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op69_workspace_size, &op69_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op69_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #69" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 288 }; - const size_t b_shape[] = { 1, 1, 1, 288 }; - status = xnn_reshape_multiply_nd_f16( - op70, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #70" << std::endl; - return ExecutionPlan(); - } - - size_t op71_workspace_size = 0; - size_t op71_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op71, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op71_workspace_size, &op71_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op71_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #71" << std::endl; - return ExecutionPlan(); - } - - size_t op72_workspace_size = 0; - size_t op72_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op72, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op72_workspace_size, &op72_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op72_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #72" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op73, - /*batch_size=*/49, - 576 /* channels */, - 576 /* input stride */, - 576 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #73" << std::endl; - return ExecutionPlan(); - } - - size_t op74_workspace_size = 0; - size_t op74_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op74, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op74_workspace_size, &op74_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op74_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #74" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op75, - /*batch_size=*/49, - 576 /* channels */, - 576 /* input stride */, - 576 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #75" << std::endl; - return ExecutionPlan(); - } - - size_t op76_workspace_size = 0; - size_t op76_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f16( - op76, - /*batch_size=*/1, 49 /* width */, - 576 /* channels */, 576 /* input stride */, 576 /* output stride */, - &op76_workspace_size, &op76_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op76_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #76" << std::endl; - return ExecutionPlan(); - } - - size_t op77_workspace_size = 0; - size_t op77_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op77, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op77_workspace_size, &op77_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op77_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #77" << std::endl; - return ExecutionPlan(); - } - - size_t op78_workspace_size = 0; - size_t op78_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op78, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op78_workspace_size, &op78_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op78_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #78" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 576 }; - const size_t b_shape[] = { 1, 1, 1, 576 }; - status = xnn_reshape_multiply_nd_f16( - op79, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #79" << std::endl; - return ExecutionPlan(); - } - - size_t op80_workspace_size = 0; - size_t op80_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op80, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op80_workspace_size, &op80_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op80_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #80" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 96 }; - const size_t b_shape[] = { 1, 7, 7, 96 }; - status = xnn_reshape_add_nd_f16( - op81, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #81" << std::endl; - return ExecutionPlan(); - } - - size_t op82_workspace_size = 0; - size_t op82_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op82, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op82_workspace_size, &op82_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op82_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #82" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op83, - /*batch_size=*/49, - 576 /* channels */, - 576 /* input stride */, - 576 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #83" << std::endl; - return ExecutionPlan(); - } - - size_t op84_workspace_size = 0; - size_t op84_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op84, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op84_workspace_size, &op84_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op84_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #84" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op85, - /*batch_size=*/49, - 576 /* channels */, - 576 /* input stride */, - 576 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #85" << std::endl; - return ExecutionPlan(); - } - - size_t op86_workspace_size = 0; - size_t op86_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f16( - op86, - /*batch_size=*/1, 49 /* width */, - 576 /* channels */, 576 /* input stride */, 576 /* output stride */, - &op86_workspace_size, &op86_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op86_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #86" << std::endl; - return ExecutionPlan(); - } - - size_t op87_workspace_size = 0; - size_t op87_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op87, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op87_workspace_size, &op87_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op87_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #87" << std::endl; - return ExecutionPlan(); - } - - size_t op88_workspace_size = 0; - size_t op88_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op88, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op88_workspace_size, &op88_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op88_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #88" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 576 }; - const size_t b_shape[] = { 1, 1, 1, 576 }; - status = xnn_reshape_multiply_nd_f16( - op89, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #89" << std::endl; - return ExecutionPlan(); - } - - size_t op90_workspace_size = 0; - size_t op90_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op90, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op90_workspace_size, &op90_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op90_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #90" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 96 }; - const size_t b_shape[] = { 1, 7, 7, 96 }; - status = xnn_reshape_add_nd_f16( - op91, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #91" << std::endl; - return ExecutionPlan(); - } - - size_t op92_workspace_size = 0; - size_t op92_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op92, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op92_workspace_size, &op92_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op92_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #92" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op93, - /*batch_size=*/49, - 576 /* channels */, - 576 /* input stride */, - 576 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #93" << std::endl; - return ExecutionPlan(); - } - - size_t op94_workspace_size = 0; - size_t op94_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f16( - op94, - /*batch_size=*/1, 49 /* width */, - 576 /* channels */, 576 /* input stride */, 576 /* output stride */, - &op94_workspace_size, &op94_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op94_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #94" << std::endl; - return ExecutionPlan(); - } - - size_t op95_workspace_size = 0; - size_t op95_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op95, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op95_workspace_size, &op95_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op95_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #95" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op96, - /*batch_size=*/1, - 1024 /* channels */, - 1024 /* input stride */, - 1024 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #96" << std::endl; - return ExecutionPlan(); - } - - size_t op97_workspace_size = 0; - size_t op97_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f16( - op97, - /*batch_size=*/1, 1 /* width */, - 1024 /* channels */, 1024 /* input stride */, 1024 /* output stride */, - &op97_workspace_size, &op97_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op97_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #97" << std::endl; - return ExecutionPlan(); - } - - size_t op98_workspace_size = 0; - size_t op98_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op98, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op98_workspace_size, &op98_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op98_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #98" << std::endl; - return ExecutionPlan(); - } - - Workspace workspace(max_workspace_size); - - status = xnn_setup_convolution2d_nhwc_f16( - op0, - workspace.data(), /*input=*/v0.data(), /*output=*/v1.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #0" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op1, - /*input=*/v1.data(), /*output=*/v2.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #1" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op2, - workspace.data(), /*input=*/v2.data(), /*output=*/v3.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #2" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f16( - op3, - workspace.data(), - /*input=*/v3.data(), /*output=*/v4.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #3" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op4, - workspace.data(), /*input=*/v4.data(), /*output=*/v5.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #4" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op5, - workspace.data(), /*input=*/v5.data(), /*output=*/v6.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #5" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f16( - op6, - v3.data() /* a */, v6.data() /* b */, /*output=*/v7.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #6" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op7, - workspace.data(), /*input=*/v7.data(), /*output=*/v8.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #7" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op8, - workspace.data(), /*input=*/v8.data(), /*output=*/v9.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #8" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op9, - workspace.data(), /*input=*/v9.data(), /*output=*/v10.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #9" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op10, - workspace.data(), /*input=*/v10.data(), /*output=*/v11.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #10" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op11, - workspace.data(), /*input=*/v11.data(), /*output=*/v12.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #11" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op12, - workspace.data(), /*input=*/v12.data(), /*output=*/v13.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #12" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op13, - workspace.data(), /*input=*/v13.data(), /*output=*/v14.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #13" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op14, - v14.data() /* a */, v11.data() /* b */, /*output=*/v15.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #14" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op15, - workspace.data(), /*input=*/v15.data(), /*output=*/v16.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #15" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op16, - /*input=*/v16.data(), /*output=*/v17.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #16" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op17, - workspace.data(), /*input=*/v17.data(), /*output=*/v18.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #17" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op18, - /*input=*/v18.data(), /*output=*/v19.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #18" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f16( - op19, - workspace.data(), - /*input=*/v19.data(), /*output=*/v20.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #19" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op20, - workspace.data(), /*input=*/v20.data(), /*output=*/v21.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #20" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op21, - workspace.data(), /*input=*/v21.data(), /*output=*/v22.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #21" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f16( - op22, - v19.data() /* a */, v22.data() /* b */, /*output=*/v23.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #22" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op23, - workspace.data(), /*input=*/v23.data(), /*output=*/v24.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #23" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op24, - workspace.data(), /*input=*/v24.data(), /*output=*/v25.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #24" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op25, - /*input=*/v25.data(), /*output=*/v26.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #25" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op26, - workspace.data(), /*input=*/v26.data(), /*output=*/v27.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #26" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op27, - /*input=*/v27.data(), /*output=*/v28.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #27" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f16( - op28, - workspace.data(), - /*input=*/v28.data(), /*output=*/v29.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #28" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op29, - workspace.data(), /*input=*/v29.data(), /*output=*/v30.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #29" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op30, - workspace.data(), /*input=*/v30.data(), /*output=*/v31.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #30" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f16( - op31, - v28.data() /* a */, v31.data() /* b */, /*output=*/v32.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #31" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op32, - workspace.data(), /*input=*/v32.data(), /*output=*/v33.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #32" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op33, - v33.data() /* a */, v24.data() /* b */, /*output=*/v34.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #33" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op34, - workspace.data(), /*input=*/v34.data(), /*output=*/v35.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #34" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op35, - /*input=*/v35.data(), /*output=*/v36.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #35" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op36, - workspace.data(), /*input=*/v36.data(), /*output=*/v37.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #36" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op37, - /*input=*/v37.data(), /*output=*/v38.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #37" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f16( - op38, - workspace.data(), - /*input=*/v38.data(), /*output=*/v39.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #38" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op39, - workspace.data(), /*input=*/v39.data(), /*output=*/v40.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #39" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op40, - workspace.data(), /*input=*/v40.data(), /*output=*/v41.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #40" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f16( - op41, - v38.data() /* a */, v41.data() /* b */, /*output=*/v42.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #41" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op42, - workspace.data(), /*input=*/v42.data(), /*output=*/v43.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #42" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op43, - v43.data() /* a */, v34.data() /* b */, /*output=*/v44.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #43" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op44, - workspace.data(), /*input=*/v44.data(), /*output=*/v45.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #44" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op45, - /*input=*/v45.data(), /*output=*/v46.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #45" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op46, - workspace.data(), /*input=*/v46.data(), /*output=*/v47.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #46" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op47, - /*input=*/v47.data(), /*output=*/v48.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #47" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f16( - op48, - workspace.data(), - /*input=*/v48.data(), /*output=*/v49.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #48" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op49, - workspace.data(), /*input=*/v49.data(), /*output=*/v50.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #49" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op50, - workspace.data(), /*input=*/v50.data(), /*output=*/v51.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #50" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f16( - op51, - v48.data() /* a */, v51.data() /* b */, /*output=*/v52.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #51" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op52, - workspace.data(), /*input=*/v52.data(), /*output=*/v53.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #52" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op53, - workspace.data(), /*input=*/v53.data(), /*output=*/v54.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #53" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op54, - /*input=*/v54.data(), /*output=*/v55.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #54" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op55, - workspace.data(), /*input=*/v55.data(), /*output=*/v56.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #55" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op56, - /*input=*/v56.data(), /*output=*/v57.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #56" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f16( - op57, - workspace.data(), - /*input=*/v57.data(), /*output=*/v58.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #57" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op58, - workspace.data(), /*input=*/v58.data(), /*output=*/v59.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #58" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op59, - workspace.data(), /*input=*/v59.data(), /*output=*/v60.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #59" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f16( - op60, - v57.data() /* a */, v60.data() /* b */, /*output=*/v61.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #60" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op61, - workspace.data(), /*input=*/v61.data(), /*output=*/v62.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #61" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op62, - v62.data() /* a */, v53.data() /* b */, /*output=*/v63.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #62" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op63, - workspace.data(), /*input=*/v63.data(), /*output=*/v64.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #63" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op64, - /*input=*/v64.data(), /*output=*/v65.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #64" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op65, - workspace.data(), /*input=*/v65.data(), /*output=*/v66.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #65" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op66, - /*input=*/v66.data(), /*output=*/v67.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #66" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f16( - op67, - workspace.data(), - /*input=*/v67.data(), /*output=*/v68.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #67" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op68, - workspace.data(), /*input=*/v68.data(), /*output=*/v69.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #68" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op69, - workspace.data(), /*input=*/v69.data(), /*output=*/v70.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #69" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f16( - op70, - v67.data() /* a */, v70.data() /* b */, /*output=*/v71.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #70" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op71, - workspace.data(), /*input=*/v71.data(), /*output=*/v72.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #71" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op72, - workspace.data(), /*input=*/v72.data(), /*output=*/v73.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #72" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op73, - /*input=*/v73.data(), /*output=*/v74.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #73" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op74, - workspace.data(), /*input=*/v74.data(), /*output=*/v75.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #74" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op75, - /*input=*/v75.data(), /*output=*/v76.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #75" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f16( - op76, - workspace.data(), - /*input=*/v76.data(), /*output=*/v77.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #76" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op77, - workspace.data(), /*input=*/v77.data(), /*output=*/v78.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #77" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op78, - workspace.data(), /*input=*/v78.data(), /*output=*/v79.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #78" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f16( - op79, - v76.data() /* a */, v79.data() /* b */, /*output=*/v80.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #79" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op80, - workspace.data(), /*input=*/v80.data(), /*output=*/v81.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #80" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op81, - v81.data() /* a */, v72.data() /* b */, /*output=*/v82.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #81" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op82, - workspace.data(), /*input=*/v82.data(), /*output=*/v83.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #82" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op83, - /*input=*/v83.data(), /*output=*/v84.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #83" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op84, - workspace.data(), /*input=*/v84.data(), /*output=*/v85.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #84" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op85, - /*input=*/v85.data(), /*output=*/v86.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #85" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f16( - op86, - workspace.data(), - /*input=*/v86.data(), /*output=*/v87.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #86" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op87, - workspace.data(), /*input=*/v87.data(), /*output=*/v88.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #87" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op88, - workspace.data(), /*input=*/v88.data(), /*output=*/v89.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #88" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f16( - op89, - v86.data() /* a */, v89.data() /* b */, /*output=*/v90.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #89" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op90, - workspace.data(), /*input=*/v90.data(), /*output=*/v91.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #90" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op91, - v91.data() /* a */, v82.data() /* b */, /*output=*/v92.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #91" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op92, - workspace.data(), /*input=*/v92.data(), /*output=*/v93.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #92" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op93, - /*input=*/v93.data(), /*output=*/v94.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #93" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f16( - op94, - workspace.data(), - /*input=*/v94.data(), /*output=*/v95.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #94" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op95, - workspace.data(), /*input=*/v95.data(), /*output=*/v96.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #95" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op96, - /*input=*/v96.data(), /*output=*/v97.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #96" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f16( - op97, - workspace.data(), - /*input=*/v97.data(), /*output=*/v98.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #97" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op98, - workspace.data(), /*input=*/v98.data(), /*output=*/v99.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #98" << std::endl; - return ExecutionPlan(); - } - - XNN_PRAGMA_CLANG("clang diagnostic push") - XNN_PRAGMA_CLANG("clang diagnostic ignored \"-Wpessimizing-move\"") - return ExecutionPlan{operators, workspace}; - XNN_PRAGMA_CLANG("clang diagnostic pop") -} - -ExecutionPlan FP16MobileNetV3Small(pthreadpool_t threadpool) { - return FP16MobileNetV3Small(/*use_jit=*/false, threadpool); -} - -ExecutionPlan FP16MobileNetV3SmallJit(pthreadpool_t threadpool) { - return FP16MobileNetV3Small(/*use_jit=*/true, threadpool); -} - -} // namespace models diff --git a/models/fp16-sparse-mobilenet-v1.cc b/models/fp16-sparse-mobilenet-v1.cc deleted file mode 100644 index 0a586f9cbaf..00000000000 --- a/models/fp16-sparse-mobilenet-v1.cc +++ /dev/null @@ -1,1448 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include "xnnpack.h" - -#include -#include -#include -#include -#include -#include - -#include "xnnpack/cache.h" -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/models.h" - -namespace models { - -ExecutionPlan FP16SparseMobileNetV1(float sparsity, pthreadpool_t threadpool) { - alignas(16) static std::array v0; - alignas(16) static std::array v1; - alignas(16) static std::array v2; - alignas(16) static std::array v3; - alignas(16) static std::array v4; - alignas(16) static std::array v5; - alignas(16) static std::array v6; - alignas(16) static std::array v7; - alignas(16) static std::array v8; - alignas(16) static std::array v9; - alignas(16) static std::array v10; - alignas(16) static std::array v11; - alignas(16) static std::array v12; - alignas(16) static std::array v13; - alignas(16) static std::array v14; - alignas(16) static std::array v15; - alignas(16) static std::array v16; - alignas(16) static std::array v17; - alignas(16) static std::array v18; - alignas(16) static std::array v19; - alignas(16) static std::array v20; - alignas(16) static std::array v21; - alignas(16) static std::array v22; - alignas(16) static std::array v23; - alignas(16) static std::array v24; - alignas(16) static std::array v25; - alignas(16) static std::array v26; - alignas(16) static std::array v27; - alignas(16) static std::array v28; - alignas(16) static std::array v29; - alignas(16) static std::array w30; - alignas(16) static std::array w31; - alignas(16) static std::array w32; - alignas(16) static std::array w33; - alignas(16) static std::array w34; - alignas(16) static std::array w35; - alignas(16) static std::array w36; - alignas(16) static std::array w37; - alignas(16) static std::array w38; - alignas(16) static std::array w39; - alignas(16) static std::array w40; - alignas(16) static std::array w41; - alignas(16) static std::array w42; - alignas(16) static std::array w43; - alignas(16) static std::array w44; - alignas(16) static std::array w45; - alignas(16) static std::array w46; - alignas(16) static std::array w47; - alignas(16) static std::array w48; - alignas(16) static std::array w49; - alignas(16) static std::array w50; - alignas(16) static std::array w51; - alignas(16) static std::array w52; - alignas(16) static std::array w53; - alignas(16) static std::array w54; - alignas(16) static std::array w55; - alignas(16) static std::array w56; - alignas(16) static std::array w57; - alignas(16) static std::array w58; - alignas(16) static std::array w59; - alignas(16) static std::array w60; - alignas(16) static std::array w61; - alignas(16) static std::array w62; - alignas(16) static std::array w63; - alignas(16) static std::array w64; - alignas(16) static std::array w65; - alignas(16) static std::array w66; - alignas(16) static std::array w67; - alignas(16) static std::array w68; - alignas(16) static std::array w69; - alignas(16) static std::array w70; - alignas(16) static std::array w71; - alignas(16) static std::array w72; - alignas(16) static std::array w73; - alignas(16) static std::array w74; - alignas(16) static std::array w75; - alignas(16) static std::array w76; - alignas(16) static std::array w77; - alignas(16) static std::array w78; - alignas(16) static std::array w79; - alignas(16) static std::array w80; - alignas(16) static std::array w81; - alignas(16) static std::array w82; - alignas(16) static std::array w83; - alignas(16) static std::array w84; - alignas(16) static std::array w85; - - std::random_device random_device; - auto rng = std::mt19937(random_device()); - auto f32rng = std::bind(std::uniform_real_distribution(-1.0f, +1.0f), std::ref(rng)); - std::generate(v0.begin(), v0.end(), f32rng); - std::generate(v1.begin(), v1.end(), f32rng); - std::generate(v2.begin(), v2.end(), f32rng); - std::generate(v3.begin(), v3.end(), f32rng); - std::generate(v4.begin(), v4.end(), f32rng); - std::generate(v5.begin(), v5.end(), f32rng); - std::generate(v6.begin(), v6.end(), f32rng); - std::generate(v7.begin(), v7.end(), f32rng); - std::generate(v8.begin(), v8.end(), f32rng); - std::generate(v9.begin(), v9.end(), f32rng); - std::generate(v10.begin(), v10.end(), f32rng); - std::generate(v11.begin(), v11.end(), f32rng); - std::generate(v12.begin(), v12.end(), f32rng); - std::generate(v13.begin(), v13.end(), f32rng); - std::generate(v14.begin(), v14.end(), f32rng); - std::generate(v15.begin(), v15.end(), f32rng); - std::generate(v16.begin(), v16.end(), f32rng); - std::generate(v17.begin(), v17.end(), f32rng); - std::generate(v18.begin(), v18.end(), f32rng); - std::generate(v19.begin(), v19.end(), f32rng); - std::generate(v20.begin(), v20.end(), f32rng); - std::generate(v21.begin(), v21.end(), f32rng); - std::generate(v22.begin(), v22.end(), f32rng); - std::generate(v23.begin(), v23.end(), f32rng); - std::generate(v24.begin(), v24.end(), f32rng); - std::generate(v25.begin(), v25.end(), f32rng); - std::generate(v26.begin(), v26.end(), f32rng); - std::generate(v27.begin(), v27.end(), f32rng); - std::generate(v28.begin(), v28.end(), f32rng); - std::generate(v29.begin(), v29.end(), f32rng); - std::generate(w30.begin(), w30.end(), f32rng); - std::generate(w31.begin(), w31.end(), f32rng); - std::generate(w32.begin(), w32.end(), f32rng); - std::generate(w33.begin(), w33.end(), f32rng); - std::fill(w34.begin(), w34.end(), 0); - std::generate(w34.begin(), w34.end() - size_t(sparsity * w34.size()), f32rng); - std::shuffle(w34.begin(), w34.end(), rng); - std::generate(w35.begin(), w35.end(), f32rng); - std::generate(w36.begin(), w36.end(), f32rng); - std::generate(w37.begin(), w37.end(), f32rng); - std::fill(w38.begin(), w38.end(), 0); - std::generate(w38.begin(), w38.end() - size_t(sparsity * w38.size()), f32rng); - std::shuffle(w38.begin(), w38.end(), rng); - std::generate(w39.begin(), w39.end(), f32rng); - std::generate(w40.begin(), w40.end(), f32rng); - std::generate(w41.begin(), w41.end(), f32rng); - std::fill(w42.begin(), w42.end(), 0); - std::generate(w42.begin(), w42.end() - size_t(sparsity * w42.size()), f32rng); - std::shuffle(w42.begin(), w42.end(), rng); - std::generate(w43.begin(), w43.end(), f32rng); - std::generate(w44.begin(), w44.end(), f32rng); - std::generate(w45.begin(), w45.end(), f32rng); - std::fill(w46.begin(), w46.end(), 0); - std::generate(w46.begin(), w46.end() - size_t(sparsity * w46.size()), f32rng); - std::shuffle(w46.begin(), w46.end(), rng); - std::generate(w47.begin(), w47.end(), f32rng); - std::generate(w48.begin(), w48.end(), f32rng); - std::generate(w49.begin(), w49.end(), f32rng); - std::fill(w50.begin(), w50.end(), 0); - std::generate(w50.begin(), w50.end() - size_t(sparsity * w50.size()), f32rng); - std::shuffle(w50.begin(), w50.end(), rng); - std::generate(w51.begin(), w51.end(), f32rng); - std::generate(w52.begin(), w52.end(), f32rng); - std::generate(w53.begin(), w53.end(), f32rng); - std::fill(w54.begin(), w54.end(), 0); - std::generate(w54.begin(), w54.end() - size_t(sparsity * w54.size()), f32rng); - std::shuffle(w54.begin(), w54.end(), rng); - std::generate(w55.begin(), w55.end(), f32rng); - std::generate(w56.begin(), w56.end(), f32rng); - std::generate(w57.begin(), w57.end(), f32rng); - std::fill(w58.begin(), w58.end(), 0); - std::generate(w58.begin(), w58.end() - size_t(sparsity * w58.size()), f32rng); - std::shuffle(w58.begin(), w58.end(), rng); - std::generate(w59.begin(), w59.end(), f32rng); - std::generate(w60.begin(), w60.end(), f32rng); - std::generate(w61.begin(), w61.end(), f32rng); - std::fill(w62.begin(), w62.end(), 0); - std::generate(w62.begin(), w62.end() - size_t(sparsity * w62.size()), f32rng); - std::shuffle(w62.begin(), w62.end(), rng); - std::generate(w63.begin(), w63.end(), f32rng); - std::generate(w64.begin(), w64.end(), f32rng); - std::generate(w65.begin(), w65.end(), f32rng); - std::fill(w66.begin(), w66.end(), 0); - std::generate(w66.begin(), w66.end() - size_t(sparsity * w66.size()), f32rng); - std::shuffle(w66.begin(), w66.end(), rng); - std::generate(w67.begin(), w67.end(), f32rng); - std::generate(w68.begin(), w68.end(), f32rng); - std::generate(w69.begin(), w69.end(), f32rng); - std::fill(w70.begin(), w70.end(), 0); - std::generate(w70.begin(), w70.end() - size_t(sparsity * w70.size()), f32rng); - std::shuffle(w70.begin(), w70.end(), rng); - std::generate(w71.begin(), w71.end(), f32rng); - std::generate(w72.begin(), w72.end(), f32rng); - std::generate(w73.begin(), w73.end(), f32rng); - std::fill(w74.begin(), w74.end(), 0); - std::generate(w74.begin(), w74.end() - size_t(sparsity * w74.size()), f32rng); - std::shuffle(w74.begin(), w74.end(), rng); - std::generate(w75.begin(), w75.end(), f32rng); - std::generate(w76.begin(), w76.end(), f32rng); - std::generate(w77.begin(), w77.end(), f32rng); - std::fill(w78.begin(), w78.end(), 0); - std::generate(w78.begin(), w78.end() - size_t(sparsity * w78.size()), f32rng); - std::shuffle(w78.begin(), w78.end(), rng); - std::generate(w79.begin(), w79.end(), f32rng); - std::generate(w80.begin(), w80.end(), f32rng); - std::generate(w81.begin(), w81.end(), f32rng); - std::fill(w82.begin(), w82.end(), 0); - std::generate(w82.begin(), w82.end() - size_t(sparsity * w82.size()), f32rng); - std::shuffle(w82.begin(), w82.end(), rng); - std::generate(w83.begin(), w83.end(), f32rng); - std::generate(w84.begin(), w84.end(), f32rng); - std::generate(w85.begin(), w85.end(), f32rng); - - Operators operators; - xnn_status status; - size_t max_workspace_size = 0; - - xnn_operator_t op0 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 3 /* input channels per group */, - 32 /* output_channels_per_group */, - 3 /* input pixel stride */, - 32 /* output pixel stride */, - w30.data(), w31.data(), - 0.0f /* output min */, 6.0f /* output max */, - XNN_FLAG_INPUT_NHWC /* flags */, - nullptr, - nullptr, - &op0); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #0" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op0, xnn_delete_operator); - - xnn_operator_t op1 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 32 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 32 /* input pixel stride */, - 32 /* output pixel stride */, - w32.data(), w33.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op1); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #1" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op1, xnn_delete_operator); - - xnn_operator_t op2 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 32 /* input channels per group */, - 64 /* output_channels_per_group */, - 32 /* input pixel stride */, - 64 /* output pixel stride */, - w34.data(), w35.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op2); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #2" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op2, xnn_delete_operator); - - xnn_operator_t op3 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 64 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 64 /* input pixel stride */, - 64 /* output pixel stride */, - w36.data(), w37.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op3); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #3" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op3, xnn_delete_operator); - - xnn_operator_t op4 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 64 /* input channels per group */, - 128 /* output_channels_per_group */, - 64 /* input pixel stride */, - 128 /* output pixel stride */, - w38.data(), w39.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op4); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #4" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op4, xnn_delete_operator); - - xnn_operator_t op5 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 128 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 128 /* input pixel stride */, - 128 /* output pixel stride */, - w40.data(), w41.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op5); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #5" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op5, xnn_delete_operator); - - xnn_operator_t op6 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 128 /* input channels per group */, - 128 /* output_channels_per_group */, - 128 /* input pixel stride */, - 128 /* output pixel stride */, - w42.data(), w43.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op6); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #6" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op6, xnn_delete_operator); - - xnn_operator_t op7 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 128 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 128 /* input pixel stride */, - 128 /* output pixel stride */, - w44.data(), w45.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op7); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #7" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op7, xnn_delete_operator); - - xnn_operator_t op8 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 128 /* input channels per group */, - 256 /* output_channels_per_group */, - 128 /* input pixel stride */, - 256 /* output pixel stride */, - w46.data(), w47.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op8); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #8" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op8, xnn_delete_operator); - - xnn_operator_t op9 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 256 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 256 /* input pixel stride */, - 256 /* output pixel stride */, - w48.data(), w49.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op9); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #9" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op9, xnn_delete_operator); - - xnn_operator_t op10 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 256 /* input channels per group */, - 256 /* output_channels_per_group */, - 256 /* input pixel stride */, - 256 /* output pixel stride */, - w50.data(), w51.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op10); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #10" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op10, xnn_delete_operator); - - xnn_operator_t op11 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 256 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 256 /* input pixel stride */, - 256 /* output pixel stride */, - w52.data(), w53.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op11); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #11" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op11, xnn_delete_operator); - - xnn_operator_t op12 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 256 /* input channels per group */, - 512 /* output_channels_per_group */, - 256 /* input pixel stride */, - 512 /* output pixel stride */, - w54.data(), w55.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op12); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #12" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op12, xnn_delete_operator); - - xnn_operator_t op13 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 512 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - w56.data(), w57.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op13); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #13" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op13, xnn_delete_operator); - - xnn_operator_t op14 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 512 /* input channels per group */, - 512 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - w58.data(), w59.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op14); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #14" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op14, xnn_delete_operator); - - xnn_operator_t op15 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 512 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - w60.data(), w61.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op15); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #15" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op15, xnn_delete_operator); - - xnn_operator_t op16 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 512 /* input channels per group */, - 512 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - w62.data(), w63.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op16); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #16" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op16, xnn_delete_operator); - - xnn_operator_t op17 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 512 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - w64.data(), w65.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op17); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #17" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op17, xnn_delete_operator); - - xnn_operator_t op18 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 512 /* input channels per group */, - 512 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - w66.data(), w67.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op18); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #18" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op18, xnn_delete_operator); - - xnn_operator_t op19 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 512 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - w68.data(), w69.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op19); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #19" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op19, xnn_delete_operator); - - xnn_operator_t op20 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 512 /* input channels per group */, - 512 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - w70.data(), w71.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op20); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #20" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op20, xnn_delete_operator); - - xnn_operator_t op21 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 512 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - w72.data(), w73.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op21); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #21" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op21, xnn_delete_operator); - - xnn_operator_t op22 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 512 /* input channels per group */, - 512 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - w74.data(), w75.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op22); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #22" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op22, xnn_delete_operator); - - xnn_operator_t op23 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 512 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - w76.data(), w77.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op23); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #23" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op23, xnn_delete_operator); - - xnn_operator_t op24 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 512 /* input channels per group */, - 1024 /* output_channels_per_group */, - 512 /* input pixel stride */, - 1024 /* output pixel stride */, - w78.data(), w79.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op24); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #24" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op24, xnn_delete_operator); - - xnn_operator_t op25 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1024 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 1024 /* input pixel stride */, - 1024 /* output pixel stride */, - w80.data(), w81.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op25); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #25" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op25, xnn_delete_operator); - - xnn_operator_t op26 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 1024 /* input channels per group */, - 1024 /* output_channels_per_group */, - 1024 /* input pixel stride */, - 1024 /* output pixel stride */, - w82.data(), w83.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op26); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #26" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op26, xnn_delete_operator); - - xnn_operator_t op27 = nullptr; - status = xnn_create_global_average_pooling_ncw_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op27); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #27" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op27, xnn_delete_operator); - - xnn_operator_t op28 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 1024 /* input channels per group */, - 1001 /* output_channels_per_group */, - 1024 /* input pixel stride */, - 1001 /* output pixel stride */, - w84.data(), w85.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op28); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #28" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op28, xnn_delete_operator); - - status = xnn_reshape_convolution2d_nchw_f16( - op0, - /*batch_size=*/1, /*input_height=*/224, /*input_width=*/224, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #0" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op1, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #1" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op2, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #2" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op3, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #3" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op4, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #4" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op5, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #5" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op6, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #6" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op7, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #7" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op8, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #8" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op9, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #9" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op10, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #10" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op11, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #11" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op12, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #12" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op13, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #13" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op14, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #14" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op15, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #15" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op16, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #16" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op17, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #17" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op18, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #18" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op19, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #19" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op20, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #20" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op21, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #21" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op22, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #22" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op23, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #23" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op24, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #24" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op25, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #25" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op26, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #26" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f16( - op27, - /*batch_size=*/1, 49 /* width */, - 1024 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #27" << std::endl; - return ExecutionPlan(); - } - - size_t op28_workspace_size = 0; - size_t op28_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op28, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op28_workspace_size, &op28_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op28_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #28" << std::endl; - return ExecutionPlan(); - } - - Workspace workspace(max_workspace_size); - - status = xnn_setup_convolution2d_nchw_f16( - op0, - /*input=*/v0.data(), /*output=*/v1.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #0" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op1, - /*input=*/v1.data(), /*output=*/v2.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #1" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op2, - /*input=*/v2.data(), /*output=*/v3.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #2" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op3, - /*input=*/v3.data(), /*output=*/v4.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #3" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op4, - /*input=*/v4.data(), /*output=*/v5.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #4" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op5, - /*input=*/v5.data(), /*output=*/v6.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #5" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op6, - /*input=*/v6.data(), /*output=*/v7.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #6" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op7, - /*input=*/v7.data(), /*output=*/v8.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #7" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op8, - /*input=*/v8.data(), /*output=*/v9.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #8" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op9, - /*input=*/v9.data(), /*output=*/v10.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #9" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op10, - /*input=*/v10.data(), /*output=*/v11.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #10" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op11, - /*input=*/v11.data(), /*output=*/v12.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #11" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op12, - /*input=*/v12.data(), /*output=*/v13.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #12" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op13, - /*input=*/v13.data(), /*output=*/v14.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #13" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op14, - /*input=*/v14.data(), /*output=*/v15.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #14" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op15, - /*input=*/v15.data(), /*output=*/v16.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #15" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op16, - /*input=*/v16.data(), /*output=*/v17.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #16" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op17, - /*input=*/v17.data(), /*output=*/v18.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #17" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op18, - /*input=*/v18.data(), /*output=*/v19.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #18" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op19, - /*input=*/v19.data(), /*output=*/v20.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #19" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op20, - /*input=*/v20.data(), /*output=*/v21.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #20" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op21, - /*input=*/v21.data(), /*output=*/v22.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #21" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op22, - /*input=*/v22.data(), /*output=*/v23.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #22" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op23, - /*input=*/v23.data(), /*output=*/v24.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #23" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op24, - /*input=*/v24.data(), /*output=*/v25.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #24" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op25, - /*input=*/v25.data(), /*output=*/v26.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #25" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op26, - /*input=*/v26.data(), /*output=*/v27.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #26" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f16( - op27, - /*input=*/v27.data(), /*output=*/v28.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #27" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op28, - workspace.data(), - /*input=*/v28.data(), /*output=*/v29.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #28" << std::endl; - return ExecutionPlan(); - } - - XNN_PRAGMA_CLANG("clang diagnostic push") - XNN_PRAGMA_CLANG("clang diagnostic ignored \"-Wpessimizing-move\"") - return ExecutionPlan{operators, workspace}; - XNN_PRAGMA_CLANG("clang diagnostic pop") -} - -} // namespace models diff --git a/models/fp16-sparse-mobilenet-v2.cc b/models/fp16-sparse-mobilenet-v2.cc deleted file mode 100644 index 202778806d0..00000000000 --- a/models/fp16-sparse-mobilenet-v2.cc +++ /dev/null @@ -1,3032 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include "xnnpack.h" - -#include -#include -#include -#include -#include -#include - -#include "xnnpack/cache.h" -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/models.h" - -namespace models { - -ExecutionPlan FP16SparseMobileNetV2(float sparsity, pthreadpool_t threadpool) { - alignas(16) static std::array v0; - alignas(16) static std::array v1; - alignas(16) static std::array v2; - alignas(16) static std::array v3; - alignas(16) static std::array v4; - alignas(16) static std::array v5; - alignas(16) static std::array v6; - alignas(16) static std::array v7; - alignas(16) static std::array v8; - alignas(16) static std::array v9; - alignas(16) static std::array v10; - alignas(16) static std::array v11; - alignas(16) static std::array v12; - alignas(16) static std::array v13; - alignas(16) static std::array v14; - alignas(16) static std::array v15; - alignas(16) static std::array v16; - alignas(16) static std::array v17; - alignas(16) static std::array v18; - alignas(16) static std::array v19; - alignas(16) static std::array v20; - alignas(16) static std::array v21; - alignas(16) static std::array v22; - alignas(16) static std::array v23; - alignas(16) static std::array v24; - alignas(16) static std::array v25; - alignas(16) static std::array v26; - alignas(16) static std::array v27; - alignas(16) static std::array v28; - alignas(16) static std::array v29; - alignas(16) static std::array v30; - alignas(16) static std::array v31; - alignas(16) static std::array v32; - alignas(16) static std::array v33; - alignas(16) static std::array v34; - alignas(16) static std::array v35; - alignas(16) static std::array v36; - alignas(16) static std::array v37; - alignas(16) static std::array v38; - alignas(16) static std::array v39; - alignas(16) static std::array v40; - alignas(16) static std::array v41; - alignas(16) static std::array v42; - alignas(16) static std::array v43; - alignas(16) static std::array v44; - alignas(16) static std::array v45; - alignas(16) static std::array v46; - alignas(16) static std::array v47; - alignas(16) static std::array v48; - alignas(16) static std::array v49; - alignas(16) static std::array v50; - alignas(16) static std::array v51; - alignas(16) static std::array v52; - alignas(16) static std::array v53; - alignas(16) static std::array v54; - alignas(16) static std::array v55; - alignas(16) static std::array v56; - alignas(16) static std::array v57; - alignas(16) static std::array v58; - alignas(16) static std::array v59; - alignas(16) static std::array v60; - alignas(16) static std::array v61; - alignas(16) static std::array v62; - alignas(16) static std::array v63; - alignas(16) static std::array v64; - alignas(16) static std::array w65; - alignas(16) static std::array w66; - alignas(16) static std::array w67; - alignas(16) static std::array w68; - alignas(16) static std::array w69; - alignas(16) static std::array w70; - alignas(16) static std::array w71; - alignas(16) static std::array w72; - alignas(16) static std::array w73; - alignas(16) static std::array w74; - alignas(16) static std::array w75; - alignas(16) static std::array w76; - alignas(16) static std::array w77; - alignas(16) static std::array w78; - alignas(16) static std::array w79; - alignas(16) static std::array w80; - alignas(16) static std::array w81; - alignas(16) static std::array w82; - alignas(16) static std::array w83; - alignas(16) static std::array w84; - alignas(16) static std::array w85; - alignas(16) static std::array w86; - alignas(16) static std::array w87; - alignas(16) static std::array w88; - alignas(16) static std::array w89; - alignas(16) static std::array w90; - alignas(16) static std::array w91; - alignas(16) static std::array w92; - alignas(16) static std::array w93; - alignas(16) static std::array w94; - alignas(16) static std::array w95; - alignas(16) static std::array w96; - alignas(16) static std::array w97; - alignas(16) static std::array w98; - alignas(16) static std::array w99; - alignas(16) static std::array w100; - alignas(16) static std::array w101; - alignas(16) static std::array w102; - alignas(16) static std::array w103; - alignas(16) static std::array w104; - alignas(16) static std::array w105; - alignas(16) static std::array w106; - alignas(16) static std::array w107; - alignas(16) static std::array w108; - alignas(16) static std::array w109; - alignas(16) static std::array w110; - alignas(16) static std::array w111; - alignas(16) static std::array w112; - alignas(16) static std::array w113; - alignas(16) static std::array w114; - alignas(16) static std::array w115; - alignas(16) static std::array w116; - alignas(16) static std::array w117; - alignas(16) static std::array w118; - alignas(16) static std::array w119; - alignas(16) static std::array w120; - alignas(16) static std::array w121; - alignas(16) static std::array w122; - alignas(16) static std::array w123; - alignas(16) static std::array w124; - alignas(16) static std::array w125; - alignas(16) static std::array w126; - alignas(16) static std::array w127; - alignas(16) static std::array w128; - alignas(16) static std::array w129; - alignas(16) static std::array w130; - alignas(16) static std::array w131; - alignas(16) static std::array w132; - alignas(16) static std::array w133; - alignas(16) static std::array w134; - alignas(16) static std::array w135; - alignas(16) static std::array w136; - alignas(16) static std::array w137; - alignas(16) static std::array w138; - alignas(16) static std::array w139; - alignas(16) static std::array w140; - alignas(16) static std::array w141; - alignas(16) static std::array w142; - alignas(16) static std::array w143; - alignas(16) static std::array w144; - alignas(16) static std::array w145; - alignas(16) static std::array w146; - alignas(16) static std::array w147; - alignas(16) static std::array w148; - alignas(16) static std::array w149; - alignas(16) static std::array w150; - alignas(16) static std::array w151; - alignas(16) static std::array w152; - alignas(16) static std::array w153; - alignas(16) static std::array w154; - alignas(16) static std::array w155; - alignas(16) static std::array w156; - alignas(16) static std::array w157; - alignas(16) static std::array w158; - alignas(16) static std::array w159; - alignas(16) static std::array w160; - alignas(16) static std::array w161; - alignas(16) static std::array w162; - alignas(16) static std::array w163; - alignas(16) static std::array w164; - alignas(16) static std::array w165; - alignas(16) static std::array w166; - alignas(16) static std::array w167; - alignas(16) static std::array w168; - alignas(16) static std::array w169; - alignas(16) static std::array w170; - - std::random_device random_device; - auto rng = std::mt19937(random_device()); - auto f32rng = std::bind(std::uniform_real_distribution(-1.0f, +1.0f), std::ref(rng)); - std::generate(v0.begin(), v0.end(), f32rng); - std::generate(v1.begin(), v1.end(), f32rng); - std::generate(v2.begin(), v2.end(), f32rng); - std::generate(v3.begin(), v3.end(), f32rng); - std::generate(v4.begin(), v4.end(), f32rng); - std::generate(v5.begin(), v5.end(), f32rng); - std::generate(v6.begin(), v6.end(), f32rng); - std::generate(v7.begin(), v7.end(), f32rng); - std::generate(v8.begin(), v8.end(), f32rng); - std::generate(v9.begin(), v9.end(), f32rng); - std::generate(v10.begin(), v10.end(), f32rng); - std::generate(v11.begin(), v11.end(), f32rng); - std::generate(v12.begin(), v12.end(), f32rng); - std::generate(v13.begin(), v13.end(), f32rng); - std::generate(v14.begin(), v14.end(), f32rng); - std::generate(v15.begin(), v15.end(), f32rng); - std::generate(v16.begin(), v16.end(), f32rng); - std::generate(v17.begin(), v17.end(), f32rng); - std::generate(v18.begin(), v18.end(), f32rng); - std::generate(v19.begin(), v19.end(), f32rng); - std::generate(v20.begin(), v20.end(), f32rng); - std::generate(v21.begin(), v21.end(), f32rng); - std::generate(v22.begin(), v22.end(), f32rng); - std::generate(v23.begin(), v23.end(), f32rng); - std::generate(v24.begin(), v24.end(), f32rng); - std::generate(v25.begin(), v25.end(), f32rng); - std::generate(v26.begin(), v26.end(), f32rng); - std::generate(v27.begin(), v27.end(), f32rng); - std::generate(v28.begin(), v28.end(), f32rng); - std::generate(v29.begin(), v29.end(), f32rng); - std::generate(v30.begin(), v30.end(), f32rng); - std::generate(v31.begin(), v31.end(), f32rng); - std::generate(v32.begin(), v32.end(), f32rng); - std::generate(v33.begin(), v33.end(), f32rng); - std::generate(v34.begin(), v34.end(), f32rng); - std::generate(v35.begin(), v35.end(), f32rng); - std::generate(v36.begin(), v36.end(), f32rng); - std::generate(v37.begin(), v37.end(), f32rng); - std::generate(v38.begin(), v38.end(), f32rng); - std::generate(v39.begin(), v39.end(), f32rng); - std::generate(v40.begin(), v40.end(), f32rng); - std::generate(v41.begin(), v41.end(), f32rng); - std::generate(v42.begin(), v42.end(), f32rng); - std::generate(v43.begin(), v43.end(), f32rng); - std::generate(v44.begin(), v44.end(), f32rng); - std::generate(v45.begin(), v45.end(), f32rng); - std::generate(v46.begin(), v46.end(), f32rng); - std::generate(v47.begin(), v47.end(), f32rng); - std::generate(v48.begin(), v48.end(), f32rng); - std::generate(v49.begin(), v49.end(), f32rng); - std::generate(v50.begin(), v50.end(), f32rng); - std::generate(v51.begin(), v51.end(), f32rng); - std::generate(v52.begin(), v52.end(), f32rng); - std::generate(v53.begin(), v53.end(), f32rng); - std::generate(v54.begin(), v54.end(), f32rng); - std::generate(v55.begin(), v55.end(), f32rng); - std::generate(v56.begin(), v56.end(), f32rng); - std::generate(v57.begin(), v57.end(), f32rng); - std::generate(v58.begin(), v58.end(), f32rng); - std::generate(v59.begin(), v59.end(), f32rng); - std::generate(v60.begin(), v60.end(), f32rng); - std::generate(v61.begin(), v61.end(), f32rng); - std::generate(v62.begin(), v62.end(), f32rng); - std::generate(v63.begin(), v63.end(), f32rng); - std::generate(v64.begin(), v64.end(), f32rng); - std::generate(w65.begin(), w65.end(), f32rng); - std::generate(w66.begin(), w66.end(), f32rng); - std::generate(w67.begin(), w67.end(), f32rng); - std::generate(w68.begin(), w68.end(), f32rng); - std::fill(w69.begin(), w69.end(), 0); - std::generate(w69.begin(), w69.end() - size_t(sparsity * w69.size()), f32rng); - std::shuffle(w69.begin(), w69.end(), rng); - std::generate(w70.begin(), w70.end(), f32rng); - std::fill(w71.begin(), w71.end(), 0); - std::generate(w71.begin(), w71.end() - size_t(sparsity * w71.size()), f32rng); - std::shuffle(w71.begin(), w71.end(), rng); - std::generate(w72.begin(), w72.end(), f32rng); - std::generate(w73.begin(), w73.end(), f32rng); - std::generate(w74.begin(), w74.end(), f32rng); - std::fill(w75.begin(), w75.end(), 0); - std::generate(w75.begin(), w75.end() - size_t(sparsity * w75.size()), f32rng); - std::shuffle(w75.begin(), w75.end(), rng); - std::generate(w76.begin(), w76.end(), f32rng); - std::fill(w77.begin(), w77.end(), 0); - std::generate(w77.begin(), w77.end() - size_t(sparsity * w77.size()), f32rng); - std::shuffle(w77.begin(), w77.end(), rng); - std::generate(w78.begin(), w78.end(), f32rng); - std::generate(w79.begin(), w79.end(), f32rng); - std::generate(w80.begin(), w80.end(), f32rng); - std::fill(w81.begin(), w81.end(), 0); - std::generate(w81.begin(), w81.end() - size_t(sparsity * w81.size()), f32rng); - std::shuffle(w81.begin(), w81.end(), rng); - std::generate(w82.begin(), w82.end(), f32rng); - std::fill(w83.begin(), w83.end(), 0); - std::generate(w83.begin(), w83.end() - size_t(sparsity * w83.size()), f32rng); - std::shuffle(w83.begin(), w83.end(), rng); - std::generate(w84.begin(), w84.end(), f32rng); - std::generate(w85.begin(), w85.end(), f32rng); - std::generate(w86.begin(), w86.end(), f32rng); - std::fill(w87.begin(), w87.end(), 0); - std::generate(w87.begin(), w87.end() - size_t(sparsity * w87.size()), f32rng); - std::shuffle(w87.begin(), w87.end(), rng); - std::generate(w88.begin(), w88.end(), f32rng); - std::fill(w89.begin(), w89.end(), 0); - std::generate(w89.begin(), w89.end() - size_t(sparsity * w89.size()), f32rng); - std::shuffle(w89.begin(), w89.end(), rng); - std::generate(w90.begin(), w90.end(), f32rng); - std::generate(w91.begin(), w91.end(), f32rng); - std::generate(w92.begin(), w92.end(), f32rng); - std::fill(w93.begin(), w93.end(), 0); - std::generate(w93.begin(), w93.end() - size_t(sparsity * w93.size()), f32rng); - std::shuffle(w93.begin(), w93.end(), rng); - std::generate(w94.begin(), w94.end(), f32rng); - std::fill(w95.begin(), w95.end(), 0); - std::generate(w95.begin(), w95.end() - size_t(sparsity * w95.size()), f32rng); - std::shuffle(w95.begin(), w95.end(), rng); - std::generate(w96.begin(), w96.end(), f32rng); - std::generate(w97.begin(), w97.end(), f32rng); - std::generate(w98.begin(), w98.end(), f32rng); - std::fill(w99.begin(), w99.end(), 0); - std::generate(w99.begin(), w99.end() - size_t(sparsity * w99.size()), f32rng); - std::shuffle(w99.begin(), w99.end(), rng); - std::generate(w100.begin(), w100.end(), f32rng); - std::fill(w101.begin(), w101.end(), 0); - std::generate(w101.begin(), w101.end() - size_t(sparsity * w101.size()), f32rng); - std::shuffle(w101.begin(), w101.end(), rng); - std::generate(w102.begin(), w102.end(), f32rng); - std::generate(w103.begin(), w103.end(), f32rng); - std::generate(w104.begin(), w104.end(), f32rng); - std::fill(w105.begin(), w105.end(), 0); - std::generate(w105.begin(), w105.end() - size_t(sparsity * w105.size()), f32rng); - std::shuffle(w105.begin(), w105.end(), rng); - std::generate(w106.begin(), w106.end(), f32rng); - std::fill(w107.begin(), w107.end(), 0); - std::generate(w107.begin(), w107.end() - size_t(sparsity * w107.size()), f32rng); - std::shuffle(w107.begin(), w107.end(), rng); - std::generate(w108.begin(), w108.end(), f32rng); - std::generate(w109.begin(), w109.end(), f32rng); - std::generate(w110.begin(), w110.end(), f32rng); - std::fill(w111.begin(), w111.end(), 0); - std::generate(w111.begin(), w111.end() - size_t(sparsity * w111.size()), f32rng); - std::shuffle(w111.begin(), w111.end(), rng); - std::generate(w112.begin(), w112.end(), f32rng); - std::fill(w113.begin(), w113.end(), 0); - std::generate(w113.begin(), w113.end() - size_t(sparsity * w113.size()), f32rng); - std::shuffle(w113.begin(), w113.end(), rng); - std::generate(w114.begin(), w114.end(), f32rng); - std::generate(w115.begin(), w115.end(), f32rng); - std::generate(w116.begin(), w116.end(), f32rng); - std::fill(w117.begin(), w117.end(), 0); - std::generate(w117.begin(), w117.end() - size_t(sparsity * w117.size()), f32rng); - std::shuffle(w117.begin(), w117.end(), rng); - std::generate(w118.begin(), w118.end(), f32rng); - std::fill(w119.begin(), w119.end(), 0); - std::generate(w119.begin(), w119.end() - size_t(sparsity * w119.size()), f32rng); - std::shuffle(w119.begin(), w119.end(), rng); - std::generate(w120.begin(), w120.end(), f32rng); - std::generate(w121.begin(), w121.end(), f32rng); - std::generate(w122.begin(), w122.end(), f32rng); - std::fill(w123.begin(), w123.end(), 0); - std::generate(w123.begin(), w123.end() - size_t(sparsity * w123.size()), f32rng); - std::shuffle(w123.begin(), w123.end(), rng); - std::generate(w124.begin(), w124.end(), f32rng); - std::fill(w125.begin(), w125.end(), 0); - std::generate(w125.begin(), w125.end() - size_t(sparsity * w125.size()), f32rng); - std::shuffle(w125.begin(), w125.end(), rng); - std::generate(w126.begin(), w126.end(), f32rng); - std::generate(w127.begin(), w127.end(), f32rng); - std::generate(w128.begin(), w128.end(), f32rng); - std::fill(w129.begin(), w129.end(), 0); - std::generate(w129.begin(), w129.end() - size_t(sparsity * w129.size()), f32rng); - std::shuffle(w129.begin(), w129.end(), rng); - std::generate(w130.begin(), w130.end(), f32rng); - std::fill(w131.begin(), w131.end(), 0); - std::generate(w131.begin(), w131.end() - size_t(sparsity * w131.size()), f32rng); - std::shuffle(w131.begin(), w131.end(), rng); - std::generate(w132.begin(), w132.end(), f32rng); - std::generate(w133.begin(), w133.end(), f32rng); - std::generate(w134.begin(), w134.end(), f32rng); - std::fill(w135.begin(), w135.end(), 0); - std::generate(w135.begin(), w135.end() - size_t(sparsity * w135.size()), f32rng); - std::shuffle(w135.begin(), w135.end(), rng); - std::generate(w136.begin(), w136.end(), f32rng); - std::fill(w137.begin(), w137.end(), 0); - std::generate(w137.begin(), w137.end() - size_t(sparsity * w137.size()), f32rng); - std::shuffle(w137.begin(), w137.end(), rng); - std::generate(w138.begin(), w138.end(), f32rng); - std::generate(w139.begin(), w139.end(), f32rng); - std::generate(w140.begin(), w140.end(), f32rng); - std::fill(w141.begin(), w141.end(), 0); - std::generate(w141.begin(), w141.end() - size_t(sparsity * w141.size()), f32rng); - std::shuffle(w141.begin(), w141.end(), rng); - std::generate(w142.begin(), w142.end(), f32rng); - std::fill(w143.begin(), w143.end(), 0); - std::generate(w143.begin(), w143.end() - size_t(sparsity * w143.size()), f32rng); - std::shuffle(w143.begin(), w143.end(), rng); - std::generate(w144.begin(), w144.end(), f32rng); - std::generate(w145.begin(), w145.end(), f32rng); - std::generate(w146.begin(), w146.end(), f32rng); - std::fill(w147.begin(), w147.end(), 0); - std::generate(w147.begin(), w147.end() - size_t(sparsity * w147.size()), f32rng); - std::shuffle(w147.begin(), w147.end(), rng); - std::generate(w148.begin(), w148.end(), f32rng); - std::fill(w149.begin(), w149.end(), 0); - std::generate(w149.begin(), w149.end() - size_t(sparsity * w149.size()), f32rng); - std::shuffle(w149.begin(), w149.end(), rng); - std::generate(w150.begin(), w150.end(), f32rng); - std::generate(w151.begin(), w151.end(), f32rng); - std::generate(w152.begin(), w152.end(), f32rng); - std::fill(w153.begin(), w153.end(), 0); - std::generate(w153.begin(), w153.end() - size_t(sparsity * w153.size()), f32rng); - std::shuffle(w153.begin(), w153.end(), rng); - std::generate(w154.begin(), w154.end(), f32rng); - std::fill(w155.begin(), w155.end(), 0); - std::generate(w155.begin(), w155.end() - size_t(sparsity * w155.size()), f32rng); - std::shuffle(w155.begin(), w155.end(), rng); - std::generate(w156.begin(), w156.end(), f32rng); - std::generate(w157.begin(), w157.end(), f32rng); - std::generate(w158.begin(), w158.end(), f32rng); - std::fill(w159.begin(), w159.end(), 0); - std::generate(w159.begin(), w159.end() - size_t(sparsity * w159.size()), f32rng); - std::shuffle(w159.begin(), w159.end(), rng); - std::generate(w160.begin(), w160.end(), f32rng); - std::fill(w161.begin(), w161.end(), 0); - std::generate(w161.begin(), w161.end() - size_t(sparsity * w161.size()), f32rng); - std::shuffle(w161.begin(), w161.end(), rng); - std::generate(w162.begin(), w162.end(), f32rng); - std::generate(w163.begin(), w163.end(), f32rng); - std::generate(w164.begin(), w164.end(), f32rng); - std::fill(w165.begin(), w165.end(), 0); - std::generate(w165.begin(), w165.end() - size_t(sparsity * w165.size()), f32rng); - std::shuffle(w165.begin(), w165.end(), rng); - std::generate(w166.begin(), w166.end(), f32rng); - std::fill(w167.begin(), w167.end(), 0); - std::generate(w167.begin(), w167.end() - size_t(sparsity * w167.size()), f32rng); - std::shuffle(w167.begin(), w167.end(), rng); - std::generate(w168.begin(), w168.end(), f32rng); - std::fill(w169.begin(), w169.end(), 0); - std::generate(w169.begin(), w169.end() - size_t(sparsity * w169.size()), f32rng); - std::shuffle(w169.begin(), w169.end(), rng); - std::generate(w170.begin(), w170.end(), f32rng); - - Operators operators; - xnn_status status; - size_t max_workspace_size = 0; - - xnn_operator_t op0 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 3 /* input channels per group */, - 32 /* output_channels_per_group */, - 3 /* input pixel stride */, - 32 /* output pixel stride */, - w65.data(), w66.data(), - 0.0f /* output min */, 6.0f /* output max */, - XNN_FLAG_INPUT_NHWC /* flags */, - nullptr, - nullptr, - &op0); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #0" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op0, xnn_delete_operator); - - xnn_operator_t op1 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 32 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 32 /* input pixel stride */, - 32 /* output pixel stride */, - w67.data(), w68.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op1); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #1" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op1, xnn_delete_operator); - - xnn_operator_t op2 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 32 /* input channels per group */, - 16 /* output_channels_per_group */, - 32 /* input pixel stride */, - 16 /* output pixel stride */, - w69.data(), w70.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op2); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #2" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op2, xnn_delete_operator); - - xnn_operator_t op3 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 16 /* input channels per group */, - 96 /* output_channels_per_group */, - 16 /* input pixel stride */, - 96 /* output pixel stride */, - w71.data(), w72.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op3); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #3" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op3, xnn_delete_operator); - - xnn_operator_t op4 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 96 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 96 /* input pixel stride */, - 96 /* output pixel stride */, - w73.data(), w74.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op4); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #4" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op4, xnn_delete_operator); - - xnn_operator_t op5 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 96 /* input channels per group */, - 24 /* output_channels_per_group */, - 96 /* input pixel stride */, - 24 /* output pixel stride */, - w75.data(), w76.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op5); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #5" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op5, xnn_delete_operator); - - xnn_operator_t op6 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 24 /* input channels per group */, - 144 /* output_channels_per_group */, - 24 /* input pixel stride */, - 144 /* output pixel stride */, - w77.data(), w78.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op6); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #6" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op6, xnn_delete_operator); - - xnn_operator_t op7 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 144 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 144 /* input pixel stride */, - 144 /* output pixel stride */, - w79.data(), w80.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op7); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #7" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op7, xnn_delete_operator); - - xnn_operator_t op8 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 144 /* input channels per group */, - 24 /* output_channels_per_group */, - 144 /* input pixel stride */, - 24 /* output pixel stride */, - w81.data(), w82.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op8); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #8" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op8, xnn_delete_operator); - - xnn_operator_t op9 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op9); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #9" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op9, xnn_delete_operator); - - xnn_operator_t op10 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 24 /* input channels per group */, - 144 /* output_channels_per_group */, - 24 /* input pixel stride */, - 144 /* output pixel stride */, - w83.data(), w84.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op10); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #10" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op10, xnn_delete_operator); - - xnn_operator_t op11 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 144 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 144 /* input pixel stride */, - 144 /* output pixel stride */, - w85.data(), w86.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op11); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #11" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op11, xnn_delete_operator); - - xnn_operator_t op12 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 144 /* input channels per group */, - 32 /* output_channels_per_group */, - 144 /* input pixel stride */, - 32 /* output pixel stride */, - w87.data(), w88.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op12); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #12" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op12, xnn_delete_operator); - - xnn_operator_t op13 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 32 /* input channels per group */, - 192 /* output_channels_per_group */, - 32 /* input pixel stride */, - 192 /* output pixel stride */, - w89.data(), w90.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op13); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #13" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op13, xnn_delete_operator); - - xnn_operator_t op14 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 192 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 192 /* input pixel stride */, - 192 /* output pixel stride */, - w91.data(), w92.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op14); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #14" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op14, xnn_delete_operator); - - xnn_operator_t op15 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 192 /* input channels per group */, - 32 /* output_channels_per_group */, - 192 /* input pixel stride */, - 32 /* output pixel stride */, - w93.data(), w94.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op15); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #15" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op15, xnn_delete_operator); - - xnn_operator_t op16 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op16); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #16" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op16, xnn_delete_operator); - - xnn_operator_t op17 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 32 /* input channels per group */, - 192 /* output_channels_per_group */, - 32 /* input pixel stride */, - 192 /* output pixel stride */, - w95.data(), w96.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op17); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #17" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op17, xnn_delete_operator); - - xnn_operator_t op18 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 192 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 192 /* input pixel stride */, - 192 /* output pixel stride */, - w97.data(), w98.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op18); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #18" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op18, xnn_delete_operator); - - xnn_operator_t op19 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 192 /* input channels per group */, - 32 /* output_channels_per_group */, - 192 /* input pixel stride */, - 32 /* output pixel stride */, - w99.data(), w100.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op19); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #19" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op19, xnn_delete_operator); - - xnn_operator_t op20 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op20); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #20" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op20, xnn_delete_operator); - - xnn_operator_t op21 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 32 /* input channels per group */, - 192 /* output_channels_per_group */, - 32 /* input pixel stride */, - 192 /* output pixel stride */, - w101.data(), w102.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op21); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #21" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op21, xnn_delete_operator); - - xnn_operator_t op22 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 192 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 192 /* input pixel stride */, - 192 /* output pixel stride */, - w103.data(), w104.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op22); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #22" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op22, xnn_delete_operator); - - xnn_operator_t op23 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 192 /* input channels per group */, - 64 /* output_channels_per_group */, - 192 /* input pixel stride */, - 64 /* output pixel stride */, - w105.data(), w106.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op23); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #23" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op23, xnn_delete_operator); - - xnn_operator_t op24 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 64 /* input channels per group */, - 384 /* output_channels_per_group */, - 64 /* input pixel stride */, - 384 /* output pixel stride */, - w107.data(), w108.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op24); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #24" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op24, xnn_delete_operator); - - xnn_operator_t op25 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 384 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 384 /* input pixel stride */, - 384 /* output pixel stride */, - w109.data(), w110.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op25); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #25" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op25, xnn_delete_operator); - - xnn_operator_t op26 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 384 /* input channels per group */, - 64 /* output_channels_per_group */, - 384 /* input pixel stride */, - 64 /* output pixel stride */, - w111.data(), w112.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op26); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #26" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op26, xnn_delete_operator); - - xnn_operator_t op27 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op27); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #27" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op27, xnn_delete_operator); - - xnn_operator_t op28 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 64 /* input channels per group */, - 384 /* output_channels_per_group */, - 64 /* input pixel stride */, - 384 /* output pixel stride */, - w113.data(), w114.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op28); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #28" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op28, xnn_delete_operator); - - xnn_operator_t op29 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 384 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 384 /* input pixel stride */, - 384 /* output pixel stride */, - w115.data(), w116.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op29); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #29" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op29, xnn_delete_operator); - - xnn_operator_t op30 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 384 /* input channels per group */, - 64 /* output_channels_per_group */, - 384 /* input pixel stride */, - 64 /* output pixel stride */, - w117.data(), w118.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op30); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #30" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op30, xnn_delete_operator); - - xnn_operator_t op31 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op31); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #31" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op31, xnn_delete_operator); - - xnn_operator_t op32 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 64 /* input channels per group */, - 384 /* output_channels_per_group */, - 64 /* input pixel stride */, - 384 /* output pixel stride */, - w119.data(), w120.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op32); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #32" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op32, xnn_delete_operator); - - xnn_operator_t op33 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 384 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 384 /* input pixel stride */, - 384 /* output pixel stride */, - w121.data(), w122.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op33); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #33" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op33, xnn_delete_operator); - - xnn_operator_t op34 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 384 /* input channels per group */, - 64 /* output_channels_per_group */, - 384 /* input pixel stride */, - 64 /* output pixel stride */, - w123.data(), w124.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op34); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #34" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op34, xnn_delete_operator); - - xnn_operator_t op35 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op35); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #35" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op35, xnn_delete_operator); - - xnn_operator_t op36 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 64 /* input channels per group */, - 384 /* output_channels_per_group */, - 64 /* input pixel stride */, - 384 /* output pixel stride */, - w125.data(), w126.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op36); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #36" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op36, xnn_delete_operator); - - xnn_operator_t op37 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 384 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 384 /* input pixel stride */, - 384 /* output pixel stride */, - w127.data(), w128.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op37); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #37" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op37, xnn_delete_operator); - - xnn_operator_t op38 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 384 /* input channels per group */, - 96 /* output_channels_per_group */, - 384 /* input pixel stride */, - 96 /* output pixel stride */, - w129.data(), w130.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op38); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #38" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op38, xnn_delete_operator); - - xnn_operator_t op39 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 96 /* input channels per group */, - 576 /* output_channels_per_group */, - 96 /* input pixel stride */, - 576 /* output pixel stride */, - w131.data(), w132.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op39); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #39" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op39, xnn_delete_operator); - - xnn_operator_t op40 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 576 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 576 /* input pixel stride */, - 576 /* output pixel stride */, - w133.data(), w134.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op40); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #40" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op40, xnn_delete_operator); - - xnn_operator_t op41 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 576 /* input channels per group */, - 96 /* output_channels_per_group */, - 576 /* input pixel stride */, - 96 /* output pixel stride */, - w135.data(), w136.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op41); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #41" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op41, xnn_delete_operator); - - xnn_operator_t op42 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op42); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #42" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op42, xnn_delete_operator); - - xnn_operator_t op43 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 96 /* input channels per group */, - 576 /* output_channels_per_group */, - 96 /* input pixel stride */, - 576 /* output pixel stride */, - w137.data(), w138.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op43); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #43" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op43, xnn_delete_operator); - - xnn_operator_t op44 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 576 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 576 /* input pixel stride */, - 576 /* output pixel stride */, - w139.data(), w140.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op44); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #44" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op44, xnn_delete_operator); - - xnn_operator_t op45 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 576 /* input channels per group */, - 96 /* output_channels_per_group */, - 576 /* input pixel stride */, - 96 /* output pixel stride */, - w141.data(), w142.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op45); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #45" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op45, xnn_delete_operator); - - xnn_operator_t op46 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op46); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #46" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op46, xnn_delete_operator); - - xnn_operator_t op47 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 96 /* input channels per group */, - 576 /* output_channels_per_group */, - 96 /* input pixel stride */, - 576 /* output pixel stride */, - w143.data(), w144.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op47); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #47" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op47, xnn_delete_operator); - - xnn_operator_t op48 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 576 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 576 /* input pixel stride */, - 576 /* output pixel stride */, - w145.data(), w146.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op48); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #48" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op48, xnn_delete_operator); - - xnn_operator_t op49 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 576 /* input channels per group */, - 160 /* output_channels_per_group */, - 576 /* input pixel stride */, - 160 /* output pixel stride */, - w147.data(), w148.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op49); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #49" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op49, xnn_delete_operator); - - xnn_operator_t op50 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 160 /* input channels per group */, - 960 /* output_channels_per_group */, - 160 /* input pixel stride */, - 960 /* output pixel stride */, - w149.data(), w150.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op50); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #50" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op50, xnn_delete_operator); - - xnn_operator_t op51 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 960 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 960 /* input pixel stride */, - 960 /* output pixel stride */, - w151.data(), w152.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op51); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #51" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op51, xnn_delete_operator); - - xnn_operator_t op52 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 960 /* input channels per group */, - 160 /* output_channels_per_group */, - 960 /* input pixel stride */, - 160 /* output pixel stride */, - w153.data(), w154.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op52); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #52" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op52, xnn_delete_operator); - - xnn_operator_t op53 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op53); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #53" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op53, xnn_delete_operator); - - xnn_operator_t op54 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 160 /* input channels per group */, - 960 /* output_channels_per_group */, - 160 /* input pixel stride */, - 960 /* output pixel stride */, - w155.data(), w156.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op54); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #54" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op54, xnn_delete_operator); - - xnn_operator_t op55 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 960 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 960 /* input pixel stride */, - 960 /* output pixel stride */, - w157.data(), w158.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op55); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #55" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op55, xnn_delete_operator); - - xnn_operator_t op56 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 960 /* input channels per group */, - 160 /* output_channels_per_group */, - 960 /* input pixel stride */, - 160 /* output pixel stride */, - w159.data(), w160.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op56); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #56" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op56, xnn_delete_operator); - - xnn_operator_t op57 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op57); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #57" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op57, xnn_delete_operator); - - xnn_operator_t op58 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 160 /* input channels per group */, - 960 /* output_channels_per_group */, - 160 /* input pixel stride */, - 960 /* output pixel stride */, - w161.data(), w162.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op58); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #58" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op58, xnn_delete_operator); - - xnn_operator_t op59 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 960 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 960 /* input pixel stride */, - 960 /* output pixel stride */, - w163.data(), w164.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op59); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #59" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op59, xnn_delete_operator); - - xnn_operator_t op60 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 960 /* input channels per group */, - 320 /* output_channels_per_group */, - 960 /* input pixel stride */, - 320 /* output pixel stride */, - w165.data(), w166.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op60); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #60" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op60, xnn_delete_operator); - - xnn_operator_t op61 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 320 /* input channels per group */, - 1280 /* output_channels_per_group */, - 320 /* input pixel stride */, - 1280 /* output pixel stride */, - w167.data(), w168.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op61); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #61" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op61, xnn_delete_operator); - - xnn_operator_t op62 = nullptr; - status = xnn_create_global_average_pooling_ncw_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op62); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #62" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op62, xnn_delete_operator); - - xnn_operator_t op63 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 1280 /* input channels per group */, - 1001 /* output_channels_per_group */, - 1280 /* input pixel stride */, - 1001 /* output pixel stride */, - w169.data(), w170.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op63); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #63" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op63, xnn_delete_operator); - - status = xnn_reshape_convolution2d_nchw_f16( - op0, - /*batch_size=*/1, /*input_height=*/224, /*input_width=*/224, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #0" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op1, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #1" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op2, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #2" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op3, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #3" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op4, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #4" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op5, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #5" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op6, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #6" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op7, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #7" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op8, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #8" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 24, 56, 56 }; - const size_t b_shape[] = { 1, 24, 56, 56 }; - status = xnn_reshape_add_nd_f16( - op9, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #9" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op10, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #10" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op11, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #11" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op12, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #12" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op13, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #13" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op14, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #14" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op15, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #15" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 32, 28, 28 }; - const size_t b_shape[] = { 1, 32, 28, 28 }; - status = xnn_reshape_add_nd_f16( - op16, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #16" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op17, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #17" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op18, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #18" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op19, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #19" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 32, 28, 28 }; - const size_t b_shape[] = { 1, 32, 28, 28 }; - status = xnn_reshape_add_nd_f16( - op20, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #20" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op21, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #21" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op22, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #22" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op23, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #23" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op24, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #24" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op25, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #25" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op26, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #26" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 64, 14, 14 }; - const size_t b_shape[] = { 1, 64, 14, 14 }; - status = xnn_reshape_add_nd_f16( - op27, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #27" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op28, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #28" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op29, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #29" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op30, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #30" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 64, 14, 14 }; - const size_t b_shape[] = { 1, 64, 14, 14 }; - status = xnn_reshape_add_nd_f16( - op31, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #31" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op32, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #32" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op33, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #33" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op34, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #34" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 64, 14, 14 }; - const size_t b_shape[] = { 1, 64, 14, 14 }; - status = xnn_reshape_add_nd_f16( - op35, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #35" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op36, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #36" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op37, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #37" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op38, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #38" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op39, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #39" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op40, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #40" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op41, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #41" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 96, 14, 14 }; - const size_t b_shape[] = { 1, 96, 14, 14 }; - status = xnn_reshape_add_nd_f16( - op42, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #42" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op43, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #43" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op44, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #44" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op45, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #45" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 96, 14, 14 }; - const size_t b_shape[] = { 1, 96, 14, 14 }; - status = xnn_reshape_add_nd_f16( - op46, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #46" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op47, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #47" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op48, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #48" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op49, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #49" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op50, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #50" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op51, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #51" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op52, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #52" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 160, 7, 7 }; - const size_t b_shape[] = { 1, 160, 7, 7 }; - status = xnn_reshape_add_nd_f16( - op53, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #53" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op54, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #54" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op55, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #55" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op56, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #56" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 160, 7, 7 }; - const size_t b_shape[] = { 1, 160, 7, 7 }; - status = xnn_reshape_add_nd_f16( - op57, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #57" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op58, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #58" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op59, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #59" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op60, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #60" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op61, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #61" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f16( - op62, - /*batch_size=*/1, 49 /* width */, - 1280 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #62" << std::endl; - return ExecutionPlan(); - } - - size_t op63_workspace_size = 0; - size_t op63_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op63, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op63_workspace_size, &op63_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op63_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #63" << std::endl; - return ExecutionPlan(); - } - - Workspace workspace(max_workspace_size); - - status = xnn_setup_convolution2d_nchw_f16( - op0, - /*input=*/v0.data(), /*output=*/v1.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #0" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op1, - /*input=*/v1.data(), /*output=*/v2.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #1" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op2, - /*input=*/v2.data(), /*output=*/v3.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #2" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op3, - /*input=*/v3.data(), /*output=*/v4.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #3" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op4, - /*input=*/v4.data(), /*output=*/v5.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #4" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op5, - /*input=*/v5.data(), /*output=*/v6.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #5" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op6, - /*input=*/v6.data(), /*output=*/v7.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #6" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op7, - /*input=*/v7.data(), /*output=*/v8.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #7" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op8, - /*input=*/v8.data(), /*output=*/v9.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #8" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op9, - v9.data() /* a */, v6.data() /* b */, /*output=*/v10.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #9" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op10, - /*input=*/v10.data(), /*output=*/v11.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #10" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op11, - /*input=*/v11.data(), /*output=*/v12.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #11" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op12, - /*input=*/v12.data(), /*output=*/v13.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #12" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op13, - /*input=*/v13.data(), /*output=*/v14.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #13" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op14, - /*input=*/v14.data(), /*output=*/v15.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #14" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op15, - /*input=*/v15.data(), /*output=*/v16.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #15" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op16, - v16.data() /* a */, v13.data() /* b */, /*output=*/v17.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #16" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op17, - /*input=*/v17.data(), /*output=*/v18.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #17" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op18, - /*input=*/v18.data(), /*output=*/v19.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #18" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op19, - /*input=*/v19.data(), /*output=*/v20.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #19" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op20, - v20.data() /* a */, v17.data() /* b */, /*output=*/v21.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #20" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op21, - /*input=*/v21.data(), /*output=*/v22.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #21" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op22, - /*input=*/v22.data(), /*output=*/v23.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #22" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op23, - /*input=*/v23.data(), /*output=*/v24.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #23" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op24, - /*input=*/v24.data(), /*output=*/v25.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #24" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op25, - /*input=*/v25.data(), /*output=*/v26.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #25" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op26, - /*input=*/v26.data(), /*output=*/v27.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #26" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op27, - v27.data() /* a */, v24.data() /* b */, /*output=*/v28.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #27" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op28, - /*input=*/v28.data(), /*output=*/v29.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #28" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op29, - /*input=*/v29.data(), /*output=*/v30.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #29" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op30, - /*input=*/v30.data(), /*output=*/v31.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #30" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op31, - v31.data() /* a */, v28.data() /* b */, /*output=*/v32.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #31" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op32, - /*input=*/v32.data(), /*output=*/v33.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #32" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op33, - /*input=*/v33.data(), /*output=*/v34.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #33" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op34, - /*input=*/v34.data(), /*output=*/v35.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #34" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op35, - v35.data() /* a */, v32.data() /* b */, /*output=*/v36.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #35" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op36, - /*input=*/v36.data(), /*output=*/v37.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #36" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op37, - /*input=*/v37.data(), /*output=*/v38.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #37" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op38, - /*input=*/v38.data(), /*output=*/v39.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #38" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op39, - /*input=*/v39.data(), /*output=*/v40.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #39" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op40, - /*input=*/v40.data(), /*output=*/v41.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #40" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op41, - /*input=*/v41.data(), /*output=*/v42.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #41" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op42, - v42.data() /* a */, v39.data() /* b */, /*output=*/v43.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #42" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op43, - /*input=*/v43.data(), /*output=*/v44.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #43" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op44, - /*input=*/v44.data(), /*output=*/v45.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #44" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op45, - /*input=*/v45.data(), /*output=*/v46.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #45" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op46, - v46.data() /* a */, v43.data() /* b */, /*output=*/v47.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #46" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op47, - /*input=*/v47.data(), /*output=*/v48.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #47" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op48, - /*input=*/v48.data(), /*output=*/v49.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #48" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op49, - /*input=*/v49.data(), /*output=*/v50.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #49" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op50, - /*input=*/v50.data(), /*output=*/v51.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #50" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op51, - /*input=*/v51.data(), /*output=*/v52.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #51" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op52, - /*input=*/v52.data(), /*output=*/v53.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #52" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op53, - v53.data() /* a */, v50.data() /* b */, /*output=*/v54.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #53" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op54, - /*input=*/v54.data(), /*output=*/v55.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #54" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op55, - /*input=*/v55.data(), /*output=*/v56.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #55" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op56, - /*input=*/v56.data(), /*output=*/v57.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #56" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op57, - v57.data() /* a */, v54.data() /* b */, /*output=*/v58.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #57" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op58, - /*input=*/v58.data(), /*output=*/v59.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #58" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op59, - /*input=*/v59.data(), /*output=*/v60.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #59" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op60, - /*input=*/v60.data(), /*output=*/v61.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #60" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op61, - /*input=*/v61.data(), /*output=*/v62.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #61" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f16( - op62, - /*input=*/v62.data(), /*output=*/v63.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #62" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op63, - workspace.data(), - /*input=*/v63.data(), /*output=*/v64.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #63" << std::endl; - return ExecutionPlan(); - } - - XNN_PRAGMA_CLANG("clang diagnostic push") - XNN_PRAGMA_CLANG("clang diagnostic ignored \"-Wpessimizing-move\"") - return ExecutionPlan{operators, workspace}; - XNN_PRAGMA_CLANG("clang diagnostic pop") -} - -} // namespace models diff --git a/models/fp16-sparse-mobilenet-v3-large.cc b/models/fp16-sparse-mobilenet-v3-large.cc deleted file mode 100644 index f81d66d69c4..00000000000 --- a/models/fp16-sparse-mobilenet-v3-large.cc +++ /dev/null @@ -1,4814 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include "xnnpack.h" - -#include -#include -#include -#include -#include -#include - -#include "xnnpack/cache.h" -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/models.h" - -namespace models { - -ExecutionPlan FP16SparseMobileNetV3Large(float sparsity, pthreadpool_t threadpool) { - alignas(16) static std::array v0; - alignas(16) static std::array v1; - alignas(16) static std::array v2; - alignas(16) static std::array v3; - alignas(16) static std::array v4; - alignas(16) static std::array v5; - alignas(16) static std::array v6; - alignas(16) static std::array v7; - alignas(16) static std::array v8; - alignas(16) static std::array v9; - alignas(16) static std::array v10; - alignas(16) static std::array v11; - alignas(16) static std::array v12; - alignas(16) static std::array v13; - alignas(16) static std::array v14; - alignas(16) static std::array v15; - alignas(16) static std::array v16; - alignas(16) static std::array v17; - alignas(16) static std::array v18; - alignas(16) static std::array v19; - alignas(16) static std::array v20; - alignas(16) static std::array v21; - alignas(16) static std::array v22; - alignas(16) static std::array v23; - alignas(16) static std::array v24; - alignas(16) static std::array v25; - alignas(16) static std::array v26; - alignas(16) static std::array v27; - alignas(16) static std::array v28; - alignas(16) static std::array v29; - alignas(16) static std::array v30; - alignas(16) static std::array v31; - alignas(16) static std::array v32; - alignas(16) static std::array v33; - alignas(16) static std::array v34; - alignas(16) static std::array v35; - alignas(16) static std::array v36; - alignas(16) static std::array v37; - alignas(16) static std::array v38; - alignas(16) static std::array v39; - alignas(16) static std::array v40; - alignas(16) static std::array v41; - alignas(16) static std::array v42; - alignas(16) static std::array v43; - alignas(16) static std::array v44; - alignas(16) static std::array v45; - alignas(16) static std::array v46; - alignas(16) static std::array v47; - alignas(16) static std::array v48; - alignas(16) static std::array v49; - alignas(16) static std::array v50; - alignas(16) static std::array v51; - alignas(16) static std::array v52; - alignas(16) static std::array v53; - alignas(16) static std::array v54; - alignas(16) static std::array v55; - alignas(16) static std::array v56; - alignas(16) static std::array v57; - alignas(16) static std::array v58; - alignas(16) static std::array v59; - alignas(16) static std::array v60; - alignas(16) static std::array v61; - alignas(16) static std::array v62; - alignas(16) static std::array v63; - alignas(16) static std::array v64; - alignas(16) static std::array v65; - alignas(16) static std::array v66; - alignas(16) static std::array v67; - alignas(16) static std::array v68; - alignas(16) static std::array v69; - alignas(16) static std::array v70; - alignas(16) static std::array v71; - alignas(16) static std::array v72; - alignas(16) static std::array v73; - alignas(16) static std::array v74; - alignas(16) static std::array v75; - alignas(16) static std::array v76; - alignas(16) static std::array v77; - alignas(16) static std::array v78; - alignas(16) static std::array v79; - alignas(16) static std::array v80; - alignas(16) static std::array v81; - alignas(16) static std::array v82; - alignas(16) static std::array v83; - alignas(16) static std::array v84; - alignas(16) static std::array v85; - alignas(16) static std::array v86; - alignas(16) static std::array v87; - alignas(16) static std::array v88; - alignas(16) static std::array v89; - alignas(16) static std::array v90; - alignas(16) static std::array v91; - alignas(16) static std::array v92; - alignas(16) static std::array v93; - alignas(16) static std::array v94; - alignas(16) static std::array v95; - alignas(16) static std::array v96; - alignas(16) static std::array v97; - alignas(16) static std::array v98; - alignas(16) static std::array v99; - alignas(16) static std::array v100; - alignas(16) static std::array v101; - alignas(16) static std::array v102; - alignas(16) static std::array v103; - alignas(16) static std::array v104; - alignas(16) static std::array v105; - alignas(16) static std::array v106; - alignas(16) static std::array v107; - alignas(16) static std::array v108; - alignas(16) static std::array v109; - alignas(16) static std::array v110; - alignas(16) static std::array v111; - alignas(16) static std::array v112; - alignas(16) static std::array v113; - alignas(16) static std::array w114; - alignas(16) static std::array w115; - alignas(16) static std::array w116; - alignas(16) static std::array w117; - alignas(16) static std::array w118; - alignas(16) static std::array w119; - alignas(16) static std::array w120; - alignas(16) static std::array w121; - alignas(16) static std::array w122; - alignas(16) static std::array w123; - alignas(16) static std::array w124; - alignas(16) static std::array w125; - alignas(16) static std::array w126; - alignas(16) static std::array w127; - alignas(16) static std::array w128; - alignas(16) static std::array w129; - alignas(16) static std::array w130; - alignas(16) static std::array w131; - alignas(16) static std::array w132; - alignas(16) static std::array w133; - alignas(16) static std::array w134; - alignas(16) static std::array w135; - alignas(16) static std::array w136; - alignas(16) static std::array w137; - alignas(16) static std::array w138; - alignas(16) static std::array w139; - alignas(16) static std::array w140; - alignas(16) static std::array w141; - alignas(16) static std::array w142; - alignas(16) static std::array w143; - alignas(16) static std::array w144; - alignas(16) static std::array w145; - alignas(16) static std::array w146; - alignas(16) static std::array w147; - alignas(16) static std::array w148; - alignas(16) static std::array w149; - alignas(16) static std::array w150; - alignas(16) static std::array w151; - alignas(16) static std::array w152; - alignas(16) static std::array w153; - alignas(16) static std::array w154; - alignas(16) static std::array w155; - alignas(16) static std::array w156; - alignas(16) static std::array w157; - alignas(16) static std::array w158; - alignas(16) static std::array w159; - alignas(16) static std::array w160; - alignas(16) static std::array w161; - alignas(16) static std::array w162; - alignas(16) static std::array w163; - alignas(16) static std::array w164; - alignas(16) static std::array w165; - alignas(16) static std::array w166; - alignas(16) static std::array w167; - alignas(16) static std::array w168; - alignas(16) static std::array w169; - alignas(16) static std::array w170; - alignas(16) static std::array w171; - alignas(16) static std::array w172; - alignas(16) static std::array w173; - alignas(16) static std::array w174; - alignas(16) static std::array w175; - alignas(16) static std::array w176; - alignas(16) static std::array w177; - alignas(16) static std::array w178; - alignas(16) static std::array w179; - alignas(16) static std::array w180; - alignas(16) static std::array w181; - alignas(16) static std::array w182; - alignas(16) static std::array w183; - alignas(16) static std::array w184; - alignas(16) static std::array w185; - alignas(16) static std::array w186; - alignas(16) static std::array w187; - alignas(16) static std::array w188; - alignas(16) static std::array w189; - alignas(16) static std::array w190; - alignas(16) static std::array w191; - alignas(16) static std::array w192; - alignas(16) static std::array w193; - alignas(16) static std::array w194; - alignas(16) static std::array w195; - alignas(16) static std::array w196; - alignas(16) static std::array w197; - alignas(16) static std::array w198; - alignas(16) static std::array w199; - alignas(16) static std::array w200; - alignas(16) static std::array w201; - alignas(16) static std::array w202; - alignas(16) static std::array w203; - alignas(16) static std::array w204; - alignas(16) static std::array w205; - alignas(16) static std::array w206; - alignas(16) static std::array w207; - alignas(16) static std::array w208; - alignas(16) static std::array w209; - alignas(16) static std::array w210; - alignas(16) static std::array w211; - alignas(16) static std::array w212; - alignas(16) static std::array w213; - alignas(16) static std::array w214; - alignas(16) static std::array w215; - alignas(16) static std::array w216; - alignas(16) static std::array w217; - alignas(16) static std::array w218; - alignas(16) static std::array w219; - alignas(16) static std::array w220; - alignas(16) static std::array w221; - alignas(16) static std::array w222; - alignas(16) static std::array w223; - alignas(16) static std::array w224; - alignas(16) static std::array w225; - alignas(16) static std::array w226; - alignas(16) static std::array w227; - alignas(16) static std::array w228; - alignas(16) static std::array w229; - alignas(16) static std::array w230; - alignas(16) static std::array w231; - alignas(16) static std::array w232; - alignas(16) static std::array w233; - alignas(16) static std::array w234; - alignas(16) static std::array w235; - alignas(16) static std::array w236; - alignas(16) static std::array w237; - alignas(16) static std::array w238; - alignas(16) static std::array w239; - alignas(16) static std::array w240; - alignas(16) static std::array w241; - - std::random_device random_device; - auto rng = std::mt19937(random_device()); - auto f32rng = std::bind(std::uniform_real_distribution(-1.0f, +1.0f), std::ref(rng)); - std::generate(v0.begin(), v0.end(), f32rng); - std::generate(v1.begin(), v1.end(), f32rng); - std::generate(v2.begin(), v2.end(), f32rng); - std::generate(v3.begin(), v3.end(), f32rng); - std::generate(v4.begin(), v4.end(), f32rng); - std::generate(v5.begin(), v5.end(), f32rng); - std::generate(v6.begin(), v6.end(), f32rng); - std::generate(v7.begin(), v7.end(), f32rng); - std::generate(v8.begin(), v8.end(), f32rng); - std::generate(v9.begin(), v9.end(), f32rng); - std::generate(v10.begin(), v10.end(), f32rng); - std::generate(v11.begin(), v11.end(), f32rng); - std::generate(v12.begin(), v12.end(), f32rng); - std::generate(v13.begin(), v13.end(), f32rng); - std::generate(v14.begin(), v14.end(), f32rng); - std::generate(v15.begin(), v15.end(), f32rng); - std::generate(v16.begin(), v16.end(), f32rng); - std::generate(v17.begin(), v17.end(), f32rng); - std::generate(v18.begin(), v18.end(), f32rng); - std::generate(v19.begin(), v19.end(), f32rng); - std::generate(v20.begin(), v20.end(), f32rng); - std::generate(v21.begin(), v21.end(), f32rng); - std::generate(v22.begin(), v22.end(), f32rng); - std::generate(v23.begin(), v23.end(), f32rng); - std::generate(v24.begin(), v24.end(), f32rng); - std::generate(v25.begin(), v25.end(), f32rng); - std::generate(v26.begin(), v26.end(), f32rng); - std::generate(v27.begin(), v27.end(), f32rng); - std::generate(v28.begin(), v28.end(), f32rng); - std::generate(v29.begin(), v29.end(), f32rng); - std::generate(v30.begin(), v30.end(), f32rng); - std::generate(v31.begin(), v31.end(), f32rng); - std::generate(v32.begin(), v32.end(), f32rng); - std::generate(v33.begin(), v33.end(), f32rng); - std::generate(v34.begin(), v34.end(), f32rng); - std::generate(v35.begin(), v35.end(), f32rng); - std::generate(v36.begin(), v36.end(), f32rng); - std::generate(v37.begin(), v37.end(), f32rng); - std::generate(v38.begin(), v38.end(), f32rng); - std::generate(v39.begin(), v39.end(), f32rng); - std::generate(v40.begin(), v40.end(), f32rng); - std::generate(v41.begin(), v41.end(), f32rng); - std::generate(v42.begin(), v42.end(), f32rng); - std::generate(v43.begin(), v43.end(), f32rng); - std::generate(v44.begin(), v44.end(), f32rng); - std::generate(v45.begin(), v45.end(), f32rng); - std::generate(v46.begin(), v46.end(), f32rng); - std::generate(v47.begin(), v47.end(), f32rng); - std::generate(v48.begin(), v48.end(), f32rng); - std::generate(v49.begin(), v49.end(), f32rng); - std::generate(v50.begin(), v50.end(), f32rng); - std::generate(v51.begin(), v51.end(), f32rng); - std::generate(v52.begin(), v52.end(), f32rng); - std::generate(v53.begin(), v53.end(), f32rng); - std::generate(v54.begin(), v54.end(), f32rng); - std::generate(v55.begin(), v55.end(), f32rng); - std::generate(v56.begin(), v56.end(), f32rng); - std::generate(v57.begin(), v57.end(), f32rng); - std::generate(v58.begin(), v58.end(), f32rng); - std::generate(v59.begin(), v59.end(), f32rng); - std::generate(v60.begin(), v60.end(), f32rng); - std::generate(v61.begin(), v61.end(), f32rng); - std::generate(v62.begin(), v62.end(), f32rng); - std::generate(v63.begin(), v63.end(), f32rng); - std::generate(v64.begin(), v64.end(), f32rng); - std::generate(v65.begin(), v65.end(), f32rng); - std::generate(v66.begin(), v66.end(), f32rng); - std::generate(v67.begin(), v67.end(), f32rng); - std::generate(v68.begin(), v68.end(), f32rng); - std::generate(v69.begin(), v69.end(), f32rng); - std::generate(v70.begin(), v70.end(), f32rng); - std::generate(v71.begin(), v71.end(), f32rng); - std::generate(v72.begin(), v72.end(), f32rng); - std::generate(v73.begin(), v73.end(), f32rng); - std::generate(v74.begin(), v74.end(), f32rng); - std::generate(v75.begin(), v75.end(), f32rng); - std::generate(v76.begin(), v76.end(), f32rng); - std::generate(v77.begin(), v77.end(), f32rng); - std::generate(v78.begin(), v78.end(), f32rng); - std::generate(v79.begin(), v79.end(), f32rng); - std::generate(v80.begin(), v80.end(), f32rng); - std::generate(v81.begin(), v81.end(), f32rng); - std::generate(v82.begin(), v82.end(), f32rng); - std::generate(v83.begin(), v83.end(), f32rng); - std::generate(v84.begin(), v84.end(), f32rng); - std::generate(v85.begin(), v85.end(), f32rng); - std::generate(v86.begin(), v86.end(), f32rng); - std::generate(v87.begin(), v87.end(), f32rng); - std::generate(v88.begin(), v88.end(), f32rng); - std::generate(v89.begin(), v89.end(), f32rng); - std::generate(v90.begin(), v90.end(), f32rng); - std::generate(v91.begin(), v91.end(), f32rng); - std::generate(v92.begin(), v92.end(), f32rng); - std::generate(v93.begin(), v93.end(), f32rng); - std::generate(v94.begin(), v94.end(), f32rng); - std::generate(v95.begin(), v95.end(), f32rng); - std::generate(v96.begin(), v96.end(), f32rng); - std::generate(v97.begin(), v97.end(), f32rng); - std::generate(v98.begin(), v98.end(), f32rng); - std::generate(v99.begin(), v99.end(), f32rng); - std::generate(v100.begin(), v100.end(), f32rng); - std::generate(v101.begin(), v101.end(), f32rng); - std::generate(v102.begin(), v102.end(), f32rng); - std::generate(v103.begin(), v103.end(), f32rng); - std::generate(v104.begin(), v104.end(), f32rng); - std::generate(v105.begin(), v105.end(), f32rng); - std::generate(v106.begin(), v106.end(), f32rng); - std::generate(v107.begin(), v107.end(), f32rng); - std::generate(v108.begin(), v108.end(), f32rng); - std::generate(v109.begin(), v109.end(), f32rng); - std::generate(v110.begin(), v110.end(), f32rng); - std::generate(v111.begin(), v111.end(), f32rng); - std::generate(v112.begin(), v112.end(), f32rng); - std::generate(v113.begin(), v113.end(), f32rng); - std::generate(w114.begin(), w114.end(), f32rng); - std::generate(w115.begin(), w115.end(), f32rng); - std::generate(w116.begin(), w116.end(), f32rng); - std::generate(w117.begin(), w117.end(), f32rng); - std::fill(w118.begin(), w118.end(), 0); - std::generate(w118.begin(), w118.end() - size_t(sparsity * w118.size()), f32rng); - std::shuffle(w118.begin(), w118.end(), rng); - std::generate(w119.begin(), w119.end(), f32rng); - std::fill(w120.begin(), w120.end(), 0); - std::generate(w120.begin(), w120.end() - size_t(sparsity * w120.size()), f32rng); - std::shuffle(w120.begin(), w120.end(), rng); - std::generate(w121.begin(), w121.end(), f32rng); - std::generate(w122.begin(), w122.end(), f32rng); - std::generate(w123.begin(), w123.end(), f32rng); - std::fill(w124.begin(), w124.end(), 0); - std::generate(w124.begin(), w124.end() - size_t(sparsity * w124.size()), f32rng); - std::shuffle(w124.begin(), w124.end(), rng); - std::generate(w125.begin(), w125.end(), f32rng); - std::fill(w126.begin(), w126.end(), 0); - std::generate(w126.begin(), w126.end() - size_t(sparsity * w126.size()), f32rng); - std::shuffle(w126.begin(), w126.end(), rng); - std::generate(w127.begin(), w127.end(), f32rng); - std::generate(w128.begin(), w128.end(), f32rng); - std::generate(w129.begin(), w129.end(), f32rng); - std::fill(w130.begin(), w130.end(), 0); - std::generate(w130.begin(), w130.end() - size_t(sparsity * w130.size()), f32rng); - std::shuffle(w130.begin(), w130.end(), rng); - std::generate(w131.begin(), w131.end(), f32rng); - std::fill(w132.begin(), w132.end(), 0); - std::generate(w132.begin(), w132.end() - size_t(sparsity * w132.size()), f32rng); - std::shuffle(w132.begin(), w132.end(), rng); - std::generate(w133.begin(), w133.end(), f32rng); - std::generate(w134.begin(), w134.end(), f32rng); - std::generate(w135.begin(), w135.end(), f32rng); - std::fill(w136.begin(), w136.end(), 0); - std::generate(w136.begin(), w136.end() - size_t(sparsity * w136.size()), f32rng); - std::shuffle(w136.begin(), w136.end(), rng); - std::generate(w137.begin(), w137.end(), f32rng); - std::fill(w138.begin(), w138.end(), 0); - std::generate(w138.begin(), w138.end() - size_t(sparsity * w138.size()), f32rng); - std::shuffle(w138.begin(), w138.end(), rng); - std::generate(w139.begin(), w139.end(), f32rng); - std::fill(w140.begin(), w140.end(), 0); - std::generate(w140.begin(), w140.end() - size_t(sparsity * w140.size()), f32rng); - std::shuffle(w140.begin(), w140.end(), rng); - std::generate(w141.begin(), w141.end(), f32rng); - std::fill(w142.begin(), w142.end(), 0); - std::generate(w142.begin(), w142.end() - size_t(sparsity * w142.size()), f32rng); - std::shuffle(w142.begin(), w142.end(), rng); - std::generate(w143.begin(), w143.end(), f32rng); - std::generate(w144.begin(), w144.end(), f32rng); - std::generate(w145.begin(), w145.end(), f32rng); - std::fill(w146.begin(), w146.end(), 0); - std::generate(w146.begin(), w146.end() - size_t(sparsity * w146.size()), f32rng); - std::shuffle(w146.begin(), w146.end(), rng); - std::generate(w147.begin(), w147.end(), f32rng); - std::fill(w148.begin(), w148.end(), 0); - std::generate(w148.begin(), w148.end() - size_t(sparsity * w148.size()), f32rng); - std::shuffle(w148.begin(), w148.end(), rng); - std::generate(w149.begin(), w149.end(), f32rng); - std::fill(w150.begin(), w150.end(), 0); - std::generate(w150.begin(), w150.end() - size_t(sparsity * w150.size()), f32rng); - std::shuffle(w150.begin(), w150.end(), rng); - std::generate(w151.begin(), w151.end(), f32rng); - std::fill(w152.begin(), w152.end(), 0); - std::generate(w152.begin(), w152.end() - size_t(sparsity * w152.size()), f32rng); - std::shuffle(w152.begin(), w152.end(), rng); - std::generate(w153.begin(), w153.end(), f32rng); - std::generate(w154.begin(), w154.end(), f32rng); - std::generate(w155.begin(), w155.end(), f32rng); - std::fill(w156.begin(), w156.end(), 0); - std::generate(w156.begin(), w156.end() - size_t(sparsity * w156.size()), f32rng); - std::shuffle(w156.begin(), w156.end(), rng); - std::generate(w157.begin(), w157.end(), f32rng); - std::fill(w158.begin(), w158.end(), 0); - std::generate(w158.begin(), w158.end() - size_t(sparsity * w158.size()), f32rng); - std::shuffle(w158.begin(), w158.end(), rng); - std::generate(w159.begin(), w159.end(), f32rng); - std::fill(w160.begin(), w160.end(), 0); - std::generate(w160.begin(), w160.end() - size_t(sparsity * w160.size()), f32rng); - std::shuffle(w160.begin(), w160.end(), rng); - std::generate(w161.begin(), w161.end(), f32rng); - std::fill(w162.begin(), w162.end(), 0); - std::generate(w162.begin(), w162.end() - size_t(sparsity * w162.size()), f32rng); - std::shuffle(w162.begin(), w162.end(), rng); - std::generate(w163.begin(), w163.end(), f32rng); - std::generate(w164.begin(), w164.end(), f32rng); - std::generate(w165.begin(), w165.end(), f32rng); - std::fill(w166.begin(), w166.end(), 0); - std::generate(w166.begin(), w166.end() - size_t(sparsity * w166.size()), f32rng); - std::shuffle(w166.begin(), w166.end(), rng); - std::generate(w167.begin(), w167.end(), f32rng); - std::fill(w168.begin(), w168.end(), 0); - std::generate(w168.begin(), w168.end() - size_t(sparsity * w168.size()), f32rng); - std::shuffle(w168.begin(), w168.end(), rng); - std::generate(w169.begin(), w169.end(), f32rng); - std::generate(w170.begin(), w170.end(), f32rng); - std::generate(w171.begin(), w171.end(), f32rng); - std::fill(w172.begin(), w172.end(), 0); - std::generate(w172.begin(), w172.end() - size_t(sparsity * w172.size()), f32rng); - std::shuffle(w172.begin(), w172.end(), rng); - std::generate(w173.begin(), w173.end(), f32rng); - std::fill(w174.begin(), w174.end(), 0); - std::generate(w174.begin(), w174.end() - size_t(sparsity * w174.size()), f32rng); - std::shuffle(w174.begin(), w174.end(), rng); - std::generate(w175.begin(), w175.end(), f32rng); - std::generate(w176.begin(), w176.end(), f32rng); - std::generate(w177.begin(), w177.end(), f32rng); - std::fill(w178.begin(), w178.end(), 0); - std::generate(w178.begin(), w178.end() - size_t(sparsity * w178.size()), f32rng); - std::shuffle(w178.begin(), w178.end(), rng); - std::generate(w179.begin(), w179.end(), f32rng); - std::fill(w180.begin(), w180.end(), 0); - std::generate(w180.begin(), w180.end() - size_t(sparsity * w180.size()), f32rng); - std::shuffle(w180.begin(), w180.end(), rng); - std::generate(w181.begin(), w181.end(), f32rng); - std::generate(w182.begin(), w182.end(), f32rng); - std::generate(w183.begin(), w183.end(), f32rng); - std::fill(w184.begin(), w184.end(), 0); - std::generate(w184.begin(), w184.end() - size_t(sparsity * w184.size()), f32rng); - std::shuffle(w184.begin(), w184.end(), rng); - std::generate(w185.begin(), w185.end(), f32rng); - std::fill(w186.begin(), w186.end(), 0); - std::generate(w186.begin(), w186.end() - size_t(sparsity * w186.size()), f32rng); - std::shuffle(w186.begin(), w186.end(), rng); - std::generate(w187.begin(), w187.end(), f32rng); - std::generate(w188.begin(), w188.end(), f32rng); - std::generate(w189.begin(), w189.end(), f32rng); - std::fill(w190.begin(), w190.end(), 0); - std::generate(w190.begin(), w190.end() - size_t(sparsity * w190.size()), f32rng); - std::shuffle(w190.begin(), w190.end(), rng); - std::generate(w191.begin(), w191.end(), f32rng); - std::fill(w192.begin(), w192.end(), 0); - std::generate(w192.begin(), w192.end() - size_t(sparsity * w192.size()), f32rng); - std::shuffle(w192.begin(), w192.end(), rng); - std::generate(w193.begin(), w193.end(), f32rng); - std::fill(w194.begin(), w194.end(), 0); - std::generate(w194.begin(), w194.end() - size_t(sparsity * w194.size()), f32rng); - std::shuffle(w194.begin(), w194.end(), rng); - std::generate(w195.begin(), w195.end(), f32rng); - std::fill(w196.begin(), w196.end(), 0); - std::generate(w196.begin(), w196.end() - size_t(sparsity * w196.size()), f32rng); - std::shuffle(w196.begin(), w196.end(), rng); - std::generate(w197.begin(), w197.end(), f32rng); - std::generate(w198.begin(), w198.end(), f32rng); - std::generate(w199.begin(), w199.end(), f32rng); - std::fill(w200.begin(), w200.end(), 0); - std::generate(w200.begin(), w200.end() - size_t(sparsity * w200.size()), f32rng); - std::shuffle(w200.begin(), w200.end(), rng); - std::generate(w201.begin(), w201.end(), f32rng); - std::fill(w202.begin(), w202.end(), 0); - std::generate(w202.begin(), w202.end() - size_t(sparsity * w202.size()), f32rng); - std::shuffle(w202.begin(), w202.end(), rng); - std::generate(w203.begin(), w203.end(), f32rng); - std::fill(w204.begin(), w204.end(), 0); - std::generate(w204.begin(), w204.end() - size_t(sparsity * w204.size()), f32rng); - std::shuffle(w204.begin(), w204.end(), rng); - std::generate(w205.begin(), w205.end(), f32rng); - std::fill(w206.begin(), w206.end(), 0); - std::generate(w206.begin(), w206.end() - size_t(sparsity * w206.size()), f32rng); - std::shuffle(w206.begin(), w206.end(), rng); - std::generate(w207.begin(), w207.end(), f32rng); - std::generate(w208.begin(), w208.end(), f32rng); - std::generate(w209.begin(), w209.end(), f32rng); - std::fill(w210.begin(), w210.end(), 0); - std::generate(w210.begin(), w210.end() - size_t(sparsity * w210.size()), f32rng); - std::shuffle(w210.begin(), w210.end(), rng); - std::generate(w211.begin(), w211.end(), f32rng); - std::fill(w212.begin(), w212.end(), 0); - std::generate(w212.begin(), w212.end() - size_t(sparsity * w212.size()), f32rng); - std::shuffle(w212.begin(), w212.end(), rng); - std::generate(w213.begin(), w213.end(), f32rng); - std::fill(w214.begin(), w214.end(), 0); - std::generate(w214.begin(), w214.end() - size_t(sparsity * w214.size()), f32rng); - std::shuffle(w214.begin(), w214.end(), rng); - std::generate(w215.begin(), w215.end(), f32rng); - std::fill(w216.begin(), w216.end(), 0); - std::generate(w216.begin(), w216.end() - size_t(sparsity * w216.size()), f32rng); - std::shuffle(w216.begin(), w216.end(), rng); - std::generate(w217.begin(), w217.end(), f32rng); - std::generate(w218.begin(), w218.end(), f32rng); - std::generate(w219.begin(), w219.end(), f32rng); - std::fill(w220.begin(), w220.end(), 0); - std::generate(w220.begin(), w220.end() - size_t(sparsity * w220.size()), f32rng); - std::shuffle(w220.begin(), w220.end(), rng); - std::generate(w221.begin(), w221.end(), f32rng); - std::fill(w222.begin(), w222.end(), 0); - std::generate(w222.begin(), w222.end() - size_t(sparsity * w222.size()), f32rng); - std::shuffle(w222.begin(), w222.end(), rng); - std::generate(w223.begin(), w223.end(), f32rng); - std::fill(w224.begin(), w224.end(), 0); - std::generate(w224.begin(), w224.end() - size_t(sparsity * w224.size()), f32rng); - std::shuffle(w224.begin(), w224.end(), rng); - std::generate(w225.begin(), w225.end(), f32rng); - std::fill(w226.begin(), w226.end(), 0); - std::generate(w226.begin(), w226.end() - size_t(sparsity * w226.size()), f32rng); - std::shuffle(w226.begin(), w226.end(), rng); - std::generate(w227.begin(), w227.end(), f32rng); - std::generate(w228.begin(), w228.end(), f32rng); - std::generate(w229.begin(), w229.end(), f32rng); - std::fill(w230.begin(), w230.end(), 0); - std::generate(w230.begin(), w230.end() - size_t(sparsity * w230.size()), f32rng); - std::shuffle(w230.begin(), w230.end(), rng); - std::generate(w231.begin(), w231.end(), f32rng); - std::fill(w232.begin(), w232.end(), 0); - std::generate(w232.begin(), w232.end() - size_t(sparsity * w232.size()), f32rng); - std::shuffle(w232.begin(), w232.end(), rng); - std::generate(w233.begin(), w233.end(), f32rng); - std::fill(w234.begin(), w234.end(), 0); - std::generate(w234.begin(), w234.end() - size_t(sparsity * w234.size()), f32rng); - std::shuffle(w234.begin(), w234.end(), rng); - std::generate(w235.begin(), w235.end(), f32rng); - std::fill(w236.begin(), w236.end(), 0); - std::generate(w236.begin(), w236.end() - size_t(sparsity * w236.size()), f32rng); - std::shuffle(w236.begin(), w236.end(), rng); - std::generate(w237.begin(), w237.end(), f32rng); - std::generate(w238.begin(), w238.end(), f32rng); - std::generate(w239.begin(), w239.end(), f32rng); - std::generate(w240.begin(), w240.end(), f32rng); - std::generate(w241.begin(), w241.end(), f32rng); - - Operators operators; - xnn_status status; - size_t max_workspace_size = 0; - - xnn_operator_t op0 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 3 /* input channels per group */, - 16 /* output_channels_per_group */, - 3 /* input pixel stride */, - 16 /* output pixel stride */, - w114.data(), w115.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - XNN_FLAG_INPUT_NHWC /* flags */, - nullptr, - nullptr, - &op0); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #0" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op0, xnn_delete_operator); - - xnn_operator_t op1 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op1); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #1" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op1, xnn_delete_operator); - - xnn_operator_t op2 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 16 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 16 /* input pixel stride */, - 16 /* output pixel stride */, - w116.data(), w117.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op2); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #2" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op2, xnn_delete_operator); - - xnn_operator_t op3 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 16 /* input channels per group */, - 16 /* output_channels_per_group */, - 16 /* input pixel stride */, - 16 /* output pixel stride */, - w118.data(), w119.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op3); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #3" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op3, xnn_delete_operator); - - xnn_operator_t op4 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op4); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #4" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op4, xnn_delete_operator); - - xnn_operator_t op5 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 16 /* input channels per group */, - 64 /* output_channels_per_group */, - 16 /* input pixel stride */, - 64 /* output pixel stride */, - w120.data(), w121.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op5); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #5" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op5, xnn_delete_operator); - - xnn_operator_t op6 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 64 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 64 /* input pixel stride */, - 64 /* output pixel stride */, - w122.data(), w123.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op6); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #6" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op6, xnn_delete_operator); - - xnn_operator_t op7 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 64 /* input channels per group */, - 24 /* output_channels_per_group */, - 64 /* input pixel stride */, - 24 /* output pixel stride */, - w124.data(), w125.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op7); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #7" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op7, xnn_delete_operator); - - xnn_operator_t op8 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 24 /* input channels per group */, - 72 /* output_channels_per_group */, - 24 /* input pixel stride */, - 72 /* output pixel stride */, - w126.data(), w127.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op8); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #8" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op8, xnn_delete_operator); - - xnn_operator_t op9 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 72 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 72 /* input pixel stride */, - 72 /* output pixel stride */, - w128.data(), w129.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op9); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #9" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op9, xnn_delete_operator); - - xnn_operator_t op10 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 72 /* input channels per group */, - 24 /* output_channels_per_group */, - 72 /* input pixel stride */, - 24 /* output pixel stride */, - w130.data(), w131.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op10); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #10" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op10, xnn_delete_operator); - - xnn_operator_t op11 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op11); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #11" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op11, xnn_delete_operator); - - xnn_operator_t op12 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 24 /* input channels per group */, - 72 /* output_channels_per_group */, - 24 /* input pixel stride */, - 72 /* output pixel stride */, - w132.data(), w133.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op12); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #12" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op12, xnn_delete_operator); - - xnn_operator_t op13 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 2 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 2 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 72 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 72 /* input pixel stride */, - 72 /* output pixel stride */, - w134.data(), w135.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op13); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #13" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op13, xnn_delete_operator); - - xnn_operator_t op14 = nullptr; - status = xnn_create_global_average_pooling_ncw_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op14); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #14" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op14, xnn_delete_operator); - - xnn_operator_t op15 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 72 /* input channels per group */, - 24 /* output_channels_per_group */, - 72 /* input pixel stride */, - 24 /* output pixel stride */, - w136.data(), w137.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op15); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #15" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op15, xnn_delete_operator); - - xnn_operator_t op16 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 24 /* input channels per group */, - 72 /* output_channels_per_group */, - 24 /* input pixel stride */, - 72 /* output pixel stride */, - w138.data(), w139.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op16); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #16" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op16, xnn_delete_operator); - - xnn_operator_t op17 = nullptr; - status = xnn_create_multiply_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op17); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #17" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op17, xnn_delete_operator); - - xnn_operator_t op18 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 72 /* input channels per group */, - 40 /* output_channels_per_group */, - 72 /* input pixel stride */, - 40 /* output pixel stride */, - w140.data(), w141.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op18); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #18" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op18, xnn_delete_operator); - - xnn_operator_t op19 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 40 /* input channels per group */, - 120 /* output_channels_per_group */, - 40 /* input pixel stride */, - 120 /* output pixel stride */, - w142.data(), w143.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op19); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #19" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op19, xnn_delete_operator); - - xnn_operator_t op20 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 2 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 2 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 120 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 120 /* input pixel stride */, - 120 /* output pixel stride */, - w144.data(), w145.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op20); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #20" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op20, xnn_delete_operator); - - xnn_operator_t op21 = nullptr; - status = xnn_create_global_average_pooling_ncw_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op21); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #21" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op21, xnn_delete_operator); - - xnn_operator_t op22 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 120 /* input channels per group */, - 32 /* output_channels_per_group */, - 120 /* input pixel stride */, - 32 /* output pixel stride */, - w146.data(), w147.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op22); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #22" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op22, xnn_delete_operator); - - xnn_operator_t op23 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 32 /* input channels per group */, - 120 /* output_channels_per_group */, - 32 /* input pixel stride */, - 120 /* output pixel stride */, - w148.data(), w149.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op23); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #23" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op23, xnn_delete_operator); - - xnn_operator_t op24 = nullptr; - status = xnn_create_multiply_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op24); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #24" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op24, xnn_delete_operator); - - xnn_operator_t op25 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 120 /* input channels per group */, - 40 /* output_channels_per_group */, - 120 /* input pixel stride */, - 40 /* output pixel stride */, - w150.data(), w151.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op25); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #25" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op25, xnn_delete_operator); - - xnn_operator_t op26 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op26); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #26" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op26, xnn_delete_operator); - - xnn_operator_t op27 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 40 /* input channels per group */, - 120 /* output_channels_per_group */, - 40 /* input pixel stride */, - 120 /* output pixel stride */, - w152.data(), w153.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op27); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #27" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op27, xnn_delete_operator); - - xnn_operator_t op28 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 2 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 2 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 120 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 120 /* input pixel stride */, - 120 /* output pixel stride */, - w154.data(), w155.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op28); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #28" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op28, xnn_delete_operator); - - xnn_operator_t op29 = nullptr; - status = xnn_create_global_average_pooling_ncw_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op29); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #29" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op29, xnn_delete_operator); - - xnn_operator_t op30 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 120 /* input channels per group */, - 32 /* output_channels_per_group */, - 120 /* input pixel stride */, - 32 /* output pixel stride */, - w156.data(), w157.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op30); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #30" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op30, xnn_delete_operator); - - xnn_operator_t op31 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 32 /* input channels per group */, - 120 /* output_channels_per_group */, - 32 /* input pixel stride */, - 120 /* output pixel stride */, - w158.data(), w159.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op31); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #31" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op31, xnn_delete_operator); - - xnn_operator_t op32 = nullptr; - status = xnn_create_multiply_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op32); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #32" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op32, xnn_delete_operator); - - xnn_operator_t op33 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 120 /* input channels per group */, - 40 /* output_channels_per_group */, - 120 /* input pixel stride */, - 40 /* output pixel stride */, - w160.data(), w161.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op33); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #33" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op33, xnn_delete_operator); - - xnn_operator_t op34 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op34); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #34" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op34, xnn_delete_operator); - - xnn_operator_t op35 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 40 /* input channels per group */, - 240 /* output_channels_per_group */, - 40 /* input pixel stride */, - 240 /* output pixel stride */, - w162.data(), w163.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op35); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #35" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op35, xnn_delete_operator); - - xnn_operator_t op36 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op36); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #36" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op36, xnn_delete_operator); - - xnn_operator_t op37 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 240 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 240 /* input pixel stride */, - 240 /* output pixel stride */, - w164.data(), w165.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op37); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #37" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op37, xnn_delete_operator); - - xnn_operator_t op38 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op38); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #38" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op38, xnn_delete_operator); - - xnn_operator_t op39 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 240 /* input channels per group */, - 80 /* output_channels_per_group */, - 240 /* input pixel stride */, - 80 /* output pixel stride */, - w166.data(), w167.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op39); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #39" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op39, xnn_delete_operator); - - xnn_operator_t op40 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 80 /* input channels per group */, - 200 /* output_channels_per_group */, - 80 /* input pixel stride */, - 200 /* output pixel stride */, - w168.data(), w169.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op40); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #40" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op40, xnn_delete_operator); - - xnn_operator_t op41 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op41); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #41" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op41, xnn_delete_operator); - - xnn_operator_t op42 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 200 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 200 /* input pixel stride */, - 200 /* output pixel stride */, - w170.data(), w171.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op42); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #42" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op42, xnn_delete_operator); - - xnn_operator_t op43 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op43); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #43" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op43, xnn_delete_operator); - - xnn_operator_t op44 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 200 /* input channels per group */, - 80 /* output_channels_per_group */, - 200 /* input pixel stride */, - 80 /* output pixel stride */, - w172.data(), w173.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op44); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #44" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op44, xnn_delete_operator); - - xnn_operator_t op45 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op45); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #45" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op45, xnn_delete_operator); - - xnn_operator_t op46 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 80 /* input channels per group */, - 184 /* output_channels_per_group */, - 80 /* input pixel stride */, - 184 /* output pixel stride */, - w174.data(), w175.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op46); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #46" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op46, xnn_delete_operator); - - xnn_operator_t op47 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op47); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #47" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op47, xnn_delete_operator); - - xnn_operator_t op48 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 184 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 184 /* input pixel stride */, - 184 /* output pixel stride */, - w176.data(), w177.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op48); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #48" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op48, xnn_delete_operator); - - xnn_operator_t op49 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op49); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #49" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op49, xnn_delete_operator); - - xnn_operator_t op50 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 184 /* input channels per group */, - 80 /* output_channels_per_group */, - 184 /* input pixel stride */, - 80 /* output pixel stride */, - w178.data(), w179.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op50); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #50" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op50, xnn_delete_operator); - - xnn_operator_t op51 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op51); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #51" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op51, xnn_delete_operator); - - xnn_operator_t op52 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 80 /* input channels per group */, - 184 /* output_channels_per_group */, - 80 /* input pixel stride */, - 184 /* output pixel stride */, - w180.data(), w181.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op52); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #52" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op52, xnn_delete_operator); - - xnn_operator_t op53 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op53); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #53" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op53, xnn_delete_operator); - - xnn_operator_t op54 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 184 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 184 /* input pixel stride */, - 184 /* output pixel stride */, - w182.data(), w183.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op54); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #54" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op54, xnn_delete_operator); - - xnn_operator_t op55 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op55); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #55" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op55, xnn_delete_operator); - - xnn_operator_t op56 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 184 /* input channels per group */, - 80 /* output_channels_per_group */, - 184 /* input pixel stride */, - 80 /* output pixel stride */, - w184.data(), w185.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op56); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #56" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op56, xnn_delete_operator); - - xnn_operator_t op57 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op57); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #57" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op57, xnn_delete_operator); - - xnn_operator_t op58 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 80 /* input channels per group */, - 480 /* output_channels_per_group */, - 80 /* input pixel stride */, - 480 /* output pixel stride */, - w186.data(), w187.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op58); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #58" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op58, xnn_delete_operator); - - xnn_operator_t op59 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op59); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #59" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op59, xnn_delete_operator); - - xnn_operator_t op60 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 480 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 480 /* input pixel stride */, - 480 /* output pixel stride */, - w188.data(), w189.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op60); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #60" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op60, xnn_delete_operator); - - xnn_operator_t op61 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op61); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #61" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op61, xnn_delete_operator); - - xnn_operator_t op62 = nullptr; - status = xnn_create_global_average_pooling_ncw_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op62); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #62" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op62, xnn_delete_operator); - - xnn_operator_t op63 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 480 /* input channels per group */, - 120 /* output_channels_per_group */, - 480 /* input pixel stride */, - 120 /* output pixel stride */, - w190.data(), w191.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op63); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #63" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op63, xnn_delete_operator); - - xnn_operator_t op64 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 120 /* input channels per group */, - 480 /* output_channels_per_group */, - 120 /* input pixel stride */, - 480 /* output pixel stride */, - w192.data(), w193.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op64); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #64" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op64, xnn_delete_operator); - - xnn_operator_t op65 = nullptr; - status = xnn_create_multiply_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op65); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #65" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op65, xnn_delete_operator); - - xnn_operator_t op66 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 480 /* input channels per group */, - 112 /* output_channels_per_group */, - 480 /* input pixel stride */, - 112 /* output pixel stride */, - w194.data(), w195.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op66); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #66" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op66, xnn_delete_operator); - - xnn_operator_t op67 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 112 /* input channels per group */, - 672 /* output_channels_per_group */, - 112 /* input pixel stride */, - 672 /* output pixel stride */, - w196.data(), w197.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op67); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #67" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op67, xnn_delete_operator); - - xnn_operator_t op68 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op68); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #68" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op68, xnn_delete_operator); - - xnn_operator_t op69 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 672 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 672 /* input pixel stride */, - 672 /* output pixel stride */, - w198.data(), w199.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op69); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #69" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op69, xnn_delete_operator); - - xnn_operator_t op70 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op70); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #70" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op70, xnn_delete_operator); - - xnn_operator_t op71 = nullptr; - status = xnn_create_global_average_pooling_ncw_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op71); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #71" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op71, xnn_delete_operator); - - xnn_operator_t op72 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 672 /* input channels per group */, - 168 /* output_channels_per_group */, - 672 /* input pixel stride */, - 168 /* output pixel stride */, - w200.data(), w201.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op72); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #72" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op72, xnn_delete_operator); - - xnn_operator_t op73 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 168 /* input channels per group */, - 672 /* output_channels_per_group */, - 168 /* input pixel stride */, - 672 /* output pixel stride */, - w202.data(), w203.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op73); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #73" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op73, xnn_delete_operator); - - xnn_operator_t op74 = nullptr; - status = xnn_create_multiply_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op74); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #74" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op74, xnn_delete_operator); - - xnn_operator_t op75 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 672 /* input channels per group */, - 112 /* output_channels_per_group */, - 672 /* input pixel stride */, - 112 /* output pixel stride */, - w204.data(), w205.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op75); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #75" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op75, xnn_delete_operator); - - xnn_operator_t op76 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op76); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #76" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op76, xnn_delete_operator); - - xnn_operator_t op77 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 112 /* input channels per group */, - 672 /* output_channels_per_group */, - 112 /* input pixel stride */, - 672 /* output pixel stride */, - w206.data(), w207.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op77); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #77" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op77, xnn_delete_operator); - - xnn_operator_t op78 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op78); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #78" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op78, xnn_delete_operator); - - xnn_operator_t op79 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 2 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 2 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 672 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 672 /* input pixel stride */, - 672 /* output pixel stride */, - w208.data(), w209.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op79); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #79" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op79, xnn_delete_operator); - - xnn_operator_t op80 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op80); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #80" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op80, xnn_delete_operator); - - xnn_operator_t op81 = nullptr; - status = xnn_create_global_average_pooling_ncw_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op81); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #81" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op81, xnn_delete_operator); - - xnn_operator_t op82 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 672 /* input channels per group */, - 168 /* output_channels_per_group */, - 672 /* input pixel stride */, - 168 /* output pixel stride */, - w210.data(), w211.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op82); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #82" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op82, xnn_delete_operator); - - xnn_operator_t op83 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 168 /* input channels per group */, - 672 /* output_channels_per_group */, - 168 /* input pixel stride */, - 672 /* output pixel stride */, - w212.data(), w213.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op83); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #83" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op83, xnn_delete_operator); - - xnn_operator_t op84 = nullptr; - status = xnn_create_multiply_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op84); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #84" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op84, xnn_delete_operator); - - xnn_operator_t op85 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 672 /* input channels per group */, - 160 /* output_channels_per_group */, - 672 /* input pixel stride */, - 160 /* output pixel stride */, - w214.data(), w215.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op85); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #85" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op85, xnn_delete_operator); - - xnn_operator_t op86 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 160 /* input channels per group */, - 960 /* output_channels_per_group */, - 160 /* input pixel stride */, - 960 /* output pixel stride */, - w216.data(), w217.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op86); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #86" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op86, xnn_delete_operator); - - xnn_operator_t op87 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op87); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #87" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op87, xnn_delete_operator); - - xnn_operator_t op88 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 2 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 2 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 960 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 960 /* input pixel stride */, - 960 /* output pixel stride */, - w218.data(), w219.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op88); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #88" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op88, xnn_delete_operator); - - xnn_operator_t op89 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op89); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #89" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op89, xnn_delete_operator); - - xnn_operator_t op90 = nullptr; - status = xnn_create_global_average_pooling_ncw_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op90); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #90" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op90, xnn_delete_operator); - - xnn_operator_t op91 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 960 /* input channels per group */, - 240 /* output_channels_per_group */, - 960 /* input pixel stride */, - 240 /* output pixel stride */, - w220.data(), w221.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op91); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #91" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op91, xnn_delete_operator); - - xnn_operator_t op92 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 240 /* input channels per group */, - 960 /* output_channels_per_group */, - 240 /* input pixel stride */, - 960 /* output pixel stride */, - w222.data(), w223.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op92); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #92" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op92, xnn_delete_operator); - - xnn_operator_t op93 = nullptr; - status = xnn_create_multiply_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op93); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #93" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op93, xnn_delete_operator); - - xnn_operator_t op94 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 960 /* input channels per group */, - 160 /* output_channels_per_group */, - 960 /* input pixel stride */, - 160 /* output pixel stride */, - w224.data(), w225.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op94); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #94" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op94, xnn_delete_operator); - - xnn_operator_t op95 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op95); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #95" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op95, xnn_delete_operator); - - xnn_operator_t op96 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 160 /* input channels per group */, - 960 /* output_channels_per_group */, - 160 /* input pixel stride */, - 960 /* output pixel stride */, - w226.data(), w227.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op96); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #96" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op96, xnn_delete_operator); - - xnn_operator_t op97 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op97); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #97" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op97, xnn_delete_operator); - - xnn_operator_t op98 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 2 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 2 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 960 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 960 /* input pixel stride */, - 960 /* output pixel stride */, - w228.data(), w229.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op98); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #98" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op98, xnn_delete_operator); - - xnn_operator_t op99 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op99); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #99" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op99, xnn_delete_operator); - - xnn_operator_t op100 = nullptr; - status = xnn_create_global_average_pooling_ncw_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op100); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #100" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op100, xnn_delete_operator); - - xnn_operator_t op101 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 960 /* input channels per group */, - 240 /* output_channels_per_group */, - 960 /* input pixel stride */, - 240 /* output pixel stride */, - w230.data(), w231.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op101); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #101" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op101, xnn_delete_operator); - - xnn_operator_t op102 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 240 /* input channels per group */, - 960 /* output_channels_per_group */, - 240 /* input pixel stride */, - 960 /* output pixel stride */, - w232.data(), w233.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op102); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #102" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op102, xnn_delete_operator); - - xnn_operator_t op103 = nullptr; - status = xnn_create_multiply_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op103); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #103" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op103, xnn_delete_operator); - - xnn_operator_t op104 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 960 /* input channels per group */, - 160 /* output_channels_per_group */, - 960 /* input pixel stride */, - 160 /* output pixel stride */, - w234.data(), w235.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op104); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #104" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op104, xnn_delete_operator); - - xnn_operator_t op105 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op105); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #105" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op105, xnn_delete_operator); - - xnn_operator_t op106 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 160 /* input channels per group */, - 960 /* output_channels_per_group */, - 160 /* input pixel stride */, - 960 /* output pixel stride */, - w236.data(), w237.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op106); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #106" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op106, xnn_delete_operator); - - xnn_operator_t op107 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op107); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #107" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op107, xnn_delete_operator); - - xnn_operator_t op108 = nullptr; - status = xnn_create_global_average_pooling_ncw_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op108); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #108" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op108, xnn_delete_operator); - - xnn_operator_t op109 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 960 /* input channels per group */, - 1280 /* output_channels_per_group */, - 960 /* input pixel stride */, - 1280 /* output pixel stride */, - w238.data(), w239.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op109); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #109" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op109, xnn_delete_operator); - - xnn_operator_t op110 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op110); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #110" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op110, xnn_delete_operator); - - xnn_operator_t op111 = nullptr; - status = xnn_create_global_average_pooling_nwc_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op111); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #111" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op111, xnn_delete_operator); - - xnn_operator_t op112 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 1280 /* input channels per group */, - 1001 /* output_channels_per_group */, - 1280 /* input pixel stride */, - 1001 /* output pixel stride */, - w240.data(), w241.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op112); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #112" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op112, xnn_delete_operator); - - status = xnn_reshape_convolution2d_nchw_f16( - op0, - /*batch_size=*/1, /*input_height=*/224, /*input_width=*/224, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #0" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op1, - /*batch_size=*/12544, - 16 /* channels */, - 16 /* input stride */, - 16 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #1" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op2, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #2" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op3, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #3" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 16, 112, 112 }; - const size_t b_shape[] = { 1, 16, 112, 112 }; - status = xnn_reshape_add_nd_f16( - op4, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #4" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op5, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #5" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op6, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #6" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op7, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #7" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op8, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #8" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op9, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #9" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op10, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #10" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 24, 56, 56 }; - const size_t b_shape[] = { 1, 24, 56, 56 }; - status = xnn_reshape_add_nd_f16( - op11, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #11" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op12, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #12" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op13, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #13" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f16( - op14, - /*batch_size=*/1, 784 /* width */, - 72 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #14" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op15, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #15" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op16, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #16" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 72, 28, 28 }; - const size_t b_shape[] = { 1, 72, 1, 1 }; - status = xnn_reshape_multiply_nd_f16( - op17, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #17" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op18, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #18" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op19, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #19" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op20, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #20" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f16( - op21, - /*batch_size=*/1, 784 /* width */, - 120 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #21" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op22, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #22" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op23, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #23" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 120, 28, 28 }; - const size_t b_shape[] = { 1, 120, 1, 1 }; - status = xnn_reshape_multiply_nd_f16( - op24, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #24" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op25, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #25" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 40, 28, 28 }; - const size_t b_shape[] = { 1, 40, 28, 28 }; - status = xnn_reshape_add_nd_f16( - op26, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #26" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op27, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #27" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op28, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #28" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f16( - op29, - /*batch_size=*/1, 784 /* width */, - 120 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #29" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op30, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #30" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op31, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #31" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 120, 28, 28 }; - const size_t b_shape[] = { 1, 120, 1, 1 }; - status = xnn_reshape_multiply_nd_f16( - op32, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #32" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op33, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #33" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 40, 28, 28 }; - const size_t b_shape[] = { 1, 40, 28, 28 }; - status = xnn_reshape_add_nd_f16( - op34, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #34" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op35, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #35" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op36, - /*batch_size=*/784, - 240 /* channels */, - 240 /* input stride */, - 240 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #36" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op37, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #37" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op38, - /*batch_size=*/196, - 240 /* channels */, - 240 /* input stride */, - 240 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #38" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op39, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #39" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op40, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #40" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op41, - /*batch_size=*/196, - 200 /* channels */, - 200 /* input stride */, - 200 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #41" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op42, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #42" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op43, - /*batch_size=*/196, - 200 /* channels */, - 200 /* input stride */, - 200 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #43" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op44, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #44" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 80, 14, 14 }; - const size_t b_shape[] = { 1, 80, 14, 14 }; - status = xnn_reshape_add_nd_f16( - op45, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #45" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op46, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #46" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op47, - /*batch_size=*/196, - 184 /* channels */, - 184 /* input stride */, - 184 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #47" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op48, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #48" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op49, - /*batch_size=*/196, - 184 /* channels */, - 184 /* input stride */, - 184 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #49" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op50, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #50" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 80, 14, 14 }; - const size_t b_shape[] = { 1, 80, 14, 14 }; - status = xnn_reshape_add_nd_f16( - op51, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #51" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op52, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #52" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op53, - /*batch_size=*/196, - 184 /* channels */, - 184 /* input stride */, - 184 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #53" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op54, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #54" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op55, - /*batch_size=*/196, - 480 /* channels */, - 480 /* input stride */, - 480 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #55" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op56, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #56" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 80, 14, 14 }; - const size_t b_shape[] = { 1, 80, 14, 14 }; - status = xnn_reshape_add_nd_f16( - op57, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #57" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op58, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #58" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op59, - /*batch_size=*/196, - 480 /* channels */, - 480 /* input stride */, - 480 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #59" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op60, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #60" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op61, - /*batch_size=*/196, - 672 /* channels */, - 672 /* input stride */, - 672 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #61" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f16( - op62, - /*batch_size=*/1, 196 /* width */, - 480 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #62" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op63, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #63" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op64, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #64" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 480, 14, 14 }; - const size_t b_shape[] = { 1, 480, 1, 1 }; - status = xnn_reshape_multiply_nd_f16( - op65, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #65" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op66, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #66" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op67, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #67" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op68, - /*batch_size=*/196, - 672 /* channels */, - 672 /* input stride */, - 672 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #68" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op69, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #69" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op70, - /*batch_size=*/196, - 672 /* channels */, - 672 /* input stride */, - 672 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #70" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f16( - op71, - /*batch_size=*/1, 196 /* width */, - 672 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #71" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op72, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #72" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op73, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #73" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 672, 14, 14 }; - const size_t b_shape[] = { 1, 672, 1, 1 }; - status = xnn_reshape_multiply_nd_f16( - op74, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #74" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op75, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #75" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 112, 14, 14 }; - const size_t b_shape[] = { 1, 112, 14, 14 }; - status = xnn_reshape_add_nd_f16( - op76, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #76" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op77, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #77" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op78, - /*batch_size=*/196, - 672 /* channels */, - 672 /* input stride */, - 672 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #78" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op79, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #79" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op80, - /*batch_size=*/49, - 672 /* channels */, - 672 /* input stride */, - 672 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #80" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f16( - op81, - /*batch_size=*/1, 49 /* width */, - 672 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #81" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op82, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #82" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op83, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #83" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 672, 7, 7 }; - const size_t b_shape[] = { 1, 672, 1, 1 }; - status = xnn_reshape_multiply_nd_f16( - op84, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #84" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op85, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #85" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op86, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #86" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op87, - /*batch_size=*/49, - 960 /* channels */, - 960 /* input stride */, - 960 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #87" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op88, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #88" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op89, - /*batch_size=*/49, - 960 /* channels */, - 960 /* input stride */, - 960 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #89" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f16( - op90, - /*batch_size=*/1, 49 /* width */, - 960 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #90" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op91, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #91" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op92, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #92" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 960, 7, 7 }; - const size_t b_shape[] = { 1, 960, 1, 1 }; - status = xnn_reshape_multiply_nd_f16( - op93, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #93" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op94, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #94" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 160, 7, 7 }; - const size_t b_shape[] = { 1, 160, 7, 7 }; - status = xnn_reshape_add_nd_f16( - op95, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #95" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op96, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #96" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op97, - /*batch_size=*/49, - 960 /* channels */, - 960 /* input stride */, - 960 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #97" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op98, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #98" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op99, - /*batch_size=*/49, - 960 /* channels */, - 960 /* input stride */, - 960 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #99" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f16( - op100, - /*batch_size=*/1, 49 /* width */, - 960 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #100" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op101, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #101" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op102, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #102" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 960, 7, 7 }; - const size_t b_shape[] = { 1, 960, 1, 1 }; - status = xnn_reshape_multiply_nd_f16( - op103, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #103" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op104, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #104" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 160, 7, 7 }; - const size_t b_shape[] = { 1, 160, 7, 7 }; - status = xnn_reshape_add_nd_f16( - op105, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #105" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op106, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #106" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op107, - /*batch_size=*/49, - 960 /* channels */, - 960 /* input stride */, - 960 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #107" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f16( - op108, - /*batch_size=*/1, 49 /* width */, - 960 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #108" << std::endl; - return ExecutionPlan(); - } - - size_t op109_workspace_size = 0; - size_t op109_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op109, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op109_workspace_size, &op109_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op109_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #109" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op110, - /*batch_size=*/1, - 1280 /* channels */, - 1280 /* input stride */, - 1280 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #110" << std::endl; - return ExecutionPlan(); - } - - size_t op111_workspace_size = 0; - size_t op111_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f16( - op111, - /*batch_size=*/1, 1 /* width */, - 1280 /* channels */, 1280 /* input stride */, 1280 /* output stride */, - &op111_workspace_size, &op111_workspace_alignment, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #111" << std::endl; - return ExecutionPlan(); - } - - size_t op112_workspace_size = 0; - size_t op112_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op112, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op112_workspace_size, &op112_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op112_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #112" << std::endl; - return ExecutionPlan(); - } - - Workspace workspace(max_workspace_size); - - status = xnn_setup_convolution2d_nchw_f16( - op0, - /*input=*/v0.data(), /*output=*/v1.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #0" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op1, - /*input=*/v1.data(), /*output=*/v2.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #1" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op2, - /*input=*/v2.data(), /*output=*/v3.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #2" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op3, - /*input=*/v3.data(), /*output=*/v4.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #3" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op4, - v4.data() /* a */, v2.data() /* b */, /*output=*/v5.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #4" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op5, - /*input=*/v5.data(), /*output=*/v6.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #5" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op6, - /*input=*/v6.data(), /*output=*/v7.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #6" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op7, - /*input=*/v7.data(), /*output=*/v8.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #7" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op8, - /*input=*/v8.data(), /*output=*/v9.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #8" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op9, - /*input=*/v9.data(), /*output=*/v10.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #9" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op10, - /*input=*/v10.data(), /*output=*/v11.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #10" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op11, - v11.data() /* a */, v8.data() /* b */, /*output=*/v12.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #11" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op12, - /*input=*/v12.data(), /*output=*/v13.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #12" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op13, - /*input=*/v13.data(), /*output=*/v14.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #13" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f16( - op14, - /*input=*/v14.data(), /*output=*/v15.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #14" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op15, - /*input=*/v15.data(), /*output=*/v16.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #15" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op16, - /*input=*/v16.data(), /*output=*/v17.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #16" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f16( - op17, - v14.data() /* a */, v17.data() /* b */, /*output=*/v18.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #17" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op18, - /*input=*/v18.data(), /*output=*/v19.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #18" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op19, - /*input=*/v19.data(), /*output=*/v20.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #19" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op20, - /*input=*/v20.data(), /*output=*/v21.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #20" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f16( - op21, - /*input=*/v21.data(), /*output=*/v22.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #21" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op22, - /*input=*/v22.data(), /*output=*/v23.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #22" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op23, - /*input=*/v23.data(), /*output=*/v24.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #23" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f16( - op24, - v21.data() /* a */, v24.data() /* b */, /*output=*/v25.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #24" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op25, - /*input=*/v25.data(), /*output=*/v26.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #25" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op26, - v26.data() /* a */, v19.data() /* b */, /*output=*/v27.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #26" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op27, - /*input=*/v27.data(), /*output=*/v28.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #27" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op28, - /*input=*/v28.data(), /*output=*/v29.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #28" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f16( - op29, - /*input=*/v29.data(), /*output=*/v30.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #29" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op30, - /*input=*/v30.data(), /*output=*/v31.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #30" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op31, - /*input=*/v31.data(), /*output=*/v32.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #31" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f16( - op32, - v29.data() /* a */, v32.data() /* b */, /*output=*/v33.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #32" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op33, - /*input=*/v33.data(), /*output=*/v34.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #33" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op34, - v34.data() /* a */, v27.data() /* b */, /*output=*/v35.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #34" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op35, - /*input=*/v35.data(), /*output=*/v36.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #35" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op36, - /*input=*/v36.data(), /*output=*/v37.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #36" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op37, - /*input=*/v37.data(), /*output=*/v38.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #37" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op38, - /*input=*/v38.data(), /*output=*/v39.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #38" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op39, - /*input=*/v39.data(), /*output=*/v40.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #39" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op40, - /*input=*/v40.data(), /*output=*/v41.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #40" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op41, - /*input=*/v41.data(), /*output=*/v42.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #41" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op42, - /*input=*/v42.data(), /*output=*/v43.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #42" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op43, - /*input=*/v43.data(), /*output=*/v44.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #43" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op44, - /*input=*/v44.data(), /*output=*/v45.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #44" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op45, - v45.data() /* a */, v40.data() /* b */, /*output=*/v46.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #45" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op46, - /*input=*/v46.data(), /*output=*/v47.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #46" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op47, - /*input=*/v47.data(), /*output=*/v48.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #47" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op48, - /*input=*/v48.data(), /*output=*/v49.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #48" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op49, - /*input=*/v49.data(), /*output=*/v50.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #49" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op50, - /*input=*/v50.data(), /*output=*/v51.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #50" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op51, - v51.data() /* a */, v46.data() /* b */, /*output=*/v52.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #51" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op52, - /*input=*/v52.data(), /*output=*/v53.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #52" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op53, - /*input=*/v53.data(), /*output=*/v54.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #53" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op54, - /*input=*/v54.data(), /*output=*/v55.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #54" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op55, - /*input=*/v55.data(), /*output=*/v56.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #55" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op56, - /*input=*/v56.data(), /*output=*/v57.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #56" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op57, - v57.data() /* a */, v52.data() /* b */, /*output=*/v58.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #57" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op58, - /*input=*/v58.data(), /*output=*/v59.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #58" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op59, - /*input=*/v59.data(), /*output=*/v60.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #59" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op60, - /*input=*/v60.data(), /*output=*/v61.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #60" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op61, - /*input=*/v61.data(), /*output=*/v62.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #61" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f16( - op62, - /*input=*/v62.data(), /*output=*/v63.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #62" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op63, - /*input=*/v63.data(), /*output=*/v64.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #63" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op64, - /*input=*/v64.data(), /*output=*/v65.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #64" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f16( - op65, - v62.data() /* a */, v65.data() /* b */, /*output=*/v66.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #65" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op66, - /*input=*/v66.data(), /*output=*/v67.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #66" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op67, - /*input=*/v67.data(), /*output=*/v68.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #67" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op68, - /*input=*/v68.data(), /*output=*/v69.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #68" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op69, - /*input=*/v69.data(), /*output=*/v70.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #69" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op70, - /*input=*/v70.data(), /*output=*/v71.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #70" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f16( - op71, - /*input=*/v71.data(), /*output=*/v72.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #71" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op72, - /*input=*/v72.data(), /*output=*/v73.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #72" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op73, - /*input=*/v73.data(), /*output=*/v74.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #73" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f16( - op74, - v71.data() /* a */, v74.data() /* b */, /*output=*/v75.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #74" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op75, - /*input=*/v75.data(), /*output=*/v76.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #75" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op76, - v76.data() /* a */, v67.data() /* b */, /*output=*/v77.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #76" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op77, - /*input=*/v77.data(), /*output=*/v78.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #77" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op78, - /*input=*/v78.data(), /*output=*/v79.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #78" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op79, - /*input=*/v79.data(), /*output=*/v80.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #79" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op80, - /*input=*/v80.data(), /*output=*/v81.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #80" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f16( - op81, - /*input=*/v81.data(), /*output=*/v82.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #81" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op82, - /*input=*/v82.data(), /*output=*/v83.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #82" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op83, - /*input=*/v83.data(), /*output=*/v84.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #83" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f16( - op84, - v81.data() /* a */, v84.data() /* b */, /*output=*/v85.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #84" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op85, - /*input=*/v85.data(), /*output=*/v86.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #85" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op86, - /*input=*/v86.data(), /*output=*/v87.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #86" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op87, - /*input=*/v87.data(), /*output=*/v88.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #87" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op88, - /*input=*/v88.data(), /*output=*/v89.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #88" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op89, - /*input=*/v89.data(), /*output=*/v90.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #89" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f16( - op90, - /*input=*/v90.data(), /*output=*/v91.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #90" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op91, - /*input=*/v91.data(), /*output=*/v92.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #91" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op92, - /*input=*/v92.data(), /*output=*/v93.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #92" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f16( - op93, - v90.data() /* a */, v93.data() /* b */, /*output=*/v94.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #93" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op94, - /*input=*/v94.data(), /*output=*/v95.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #94" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op95, - v95.data() /* a */, v86.data() /* b */, /*output=*/v96.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #95" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op96, - /*input=*/v96.data(), /*output=*/v97.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #96" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op97, - /*input=*/v97.data(), /*output=*/v98.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #97" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op98, - /*input=*/v98.data(), /*output=*/v99.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #98" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op99, - /*input=*/v99.data(), /*output=*/v100.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #99" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f16( - op100, - /*input=*/v100.data(), /*output=*/v101.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #100" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op101, - /*input=*/v101.data(), /*output=*/v102.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #101" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op102, - /*input=*/v102.data(), /*output=*/v103.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #102" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f16( - op103, - v100.data() /* a */, v103.data() /* b */, /*output=*/v104.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #103" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op104, - /*input=*/v104.data(), /*output=*/v105.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #104" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op105, - v105.data() /* a */, v96.data() /* b */, /*output=*/v106.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #105" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op106, - /*input=*/v106.data(), /*output=*/v107.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #106" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op107, - /*input=*/v107.data(), /*output=*/v108.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #107" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f16( - op108, - /*input=*/v108.data(), /*output=*/v109.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #108" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op109, - workspace.data(), - /*input=*/v109.data(), /*output=*/v110.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #109" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op110, - /*input=*/v110.data(), /*output=*/v111.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #110" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f16( - op111, - workspace.data(), - /*input=*/v111.data(), /*output=*/v112.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #111" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op112, - workspace.data(), - /*input=*/v112.data(), /*output=*/v113.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #112" << std::endl; - return ExecutionPlan(); - } - - XNN_PRAGMA_CLANG("clang diagnostic push") - XNN_PRAGMA_CLANG("clang diagnostic ignored \"-Wpessimizing-move\"") - return ExecutionPlan{operators, workspace}; - XNN_PRAGMA_CLANG("clang diagnostic pop") -} - -} // namespace models diff --git a/models/fp16-sparse-mobilenet-v3-small.cc b/models/fp16-sparse-mobilenet-v3-small.cc deleted file mode 100644 index 8ca0f8baf18..00000000000 --- a/models/fp16-sparse-mobilenet-v3-small.cc +++ /dev/null @@ -1,4189 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include "xnnpack.h" - -#include -#include -#include -#include -#include -#include - -#include "xnnpack/cache.h" -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/models.h" - -namespace models { - -ExecutionPlan FP16SparseMobileNetV3Small(float sparsity, pthreadpool_t threadpool) { - alignas(16) static std::array v0; - alignas(16) static std::array v1; - alignas(16) static std::array v2; - alignas(16) static std::array v3; - alignas(16) static std::array v4; - alignas(16) static std::array v5; - alignas(16) static std::array v6; - alignas(16) static std::array v7; - alignas(16) static std::array v8; - alignas(16) static std::array v9; - alignas(16) static std::array v10; - alignas(16) static std::array v11; - alignas(16) static std::array v12; - alignas(16) static std::array v13; - alignas(16) static std::array v14; - alignas(16) static std::array v15; - alignas(16) static std::array v16; - alignas(16) static std::array v17; - alignas(16) static std::array v18; - alignas(16) static std::array v19; - alignas(16) static std::array v20; - alignas(16) static std::array v21; - alignas(16) static std::array v22; - alignas(16) static std::array v23; - alignas(16) static std::array v24; - alignas(16) static std::array v25; - alignas(16) static std::array v26; - alignas(16) static std::array v27; - alignas(16) static std::array v28; - alignas(16) static std::array v29; - alignas(16) static std::array v30; - alignas(16) static std::array v31; - alignas(16) static std::array v32; - alignas(16) static std::array v33; - alignas(16) static std::array v34; - alignas(16) static std::array v35; - alignas(16) static std::array v36; - alignas(16) static std::array v37; - alignas(16) static std::array v38; - alignas(16) static std::array v39; - alignas(16) static std::array v40; - alignas(16) static std::array v41; - alignas(16) static std::array v42; - alignas(16) static std::array v43; - alignas(16) static std::array v44; - alignas(16) static std::array v45; - alignas(16) static std::array v46; - alignas(16) static std::array v47; - alignas(16) static std::array v48; - alignas(16) static std::array v49; - alignas(16) static std::array v50; - alignas(16) static std::array v51; - alignas(16) static std::array v52; - alignas(16) static std::array v53; - alignas(16) static std::array v54; - alignas(16) static std::array v55; - alignas(16) static std::array v56; - alignas(16) static std::array v57; - alignas(16) static std::array v58; - alignas(16) static std::array v59; - alignas(16) static std::array v60; - alignas(16) static std::array v61; - alignas(16) static std::array v62; - alignas(16) static std::array v63; - alignas(16) static std::array v64; - alignas(16) static std::array v65; - alignas(16) static std::array v66; - alignas(16) static std::array v67; - alignas(16) static std::array v68; - alignas(16) static std::array v69; - alignas(16) static std::array v70; - alignas(16) static std::array v71; - alignas(16) static std::array v72; - alignas(16) static std::array v73; - alignas(16) static std::array v74; - alignas(16) static std::array v75; - alignas(16) static std::array v76; - alignas(16) static std::array v77; - alignas(16) static std::array v78; - alignas(16) static std::array v79; - alignas(16) static std::array v80; - alignas(16) static std::array v81; - alignas(16) static std::array v82; - alignas(16) static std::array v83; - alignas(16) static std::array v84; - alignas(16) static std::array v85; - alignas(16) static std::array v86; - alignas(16) static std::array v87; - alignas(16) static std::array v88; - alignas(16) static std::array v89; - alignas(16) static std::array v90; - alignas(16) static std::array v91; - alignas(16) static std::array v92; - alignas(16) static std::array v93; - alignas(16) static std::array v94; - alignas(16) static std::array v95; - alignas(16) static std::array v96; - alignas(16) static std::array v97; - alignas(16) static std::array v98; - alignas(16) static std::array v99; - alignas(16) static std::array w100; - alignas(16) static std::array w101; - alignas(16) static std::array w102; - alignas(16) static std::array w103; - alignas(16) static std::array w104; - alignas(16) static std::array w105; - alignas(16) static std::array w106; - alignas(16) static std::array w107; - alignas(16) static std::array w108; - alignas(16) static std::array w109; - alignas(16) static std::array w110; - alignas(16) static std::array w111; - alignas(16) static std::array w112; - alignas(16) static std::array w113; - alignas(16) static std::array w114; - alignas(16) static std::array w115; - alignas(16) static std::array w116; - alignas(16) static std::array w117; - alignas(16) static std::array w118; - alignas(16) static std::array w119; - alignas(16) static std::array w120; - alignas(16) static std::array w121; - alignas(16) static std::array w122; - alignas(16) static std::array w123; - alignas(16) static std::array w124; - alignas(16) static std::array w125; - alignas(16) static std::array w126; - alignas(16) static std::array w127; - alignas(16) static std::array w128; - alignas(16) static std::array w129; - alignas(16) static std::array w130; - alignas(16) static std::array w131; - alignas(16) static std::array w132; - alignas(16) static std::array w133; - alignas(16) static std::array w134; - alignas(16) static std::array w135; - alignas(16) static std::array w136; - alignas(16) static std::array w137; - alignas(16) static std::array w138; - alignas(16) static std::array w139; - alignas(16) static std::array w140; - alignas(16) static std::array w141; - alignas(16) static std::array w142; - alignas(16) static std::array w143; - alignas(16) static std::array w144; - alignas(16) static std::array w145; - alignas(16) static std::array w146; - alignas(16) static std::array w147; - alignas(16) static std::array w148; - alignas(16) static std::array w149; - alignas(16) static std::array w150; - alignas(16) static std::array w151; - alignas(16) static std::array w152; - alignas(16) static std::array w153; - alignas(16) static std::array w154; - alignas(16) static std::array w155; - alignas(16) static std::array w156; - alignas(16) static std::array w157; - alignas(16) static std::array w158; - alignas(16) static std::array w159; - alignas(16) static std::array w160; - alignas(16) static std::array w161; - alignas(16) static std::array w162; - alignas(16) static std::array w163; - alignas(16) static std::array w164; - alignas(16) static std::array w165; - alignas(16) static std::array w166; - alignas(16) static std::array w167; - alignas(16) static std::array w168; - alignas(16) static std::array w169; - alignas(16) static std::array w170; - alignas(16) static std::array w171; - alignas(16) static std::array w172; - alignas(16) static std::array w173; - alignas(16) static std::array w174; - alignas(16) static std::array w175; - alignas(16) static std::array w176; - alignas(16) static std::array w177; - alignas(16) static std::array w178; - alignas(16) static std::array w179; - alignas(16) static std::array w180; - alignas(16) static std::array w181; - alignas(16) static std::array w182; - alignas(16) static std::array w183; - alignas(16) static std::array w184; - alignas(16) static std::array w185; - alignas(16) static std::array w186; - alignas(16) static std::array w187; - alignas(16) static std::array w188; - alignas(16) static std::array w189; - alignas(16) static std::array w190; - alignas(16) static std::array w191; - alignas(16) static std::array w192; - alignas(16) static std::array w193; - alignas(16) static std::array w194; - alignas(16) static std::array w195; - alignas(16) static std::array w196; - alignas(16) static std::array w197; - alignas(16) static std::array w198; - alignas(16) static std::array w199; - alignas(16) static std::array w200; - alignas(16) static std::array w201; - alignas(16) static std::array w202; - alignas(16) static std::array w203; - alignas(16) static std::array w204; - alignas(16) static std::array w205; - alignas(16) static std::array w206; - alignas(16) static std::array w207; - - std::random_device random_device; - auto rng = std::mt19937(random_device()); - auto f32rng = std::bind(std::uniform_real_distribution(-1.0f, +1.0f), std::ref(rng)); - std::generate(v0.begin(), v0.end(), f32rng); - std::generate(v1.begin(), v1.end(), f32rng); - std::generate(v2.begin(), v2.end(), f32rng); - std::generate(v3.begin(), v3.end(), f32rng); - std::generate(v4.begin(), v4.end(), f32rng); - std::generate(v5.begin(), v5.end(), f32rng); - std::generate(v6.begin(), v6.end(), f32rng); - std::generate(v7.begin(), v7.end(), f32rng); - std::generate(v8.begin(), v8.end(), f32rng); - std::generate(v9.begin(), v9.end(), f32rng); - std::generate(v10.begin(), v10.end(), f32rng); - std::generate(v11.begin(), v11.end(), f32rng); - std::generate(v12.begin(), v12.end(), f32rng); - std::generate(v13.begin(), v13.end(), f32rng); - std::generate(v14.begin(), v14.end(), f32rng); - std::generate(v15.begin(), v15.end(), f32rng); - std::generate(v16.begin(), v16.end(), f32rng); - std::generate(v17.begin(), v17.end(), f32rng); - std::generate(v18.begin(), v18.end(), f32rng); - std::generate(v19.begin(), v19.end(), f32rng); - std::generate(v20.begin(), v20.end(), f32rng); - std::generate(v21.begin(), v21.end(), f32rng); - std::generate(v22.begin(), v22.end(), f32rng); - std::generate(v23.begin(), v23.end(), f32rng); - std::generate(v24.begin(), v24.end(), f32rng); - std::generate(v25.begin(), v25.end(), f32rng); - std::generate(v26.begin(), v26.end(), f32rng); - std::generate(v27.begin(), v27.end(), f32rng); - std::generate(v28.begin(), v28.end(), f32rng); - std::generate(v29.begin(), v29.end(), f32rng); - std::generate(v30.begin(), v30.end(), f32rng); - std::generate(v31.begin(), v31.end(), f32rng); - std::generate(v32.begin(), v32.end(), f32rng); - std::generate(v33.begin(), v33.end(), f32rng); - std::generate(v34.begin(), v34.end(), f32rng); - std::generate(v35.begin(), v35.end(), f32rng); - std::generate(v36.begin(), v36.end(), f32rng); - std::generate(v37.begin(), v37.end(), f32rng); - std::generate(v38.begin(), v38.end(), f32rng); - std::generate(v39.begin(), v39.end(), f32rng); - std::generate(v40.begin(), v40.end(), f32rng); - std::generate(v41.begin(), v41.end(), f32rng); - std::generate(v42.begin(), v42.end(), f32rng); - std::generate(v43.begin(), v43.end(), f32rng); - std::generate(v44.begin(), v44.end(), f32rng); - std::generate(v45.begin(), v45.end(), f32rng); - std::generate(v46.begin(), v46.end(), f32rng); - std::generate(v47.begin(), v47.end(), f32rng); - std::generate(v48.begin(), v48.end(), f32rng); - std::generate(v49.begin(), v49.end(), f32rng); - std::generate(v50.begin(), v50.end(), f32rng); - std::generate(v51.begin(), v51.end(), f32rng); - std::generate(v52.begin(), v52.end(), f32rng); - std::generate(v53.begin(), v53.end(), f32rng); - std::generate(v54.begin(), v54.end(), f32rng); - std::generate(v55.begin(), v55.end(), f32rng); - std::generate(v56.begin(), v56.end(), f32rng); - std::generate(v57.begin(), v57.end(), f32rng); - std::generate(v58.begin(), v58.end(), f32rng); - std::generate(v59.begin(), v59.end(), f32rng); - std::generate(v60.begin(), v60.end(), f32rng); - std::generate(v61.begin(), v61.end(), f32rng); - std::generate(v62.begin(), v62.end(), f32rng); - std::generate(v63.begin(), v63.end(), f32rng); - std::generate(v64.begin(), v64.end(), f32rng); - std::generate(v65.begin(), v65.end(), f32rng); - std::generate(v66.begin(), v66.end(), f32rng); - std::generate(v67.begin(), v67.end(), f32rng); - std::generate(v68.begin(), v68.end(), f32rng); - std::generate(v69.begin(), v69.end(), f32rng); - std::generate(v70.begin(), v70.end(), f32rng); - std::generate(v71.begin(), v71.end(), f32rng); - std::generate(v72.begin(), v72.end(), f32rng); - std::generate(v73.begin(), v73.end(), f32rng); - std::generate(v74.begin(), v74.end(), f32rng); - std::generate(v75.begin(), v75.end(), f32rng); - std::generate(v76.begin(), v76.end(), f32rng); - std::generate(v77.begin(), v77.end(), f32rng); - std::generate(v78.begin(), v78.end(), f32rng); - std::generate(v79.begin(), v79.end(), f32rng); - std::generate(v80.begin(), v80.end(), f32rng); - std::generate(v81.begin(), v81.end(), f32rng); - std::generate(v82.begin(), v82.end(), f32rng); - std::generate(v83.begin(), v83.end(), f32rng); - std::generate(v84.begin(), v84.end(), f32rng); - std::generate(v85.begin(), v85.end(), f32rng); - std::generate(v86.begin(), v86.end(), f32rng); - std::generate(v87.begin(), v87.end(), f32rng); - std::generate(v88.begin(), v88.end(), f32rng); - std::generate(v89.begin(), v89.end(), f32rng); - std::generate(v90.begin(), v90.end(), f32rng); - std::generate(v91.begin(), v91.end(), f32rng); - std::generate(v92.begin(), v92.end(), f32rng); - std::generate(v93.begin(), v93.end(), f32rng); - std::generate(v94.begin(), v94.end(), f32rng); - std::generate(v95.begin(), v95.end(), f32rng); - std::generate(v96.begin(), v96.end(), f32rng); - std::generate(v97.begin(), v97.end(), f32rng); - std::generate(v98.begin(), v98.end(), f32rng); - std::generate(v99.begin(), v99.end(), f32rng); - std::generate(w100.begin(), w100.end(), f32rng); - std::generate(w101.begin(), w101.end(), f32rng); - std::generate(w102.begin(), w102.end(), f32rng); - std::generate(w103.begin(), w103.end(), f32rng); - std::fill(w104.begin(), w104.end(), 0); - std::generate(w104.begin(), w104.end() - size_t(sparsity * w104.size()), f32rng); - std::shuffle(w104.begin(), w104.end(), rng); - std::generate(w105.begin(), w105.end(), f32rng); - std::fill(w106.begin(), w106.end(), 0); - std::generate(w106.begin(), w106.end() - size_t(sparsity * w106.size()), f32rng); - std::shuffle(w106.begin(), w106.end(), rng); - std::generate(w107.begin(), w107.end(), f32rng); - std::fill(w108.begin(), w108.end(), 0); - std::generate(w108.begin(), w108.end() - size_t(sparsity * w108.size()), f32rng); - std::shuffle(w108.begin(), w108.end(), rng); - std::generate(w109.begin(), w109.end(), f32rng); - std::fill(w110.begin(), w110.end(), 0); - std::generate(w110.begin(), w110.end() - size_t(sparsity * w110.size()), f32rng); - std::shuffle(w110.begin(), w110.end(), rng); - std::generate(w111.begin(), w111.end(), f32rng); - std::generate(w112.begin(), w112.end(), f32rng); - std::generate(w113.begin(), w113.end(), f32rng); - std::fill(w114.begin(), w114.end(), 0); - std::generate(w114.begin(), w114.end() - size_t(sparsity * w114.size()), f32rng); - std::shuffle(w114.begin(), w114.end(), rng); - std::generate(w115.begin(), w115.end(), f32rng); - std::fill(w116.begin(), w116.end(), 0); - std::generate(w116.begin(), w116.end() - size_t(sparsity * w116.size()), f32rng); - std::shuffle(w116.begin(), w116.end(), rng); - std::generate(w117.begin(), w117.end(), f32rng); - std::generate(w118.begin(), w118.end(), f32rng); - std::generate(w119.begin(), w119.end(), f32rng); - std::fill(w120.begin(), w120.end(), 0); - std::generate(w120.begin(), w120.end() - size_t(sparsity * w120.size()), f32rng); - std::shuffle(w120.begin(), w120.end(), rng); - std::generate(w121.begin(), w121.end(), f32rng); - std::fill(w122.begin(), w122.end(), 0); - std::generate(w122.begin(), w122.end() - size_t(sparsity * w122.size()), f32rng); - std::shuffle(w122.begin(), w122.end(), rng); - std::generate(w123.begin(), w123.end(), f32rng); - std::generate(w124.begin(), w124.end(), f32rng); - std::generate(w125.begin(), w125.end(), f32rng); - std::fill(w126.begin(), w126.end(), 0); - std::generate(w126.begin(), w126.end() - size_t(sparsity * w126.size()), f32rng); - std::shuffle(w126.begin(), w126.end(), rng); - std::generate(w127.begin(), w127.end(), f32rng); - std::fill(w128.begin(), w128.end(), 0); - std::generate(w128.begin(), w128.end() - size_t(sparsity * w128.size()), f32rng); - std::shuffle(w128.begin(), w128.end(), rng); - std::generate(w129.begin(), w129.end(), f32rng); - std::fill(w130.begin(), w130.end(), 0); - std::generate(w130.begin(), w130.end() - size_t(sparsity * w130.size()), f32rng); - std::shuffle(w130.begin(), w130.end(), rng); - std::generate(w131.begin(), w131.end(), f32rng); - std::fill(w132.begin(), w132.end(), 0); - std::generate(w132.begin(), w132.end() - size_t(sparsity * w132.size()), f32rng); - std::shuffle(w132.begin(), w132.end(), rng); - std::generate(w133.begin(), w133.end(), f32rng); - std::generate(w134.begin(), w134.end(), f32rng); - std::generate(w135.begin(), w135.end(), f32rng); - std::fill(w136.begin(), w136.end(), 0); - std::generate(w136.begin(), w136.end() - size_t(sparsity * w136.size()), f32rng); - std::shuffle(w136.begin(), w136.end(), rng); - std::generate(w137.begin(), w137.end(), f32rng); - std::fill(w138.begin(), w138.end(), 0); - std::generate(w138.begin(), w138.end() - size_t(sparsity * w138.size()), f32rng); - std::shuffle(w138.begin(), w138.end(), rng); - std::generate(w139.begin(), w139.end(), f32rng); - std::fill(w140.begin(), w140.end(), 0); - std::generate(w140.begin(), w140.end() - size_t(sparsity * w140.size()), f32rng); - std::shuffle(w140.begin(), w140.end(), rng); - std::generate(w141.begin(), w141.end(), f32rng); - std::fill(w142.begin(), w142.end(), 0); - std::generate(w142.begin(), w142.end() - size_t(sparsity * w142.size()), f32rng); - std::shuffle(w142.begin(), w142.end(), rng); - std::generate(w143.begin(), w143.end(), f32rng); - std::generate(w144.begin(), w144.end(), f32rng); - std::generate(w145.begin(), w145.end(), f32rng); - std::fill(w146.begin(), w146.end(), 0); - std::generate(w146.begin(), w146.end() - size_t(sparsity * w146.size()), f32rng); - std::shuffle(w146.begin(), w146.end(), rng); - std::generate(w147.begin(), w147.end(), f32rng); - std::fill(w148.begin(), w148.end(), 0); - std::generate(w148.begin(), w148.end() - size_t(sparsity * w148.size()), f32rng); - std::shuffle(w148.begin(), w148.end(), rng); - std::generate(w149.begin(), w149.end(), f32rng); - std::fill(w150.begin(), w150.end(), 0); - std::generate(w150.begin(), w150.end() - size_t(sparsity * w150.size()), f32rng); - std::shuffle(w150.begin(), w150.end(), rng); - std::generate(w151.begin(), w151.end(), f32rng); - std::fill(w152.begin(), w152.end(), 0); - std::generate(w152.begin(), w152.end() - size_t(sparsity * w152.size()), f32rng); - std::shuffle(w152.begin(), w152.end(), rng); - std::generate(w153.begin(), w153.end(), f32rng); - std::generate(w154.begin(), w154.end(), f32rng); - std::generate(w155.begin(), w155.end(), f32rng); - std::fill(w156.begin(), w156.end(), 0); - std::generate(w156.begin(), w156.end() - size_t(sparsity * w156.size()), f32rng); - std::shuffle(w156.begin(), w156.end(), rng); - std::generate(w157.begin(), w157.end(), f32rng); - std::fill(w158.begin(), w158.end(), 0); - std::generate(w158.begin(), w158.end() - size_t(sparsity * w158.size()), f32rng); - std::shuffle(w158.begin(), w158.end(), rng); - std::generate(w159.begin(), w159.end(), f32rng); - std::fill(w160.begin(), w160.end(), 0); - std::generate(w160.begin(), w160.end() - size_t(sparsity * w160.size()), f32rng); - std::shuffle(w160.begin(), w160.end(), rng); - std::generate(w161.begin(), w161.end(), f32rng); - std::fill(w162.begin(), w162.end(), 0); - std::generate(w162.begin(), w162.end() - size_t(sparsity * w162.size()), f32rng); - std::shuffle(w162.begin(), w162.end(), rng); - std::generate(w163.begin(), w163.end(), f32rng); - std::generate(w164.begin(), w164.end(), f32rng); - std::generate(w165.begin(), w165.end(), f32rng); - std::fill(w166.begin(), w166.end(), 0); - std::generate(w166.begin(), w166.end() - size_t(sparsity * w166.size()), f32rng); - std::shuffle(w166.begin(), w166.end(), rng); - std::generate(w167.begin(), w167.end(), f32rng); - std::fill(w168.begin(), w168.end(), 0); - std::generate(w168.begin(), w168.end() - size_t(sparsity * w168.size()), f32rng); - std::shuffle(w168.begin(), w168.end(), rng); - std::generate(w169.begin(), w169.end(), f32rng); - std::fill(w170.begin(), w170.end(), 0); - std::generate(w170.begin(), w170.end() - size_t(sparsity * w170.size()), f32rng); - std::shuffle(w170.begin(), w170.end(), rng); - std::generate(w171.begin(), w171.end(), f32rng); - std::fill(w172.begin(), w172.end(), 0); - std::generate(w172.begin(), w172.end() - size_t(sparsity * w172.size()), f32rng); - std::shuffle(w172.begin(), w172.end(), rng); - std::generate(w173.begin(), w173.end(), f32rng); - std::generate(w174.begin(), w174.end(), f32rng); - std::generate(w175.begin(), w175.end(), f32rng); - std::fill(w176.begin(), w176.end(), 0); - std::generate(w176.begin(), w176.end() - size_t(sparsity * w176.size()), f32rng); - std::shuffle(w176.begin(), w176.end(), rng); - std::generate(w177.begin(), w177.end(), f32rng); - std::fill(w178.begin(), w178.end(), 0); - std::generate(w178.begin(), w178.end() - size_t(sparsity * w178.size()), f32rng); - std::shuffle(w178.begin(), w178.end(), rng); - std::generate(w179.begin(), w179.end(), f32rng); - std::fill(w180.begin(), w180.end(), 0); - std::generate(w180.begin(), w180.end() - size_t(sparsity * w180.size()), f32rng); - std::shuffle(w180.begin(), w180.end(), rng); - std::generate(w181.begin(), w181.end(), f32rng); - std::fill(w182.begin(), w182.end(), 0); - std::generate(w182.begin(), w182.end() - size_t(sparsity * w182.size()), f32rng); - std::shuffle(w182.begin(), w182.end(), rng); - std::generate(w183.begin(), w183.end(), f32rng); - std::generate(w184.begin(), w184.end(), f32rng); - std::generate(w185.begin(), w185.end(), f32rng); - std::fill(w186.begin(), w186.end(), 0); - std::generate(w186.begin(), w186.end() - size_t(sparsity * w186.size()), f32rng); - std::shuffle(w186.begin(), w186.end(), rng); - std::generate(w187.begin(), w187.end(), f32rng); - std::fill(w188.begin(), w188.end(), 0); - std::generate(w188.begin(), w188.end() - size_t(sparsity * w188.size()), f32rng); - std::shuffle(w188.begin(), w188.end(), rng); - std::generate(w189.begin(), w189.end(), f32rng); - std::fill(w190.begin(), w190.end(), 0); - std::generate(w190.begin(), w190.end() - size_t(sparsity * w190.size()), f32rng); - std::shuffle(w190.begin(), w190.end(), rng); - std::generate(w191.begin(), w191.end(), f32rng); - std::fill(w192.begin(), w192.end(), 0); - std::generate(w192.begin(), w192.end() - size_t(sparsity * w192.size()), f32rng); - std::shuffle(w192.begin(), w192.end(), rng); - std::generate(w193.begin(), w193.end(), f32rng); - std::generate(w194.begin(), w194.end(), f32rng); - std::generate(w195.begin(), w195.end(), f32rng); - std::fill(w196.begin(), w196.end(), 0); - std::generate(w196.begin(), w196.end() - size_t(sparsity * w196.size()), f32rng); - std::shuffle(w196.begin(), w196.end(), rng); - std::generate(w197.begin(), w197.end(), f32rng); - std::fill(w198.begin(), w198.end(), 0); - std::generate(w198.begin(), w198.end() - size_t(sparsity * w198.size()), f32rng); - std::shuffle(w198.begin(), w198.end(), rng); - std::generate(w199.begin(), w199.end(), f32rng); - std::fill(w200.begin(), w200.end(), 0); - std::generate(w200.begin(), w200.end() - size_t(sparsity * w200.size()), f32rng); - std::shuffle(w200.begin(), w200.end(), rng); - std::generate(w201.begin(), w201.end(), f32rng); - std::fill(w202.begin(), w202.end(), 0); - std::generate(w202.begin(), w202.end() - size_t(sparsity * w202.size()), f32rng); - std::shuffle(w202.begin(), w202.end(), rng); - std::generate(w203.begin(), w203.end(), f32rng); - std::fill(w204.begin(), w204.end(), 0); - std::generate(w204.begin(), w204.end() - size_t(sparsity * w204.size()), f32rng); - std::shuffle(w204.begin(), w204.end(), rng); - std::generate(w205.begin(), w205.end(), f32rng); - std::generate(w206.begin(), w206.end(), f32rng); - std::generate(w207.begin(), w207.end(), f32rng); - - Operators operators; - xnn_status status; - size_t max_workspace_size = 0; - - xnn_operator_t op0 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 3 /* input channels per group */, - 16 /* output_channels_per_group */, - 3 /* input pixel stride */, - 16 /* output pixel stride */, - w100.data(), w101.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - XNN_FLAG_INPUT_NHWC /* flags */, - nullptr, - nullptr, - &op0); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #0" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op0, xnn_delete_operator); - - xnn_operator_t op1 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op1); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #1" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op1, xnn_delete_operator); - - xnn_operator_t op2 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 16 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 16 /* input pixel stride */, - 16 /* output pixel stride */, - w102.data(), w103.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op2); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #2" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op2, xnn_delete_operator); - - xnn_operator_t op3 = nullptr; - status = xnn_create_global_average_pooling_ncw_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op3); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #3" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op3, xnn_delete_operator); - - xnn_operator_t op4 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 16 /* input channels per group */, - 8 /* output_channels_per_group */, - 16 /* input pixel stride */, - 8 /* output pixel stride */, - w104.data(), w105.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op4); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #4" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op4, xnn_delete_operator); - - xnn_operator_t op5 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 8 /* input channels per group */, - 16 /* output_channels_per_group */, - 8 /* input pixel stride */, - 16 /* output pixel stride */, - w106.data(), w107.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op5); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #5" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op5, xnn_delete_operator); - - xnn_operator_t op6 = nullptr; - status = xnn_create_multiply_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op6); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #6" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op6, xnn_delete_operator); - - xnn_operator_t op7 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 16 /* input channels per group */, - 16 /* output_channels_per_group */, - 16 /* input pixel stride */, - 16 /* output pixel stride */, - w108.data(), w109.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op7); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #7" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op7, xnn_delete_operator); - - xnn_operator_t op8 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 16 /* input channels per group */, - 72 /* output_channels_per_group */, - 16 /* input pixel stride */, - 72 /* output pixel stride */, - w110.data(), w111.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op8); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #8" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op8, xnn_delete_operator); - - xnn_operator_t op9 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 72 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 72 /* input pixel stride */, - 72 /* output pixel stride */, - w112.data(), w113.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op9); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #9" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op9, xnn_delete_operator); - - xnn_operator_t op10 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 72 /* input channels per group */, - 24 /* output_channels_per_group */, - 72 /* input pixel stride */, - 24 /* output pixel stride */, - w114.data(), w115.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op10); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #10" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op10, xnn_delete_operator); - - xnn_operator_t op11 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 24 /* input channels per group */, - 88 /* output_channels_per_group */, - 24 /* input pixel stride */, - 88 /* output pixel stride */, - w116.data(), w117.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op11); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #11" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op11, xnn_delete_operator); - - xnn_operator_t op12 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 88 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 88 /* input pixel stride */, - 88 /* output pixel stride */, - w118.data(), w119.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op12); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #12" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op12, xnn_delete_operator); - - xnn_operator_t op13 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 88 /* input channels per group */, - 24 /* output_channels_per_group */, - 88 /* input pixel stride */, - 24 /* output pixel stride */, - w120.data(), w121.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op13); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #13" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op13, xnn_delete_operator); - - xnn_operator_t op14 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op14); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #14" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op14, xnn_delete_operator); - - xnn_operator_t op15 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 24 /* input channels per group */, - 96 /* output_channels_per_group */, - 24 /* input pixel stride */, - 96 /* output pixel stride */, - w122.data(), w123.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op15); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #15" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op15, xnn_delete_operator); - - xnn_operator_t op16 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op16); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #16" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op16, xnn_delete_operator); - - xnn_operator_t op17 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 2 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 2 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 96 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 96 /* input pixel stride */, - 96 /* output pixel stride */, - w124.data(), w125.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op17); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #17" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op17, xnn_delete_operator); - - xnn_operator_t op18 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op18); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #18" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op18, xnn_delete_operator); - - xnn_operator_t op19 = nullptr; - status = xnn_create_global_average_pooling_ncw_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op19); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #19" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op19, xnn_delete_operator); - - xnn_operator_t op20 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 96 /* input channels per group */, - 24 /* output_channels_per_group */, - 96 /* input pixel stride */, - 24 /* output pixel stride */, - w126.data(), w127.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op20); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #20" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op20, xnn_delete_operator); - - xnn_operator_t op21 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 24 /* input channels per group */, - 96 /* output_channels_per_group */, - 24 /* input pixel stride */, - 96 /* output pixel stride */, - w128.data(), w129.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op21); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #21" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op21, xnn_delete_operator); - - xnn_operator_t op22 = nullptr; - status = xnn_create_multiply_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op22); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #22" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op22, xnn_delete_operator); - - xnn_operator_t op23 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 96 /* input channels per group */, - 40 /* output_channels_per_group */, - 96 /* input pixel stride */, - 40 /* output pixel stride */, - w130.data(), w131.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op23); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #23" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op23, xnn_delete_operator); - - xnn_operator_t op24 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 40 /* input channels per group */, - 240 /* output_channels_per_group */, - 40 /* input pixel stride */, - 240 /* output pixel stride */, - w132.data(), w133.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op24); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #24" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op24, xnn_delete_operator); - - xnn_operator_t op25 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op25); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #25" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op25, xnn_delete_operator); - - xnn_operator_t op26 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 2 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 2 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 240 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 240 /* input pixel stride */, - 240 /* output pixel stride */, - w134.data(), w135.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op26); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #26" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op26, xnn_delete_operator); - - xnn_operator_t op27 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op27); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #27" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op27, xnn_delete_operator); - - xnn_operator_t op28 = nullptr; - status = xnn_create_global_average_pooling_ncw_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op28); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #28" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op28, xnn_delete_operator); - - xnn_operator_t op29 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 240 /* input channels per group */, - 64 /* output_channels_per_group */, - 240 /* input pixel stride */, - 64 /* output pixel stride */, - w136.data(), w137.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op29); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #29" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op29, xnn_delete_operator); - - xnn_operator_t op30 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 64 /* input channels per group */, - 240 /* output_channels_per_group */, - 64 /* input pixel stride */, - 240 /* output pixel stride */, - w138.data(), w139.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op30); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #30" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op30, xnn_delete_operator); - - xnn_operator_t op31 = nullptr; - status = xnn_create_multiply_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op31); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #31" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op31, xnn_delete_operator); - - xnn_operator_t op32 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 240 /* input channels per group */, - 40 /* output_channels_per_group */, - 240 /* input pixel stride */, - 40 /* output pixel stride */, - w140.data(), w141.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op32); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #32" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op32, xnn_delete_operator); - - xnn_operator_t op33 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op33); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #33" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op33, xnn_delete_operator); - - xnn_operator_t op34 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 40 /* input channels per group */, - 240 /* output_channels_per_group */, - 40 /* input pixel stride */, - 240 /* output pixel stride */, - w142.data(), w143.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op34); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #34" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op34, xnn_delete_operator); - - xnn_operator_t op35 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op35); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #35" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op35, xnn_delete_operator); - - xnn_operator_t op36 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 2 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 2 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 240 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 240 /* input pixel stride */, - 240 /* output pixel stride */, - w144.data(), w145.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op36); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #36" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op36, xnn_delete_operator); - - xnn_operator_t op37 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op37); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #37" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op37, xnn_delete_operator); - - xnn_operator_t op38 = nullptr; - status = xnn_create_global_average_pooling_ncw_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op38); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #38" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op38, xnn_delete_operator); - - xnn_operator_t op39 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 240 /* input channels per group */, - 64 /* output_channels_per_group */, - 240 /* input pixel stride */, - 64 /* output pixel stride */, - w146.data(), w147.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op39); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #39" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op39, xnn_delete_operator); - - xnn_operator_t op40 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 64 /* input channels per group */, - 240 /* output_channels_per_group */, - 64 /* input pixel stride */, - 240 /* output pixel stride */, - w148.data(), w149.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op40); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #40" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op40, xnn_delete_operator); - - xnn_operator_t op41 = nullptr; - status = xnn_create_multiply_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op41); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #41" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op41, xnn_delete_operator); - - xnn_operator_t op42 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 240 /* input channels per group */, - 40 /* output_channels_per_group */, - 240 /* input pixel stride */, - 40 /* output pixel stride */, - w150.data(), w151.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op42); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #42" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op42, xnn_delete_operator); - - xnn_operator_t op43 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op43); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #43" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op43, xnn_delete_operator); - - xnn_operator_t op44 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 40 /* input channels per group */, - 120 /* output_channels_per_group */, - 40 /* input pixel stride */, - 120 /* output pixel stride */, - w152.data(), w153.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op44); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #44" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op44, xnn_delete_operator); - - xnn_operator_t op45 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op45); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #45" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op45, xnn_delete_operator); - - xnn_operator_t op46 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 2 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 2 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 120 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 120 /* input pixel stride */, - 120 /* output pixel stride */, - w154.data(), w155.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op46); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #46" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op46, xnn_delete_operator); - - xnn_operator_t op47 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op47); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #47" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op47, xnn_delete_operator); - - xnn_operator_t op48 = nullptr; - status = xnn_create_global_average_pooling_ncw_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op48); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #48" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op48, xnn_delete_operator); - - xnn_operator_t op49 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 120 /* input channels per group */, - 32 /* output_channels_per_group */, - 120 /* input pixel stride */, - 32 /* output pixel stride */, - w156.data(), w157.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op49); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #49" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op49, xnn_delete_operator); - - xnn_operator_t op50 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 32 /* input channels per group */, - 120 /* output_channels_per_group */, - 32 /* input pixel stride */, - 120 /* output pixel stride */, - w158.data(), w159.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op50); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #50" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op50, xnn_delete_operator); - - xnn_operator_t op51 = nullptr; - status = xnn_create_multiply_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op51); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #51" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op51, xnn_delete_operator); - - xnn_operator_t op52 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 120 /* input channels per group */, - 48 /* output_channels_per_group */, - 120 /* input pixel stride */, - 48 /* output pixel stride */, - w160.data(), w161.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op52); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #52" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op52, xnn_delete_operator); - - xnn_operator_t op53 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 48 /* input channels per group */, - 144 /* output_channels_per_group */, - 48 /* input pixel stride */, - 144 /* output pixel stride */, - w162.data(), w163.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op53); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #53" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op53, xnn_delete_operator); - - xnn_operator_t op54 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op54); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #54" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op54, xnn_delete_operator); - - xnn_operator_t op55 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 2 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 2 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 144 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 144 /* input pixel stride */, - 144 /* output pixel stride */, - w164.data(), w165.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op55); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #55" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op55, xnn_delete_operator); - - xnn_operator_t op56 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op56); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #56" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op56, xnn_delete_operator); - - xnn_operator_t op57 = nullptr; - status = xnn_create_global_average_pooling_ncw_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op57); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #57" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op57, xnn_delete_operator); - - xnn_operator_t op58 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 144 /* input channels per group */, - 40 /* output_channels_per_group */, - 144 /* input pixel stride */, - 40 /* output pixel stride */, - w166.data(), w167.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op58); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #58" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op58, xnn_delete_operator); - - xnn_operator_t op59 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 40 /* input channels per group */, - 144 /* output_channels_per_group */, - 40 /* input pixel stride */, - 144 /* output pixel stride */, - w168.data(), w169.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op59); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #59" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op59, xnn_delete_operator); - - xnn_operator_t op60 = nullptr; - status = xnn_create_multiply_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op60); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #60" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op60, xnn_delete_operator); - - xnn_operator_t op61 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 144 /* input channels per group */, - 48 /* output_channels_per_group */, - 144 /* input pixel stride */, - 48 /* output pixel stride */, - w170.data(), w171.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op61); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #61" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op61, xnn_delete_operator); - - xnn_operator_t op62 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op62); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #62" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op62, xnn_delete_operator); - - xnn_operator_t op63 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 48 /* input channels per group */, - 288 /* output_channels_per_group */, - 48 /* input pixel stride */, - 288 /* output pixel stride */, - w172.data(), w173.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op63); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #63" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op63, xnn_delete_operator); - - xnn_operator_t op64 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op64); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #64" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op64, xnn_delete_operator); - - xnn_operator_t op65 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 2 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 2 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 288 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 288 /* input pixel stride */, - 288 /* output pixel stride */, - w174.data(), w175.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op65); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #65" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op65, xnn_delete_operator); - - xnn_operator_t op66 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op66); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #66" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op66, xnn_delete_operator); - - xnn_operator_t op67 = nullptr; - status = xnn_create_global_average_pooling_ncw_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op67); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #67" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op67, xnn_delete_operator); - - xnn_operator_t op68 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 288 /* input channels per group */, - 72 /* output_channels_per_group */, - 288 /* input pixel stride */, - 72 /* output pixel stride */, - w176.data(), w177.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op68); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #68" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op68, xnn_delete_operator); - - xnn_operator_t op69 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 72 /* input channels per group */, - 288 /* output_channels_per_group */, - 72 /* input pixel stride */, - 288 /* output pixel stride */, - w178.data(), w179.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op69); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #69" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op69, xnn_delete_operator); - - xnn_operator_t op70 = nullptr; - status = xnn_create_multiply_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op70); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #70" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op70, xnn_delete_operator); - - xnn_operator_t op71 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 288 /* input channels per group */, - 96 /* output_channels_per_group */, - 288 /* input pixel stride */, - 96 /* output pixel stride */, - w180.data(), w181.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op71); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #71" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op71, xnn_delete_operator); - - xnn_operator_t op72 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 96 /* input channels per group */, - 576 /* output_channels_per_group */, - 96 /* input pixel stride */, - 576 /* output pixel stride */, - w182.data(), w183.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op72); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #72" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op72, xnn_delete_operator); - - xnn_operator_t op73 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op73); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #73" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op73, xnn_delete_operator); - - xnn_operator_t op74 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 2 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 2 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 576 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 576 /* input pixel stride */, - 576 /* output pixel stride */, - w184.data(), w185.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op74); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #74" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op74, xnn_delete_operator); - - xnn_operator_t op75 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op75); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #75" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op75, xnn_delete_operator); - - xnn_operator_t op76 = nullptr; - status = xnn_create_global_average_pooling_ncw_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op76); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #76" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op76, xnn_delete_operator); - - xnn_operator_t op77 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 576 /* input channels per group */, - 144 /* output_channels_per_group */, - 576 /* input pixel stride */, - 144 /* output pixel stride */, - w186.data(), w187.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op77); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #77" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op77, xnn_delete_operator); - - xnn_operator_t op78 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 144 /* input channels per group */, - 576 /* output_channels_per_group */, - 144 /* input pixel stride */, - 576 /* output pixel stride */, - w188.data(), w189.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op78); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #78" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op78, xnn_delete_operator); - - xnn_operator_t op79 = nullptr; - status = xnn_create_multiply_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op79); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #79" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op79, xnn_delete_operator); - - xnn_operator_t op80 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 576 /* input channels per group */, - 96 /* output_channels_per_group */, - 576 /* input pixel stride */, - 96 /* output pixel stride */, - w190.data(), w191.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op80); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #80" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op80, xnn_delete_operator); - - xnn_operator_t op81 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op81); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #81" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op81, xnn_delete_operator); - - xnn_operator_t op82 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 96 /* input channels per group */, - 576 /* output_channels_per_group */, - 96 /* input pixel stride */, - 576 /* output pixel stride */, - w192.data(), w193.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op82); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #82" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op82, xnn_delete_operator); - - xnn_operator_t op83 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op83); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #83" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op83, xnn_delete_operator); - - xnn_operator_t op84 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 2 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 2 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 576 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 576 /* input pixel stride */, - 576 /* output pixel stride */, - w194.data(), w195.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op84); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #84" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op84, xnn_delete_operator); - - xnn_operator_t op85 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op85); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #85" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op85, xnn_delete_operator); - - xnn_operator_t op86 = nullptr; - status = xnn_create_global_average_pooling_ncw_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op86); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #86" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op86, xnn_delete_operator); - - xnn_operator_t op87 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 576 /* input channels per group */, - 144 /* output_channels_per_group */, - 576 /* input pixel stride */, - 144 /* output pixel stride */, - w196.data(), w197.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op87); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #87" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op87, xnn_delete_operator); - - xnn_operator_t op88 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 144 /* input channels per group */, - 576 /* output_channels_per_group */, - 144 /* input pixel stride */, - 576 /* output pixel stride */, - w198.data(), w199.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op88); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #88" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op88, xnn_delete_operator); - - xnn_operator_t op89 = nullptr; - status = xnn_create_multiply_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op89); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #89" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op89, xnn_delete_operator); - - xnn_operator_t op90 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 576 /* input channels per group */, - 96 /* output_channels_per_group */, - 576 /* input pixel stride */, - 96 /* output pixel stride */, - w200.data(), w201.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op90); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #90" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op90, xnn_delete_operator); - - xnn_operator_t op91 = nullptr; - status = xnn_create_add_nd_f16( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op91); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #91" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op91, xnn_delete_operator); - - xnn_operator_t op92 = nullptr; - status = xnn_create_convolution2d_nchw_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 96 /* input channels per group */, - 576 /* output_channels_per_group */, - 96 /* input pixel stride */, - 576 /* output pixel stride */, - w202.data(), w203.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op92); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #92" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op92, xnn_delete_operator); - - xnn_operator_t op93 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op93); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #93" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op93, xnn_delete_operator); - - xnn_operator_t op94 = nullptr; - status = xnn_create_global_average_pooling_ncw_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op94); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #94" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op94, xnn_delete_operator); - - xnn_operator_t op95 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 576 /* input channels per group */, - 1024 /* output_channels_per_group */, - 576 /* input pixel stride */, - 1024 /* output pixel stride */, - w204.data(), w205.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op95); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #95" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op95, xnn_delete_operator); - - xnn_operator_t op96 = nullptr; - status = xnn_create_hardswish_nc_f16( - 0 /* flags */, - &op96); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #96" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op96, xnn_delete_operator); - - xnn_operator_t op97 = nullptr; - status = xnn_create_global_average_pooling_nwc_f16( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op97); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #97" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op97, xnn_delete_operator); - - xnn_operator_t op98 = nullptr; - status = xnn_create_convolution2d_nhwc_f16( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 1024 /* input channels per group */, - 1001 /* output_channels_per_group */, - 1024 /* input pixel stride */, - 1001 /* output pixel stride */, - w206.data(), w207.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op98); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #98" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op98, xnn_delete_operator); - - status = xnn_reshape_convolution2d_nchw_f16( - op0, - /*batch_size=*/1, /*input_height=*/224, /*input_width=*/224, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #0" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op1, - /*batch_size=*/12544, - 16 /* channels */, - 16 /* input stride */, - 16 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #1" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op2, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #2" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f16( - op3, - /*batch_size=*/1, 3136 /* width */, - 16 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #3" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op4, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #4" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op5, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #5" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 16, 56, 56 }; - const size_t b_shape[] = { 1, 16, 1, 1 }; - status = xnn_reshape_multiply_nd_f16( - op6, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #6" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op7, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #7" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op8, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #8" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op9, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #9" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op10, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #10" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op11, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #11" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op12, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #12" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op13, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #13" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 24, 28, 28 }; - const size_t b_shape[] = { 1, 24, 28, 28 }; - status = xnn_reshape_add_nd_f16( - op14, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #14" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op15, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #15" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op16, - /*batch_size=*/784, - 96 /* channels */, - 96 /* input stride */, - 96 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #16" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op17, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #17" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op18, - /*batch_size=*/196, - 96 /* channels */, - 96 /* input stride */, - 96 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #18" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f16( - op19, - /*batch_size=*/1, 196 /* width */, - 96 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #19" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op20, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #20" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op21, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #21" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 96, 14, 14 }; - const size_t b_shape[] = { 1, 96, 1, 1 }; - status = xnn_reshape_multiply_nd_f16( - op22, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #22" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op23, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #23" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op24, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #24" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op25, - /*batch_size=*/196, - 240 /* channels */, - 240 /* input stride */, - 240 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #25" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op26, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #26" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op27, - /*batch_size=*/196, - 240 /* channels */, - 240 /* input stride */, - 240 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #27" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f16( - op28, - /*batch_size=*/1, 196 /* width */, - 240 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #28" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op29, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #29" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op30, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #30" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 240, 14, 14 }; - const size_t b_shape[] = { 1, 240, 1, 1 }; - status = xnn_reshape_multiply_nd_f16( - op31, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #31" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op32, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #32" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 40, 14, 14 }; - const size_t b_shape[] = { 1, 40, 14, 14 }; - status = xnn_reshape_add_nd_f16( - op33, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #33" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op34, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #34" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op35, - /*batch_size=*/196, - 240 /* channels */, - 240 /* input stride */, - 240 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #35" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op36, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #36" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op37, - /*batch_size=*/196, - 240 /* channels */, - 240 /* input stride */, - 240 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #37" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f16( - op38, - /*batch_size=*/1, 196 /* width */, - 240 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #38" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op39, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #39" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op40, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #40" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 240, 14, 14 }; - const size_t b_shape[] = { 1, 240, 1, 1 }; - status = xnn_reshape_multiply_nd_f16( - op41, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #41" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op42, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #42" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 40, 14, 14 }; - const size_t b_shape[] = { 1, 40, 14, 14 }; - status = xnn_reshape_add_nd_f16( - op43, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #43" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op44, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #44" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op45, - /*batch_size=*/196, - 120 /* channels */, - 120 /* input stride */, - 120 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #45" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op46, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #46" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op47, - /*batch_size=*/196, - 120 /* channels */, - 120 /* input stride */, - 120 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #47" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f16( - op48, - /*batch_size=*/1, 196 /* width */, - 120 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #48" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op49, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #49" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op50, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #50" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 120, 14, 14 }; - const size_t b_shape[] = { 1, 120, 1, 1 }; - status = xnn_reshape_multiply_nd_f16( - op51, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #51" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op52, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #52" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op53, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #53" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op54, - /*batch_size=*/196, - 144 /* channels */, - 144 /* input stride */, - 144 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #54" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op55, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #55" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op56, - /*batch_size=*/196, - 144 /* channels */, - 144 /* input stride */, - 144 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #56" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f16( - op57, - /*batch_size=*/1, 196 /* width */, - 144 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #57" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op58, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #58" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op59, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #59" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 144, 14, 14 }; - const size_t b_shape[] = { 1, 144, 1, 1 }; - status = xnn_reshape_multiply_nd_f16( - op60, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #60" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op61, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #61" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 48, 14, 14 }; - const size_t b_shape[] = { 1, 48, 14, 14 }; - status = xnn_reshape_add_nd_f16( - op62, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #62" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op63, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #63" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op64, - /*batch_size=*/196, - 288 /* channels */, - 288 /* input stride */, - 288 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #64" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op65, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #65" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op66, - /*batch_size=*/49, - 288 /* channels */, - 288 /* input stride */, - 288 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #66" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f16( - op67, - /*batch_size=*/1, 49 /* width */, - 288 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #67" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op68, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #68" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op69, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #69" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 288, 7, 7 }; - const size_t b_shape[] = { 1, 288, 1, 1 }; - status = xnn_reshape_multiply_nd_f16( - op70, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #70" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op71, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #71" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op72, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #72" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op73, - /*batch_size=*/49, - 576 /* channels */, - 576 /* input stride */, - 576 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #73" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op74, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #74" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op75, - /*batch_size=*/49, - 576 /* channels */, - 576 /* input stride */, - 576 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #75" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f16( - op76, - /*batch_size=*/1, 49 /* width */, - 576 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #76" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op77, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #77" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op78, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #78" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 576, 7, 7 }; - const size_t b_shape[] = { 1, 576, 1, 1 }; - status = xnn_reshape_multiply_nd_f16( - op79, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #79" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op80, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #80" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 96, 7, 7 }; - const size_t b_shape[] = { 1, 96, 7, 7 }; - status = xnn_reshape_add_nd_f16( - op81, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #81" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op82, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #82" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op83, - /*batch_size=*/49, - 576 /* channels */, - 576 /* input stride */, - 576 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #83" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op84, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #84" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op85, - /*batch_size=*/49, - 576 /* channels */, - 576 /* input stride */, - 576 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #85" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f16( - op86, - /*batch_size=*/1, 49 /* width */, - 576 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #86" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op87, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #87" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op88, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #88" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 576, 7, 7 }; - const size_t b_shape[] = { 1, 576, 1, 1 }; - status = xnn_reshape_multiply_nd_f16( - op89, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #89" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op90, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #90" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 96, 7, 7 }; - const size_t b_shape[] = { 1, 96, 7, 7 }; - status = xnn_reshape_add_nd_f16( - op91, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #91" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f16( - op92, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #92" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op93, - /*batch_size=*/49, - 576 /* channels */, - 576 /* input stride */, - 576 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #93" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f16( - op94, - /*batch_size=*/1, 49 /* width */, - 576 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #94" << std::endl; - return ExecutionPlan(); - } - - size_t op95_workspace_size = 0; - size_t op95_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op95, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op95_workspace_size, &op95_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op95_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #95" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f16( - op96, - /*batch_size=*/1, - 1024 /* channels */, - 1024 /* input stride */, - 1024 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #96" << std::endl; - return ExecutionPlan(); - } - - size_t op97_workspace_size = 0; - size_t op97_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f16( - op97, - /*batch_size=*/1, 1 /* width */, - 1024 /* channels */, 1024 /* input stride */, 1024 /* output stride */, - &op97_workspace_size, &op97_workspace_alignment, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #97" << std::endl; - return ExecutionPlan(); - } - - size_t op98_workspace_size = 0; - size_t op98_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - op98, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op98_workspace_size, &op98_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op98_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #98" << std::endl; - return ExecutionPlan(); - } - - Workspace workspace(max_workspace_size); - - status = xnn_setup_convolution2d_nchw_f16( - op0, - /*input=*/v0.data(), /*output=*/v1.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #0" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op1, - /*input=*/v1.data(), /*output=*/v2.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #1" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op2, - /*input=*/v2.data(), /*output=*/v3.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #2" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f16( - op3, - /*input=*/v3.data(), /*output=*/v4.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #3" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op4, - /*input=*/v4.data(), /*output=*/v5.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #4" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op5, - /*input=*/v5.data(), /*output=*/v6.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #5" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f16( - op6, - v3.data() /* a */, v6.data() /* b */, /*output=*/v7.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #6" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op7, - /*input=*/v7.data(), /*output=*/v8.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #7" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op8, - /*input=*/v8.data(), /*output=*/v9.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #8" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op9, - /*input=*/v9.data(), /*output=*/v10.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #9" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op10, - /*input=*/v10.data(), /*output=*/v11.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #10" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op11, - /*input=*/v11.data(), /*output=*/v12.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #11" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op12, - /*input=*/v12.data(), /*output=*/v13.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #12" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op13, - /*input=*/v13.data(), /*output=*/v14.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #13" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op14, - v14.data() /* a */, v11.data() /* b */, /*output=*/v15.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #14" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op15, - /*input=*/v15.data(), /*output=*/v16.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #15" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op16, - /*input=*/v16.data(), /*output=*/v17.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #16" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op17, - /*input=*/v17.data(), /*output=*/v18.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #17" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op18, - /*input=*/v18.data(), /*output=*/v19.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #18" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f16( - op19, - /*input=*/v19.data(), /*output=*/v20.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #19" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op20, - /*input=*/v20.data(), /*output=*/v21.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #20" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op21, - /*input=*/v21.data(), /*output=*/v22.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #21" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f16( - op22, - v19.data() /* a */, v22.data() /* b */, /*output=*/v23.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #22" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op23, - /*input=*/v23.data(), /*output=*/v24.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #23" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op24, - /*input=*/v24.data(), /*output=*/v25.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #24" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op25, - /*input=*/v25.data(), /*output=*/v26.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #25" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op26, - /*input=*/v26.data(), /*output=*/v27.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #26" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op27, - /*input=*/v27.data(), /*output=*/v28.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #27" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f16( - op28, - /*input=*/v28.data(), /*output=*/v29.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #28" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op29, - /*input=*/v29.data(), /*output=*/v30.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #29" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op30, - /*input=*/v30.data(), /*output=*/v31.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #30" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f16( - op31, - v28.data() /* a */, v31.data() /* b */, /*output=*/v32.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #31" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op32, - /*input=*/v32.data(), /*output=*/v33.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #32" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op33, - v33.data() /* a */, v24.data() /* b */, /*output=*/v34.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #33" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op34, - /*input=*/v34.data(), /*output=*/v35.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #34" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op35, - /*input=*/v35.data(), /*output=*/v36.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #35" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op36, - /*input=*/v36.data(), /*output=*/v37.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #36" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op37, - /*input=*/v37.data(), /*output=*/v38.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #37" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f16( - op38, - /*input=*/v38.data(), /*output=*/v39.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #38" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op39, - /*input=*/v39.data(), /*output=*/v40.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #39" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op40, - /*input=*/v40.data(), /*output=*/v41.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #40" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f16( - op41, - v38.data() /* a */, v41.data() /* b */, /*output=*/v42.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #41" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op42, - /*input=*/v42.data(), /*output=*/v43.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #42" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op43, - v43.data() /* a */, v34.data() /* b */, /*output=*/v44.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #43" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op44, - /*input=*/v44.data(), /*output=*/v45.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #44" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op45, - /*input=*/v45.data(), /*output=*/v46.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #45" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op46, - /*input=*/v46.data(), /*output=*/v47.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #46" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op47, - /*input=*/v47.data(), /*output=*/v48.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #47" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f16( - op48, - /*input=*/v48.data(), /*output=*/v49.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #48" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op49, - /*input=*/v49.data(), /*output=*/v50.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #49" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op50, - /*input=*/v50.data(), /*output=*/v51.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #50" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f16( - op51, - v48.data() /* a */, v51.data() /* b */, /*output=*/v52.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #51" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op52, - /*input=*/v52.data(), /*output=*/v53.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #52" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op53, - /*input=*/v53.data(), /*output=*/v54.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #53" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op54, - /*input=*/v54.data(), /*output=*/v55.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #54" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op55, - /*input=*/v55.data(), /*output=*/v56.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #55" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op56, - /*input=*/v56.data(), /*output=*/v57.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #56" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f16( - op57, - /*input=*/v57.data(), /*output=*/v58.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #57" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op58, - /*input=*/v58.data(), /*output=*/v59.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #58" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op59, - /*input=*/v59.data(), /*output=*/v60.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #59" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f16( - op60, - v57.data() /* a */, v60.data() /* b */, /*output=*/v61.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #60" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op61, - /*input=*/v61.data(), /*output=*/v62.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #61" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op62, - v62.data() /* a */, v53.data() /* b */, /*output=*/v63.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #62" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op63, - /*input=*/v63.data(), /*output=*/v64.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #63" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op64, - /*input=*/v64.data(), /*output=*/v65.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #64" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op65, - /*input=*/v65.data(), /*output=*/v66.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #65" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op66, - /*input=*/v66.data(), /*output=*/v67.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #66" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f16( - op67, - /*input=*/v67.data(), /*output=*/v68.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #67" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op68, - /*input=*/v68.data(), /*output=*/v69.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #68" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op69, - /*input=*/v69.data(), /*output=*/v70.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #69" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f16( - op70, - v67.data() /* a */, v70.data() /* b */, /*output=*/v71.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #70" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op71, - /*input=*/v71.data(), /*output=*/v72.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #71" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op72, - /*input=*/v72.data(), /*output=*/v73.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #72" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op73, - /*input=*/v73.data(), /*output=*/v74.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #73" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op74, - /*input=*/v74.data(), /*output=*/v75.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #74" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op75, - /*input=*/v75.data(), /*output=*/v76.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #75" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f16( - op76, - /*input=*/v76.data(), /*output=*/v77.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #76" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op77, - /*input=*/v77.data(), /*output=*/v78.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #77" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op78, - /*input=*/v78.data(), /*output=*/v79.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #78" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f16( - op79, - v76.data() /* a */, v79.data() /* b */, /*output=*/v80.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #79" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op80, - /*input=*/v80.data(), /*output=*/v81.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #80" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op81, - v81.data() /* a */, v72.data() /* b */, /*output=*/v82.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #81" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op82, - /*input=*/v82.data(), /*output=*/v83.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #82" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op83, - /*input=*/v83.data(), /*output=*/v84.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #83" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op84, - /*input=*/v84.data(), /*output=*/v85.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #84" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op85, - /*input=*/v85.data(), /*output=*/v86.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #85" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f16( - op86, - /*input=*/v86.data(), /*output=*/v87.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #86" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op87, - /*input=*/v87.data(), /*output=*/v88.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #87" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op88, - /*input=*/v88.data(), /*output=*/v89.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #88" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f16( - op89, - v86.data() /* a */, v89.data() /* b */, /*output=*/v90.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #89" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op90, - /*input=*/v90.data(), /*output=*/v91.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #90" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f16( - op91, - v91.data() /* a */, v82.data() /* b */, /*output=*/v92.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #91" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f16( - op92, - /*input=*/v92.data(), /*output=*/v93.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #92" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op93, - /*input=*/v93.data(), /*output=*/v94.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #93" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f16( - op94, - /*input=*/v94.data(), /*output=*/v95.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #94" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op95, - workspace.data(), - /*input=*/v95.data(), /*output=*/v96.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #95" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f16( - op96, - /*input=*/v96.data(), /*output=*/v97.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #96" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f16( - op97, - workspace.data(), - /*input=*/v97.data(), /*output=*/v98.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #97" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f16( - op98, - workspace.data(), - /*input=*/v98.data(), /*output=*/v99.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #98" << std::endl; - return ExecutionPlan(); - } - - XNN_PRAGMA_CLANG("clang diagnostic push") - XNN_PRAGMA_CLANG("clang diagnostic ignored \"-Wpessimizing-move\"") - return ExecutionPlan{operators, workspace}; - XNN_PRAGMA_CLANG("clang diagnostic pop") -} - -} // namespace models diff --git a/models/fp32-mobilenet-v1.cc b/models/fp32-mobilenet-v1.cc deleted file mode 100644 index 265285d31dc..00000000000 --- a/models/fp32-mobilenet-v1.cc +++ /dev/null @@ -1,1536 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! - -#include "xnnpack.h" - -#include -#include -#include -#include -#include -#include - -#include "xnnpack/cache.h" -#include "xnnpack/common.h" -#include "xnnpack/models.h" - -namespace models { - -ExecutionPlan FP32MobileNetV1(pthreadpool_t threadpool) { - alignas(16) static std::array v0; - alignas(16) static std::array v1; - alignas(16) static std::array v2; - alignas(16) static std::array v3; - alignas(16) static std::array v4; - alignas(16) static std::array v5; - alignas(16) static std::array v6; - alignas(16) static std::array v7; - alignas(16) static std::array v8; - alignas(16) static std::array v9; - alignas(16) static std::array v10; - alignas(16) static std::array v11; - alignas(16) static std::array v12; - alignas(16) static std::array v13; - alignas(16) static std::array v14; - alignas(16) static std::array v15; - alignas(16) static std::array v16; - alignas(16) static std::array v17; - alignas(16) static std::array v18; - alignas(16) static std::array v19; - alignas(16) static std::array v20; - alignas(16) static std::array v21; - alignas(16) static std::array v22; - alignas(16) static std::array v23; - alignas(16) static std::array v24; - alignas(16) static std::array v25; - alignas(16) static std::array v26; - alignas(16) static std::array v27; - alignas(16) static std::array v28; - alignas(16) static std::array v29; - alignas(16) static std::array w30; - alignas(16) static std::array w31; - alignas(16) static std::array w32; - alignas(16) static std::array w33; - alignas(16) static std::array w34; - alignas(16) static std::array w35; - alignas(16) static std::array w36; - alignas(16) static std::array w37; - alignas(16) static std::array w38; - alignas(16) static std::array w39; - alignas(16) static std::array w40; - alignas(16) static std::array w41; - alignas(16) static std::array w42; - alignas(16) static std::array w43; - alignas(16) static std::array w44; - alignas(16) static std::array w45; - alignas(16) static std::array w46; - alignas(16) static std::array w47; - alignas(16) static std::array w48; - alignas(16) static std::array w49; - alignas(16) static std::array w50; - alignas(16) static std::array w51; - alignas(16) static std::array w52; - alignas(16) static std::array w53; - alignas(16) static std::array w54; - alignas(16) static std::array w55; - alignas(16) static std::array w56; - alignas(16) static std::array w57; - alignas(16) static std::array w58; - alignas(16) static std::array w59; - alignas(16) static std::array w60; - alignas(16) static std::array w61; - alignas(16) static std::array w62; - alignas(16) static std::array w63; - alignas(16) static std::array w64; - alignas(16) static std::array w65; - alignas(16) static std::array w66; - alignas(16) static std::array w67; - alignas(16) static std::array w68; - alignas(16) static std::array w69; - alignas(16) static std::array w70; - alignas(16) static std::array w71; - alignas(16) static std::array w72; - alignas(16) static std::array w73; - alignas(16) static std::array w74; - alignas(16) static std::array w75; - alignas(16) static std::array w76; - alignas(16) static std::array w77; - alignas(16) static std::array w78; - alignas(16) static std::array w79; - alignas(16) static std::array w80; - alignas(16) static std::array w81; - alignas(16) static std::array w82; - alignas(16) static std::array w83; - alignas(16) static std::array w84; - alignas(16) static std::array w85; - - std::random_device random_device; - auto rng = std::mt19937(random_device()); - auto f32rng = std::bind(std::uniform_real_distribution(-1.0f, +1.0f), std::ref(rng)); - std::generate(v0.begin(), v0.end(), std::ref(f32rng)); - std::generate(v1.begin(), v1.end(), std::ref(f32rng)); - std::generate(v2.begin(), v2.end(), std::ref(f32rng)); - std::generate(v3.begin(), v3.end(), std::ref(f32rng)); - std::generate(v4.begin(), v4.end(), std::ref(f32rng)); - std::generate(v5.begin(), v5.end(), std::ref(f32rng)); - std::generate(v6.begin(), v6.end(), std::ref(f32rng)); - std::generate(v7.begin(), v7.end(), std::ref(f32rng)); - std::generate(v8.begin(), v8.end(), std::ref(f32rng)); - std::generate(v9.begin(), v9.end(), std::ref(f32rng)); - std::generate(v10.begin(), v10.end(), std::ref(f32rng)); - std::generate(v11.begin(), v11.end(), std::ref(f32rng)); - std::generate(v12.begin(), v12.end(), std::ref(f32rng)); - std::generate(v13.begin(), v13.end(), std::ref(f32rng)); - std::generate(v14.begin(), v14.end(), std::ref(f32rng)); - std::generate(v15.begin(), v15.end(), std::ref(f32rng)); - std::generate(v16.begin(), v16.end(), std::ref(f32rng)); - std::generate(v17.begin(), v17.end(), std::ref(f32rng)); - std::generate(v18.begin(), v18.end(), std::ref(f32rng)); - std::generate(v19.begin(), v19.end(), std::ref(f32rng)); - std::generate(v20.begin(), v20.end(), std::ref(f32rng)); - std::generate(v21.begin(), v21.end(), std::ref(f32rng)); - std::generate(v22.begin(), v22.end(), std::ref(f32rng)); - std::generate(v23.begin(), v23.end(), std::ref(f32rng)); - std::generate(v24.begin(), v24.end(), std::ref(f32rng)); - std::generate(v25.begin(), v25.end(), std::ref(f32rng)); - std::generate(v26.begin(), v26.end(), std::ref(f32rng)); - std::generate(v27.begin(), v27.end(), std::ref(f32rng)); - std::generate(v28.begin(), v28.end(), std::ref(f32rng)); - std::generate(v29.begin(), v29.end(), std::ref(f32rng)); - std::generate(w30.begin(), w30.end(), std::ref(f32rng)); - std::generate(w31.begin(), w31.end(), std::ref(f32rng)); - std::generate(w32.begin(), w32.end(), std::ref(f32rng)); - std::generate(w33.begin(), w33.end(), std::ref(f32rng)); - std::generate(w34.begin(), w34.end(), std::ref(f32rng)); - std::generate(w35.begin(), w35.end(), std::ref(f32rng)); - std::generate(w36.begin(), w36.end(), std::ref(f32rng)); - std::generate(w37.begin(), w37.end(), std::ref(f32rng)); - std::generate(w38.begin(), w38.end(), std::ref(f32rng)); - std::generate(w39.begin(), w39.end(), std::ref(f32rng)); - std::generate(w40.begin(), w40.end(), std::ref(f32rng)); - std::generate(w41.begin(), w41.end(), std::ref(f32rng)); - std::generate(w42.begin(), w42.end(), std::ref(f32rng)); - std::generate(w43.begin(), w43.end(), std::ref(f32rng)); - std::generate(w44.begin(), w44.end(), std::ref(f32rng)); - std::generate(w45.begin(), w45.end(), std::ref(f32rng)); - std::generate(w46.begin(), w46.end(), std::ref(f32rng)); - std::generate(w47.begin(), w47.end(), std::ref(f32rng)); - std::generate(w48.begin(), w48.end(), std::ref(f32rng)); - std::generate(w49.begin(), w49.end(), std::ref(f32rng)); - std::generate(w50.begin(), w50.end(), std::ref(f32rng)); - std::generate(w51.begin(), w51.end(), std::ref(f32rng)); - std::generate(w52.begin(), w52.end(), std::ref(f32rng)); - std::generate(w53.begin(), w53.end(), std::ref(f32rng)); - std::generate(w54.begin(), w54.end(), std::ref(f32rng)); - std::generate(w55.begin(), w55.end(), std::ref(f32rng)); - std::generate(w56.begin(), w56.end(), std::ref(f32rng)); - std::generate(w57.begin(), w57.end(), std::ref(f32rng)); - std::generate(w58.begin(), w58.end(), std::ref(f32rng)); - std::generate(w59.begin(), w59.end(), std::ref(f32rng)); - std::generate(w60.begin(), w60.end(), std::ref(f32rng)); - std::generate(w61.begin(), w61.end(), std::ref(f32rng)); - std::generate(w62.begin(), w62.end(), std::ref(f32rng)); - std::generate(w63.begin(), w63.end(), std::ref(f32rng)); - std::generate(w64.begin(), w64.end(), std::ref(f32rng)); - std::generate(w65.begin(), w65.end(), std::ref(f32rng)); - std::generate(w66.begin(), w66.end(), std::ref(f32rng)); - std::generate(w67.begin(), w67.end(), std::ref(f32rng)); - std::generate(w68.begin(), w68.end(), std::ref(f32rng)); - std::generate(w69.begin(), w69.end(), std::ref(f32rng)); - std::generate(w70.begin(), w70.end(), std::ref(f32rng)); - std::generate(w71.begin(), w71.end(), std::ref(f32rng)); - std::generate(w72.begin(), w72.end(), std::ref(f32rng)); - std::generate(w73.begin(), w73.end(), std::ref(f32rng)); - std::generate(w74.begin(), w74.end(), std::ref(f32rng)); - std::generate(w75.begin(), w75.end(), std::ref(f32rng)); - std::generate(w76.begin(), w76.end(), std::ref(f32rng)); - std::generate(w77.begin(), w77.end(), std::ref(f32rng)); - std::generate(w78.begin(), w78.end(), std::ref(f32rng)); - std::generate(w79.begin(), w79.end(), std::ref(f32rng)); - std::generate(w80.begin(), w80.end(), std::ref(f32rng)); - std::generate(w81.begin(), w81.end(), std::ref(f32rng)); - std::generate(w82.begin(), w82.end(), std::ref(f32rng)); - std::generate(w83.begin(), w83.end(), std::ref(f32rng)); - std::generate(w84.begin(), w84.end(), std::ref(f32rng)); - std::generate(w85.begin(), w85.end(), std::ref(f32rng)); - - Operators operators; - xnn_status status; - xnn_code_cache* code_cache_ptr = nullptr; - size_t max_workspace_size = 0; - - xnn_operator_t op0 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/0, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/3, - /*group_output_channels=*/32, - /*input_channel_stride=*/3, - /*output_channel_stride=*/32, - /*kernel=*/w30.data(), /*bias=*/w31.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op0); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #0" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op0, xnn_delete_operator); - - xnn_operator_t op1 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/32, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/32, - /*output_channel_stride=*/32, - /*kernel=*/w32.data(), /*bias=*/w33.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op1); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #1" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op1, xnn_delete_operator); - - xnn_operator_t op2 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/32, - /*group_output_channels=*/64, - /*input_channel_stride=*/32, - /*output_channel_stride=*/64, - /*kernel=*/w34.data(), /*bias=*/w35.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op2); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #2" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op2, xnn_delete_operator); - - xnn_operator_t op3 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/0, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/64, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/64, - /*output_channel_stride=*/64, - /*kernel=*/w36.data(), /*bias=*/w37.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op3); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #3" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op3, xnn_delete_operator); - - xnn_operator_t op4 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/64, - /*group_output_channels=*/128, - /*input_channel_stride=*/64, - /*output_channel_stride=*/128, - /*kernel=*/w38.data(), /*bias=*/w39.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op4); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #4" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op4, xnn_delete_operator); - - xnn_operator_t op5 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/128, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/128, - /*output_channel_stride=*/128, - /*kernel=*/w40.data(), /*bias=*/w41.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op5); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #5" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op5, xnn_delete_operator); - - xnn_operator_t op6 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/128, - /*group_output_channels=*/128, - /*input_channel_stride=*/128, - /*output_channel_stride=*/128, - /*kernel=*/w42.data(), /*bias=*/w43.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op6); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #6" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op6, xnn_delete_operator); - - xnn_operator_t op7 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/0, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/128, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/128, - /*output_channel_stride=*/128, - /*kernel=*/w44.data(), /*bias=*/w45.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op7); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #7" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op7, xnn_delete_operator); - - xnn_operator_t op8 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/128, - /*group_output_channels=*/256, - /*input_channel_stride=*/128, - /*output_channel_stride=*/256, - /*kernel=*/w46.data(), /*bias=*/w47.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op8); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #8" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op8, xnn_delete_operator); - - xnn_operator_t op9 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/256, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/256, - /*output_channel_stride=*/256, - /*kernel=*/w48.data(), /*bias=*/w49.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op9); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #9" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op9, xnn_delete_operator); - - xnn_operator_t op10 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/256, - /*group_output_channels=*/256, - /*input_channel_stride=*/256, - /*output_channel_stride=*/256, - /*kernel=*/w50.data(), /*bias=*/w51.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op10); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #10" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op10, xnn_delete_operator); - - xnn_operator_t op11 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/0, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/256, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/256, - /*output_channel_stride=*/256, - /*kernel=*/w52.data(), /*bias=*/w53.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op11); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #11" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op11, xnn_delete_operator); - - xnn_operator_t op12 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/256, - /*group_output_channels=*/512, - /*input_channel_stride=*/256, - /*output_channel_stride=*/512, - /*kernel=*/w54.data(), /*bias=*/w55.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op12); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #12" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op12, xnn_delete_operator); - - xnn_operator_t op13 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/512, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/512, - /*output_channel_stride=*/512, - /*kernel=*/w56.data(), /*bias=*/w57.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op13); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #13" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op13, xnn_delete_operator); - - xnn_operator_t op14 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/512, - /*group_output_channels=*/512, - /*input_channel_stride=*/512, - /*output_channel_stride=*/512, - /*kernel=*/w58.data(), /*bias=*/w59.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op14); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #14" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op14, xnn_delete_operator); - - xnn_operator_t op15 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/512, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/512, - /*output_channel_stride=*/512, - /*kernel=*/w60.data(), /*bias=*/w61.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op15); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #15" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op15, xnn_delete_operator); - - xnn_operator_t op16 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/512, - /*group_output_channels=*/512, - /*input_channel_stride=*/512, - /*output_channel_stride=*/512, - /*kernel=*/w62.data(), /*bias=*/w63.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op16); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #16" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op16, xnn_delete_operator); - - xnn_operator_t op17 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/512, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/512, - /*output_channel_stride=*/512, - /*kernel=*/w64.data(), /*bias=*/w65.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op17); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #17" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op17, xnn_delete_operator); - - xnn_operator_t op18 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/512, - /*group_output_channels=*/512, - /*input_channel_stride=*/512, - /*output_channel_stride=*/512, - /*kernel=*/w66.data(), /*bias=*/w67.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op18); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #18" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op18, xnn_delete_operator); - - xnn_operator_t op19 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/512, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/512, - /*output_channel_stride=*/512, - /*kernel=*/w68.data(), /*bias=*/w69.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op19); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #19" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op19, xnn_delete_operator); - - xnn_operator_t op20 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/512, - /*group_output_channels=*/512, - /*input_channel_stride=*/512, - /*output_channel_stride=*/512, - /*kernel=*/w70.data(), /*bias=*/w71.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op20); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #20" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op20, xnn_delete_operator); - - xnn_operator_t op21 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/512, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/512, - /*output_channel_stride=*/512, - /*kernel=*/w72.data(), /*bias=*/w73.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op21); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #21" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op21, xnn_delete_operator); - - xnn_operator_t op22 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/512, - /*group_output_channels=*/512, - /*input_channel_stride=*/512, - /*output_channel_stride=*/512, - /*kernel=*/w74.data(), /*bias=*/w75.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op22); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #22" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op22, xnn_delete_operator); - - xnn_operator_t op23 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/0, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/512, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/512, - /*output_channel_stride=*/512, - /*kernel=*/w76.data(), /*bias=*/w77.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op23); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #23" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op23, xnn_delete_operator); - - xnn_operator_t op24 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/512, - /*group_output_channels=*/1024, - /*input_channel_stride=*/512, - /*output_channel_stride=*/1024, - /*kernel=*/w78.data(), /*bias=*/w79.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op24); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #24" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op24, xnn_delete_operator); - - xnn_operator_t op25 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1024, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/1024, - /*output_channel_stride=*/1024, - /*kernel=*/w80.data(), /*bias=*/w81.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op25); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #25" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op25, xnn_delete_operator); - - xnn_operator_t op26 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/1024, - /*group_output_channels=*/1024, - /*input_channel_stride=*/1024, - /*output_channel_stride=*/1024, - /*kernel=*/w82.data(), /*bias=*/w83.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op26); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #26" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op26, xnn_delete_operator); - - xnn_operator_t op27 = nullptr; - status = xnn_create_global_average_pooling_nwc_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op27); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #27" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op27, xnn_delete_operator); - - xnn_operator_t op28 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/1024, - /*group_output_channels=*/1001, - /*input_channel_stride=*/1024, - /*output_channel_stride=*/1001, - /*kernel=*/w84.data(), /*bias=*/w85.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op28); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #28" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op28, xnn_delete_operator); - - size_t op0_workspace_size = 0; - size_t op0_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op0, - /*batch_size=*/1, /*input_height=*/224, /*input_width=*/224, - &op0_workspace_size, &op0_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op0_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #0" << std::endl; - return ExecutionPlan(); - } - - size_t op1_workspace_size = 0; - size_t op1_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op1, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op1_workspace_size, &op1_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op1_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #1" << std::endl; - return ExecutionPlan(); - } - - size_t op2_workspace_size = 0; - size_t op2_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op2, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op2_workspace_size, &op2_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op2_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #2" << std::endl; - return ExecutionPlan(); - } - - size_t op3_workspace_size = 0; - size_t op3_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op3, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op3_workspace_size, &op3_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op3_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #3" << std::endl; - return ExecutionPlan(); - } - - size_t op4_workspace_size = 0; - size_t op4_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op4, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op4_workspace_size, &op4_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op4_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #4" << std::endl; - return ExecutionPlan(); - } - - size_t op5_workspace_size = 0; - size_t op5_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op5, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op5_workspace_size, &op5_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op5_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #5" << std::endl; - return ExecutionPlan(); - } - - size_t op6_workspace_size = 0; - size_t op6_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op6, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op6_workspace_size, &op6_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op6_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #6" << std::endl; - return ExecutionPlan(); - } - - size_t op7_workspace_size = 0; - size_t op7_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op7, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op7_workspace_size, &op7_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op7_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #7" << std::endl; - return ExecutionPlan(); - } - - size_t op8_workspace_size = 0; - size_t op8_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op8, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op8_workspace_size, &op8_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op8_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #8" << std::endl; - return ExecutionPlan(); - } - - size_t op9_workspace_size = 0; - size_t op9_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op9, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op9_workspace_size, &op9_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op9_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #9" << std::endl; - return ExecutionPlan(); - } - - size_t op10_workspace_size = 0; - size_t op10_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op10, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op10_workspace_size, &op10_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op10_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #10" << std::endl; - return ExecutionPlan(); - } - - size_t op11_workspace_size = 0; - size_t op11_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op11, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op11_workspace_size, &op11_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op11_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #11" << std::endl; - return ExecutionPlan(); - } - - size_t op12_workspace_size = 0; - size_t op12_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op12, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op12_workspace_size, &op12_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op12_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #12" << std::endl; - return ExecutionPlan(); - } - - size_t op13_workspace_size = 0; - size_t op13_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op13, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op13_workspace_size, &op13_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op13_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #13" << std::endl; - return ExecutionPlan(); - } - - size_t op14_workspace_size = 0; - size_t op14_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op14, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op14_workspace_size, &op14_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op14_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #14" << std::endl; - return ExecutionPlan(); - } - - size_t op15_workspace_size = 0; - size_t op15_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op15, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op15_workspace_size, &op15_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op15_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #15" << std::endl; - return ExecutionPlan(); - } - - size_t op16_workspace_size = 0; - size_t op16_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op16, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op16_workspace_size, &op16_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op16_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #16" << std::endl; - return ExecutionPlan(); - } - - size_t op17_workspace_size = 0; - size_t op17_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op17, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op17_workspace_size, &op17_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op17_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #17" << std::endl; - return ExecutionPlan(); - } - - size_t op18_workspace_size = 0; - size_t op18_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op18, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op18_workspace_size, &op18_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op18_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #18" << std::endl; - return ExecutionPlan(); - } - - size_t op19_workspace_size = 0; - size_t op19_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op19, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op19_workspace_size, &op19_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op19_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #19" << std::endl; - return ExecutionPlan(); - } - - size_t op20_workspace_size = 0; - size_t op20_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op20, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op20_workspace_size, &op20_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op20_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #20" << std::endl; - return ExecutionPlan(); - } - - size_t op21_workspace_size = 0; - size_t op21_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op21, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op21_workspace_size, &op21_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op21_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #21" << std::endl; - return ExecutionPlan(); - } - - size_t op22_workspace_size = 0; - size_t op22_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op22, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op22_workspace_size, &op22_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op22_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #22" << std::endl; - return ExecutionPlan(); - } - - size_t op23_workspace_size = 0; - size_t op23_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op23, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op23_workspace_size, &op23_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op23_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #23" << std::endl; - return ExecutionPlan(); - } - - size_t op24_workspace_size = 0; - size_t op24_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op24, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op24_workspace_size, &op24_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op24_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #24" << std::endl; - return ExecutionPlan(); - } - - size_t op25_workspace_size = 0; - size_t op25_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op25, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op25_workspace_size, &op25_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op25_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #25" << std::endl; - return ExecutionPlan(); - } - - size_t op26_workspace_size = 0; - size_t op26_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op26, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op26_workspace_size, &op26_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op26_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #26" << std::endl; - return ExecutionPlan(); - } - - size_t op27_workspace_size = 0; - size_t op27_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f32( - op27, - /*batch_size=*/1, 49 /* width */, - 1024 /* channels */, 1024 /* input stride */, 1024 /* output stride */, - &op27_workspace_size, &op27_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op27_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #27" << std::endl; - return ExecutionPlan(); - } - - size_t op28_workspace_size = 0; - size_t op28_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op28, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op28_workspace_size, &op28_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op28_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #28" << std::endl; - return ExecutionPlan(); - } - - Workspace workspace(max_workspace_size); - - status = xnn_setup_convolution2d_nhwc_f32( - op0, - workspace.data(), /*input=*/v0.data(), /*output=*/v1.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #0" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op1, - workspace.data(), /*input=*/v1.data(), /*output=*/v2.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #1" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op2, - workspace.data(), /*input=*/v2.data(), /*output=*/v3.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #2" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op3, - workspace.data(), /*input=*/v3.data(), /*output=*/v4.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #3" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op4, - workspace.data(), /*input=*/v4.data(), /*output=*/v5.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #4" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op5, - workspace.data(), /*input=*/v5.data(), /*output=*/v6.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #5" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op6, - workspace.data(), /*input=*/v6.data(), /*output=*/v7.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #6" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op7, - workspace.data(), /*input=*/v7.data(), /*output=*/v8.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #7" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op8, - workspace.data(), /*input=*/v8.data(), /*output=*/v9.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #8" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op9, - workspace.data(), /*input=*/v9.data(), /*output=*/v10.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #9" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op10, - workspace.data(), /*input=*/v10.data(), /*output=*/v11.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #10" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op11, - workspace.data(), /*input=*/v11.data(), /*output=*/v12.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #11" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op12, - workspace.data(), /*input=*/v12.data(), /*output=*/v13.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #12" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op13, - workspace.data(), /*input=*/v13.data(), /*output=*/v14.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #13" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op14, - workspace.data(), /*input=*/v14.data(), /*output=*/v15.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #14" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op15, - workspace.data(), /*input=*/v15.data(), /*output=*/v16.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #15" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op16, - workspace.data(), /*input=*/v16.data(), /*output=*/v17.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #16" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op17, - workspace.data(), /*input=*/v17.data(), /*output=*/v18.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #17" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op18, - workspace.data(), /*input=*/v18.data(), /*output=*/v19.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #18" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op19, - workspace.data(), /*input=*/v19.data(), /*output=*/v20.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #19" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op20, - workspace.data(), /*input=*/v20.data(), /*output=*/v21.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #20" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op21, - workspace.data(), /*input=*/v21.data(), /*output=*/v22.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #21" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op22, - workspace.data(), /*input=*/v22.data(), /*output=*/v23.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #22" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op23, - workspace.data(), /*input=*/v23.data(), /*output=*/v24.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #23" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op24, - workspace.data(), /*input=*/v24.data(), /*output=*/v25.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #24" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op25, - workspace.data(), /*input=*/v25.data(), /*output=*/v26.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #25" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op26, - workspace.data(), /*input=*/v26.data(), /*output=*/v27.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #26" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f32( - op27, - workspace.data(), - /*input=*/v27.data(), /*output=*/v28.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #27" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op28, - workspace.data(), /*input=*/v28.data(), /*output=*/v29.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #28" << std::endl; - return ExecutionPlan(); - } - - XNN_PRAGMA_CLANG("clang diagnostic push") - XNN_PRAGMA_CLANG("clang diagnostic ignored \"-Wpessimizing-move\"") - return ExecutionPlan{operators, workspace}; - XNN_PRAGMA_CLANG("clang diagnostic pop") -} - -} // namespace models diff --git a/models/fp32-mobilenet-v2.cc b/models/fp32-mobilenet-v2.cc deleted file mode 100644 index 9894d701480..00000000000 --- a/models/fp32-mobilenet-v2.cc +++ /dev/null @@ -1,3240 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! - -#include "xnnpack.h" - -#include -#include -#include -#include -#include -#include - -#include "xnnpack/cache.h" -#include "xnnpack/common.h" -#include "xnnpack/models.h" - -namespace models { - -ExecutionPlan FP32MobileNetV2(pthreadpool_t threadpool) { - alignas(16) static std::array v0; - alignas(16) static std::array v1; - alignas(16) static std::array v2; - alignas(16) static std::array v3; - alignas(16) static std::array v4; - alignas(16) static std::array v5; - alignas(16) static std::array v6; - alignas(16) static std::array v7; - alignas(16) static std::array v8; - alignas(16) static std::array v9; - alignas(16) static std::array v10; - alignas(16) static std::array v11; - alignas(16) static std::array v12; - alignas(16) static std::array v13; - alignas(16) static std::array v14; - alignas(16) static std::array v15; - alignas(16) static std::array v16; - alignas(16) static std::array v17; - alignas(16) static std::array v18; - alignas(16) static std::array v19; - alignas(16) static std::array v20; - alignas(16) static std::array v21; - alignas(16) static std::array v22; - alignas(16) static std::array v23; - alignas(16) static std::array v24; - alignas(16) static std::array v25; - alignas(16) static std::array v26; - alignas(16) static std::array v27; - alignas(16) static std::array v28; - alignas(16) static std::array v29; - alignas(16) static std::array v30; - alignas(16) static std::array v31; - alignas(16) static std::array v32; - alignas(16) static std::array v33; - alignas(16) static std::array v34; - alignas(16) static std::array v35; - alignas(16) static std::array v36; - alignas(16) static std::array v37; - alignas(16) static std::array v38; - alignas(16) static std::array v39; - alignas(16) static std::array v40; - alignas(16) static std::array v41; - alignas(16) static std::array v42; - alignas(16) static std::array v43; - alignas(16) static std::array v44; - alignas(16) static std::array v45; - alignas(16) static std::array v46; - alignas(16) static std::array v47; - alignas(16) static std::array v48; - alignas(16) static std::array v49; - alignas(16) static std::array v50; - alignas(16) static std::array v51; - alignas(16) static std::array v52; - alignas(16) static std::array v53; - alignas(16) static std::array v54; - alignas(16) static std::array v55; - alignas(16) static std::array v56; - alignas(16) static std::array v57; - alignas(16) static std::array v58; - alignas(16) static std::array v59; - alignas(16) static std::array v60; - alignas(16) static std::array v61; - alignas(16) static std::array v62; - alignas(16) static std::array v63; - alignas(16) static std::array v64; - alignas(16) static std::array v65; - alignas(16) static std::array v66; - alignas(16) static std::array w67; - alignas(16) static std::array w68; - alignas(16) static std::array w69; - alignas(16) static std::array w70; - alignas(16) static std::array w71; - alignas(16) static std::array w72; - alignas(16) static std::array w73; - alignas(16) static std::array w74; - alignas(16) static std::array w75; - alignas(16) static std::array w76; - alignas(16) static std::array w77; - alignas(16) static std::array w78; - alignas(16) static std::array w79; - alignas(16) static std::array w80; - alignas(16) static std::array w81; - alignas(16) static std::array w82; - alignas(16) static std::array w83; - alignas(16) static std::array w84; - alignas(16) static std::array w85; - alignas(16) static std::array w86; - alignas(16) static std::array w87; - alignas(16) static std::array w88; - alignas(16) static std::array w89; - alignas(16) static std::array w90; - alignas(16) static std::array w91; - alignas(16) static std::array w92; - alignas(16) static std::array w93; - alignas(16) static std::array w94; - alignas(16) static std::array w95; - alignas(16) static std::array w96; - alignas(16) static std::array w97; - alignas(16) static std::array w98; - alignas(16) static std::array w99; - alignas(16) static std::array w100; - alignas(16) static std::array w101; - alignas(16) static std::array w102; - alignas(16) static std::array w103; - alignas(16) static std::array w104; - alignas(16) static std::array w105; - alignas(16) static std::array w106; - alignas(16) static std::array w107; - alignas(16) static std::array w108; - alignas(16) static std::array w109; - alignas(16) static std::array w110; - alignas(16) static std::array w111; - alignas(16) static std::array w112; - alignas(16) static std::array w113; - alignas(16) static std::array w114; - alignas(16) static std::array w115; - alignas(16) static std::array w116; - alignas(16) static std::array w117; - alignas(16) static std::array w118; - alignas(16) static std::array w119; - alignas(16) static std::array w120; - alignas(16) static std::array w121; - alignas(16) static std::array w122; - alignas(16) static std::array w123; - alignas(16) static std::array w124; - alignas(16) static std::array w125; - alignas(16) static std::array w126; - alignas(16) static std::array w127; - alignas(16) static std::array w128; - alignas(16) static std::array w129; - alignas(16) static std::array w130; - alignas(16) static std::array w131; - alignas(16) static std::array w132; - alignas(16) static std::array w133; - alignas(16) static std::array w134; - alignas(16) static std::array w135; - alignas(16) static std::array w136; - alignas(16) static std::array w137; - alignas(16) static std::array w138; - alignas(16) static std::array w139; - alignas(16) static std::array w140; - alignas(16) static std::array w141; - alignas(16) static std::array w142; - alignas(16) static std::array w143; - alignas(16) static std::array w144; - alignas(16) static std::array w145; - alignas(16) static std::array w146; - alignas(16) static std::array w147; - alignas(16) static std::array w148; - alignas(16) static std::array w149; - alignas(16) static std::array w150; - alignas(16) static std::array w151; - alignas(16) static std::array w152; - alignas(16) static std::array w153; - alignas(16) static std::array w154; - alignas(16) static std::array w155; - alignas(16) static std::array w156; - alignas(16) static std::array w157; - alignas(16) static std::array w158; - alignas(16) static std::array w159; - alignas(16) static std::array w160; - alignas(16) static std::array w161; - alignas(16) static std::array w162; - alignas(16) static std::array w163; - alignas(16) static std::array w164; - alignas(16) static std::array w165; - alignas(16) static std::array w166; - alignas(16) static std::array w167; - alignas(16) static std::array w168; - alignas(16) static std::array w169; - alignas(16) static std::array w170; - alignas(16) static std::array w171; - alignas(16) static std::array w172; - - std::random_device random_device; - auto rng = std::mt19937(random_device()); - auto f32rng = std::bind(std::uniform_real_distribution(-1.0f, +1.0f), std::ref(rng)); - std::generate(v0.begin(), v0.end(), std::ref(f32rng)); - std::generate(v1.begin(), v1.end(), std::ref(f32rng)); - std::generate(v2.begin(), v2.end(), std::ref(f32rng)); - std::generate(v3.begin(), v3.end(), std::ref(f32rng)); - std::generate(v4.begin(), v4.end(), std::ref(f32rng)); - std::generate(v5.begin(), v5.end(), std::ref(f32rng)); - std::generate(v6.begin(), v6.end(), std::ref(f32rng)); - std::generate(v7.begin(), v7.end(), std::ref(f32rng)); - std::generate(v8.begin(), v8.end(), std::ref(f32rng)); - std::generate(v9.begin(), v9.end(), std::ref(f32rng)); - std::generate(v10.begin(), v10.end(), std::ref(f32rng)); - std::generate(v11.begin(), v11.end(), std::ref(f32rng)); - std::generate(v12.begin(), v12.end(), std::ref(f32rng)); - std::generate(v13.begin(), v13.end(), std::ref(f32rng)); - std::generate(v14.begin(), v14.end(), std::ref(f32rng)); - std::generate(v15.begin(), v15.end(), std::ref(f32rng)); - std::generate(v16.begin(), v16.end(), std::ref(f32rng)); - std::generate(v17.begin(), v17.end(), std::ref(f32rng)); - std::generate(v18.begin(), v18.end(), std::ref(f32rng)); - std::generate(v19.begin(), v19.end(), std::ref(f32rng)); - std::generate(v20.begin(), v20.end(), std::ref(f32rng)); - std::generate(v21.begin(), v21.end(), std::ref(f32rng)); - std::generate(v22.begin(), v22.end(), std::ref(f32rng)); - std::generate(v23.begin(), v23.end(), std::ref(f32rng)); - std::generate(v24.begin(), v24.end(), std::ref(f32rng)); - std::generate(v25.begin(), v25.end(), std::ref(f32rng)); - std::generate(v26.begin(), v26.end(), std::ref(f32rng)); - std::generate(v27.begin(), v27.end(), std::ref(f32rng)); - std::generate(v28.begin(), v28.end(), std::ref(f32rng)); - std::generate(v29.begin(), v29.end(), std::ref(f32rng)); - std::generate(v30.begin(), v30.end(), std::ref(f32rng)); - std::generate(v31.begin(), v31.end(), std::ref(f32rng)); - std::generate(v32.begin(), v32.end(), std::ref(f32rng)); - std::generate(v33.begin(), v33.end(), std::ref(f32rng)); - std::generate(v34.begin(), v34.end(), std::ref(f32rng)); - std::generate(v35.begin(), v35.end(), std::ref(f32rng)); - std::generate(v36.begin(), v36.end(), std::ref(f32rng)); - std::generate(v37.begin(), v37.end(), std::ref(f32rng)); - std::generate(v38.begin(), v38.end(), std::ref(f32rng)); - std::generate(v39.begin(), v39.end(), std::ref(f32rng)); - std::generate(v40.begin(), v40.end(), std::ref(f32rng)); - std::generate(v41.begin(), v41.end(), std::ref(f32rng)); - std::generate(v42.begin(), v42.end(), std::ref(f32rng)); - std::generate(v43.begin(), v43.end(), std::ref(f32rng)); - std::generate(v44.begin(), v44.end(), std::ref(f32rng)); - std::generate(v45.begin(), v45.end(), std::ref(f32rng)); - std::generate(v46.begin(), v46.end(), std::ref(f32rng)); - std::generate(v47.begin(), v47.end(), std::ref(f32rng)); - std::generate(v48.begin(), v48.end(), std::ref(f32rng)); - std::generate(v49.begin(), v49.end(), std::ref(f32rng)); - std::generate(v50.begin(), v50.end(), std::ref(f32rng)); - std::generate(v51.begin(), v51.end(), std::ref(f32rng)); - std::generate(v52.begin(), v52.end(), std::ref(f32rng)); - std::generate(v53.begin(), v53.end(), std::ref(f32rng)); - std::generate(v54.begin(), v54.end(), std::ref(f32rng)); - std::generate(v55.begin(), v55.end(), std::ref(f32rng)); - std::generate(v56.begin(), v56.end(), std::ref(f32rng)); - std::generate(v57.begin(), v57.end(), std::ref(f32rng)); - std::generate(v58.begin(), v58.end(), std::ref(f32rng)); - std::generate(v59.begin(), v59.end(), std::ref(f32rng)); - std::generate(v60.begin(), v60.end(), std::ref(f32rng)); - std::generate(v61.begin(), v61.end(), std::ref(f32rng)); - std::generate(v62.begin(), v62.end(), std::ref(f32rng)); - std::generate(v63.begin(), v63.end(), std::ref(f32rng)); - std::generate(v64.begin(), v64.end(), std::ref(f32rng)); - std::generate(v65.begin(), v65.end(), std::ref(f32rng)); - std::generate(v66.begin(), v66.end(), std::ref(f32rng)); - std::generate(w67.begin(), w67.end(), std::ref(f32rng)); - std::generate(w68.begin(), w68.end(), std::ref(f32rng)); - std::generate(w69.begin(), w69.end(), std::ref(f32rng)); - std::generate(w70.begin(), w70.end(), std::ref(f32rng)); - std::generate(w71.begin(), w71.end(), std::ref(f32rng)); - std::generate(w72.begin(), w72.end(), std::ref(f32rng)); - std::generate(w73.begin(), w73.end(), std::ref(f32rng)); - std::generate(w74.begin(), w74.end(), std::ref(f32rng)); - std::generate(w75.begin(), w75.end(), std::ref(f32rng)); - std::generate(w76.begin(), w76.end(), std::ref(f32rng)); - std::generate(w77.begin(), w77.end(), std::ref(f32rng)); - std::generate(w78.begin(), w78.end(), std::ref(f32rng)); - std::generate(w79.begin(), w79.end(), std::ref(f32rng)); - std::generate(w80.begin(), w80.end(), std::ref(f32rng)); - std::generate(w81.begin(), w81.end(), std::ref(f32rng)); - std::generate(w82.begin(), w82.end(), std::ref(f32rng)); - std::generate(w83.begin(), w83.end(), std::ref(f32rng)); - std::generate(w84.begin(), w84.end(), std::ref(f32rng)); - std::generate(w85.begin(), w85.end(), std::ref(f32rng)); - std::generate(w86.begin(), w86.end(), std::ref(f32rng)); - std::generate(w87.begin(), w87.end(), std::ref(f32rng)); - std::generate(w88.begin(), w88.end(), std::ref(f32rng)); - std::generate(w89.begin(), w89.end(), std::ref(f32rng)); - std::generate(w90.begin(), w90.end(), std::ref(f32rng)); - std::generate(w91.begin(), w91.end(), std::ref(f32rng)); - std::generate(w92.begin(), w92.end(), std::ref(f32rng)); - std::generate(w93.begin(), w93.end(), std::ref(f32rng)); - std::generate(w94.begin(), w94.end(), std::ref(f32rng)); - std::generate(w95.begin(), w95.end(), std::ref(f32rng)); - std::generate(w96.begin(), w96.end(), std::ref(f32rng)); - std::generate(w97.begin(), w97.end(), std::ref(f32rng)); - std::generate(w98.begin(), w98.end(), std::ref(f32rng)); - std::generate(w99.begin(), w99.end(), std::ref(f32rng)); - std::generate(w100.begin(), w100.end(), std::ref(f32rng)); - std::generate(w101.begin(), w101.end(), std::ref(f32rng)); - std::generate(w102.begin(), w102.end(), std::ref(f32rng)); - std::generate(w103.begin(), w103.end(), std::ref(f32rng)); - std::generate(w104.begin(), w104.end(), std::ref(f32rng)); - std::generate(w105.begin(), w105.end(), std::ref(f32rng)); - std::generate(w106.begin(), w106.end(), std::ref(f32rng)); - std::generate(w107.begin(), w107.end(), std::ref(f32rng)); - std::generate(w108.begin(), w108.end(), std::ref(f32rng)); - std::generate(w109.begin(), w109.end(), std::ref(f32rng)); - std::generate(w110.begin(), w110.end(), std::ref(f32rng)); - std::generate(w111.begin(), w111.end(), std::ref(f32rng)); - std::generate(w112.begin(), w112.end(), std::ref(f32rng)); - std::generate(w113.begin(), w113.end(), std::ref(f32rng)); - std::generate(w114.begin(), w114.end(), std::ref(f32rng)); - std::generate(w115.begin(), w115.end(), std::ref(f32rng)); - std::generate(w116.begin(), w116.end(), std::ref(f32rng)); - std::generate(w117.begin(), w117.end(), std::ref(f32rng)); - std::generate(w118.begin(), w118.end(), std::ref(f32rng)); - std::generate(w119.begin(), w119.end(), std::ref(f32rng)); - std::generate(w120.begin(), w120.end(), std::ref(f32rng)); - std::generate(w121.begin(), w121.end(), std::ref(f32rng)); - std::generate(w122.begin(), w122.end(), std::ref(f32rng)); - std::generate(w123.begin(), w123.end(), std::ref(f32rng)); - std::generate(w124.begin(), w124.end(), std::ref(f32rng)); - std::generate(w125.begin(), w125.end(), std::ref(f32rng)); - std::generate(w126.begin(), w126.end(), std::ref(f32rng)); - std::generate(w127.begin(), w127.end(), std::ref(f32rng)); - std::generate(w128.begin(), w128.end(), std::ref(f32rng)); - std::generate(w129.begin(), w129.end(), std::ref(f32rng)); - std::generate(w130.begin(), w130.end(), std::ref(f32rng)); - std::generate(w131.begin(), w131.end(), std::ref(f32rng)); - std::generate(w132.begin(), w132.end(), std::ref(f32rng)); - std::generate(w133.begin(), w133.end(), std::ref(f32rng)); - std::generate(w134.begin(), w134.end(), std::ref(f32rng)); - std::generate(w135.begin(), w135.end(), std::ref(f32rng)); - std::generate(w136.begin(), w136.end(), std::ref(f32rng)); - std::generate(w137.begin(), w137.end(), std::ref(f32rng)); - std::generate(w138.begin(), w138.end(), std::ref(f32rng)); - std::generate(w139.begin(), w139.end(), std::ref(f32rng)); - std::generate(w140.begin(), w140.end(), std::ref(f32rng)); - std::generate(w141.begin(), w141.end(), std::ref(f32rng)); - std::generate(w142.begin(), w142.end(), std::ref(f32rng)); - std::generate(w143.begin(), w143.end(), std::ref(f32rng)); - std::generate(w144.begin(), w144.end(), std::ref(f32rng)); - std::generate(w145.begin(), w145.end(), std::ref(f32rng)); - std::generate(w146.begin(), w146.end(), std::ref(f32rng)); - std::generate(w147.begin(), w147.end(), std::ref(f32rng)); - std::generate(w148.begin(), w148.end(), std::ref(f32rng)); - std::generate(w149.begin(), w149.end(), std::ref(f32rng)); - std::generate(w150.begin(), w150.end(), std::ref(f32rng)); - std::generate(w151.begin(), w151.end(), std::ref(f32rng)); - std::generate(w152.begin(), w152.end(), std::ref(f32rng)); - std::generate(w153.begin(), w153.end(), std::ref(f32rng)); - std::generate(w154.begin(), w154.end(), std::ref(f32rng)); - std::generate(w155.begin(), w155.end(), std::ref(f32rng)); - std::generate(w156.begin(), w156.end(), std::ref(f32rng)); - std::generate(w157.begin(), w157.end(), std::ref(f32rng)); - std::generate(w158.begin(), w158.end(), std::ref(f32rng)); - std::generate(w159.begin(), w159.end(), std::ref(f32rng)); - std::generate(w160.begin(), w160.end(), std::ref(f32rng)); - std::generate(w161.begin(), w161.end(), std::ref(f32rng)); - std::generate(w162.begin(), w162.end(), std::ref(f32rng)); - std::generate(w163.begin(), w163.end(), std::ref(f32rng)); - std::generate(w164.begin(), w164.end(), std::ref(f32rng)); - std::generate(w165.begin(), w165.end(), std::ref(f32rng)); - std::generate(w166.begin(), w166.end(), std::ref(f32rng)); - std::generate(w167.begin(), w167.end(), std::ref(f32rng)); - std::generate(w168.begin(), w168.end(), std::ref(f32rng)); - std::generate(w169.begin(), w169.end(), std::ref(f32rng)); - std::generate(w170.begin(), w170.end(), std::ref(f32rng)); - std::generate(w171.begin(), w171.end(), std::ref(f32rng)); - std::generate(w172.begin(), w172.end(), std::ref(f32rng)); - - Operators operators; - xnn_status status; - xnn_code_cache* code_cache_ptr = nullptr; - size_t max_workspace_size = 0; - - xnn_operator_t op0 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/0, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/3, - /*group_output_channels=*/32, - /*input_channel_stride=*/3, - /*output_channel_stride=*/32, - /*kernel=*/w67.data(), /*bias=*/w68.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op0); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #0" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op0, xnn_delete_operator); - - xnn_operator_t op1 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/32, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/32, - /*output_channel_stride=*/32, - /*kernel=*/w69.data(), /*bias=*/w70.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op1); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #1" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op1, xnn_delete_operator); - - xnn_operator_t op2 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/32, - /*group_output_channels=*/16, - /*input_channel_stride=*/32, - /*output_channel_stride=*/16, - /*kernel=*/w71.data(), /*bias=*/w72.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op2); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #2" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op2, xnn_delete_operator); - - xnn_operator_t op3 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/16, - /*group_output_channels=*/96, - /*input_channel_stride=*/16, - /*output_channel_stride=*/96, - /*kernel=*/w73.data(), /*bias=*/w74.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op3); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #3" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op3, xnn_delete_operator); - - xnn_operator_t op4 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/0, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/96, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/96, - /*output_channel_stride=*/96, - /*kernel=*/w75.data(), /*bias=*/w76.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op4); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #4" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op4, xnn_delete_operator); - - xnn_operator_t op5 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/96, - /*group_output_channels=*/24, - /*input_channel_stride=*/96, - /*output_channel_stride=*/24, - /*kernel=*/w77.data(), /*bias=*/w78.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op5); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #5" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op5, xnn_delete_operator); - - xnn_operator_t op6 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/24, - /*group_output_channels=*/144, - /*input_channel_stride=*/24, - /*output_channel_stride=*/144, - /*kernel=*/w79.data(), /*bias=*/w80.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op6); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #6" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op6, xnn_delete_operator); - - xnn_operator_t op7 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/144, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/144, - /*output_channel_stride=*/144, - /*kernel=*/w81.data(), /*bias=*/w82.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op7); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #7" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op7, xnn_delete_operator); - - xnn_operator_t op8 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/144, - /*group_output_channels=*/24, - /*input_channel_stride=*/144, - /*output_channel_stride=*/24, - /*kernel=*/w83.data(), /*bias=*/w84.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op8); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #8" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op8, xnn_delete_operator); - - xnn_operator_t op9 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op9); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #9" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op9, xnn_delete_operator); - - xnn_operator_t op10 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/24, - /*group_output_channels=*/144, - /*input_channel_stride=*/24, - /*output_channel_stride=*/144, - /*kernel=*/w85.data(), /*bias=*/w86.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op10); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #10" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op10, xnn_delete_operator); - - xnn_operator_t op11 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/0, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/144, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/144, - /*output_channel_stride=*/144, - /*kernel=*/w87.data(), /*bias=*/w88.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op11); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #11" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op11, xnn_delete_operator); - - xnn_operator_t op12 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/144, - /*group_output_channels=*/32, - /*input_channel_stride=*/144, - /*output_channel_stride=*/32, - /*kernel=*/w89.data(), /*bias=*/w90.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op12); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #12" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op12, xnn_delete_operator); - - xnn_operator_t op13 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/32, - /*group_output_channels=*/192, - /*input_channel_stride=*/32, - /*output_channel_stride=*/192, - /*kernel=*/w91.data(), /*bias=*/w92.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op13); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #13" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op13, xnn_delete_operator); - - xnn_operator_t op14 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/192, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/192, - /*output_channel_stride=*/192, - /*kernel=*/w93.data(), /*bias=*/w94.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op14); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #14" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op14, xnn_delete_operator); - - xnn_operator_t op15 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/192, - /*group_output_channels=*/32, - /*input_channel_stride=*/192, - /*output_channel_stride=*/32, - /*kernel=*/w95.data(), /*bias=*/w96.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op15); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #15" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op15, xnn_delete_operator); - - xnn_operator_t op16 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op16); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #16" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op16, xnn_delete_operator); - - xnn_operator_t op17 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/32, - /*group_output_channels=*/192, - /*input_channel_stride=*/32, - /*output_channel_stride=*/192, - /*kernel=*/w97.data(), /*bias=*/w98.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op17); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #17" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op17, xnn_delete_operator); - - xnn_operator_t op18 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/192, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/192, - /*output_channel_stride=*/192, - /*kernel=*/w99.data(), /*bias=*/w100.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op18); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #18" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op18, xnn_delete_operator); - - xnn_operator_t op19 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/192, - /*group_output_channels=*/32, - /*input_channel_stride=*/192, - /*output_channel_stride=*/32, - /*kernel=*/w101.data(), /*bias=*/w102.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op19); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #19" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op19, xnn_delete_operator); - - xnn_operator_t op20 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op20); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #20" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op20, xnn_delete_operator); - - xnn_operator_t op21 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/32, - /*group_output_channels=*/192, - /*input_channel_stride=*/32, - /*output_channel_stride=*/192, - /*kernel=*/w103.data(), /*bias=*/w104.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op21); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #21" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op21, xnn_delete_operator); - - xnn_operator_t op22 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/0, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/192, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/192, - /*output_channel_stride=*/192, - /*kernel=*/w105.data(), /*bias=*/w106.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op22); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #22" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op22, xnn_delete_operator); - - xnn_operator_t op23 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/192, - /*group_output_channels=*/64, - /*input_channel_stride=*/192, - /*output_channel_stride=*/64, - /*kernel=*/w107.data(), /*bias=*/w108.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op23); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #23" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op23, xnn_delete_operator); - - xnn_operator_t op24 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/64, - /*group_output_channels=*/384, - /*input_channel_stride=*/64, - /*output_channel_stride=*/384, - /*kernel=*/w109.data(), /*bias=*/w110.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op24); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #24" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op24, xnn_delete_operator); - - xnn_operator_t op25 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/384, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/384, - /*output_channel_stride=*/384, - /*kernel=*/w111.data(), /*bias=*/w112.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op25); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #25" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op25, xnn_delete_operator); - - xnn_operator_t op26 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/384, - /*group_output_channels=*/64, - /*input_channel_stride=*/384, - /*output_channel_stride=*/64, - /*kernel=*/w113.data(), /*bias=*/w114.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op26); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #26" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op26, xnn_delete_operator); - - xnn_operator_t op27 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op27); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #27" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op27, xnn_delete_operator); - - xnn_operator_t op28 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/64, - /*group_output_channels=*/384, - /*input_channel_stride=*/64, - /*output_channel_stride=*/384, - /*kernel=*/w115.data(), /*bias=*/w116.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op28); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #28" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op28, xnn_delete_operator); - - xnn_operator_t op29 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/384, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/384, - /*output_channel_stride=*/384, - /*kernel=*/w117.data(), /*bias=*/w118.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op29); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #29" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op29, xnn_delete_operator); - - xnn_operator_t op30 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/384, - /*group_output_channels=*/64, - /*input_channel_stride=*/384, - /*output_channel_stride=*/64, - /*kernel=*/w119.data(), /*bias=*/w120.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op30); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #30" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op30, xnn_delete_operator); - - xnn_operator_t op31 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op31); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #31" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op31, xnn_delete_operator); - - xnn_operator_t op32 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/64, - /*group_output_channels=*/384, - /*input_channel_stride=*/64, - /*output_channel_stride=*/384, - /*kernel=*/w121.data(), /*bias=*/w122.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op32); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #32" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op32, xnn_delete_operator); - - xnn_operator_t op33 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/384, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/384, - /*output_channel_stride=*/384, - /*kernel=*/w123.data(), /*bias=*/w124.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op33); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #33" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op33, xnn_delete_operator); - - xnn_operator_t op34 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/384, - /*group_output_channels=*/64, - /*input_channel_stride=*/384, - /*output_channel_stride=*/64, - /*kernel=*/w125.data(), /*bias=*/w126.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op34); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #34" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op34, xnn_delete_operator); - - xnn_operator_t op35 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op35); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #35" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op35, xnn_delete_operator); - - xnn_operator_t op36 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/64, - /*group_output_channels=*/384, - /*input_channel_stride=*/64, - /*output_channel_stride=*/384, - /*kernel=*/w127.data(), /*bias=*/w128.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op36); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #36" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op36, xnn_delete_operator); - - xnn_operator_t op37 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/384, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/384, - /*output_channel_stride=*/384, - /*kernel=*/w129.data(), /*bias=*/w130.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op37); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #37" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op37, xnn_delete_operator); - - xnn_operator_t op38 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/384, - /*group_output_channels=*/96, - /*input_channel_stride=*/384, - /*output_channel_stride=*/96, - /*kernel=*/w131.data(), /*bias=*/w132.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op38); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #38" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op38, xnn_delete_operator); - - xnn_operator_t op39 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/96, - /*group_output_channels=*/576, - /*input_channel_stride=*/96, - /*output_channel_stride=*/576, - /*kernel=*/w133.data(), /*bias=*/w134.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op39); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #39" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op39, xnn_delete_operator); - - xnn_operator_t op40 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/576, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/576, - /*output_channel_stride=*/576, - /*kernel=*/w135.data(), /*bias=*/w136.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op40); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #40" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op40, xnn_delete_operator); - - xnn_operator_t op41 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/576, - /*group_output_channels=*/96, - /*input_channel_stride=*/576, - /*output_channel_stride=*/96, - /*kernel=*/w137.data(), /*bias=*/w138.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op41); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #41" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op41, xnn_delete_operator); - - xnn_operator_t op42 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op42); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #42" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op42, xnn_delete_operator); - - xnn_operator_t op43 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/96, - /*group_output_channels=*/576, - /*input_channel_stride=*/96, - /*output_channel_stride=*/576, - /*kernel=*/w139.data(), /*bias=*/w140.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op43); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #43" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op43, xnn_delete_operator); - - xnn_operator_t op44 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/576, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/576, - /*output_channel_stride=*/576, - /*kernel=*/w141.data(), /*bias=*/w142.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op44); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #44" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op44, xnn_delete_operator); - - xnn_operator_t op45 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/576, - /*group_output_channels=*/96, - /*input_channel_stride=*/576, - /*output_channel_stride=*/96, - /*kernel=*/w143.data(), /*bias=*/w144.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op45); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #45" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op45, xnn_delete_operator); - - xnn_operator_t op46 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op46); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #46" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op46, xnn_delete_operator); - - xnn_operator_t op47 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/96, - /*group_output_channels=*/576, - /*input_channel_stride=*/96, - /*output_channel_stride=*/576, - /*kernel=*/w145.data(), /*bias=*/w146.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op47); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #47" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op47, xnn_delete_operator); - - xnn_operator_t op48 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/0, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/576, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/576, - /*output_channel_stride=*/576, - /*kernel=*/w147.data(), /*bias=*/w148.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op48); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #48" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op48, xnn_delete_operator); - - xnn_operator_t op49 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/576, - /*group_output_channels=*/160, - /*input_channel_stride=*/576, - /*output_channel_stride=*/160, - /*kernel=*/w149.data(), /*bias=*/w150.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op49); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #49" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op49, xnn_delete_operator); - - xnn_operator_t op50 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/160, - /*group_output_channels=*/960, - /*input_channel_stride=*/160, - /*output_channel_stride=*/960, - /*kernel=*/w151.data(), /*bias=*/w152.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op50); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #50" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op50, xnn_delete_operator); - - xnn_operator_t op51 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/960, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/960, - /*output_channel_stride=*/960, - /*kernel=*/w153.data(), /*bias=*/w154.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op51); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #51" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op51, xnn_delete_operator); - - xnn_operator_t op52 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/960, - /*group_output_channels=*/160, - /*input_channel_stride=*/960, - /*output_channel_stride=*/160, - /*kernel=*/w155.data(), /*bias=*/w156.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op52); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #52" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op52, xnn_delete_operator); - - xnn_operator_t op53 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op53); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #53" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op53, xnn_delete_operator); - - xnn_operator_t op54 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/160, - /*group_output_channels=*/960, - /*input_channel_stride=*/160, - /*output_channel_stride=*/960, - /*kernel=*/w157.data(), /*bias=*/w158.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op54); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #54" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op54, xnn_delete_operator); - - xnn_operator_t op55 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/960, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/960, - /*output_channel_stride=*/960, - /*kernel=*/w159.data(), /*bias=*/w160.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op55); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #55" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op55, xnn_delete_operator); - - xnn_operator_t op56 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/960, - /*group_output_channels=*/160, - /*input_channel_stride=*/960, - /*output_channel_stride=*/160, - /*kernel=*/w161.data(), /*bias=*/w162.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op56); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #56" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op56, xnn_delete_operator); - - xnn_operator_t op57 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op57); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #57" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op57, xnn_delete_operator); - - xnn_operator_t op58 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/160, - /*group_output_channels=*/960, - /*input_channel_stride=*/160, - /*output_channel_stride=*/960, - /*kernel=*/w163.data(), /*bias=*/w164.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op58); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #58" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op58, xnn_delete_operator); - - xnn_operator_t op59 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/960, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/960, - /*output_channel_stride=*/960, - /*kernel=*/w165.data(), /*bias=*/w166.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op59); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #59" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op59, xnn_delete_operator); - - xnn_operator_t op60 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/960, - /*group_output_channels=*/320, - /*input_channel_stride=*/960, - /*output_channel_stride=*/320, - /*kernel=*/w167.data(), /*bias=*/w168.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op60); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #60" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op60, xnn_delete_operator); - - xnn_operator_t op61 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/320, - /*group_output_channels=*/1280, - /*input_channel_stride=*/320, - /*output_channel_stride=*/1280, - /*kernel=*/w169.data(), /*bias=*/w170.data(), - /*output_min=*/0.0f, /*output_max=*/6.0f, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op61); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #61" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op61, xnn_delete_operator); - - xnn_operator_t op62 = nullptr; - status = xnn_create_global_average_pooling_nwc_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op62); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #62" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op62, xnn_delete_operator); - - xnn_operator_t op63 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/1280, - /*group_output_channels=*/1001, - /*input_channel_stride=*/1280, - /*output_channel_stride=*/1001, - /*kernel=*/w171.data(), /*bias=*/w172.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op63); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #63" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op63, xnn_delete_operator); - - xnn_operator_t op64 = nullptr; - status = xnn_create_copy_nc_x32( - 0 /* flags */, - &op64); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #64" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op64, xnn_delete_operator); - - xnn_operator_t op65 = nullptr; - status = xnn_create_softmax_nc_f32( - /*flags=*/0, - &op65); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #65" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op65, xnn_delete_operator); - - size_t op0_workspace_size = 0; - size_t op0_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op0, - /*batch_size=*/1, /*input_height=*/224, /*input_width=*/224, - &op0_workspace_size, &op0_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op0_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #0" << std::endl; - return ExecutionPlan(); - } - - size_t op1_workspace_size = 0; - size_t op1_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op1, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op1_workspace_size, &op1_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op1_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #1" << std::endl; - return ExecutionPlan(); - } - - size_t op2_workspace_size = 0; - size_t op2_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op2, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op2_workspace_size, &op2_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op2_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #2" << std::endl; - return ExecutionPlan(); - } - - size_t op3_workspace_size = 0; - size_t op3_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op3, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op3_workspace_size, &op3_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op3_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #3" << std::endl; - return ExecutionPlan(); - } - - size_t op4_workspace_size = 0; - size_t op4_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op4, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op4_workspace_size, &op4_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op4_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #4" << std::endl; - return ExecutionPlan(); - } - - size_t op5_workspace_size = 0; - size_t op5_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op5, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op5_workspace_size, &op5_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op5_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #5" << std::endl; - return ExecutionPlan(); - } - - size_t op6_workspace_size = 0; - size_t op6_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op6, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op6_workspace_size, &op6_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op6_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #6" << std::endl; - return ExecutionPlan(); - } - - size_t op7_workspace_size = 0; - size_t op7_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op7, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op7_workspace_size, &op7_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op7_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #7" << std::endl; - return ExecutionPlan(); - } - - size_t op8_workspace_size = 0; - size_t op8_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op8, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op8_workspace_size, &op8_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op8_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #8" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 56, 56, 24 }; - const size_t b_shape[] = { 1, 56, 56, 24 }; - status = xnn_reshape_add_nd_f32( - op9, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #9" << std::endl; - return ExecutionPlan(); - } - - size_t op10_workspace_size = 0; - size_t op10_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op10, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op10_workspace_size, &op10_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op10_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #10" << std::endl; - return ExecutionPlan(); - } - - size_t op11_workspace_size = 0; - size_t op11_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op11, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op11_workspace_size, &op11_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op11_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #11" << std::endl; - return ExecutionPlan(); - } - - size_t op12_workspace_size = 0; - size_t op12_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op12, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op12_workspace_size, &op12_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op12_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #12" << std::endl; - return ExecutionPlan(); - } - - size_t op13_workspace_size = 0; - size_t op13_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op13, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op13_workspace_size, &op13_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op13_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #13" << std::endl; - return ExecutionPlan(); - } - - size_t op14_workspace_size = 0; - size_t op14_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op14, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op14_workspace_size, &op14_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op14_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #14" << std::endl; - return ExecutionPlan(); - } - - size_t op15_workspace_size = 0; - size_t op15_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op15, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op15_workspace_size, &op15_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op15_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #15" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 28, 28, 32 }; - const size_t b_shape[] = { 1, 28, 28, 32 }; - status = xnn_reshape_add_nd_f32( - op16, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #16" << std::endl; - return ExecutionPlan(); - } - - size_t op17_workspace_size = 0; - size_t op17_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op17, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op17_workspace_size, &op17_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op17_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #17" << std::endl; - return ExecutionPlan(); - } - - size_t op18_workspace_size = 0; - size_t op18_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op18, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op18_workspace_size, &op18_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op18_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #18" << std::endl; - return ExecutionPlan(); - } - - size_t op19_workspace_size = 0; - size_t op19_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op19, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op19_workspace_size, &op19_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op19_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #19" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 28, 28, 32 }; - const size_t b_shape[] = { 1, 28, 28, 32 }; - status = xnn_reshape_add_nd_f32( - op20, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #20" << std::endl; - return ExecutionPlan(); - } - - size_t op21_workspace_size = 0; - size_t op21_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op21, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op21_workspace_size, &op21_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op21_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #21" << std::endl; - return ExecutionPlan(); - } - - size_t op22_workspace_size = 0; - size_t op22_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op22, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op22_workspace_size, &op22_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op22_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #22" << std::endl; - return ExecutionPlan(); - } - - size_t op23_workspace_size = 0; - size_t op23_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op23, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op23_workspace_size, &op23_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op23_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #23" << std::endl; - return ExecutionPlan(); - } - - size_t op24_workspace_size = 0; - size_t op24_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op24, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op24_workspace_size, &op24_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op24_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #24" << std::endl; - return ExecutionPlan(); - } - - size_t op25_workspace_size = 0; - size_t op25_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op25, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op25_workspace_size, &op25_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op25_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #25" << std::endl; - return ExecutionPlan(); - } - - size_t op26_workspace_size = 0; - size_t op26_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op26, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op26_workspace_size, &op26_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op26_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #26" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 64 }; - const size_t b_shape[] = { 1, 14, 14, 64 }; - status = xnn_reshape_add_nd_f32( - op27, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #27" << std::endl; - return ExecutionPlan(); - } - - size_t op28_workspace_size = 0; - size_t op28_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op28, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op28_workspace_size, &op28_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op28_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #28" << std::endl; - return ExecutionPlan(); - } - - size_t op29_workspace_size = 0; - size_t op29_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op29, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op29_workspace_size, &op29_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op29_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #29" << std::endl; - return ExecutionPlan(); - } - - size_t op30_workspace_size = 0; - size_t op30_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op30, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op30_workspace_size, &op30_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op30_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #30" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 64 }; - const size_t b_shape[] = { 1, 14, 14, 64 }; - status = xnn_reshape_add_nd_f32( - op31, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #31" << std::endl; - return ExecutionPlan(); - } - - size_t op32_workspace_size = 0; - size_t op32_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op32, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op32_workspace_size, &op32_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op32_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #32" << std::endl; - return ExecutionPlan(); - } - - size_t op33_workspace_size = 0; - size_t op33_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op33, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op33_workspace_size, &op33_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op33_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #33" << std::endl; - return ExecutionPlan(); - } - - size_t op34_workspace_size = 0; - size_t op34_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op34, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op34_workspace_size, &op34_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op34_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #34" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 64 }; - const size_t b_shape[] = { 1, 14, 14, 64 }; - status = xnn_reshape_add_nd_f32( - op35, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #35" << std::endl; - return ExecutionPlan(); - } - - size_t op36_workspace_size = 0; - size_t op36_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op36, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op36_workspace_size, &op36_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op36_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #36" << std::endl; - return ExecutionPlan(); - } - - size_t op37_workspace_size = 0; - size_t op37_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op37, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op37_workspace_size, &op37_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op37_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #37" << std::endl; - return ExecutionPlan(); - } - - size_t op38_workspace_size = 0; - size_t op38_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op38, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op38_workspace_size, &op38_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op38_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #38" << std::endl; - return ExecutionPlan(); - } - - size_t op39_workspace_size = 0; - size_t op39_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op39, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op39_workspace_size, &op39_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op39_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #39" << std::endl; - return ExecutionPlan(); - } - - size_t op40_workspace_size = 0; - size_t op40_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op40, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op40_workspace_size, &op40_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op40_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #40" << std::endl; - return ExecutionPlan(); - } - - size_t op41_workspace_size = 0; - size_t op41_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op41, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op41_workspace_size, &op41_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op41_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #41" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 96 }; - const size_t b_shape[] = { 1, 14, 14, 96 }; - status = xnn_reshape_add_nd_f32( - op42, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #42" << std::endl; - return ExecutionPlan(); - } - - size_t op43_workspace_size = 0; - size_t op43_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op43, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op43_workspace_size, &op43_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op43_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #43" << std::endl; - return ExecutionPlan(); - } - - size_t op44_workspace_size = 0; - size_t op44_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op44, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op44_workspace_size, &op44_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op44_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #44" << std::endl; - return ExecutionPlan(); - } - - size_t op45_workspace_size = 0; - size_t op45_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op45, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op45_workspace_size, &op45_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op45_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #45" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 96 }; - const size_t b_shape[] = { 1, 14, 14, 96 }; - status = xnn_reshape_add_nd_f32( - op46, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #46" << std::endl; - return ExecutionPlan(); - } - - size_t op47_workspace_size = 0; - size_t op47_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op47, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op47_workspace_size, &op47_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op47_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #47" << std::endl; - return ExecutionPlan(); - } - - size_t op48_workspace_size = 0; - size_t op48_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op48, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op48_workspace_size, &op48_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op48_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #48" << std::endl; - return ExecutionPlan(); - } - - size_t op49_workspace_size = 0; - size_t op49_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op49, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op49_workspace_size, &op49_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op49_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #49" << std::endl; - return ExecutionPlan(); - } - - size_t op50_workspace_size = 0; - size_t op50_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op50, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op50_workspace_size, &op50_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op50_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #50" << std::endl; - return ExecutionPlan(); - } - - size_t op51_workspace_size = 0; - size_t op51_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op51, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op51_workspace_size, &op51_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op51_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #51" << std::endl; - return ExecutionPlan(); - } - - size_t op52_workspace_size = 0; - size_t op52_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op52, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op52_workspace_size, &op52_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op52_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #52" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 160 }; - const size_t b_shape[] = { 1, 7, 7, 160 }; - status = xnn_reshape_add_nd_f32( - op53, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #53" << std::endl; - return ExecutionPlan(); - } - - size_t op54_workspace_size = 0; - size_t op54_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op54, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op54_workspace_size, &op54_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op54_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #54" << std::endl; - return ExecutionPlan(); - } - - size_t op55_workspace_size = 0; - size_t op55_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op55, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op55_workspace_size, &op55_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op55_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #55" << std::endl; - return ExecutionPlan(); - } - - size_t op56_workspace_size = 0; - size_t op56_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op56, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op56_workspace_size, &op56_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op56_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #56" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 160 }; - const size_t b_shape[] = { 1, 7, 7, 160 }; - status = xnn_reshape_add_nd_f32( - op57, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #57" << std::endl; - return ExecutionPlan(); - } - - size_t op58_workspace_size = 0; - size_t op58_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op58, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op58_workspace_size, &op58_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op58_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #58" << std::endl; - return ExecutionPlan(); - } - - size_t op59_workspace_size = 0; - size_t op59_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op59, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op59_workspace_size, &op59_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op59_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #59" << std::endl; - return ExecutionPlan(); - } - - size_t op60_workspace_size = 0; - size_t op60_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op60, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op60_workspace_size, &op60_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op60_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #60" << std::endl; - return ExecutionPlan(); - } - - size_t op61_workspace_size = 0; - size_t op61_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op61, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op61_workspace_size, &op61_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op61_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #61" << std::endl; - return ExecutionPlan(); - } - - size_t op62_workspace_size = 0; - size_t op62_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f32( - op62, - /*batch_size=*/1, 49 /* width */, - 1280 /* channels */, 1280 /* input stride */, 1280 /* output stride */, - &op62_workspace_size, &op62_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op62_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #62" << std::endl; - return ExecutionPlan(); - } - - size_t op63_workspace_size = 0; - size_t op63_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op63, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op63_workspace_size, &op63_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op63_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #63" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x32( - op64, - /*batch_size=*/1001, - 1 /* channels */, - 1 /* input stride */, - 1 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #64" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_softmax_nc_f32( - op65, - /*channels=*/1001, - /*input_stride=*/1001, - /*output_stride=*/1001, - /*batch_size=*/1, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #65" << std::endl; - return ExecutionPlan(); - } - - Workspace workspace(max_workspace_size); - - status = xnn_setup_convolution2d_nhwc_f32( - op0, - workspace.data(), /*input=*/v0.data(), /*output=*/v1.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #0" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op1, - workspace.data(), /*input=*/v1.data(), /*output=*/v2.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #1" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op2, - workspace.data(), /*input=*/v2.data(), /*output=*/v3.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #2" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op3, - workspace.data(), /*input=*/v3.data(), /*output=*/v4.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #3" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op4, - workspace.data(), /*input=*/v4.data(), /*output=*/v5.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #4" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op5, - workspace.data(), /*input=*/v5.data(), /*output=*/v6.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #5" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op6, - workspace.data(), /*input=*/v6.data(), /*output=*/v7.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #6" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op7, - workspace.data(), /*input=*/v7.data(), /*output=*/v8.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #7" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op8, - workspace.data(), /*input=*/v8.data(), /*output=*/v9.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #8" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op9, - v9.data() /* a */, v6.data() /* b */, /*output=*/v10.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #9" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op10, - workspace.data(), /*input=*/v10.data(), /*output=*/v11.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #10" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op11, - workspace.data(), /*input=*/v11.data(), /*output=*/v12.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #11" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op12, - workspace.data(), /*input=*/v12.data(), /*output=*/v13.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #12" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op13, - workspace.data(), /*input=*/v13.data(), /*output=*/v14.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #13" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op14, - workspace.data(), /*input=*/v14.data(), /*output=*/v15.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #14" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op15, - workspace.data(), /*input=*/v15.data(), /*output=*/v16.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #15" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op16, - v16.data() /* a */, v13.data() /* b */, /*output=*/v17.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #16" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op17, - workspace.data(), /*input=*/v17.data(), /*output=*/v18.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #17" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op18, - workspace.data(), /*input=*/v18.data(), /*output=*/v19.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #18" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op19, - workspace.data(), /*input=*/v19.data(), /*output=*/v20.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #19" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op20, - v20.data() /* a */, v17.data() /* b */, /*output=*/v21.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #20" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op21, - workspace.data(), /*input=*/v21.data(), /*output=*/v22.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #21" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op22, - workspace.data(), /*input=*/v22.data(), /*output=*/v23.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #22" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op23, - workspace.data(), /*input=*/v23.data(), /*output=*/v24.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #23" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op24, - workspace.data(), /*input=*/v24.data(), /*output=*/v25.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #24" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op25, - workspace.data(), /*input=*/v25.data(), /*output=*/v26.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #25" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op26, - workspace.data(), /*input=*/v26.data(), /*output=*/v27.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #26" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op27, - v27.data() /* a */, v24.data() /* b */, /*output=*/v28.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #27" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op28, - workspace.data(), /*input=*/v28.data(), /*output=*/v29.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #28" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op29, - workspace.data(), /*input=*/v29.data(), /*output=*/v30.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #29" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op30, - workspace.data(), /*input=*/v30.data(), /*output=*/v31.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #30" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op31, - v31.data() /* a */, v28.data() /* b */, /*output=*/v32.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #31" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op32, - workspace.data(), /*input=*/v32.data(), /*output=*/v33.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #32" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op33, - workspace.data(), /*input=*/v33.data(), /*output=*/v34.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #33" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op34, - workspace.data(), /*input=*/v34.data(), /*output=*/v35.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #34" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op35, - v35.data() /* a */, v32.data() /* b */, /*output=*/v36.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #35" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op36, - workspace.data(), /*input=*/v36.data(), /*output=*/v37.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #36" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op37, - workspace.data(), /*input=*/v37.data(), /*output=*/v38.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #37" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op38, - workspace.data(), /*input=*/v38.data(), /*output=*/v39.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #38" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op39, - workspace.data(), /*input=*/v39.data(), /*output=*/v40.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #39" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op40, - workspace.data(), /*input=*/v40.data(), /*output=*/v41.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #40" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op41, - workspace.data(), /*input=*/v41.data(), /*output=*/v42.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #41" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op42, - v42.data() /* a */, v39.data() /* b */, /*output=*/v43.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #42" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op43, - workspace.data(), /*input=*/v43.data(), /*output=*/v44.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #43" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op44, - workspace.data(), /*input=*/v44.data(), /*output=*/v45.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #44" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op45, - workspace.data(), /*input=*/v45.data(), /*output=*/v46.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #45" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op46, - v46.data() /* a */, v43.data() /* b */, /*output=*/v47.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #46" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op47, - workspace.data(), /*input=*/v47.data(), /*output=*/v48.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #47" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op48, - workspace.data(), /*input=*/v48.data(), /*output=*/v49.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #48" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op49, - workspace.data(), /*input=*/v49.data(), /*output=*/v50.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #49" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op50, - workspace.data(), /*input=*/v50.data(), /*output=*/v51.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #50" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op51, - workspace.data(), /*input=*/v51.data(), /*output=*/v52.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #51" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op52, - workspace.data(), /*input=*/v52.data(), /*output=*/v53.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #52" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op53, - v53.data() /* a */, v50.data() /* b */, /*output=*/v54.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #53" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op54, - workspace.data(), /*input=*/v54.data(), /*output=*/v55.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #54" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op55, - workspace.data(), /*input=*/v55.data(), /*output=*/v56.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #55" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op56, - workspace.data(), /*input=*/v56.data(), /*output=*/v57.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #56" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op57, - v57.data() /* a */, v54.data() /* b */, /*output=*/v58.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #57" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op58, - workspace.data(), /*input=*/v58.data(), /*output=*/v59.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #58" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op59, - workspace.data(), /*input=*/v59.data(), /*output=*/v60.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #59" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op60, - workspace.data(), /*input=*/v60.data(), /*output=*/v61.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #60" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op61, - workspace.data(), /*input=*/v61.data(), /*output=*/v62.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #61" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f32( - op62, - workspace.data(), - /*input=*/v62.data(), /*output=*/v63.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #62" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op63, - workspace.data(), /*input=*/v63.data(), /*output=*/v64.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #63" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x32( - op64, - /*input=*/v64.data(), /*output=*/v65.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #64" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_softmax_nc_f32( - op65, - /*input=*/v65.data(), /*output=*/v66.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #65" << std::endl; - return ExecutionPlan(); - } - - XNN_PRAGMA_CLANG("clang diagnostic push") - XNN_PRAGMA_CLANG("clang diagnostic ignored \"-Wpessimizing-move\"") - return ExecutionPlan{operators, workspace}; - XNN_PRAGMA_CLANG("clang diagnostic pop") -} - -} // namespace models diff --git a/models/fp32-mobilenet-v3-large.cc b/models/fp32-mobilenet-v3-large.cc deleted file mode 100644 index 62dea46ea6c..00000000000 --- a/models/fp32-mobilenet-v3-large.cc +++ /dev/null @@ -1,5080 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! - -#include "xnnpack.h" - -#include -#include -#include -#include -#include -#include - -#include "xnnpack/cache.h" -#include "xnnpack/common.h" -#include "xnnpack/models.h" - -namespace models { - -ExecutionPlan FP32MobileNetV3Large(pthreadpool_t threadpool) { - alignas(16) static std::array v0; - alignas(16) static std::array v1; - alignas(16) static std::array v2; - alignas(16) static std::array v3; - alignas(16) static std::array v4; - alignas(16) static std::array v5; - alignas(16) static std::array v6; - alignas(16) static std::array v7; - alignas(16) static std::array v8; - alignas(16) static std::array v9; - alignas(16) static std::array v10; - alignas(16) static std::array v11; - alignas(16) static std::array v12; - alignas(16) static std::array v13; - alignas(16) static std::array v14; - alignas(16) static std::array v15; - alignas(16) static std::array v16; - alignas(16) static std::array v17; - alignas(16) static std::array v18; - alignas(16) static std::array v19; - alignas(16) static std::array v20; - alignas(16) static std::array v21; - alignas(16) static std::array v22; - alignas(16) static std::array v23; - alignas(16) static std::array v24; - alignas(16) static std::array v25; - alignas(16) static std::array v26; - alignas(16) static std::array v27; - alignas(16) static std::array v28; - alignas(16) static std::array v29; - alignas(16) static std::array v30; - alignas(16) static std::array v31; - alignas(16) static std::array v32; - alignas(16) static std::array v33; - alignas(16) static std::array v34; - alignas(16) static std::array v35; - alignas(16) static std::array v36; - alignas(16) static std::array v37; - alignas(16) static std::array v38; - alignas(16) static std::array v39; - alignas(16) static std::array v40; - alignas(16) static std::array v41; - alignas(16) static std::array v42; - alignas(16) static std::array v43; - alignas(16) static std::array v44; - alignas(16) static std::array v45; - alignas(16) static std::array v46; - alignas(16) static std::array v47; - alignas(16) static std::array v48; - alignas(16) static std::array v49; - alignas(16) static std::array v50; - alignas(16) static std::array v51; - alignas(16) static std::array v52; - alignas(16) static std::array v53; - alignas(16) static std::array v54; - alignas(16) static std::array v55; - alignas(16) static std::array v56; - alignas(16) static std::array v57; - alignas(16) static std::array v58; - alignas(16) static std::array v59; - alignas(16) static std::array v60; - alignas(16) static std::array v61; - alignas(16) static std::array v62; - alignas(16) static std::array v63; - alignas(16) static std::array v64; - alignas(16) static std::array v65; - alignas(16) static std::array v66; - alignas(16) static std::array v67; - alignas(16) static std::array v68; - alignas(16) static std::array v69; - alignas(16) static std::array v70; - alignas(16) static std::array v71; - alignas(16) static std::array v72; - alignas(16) static std::array v73; - alignas(16) static std::array v74; - alignas(16) static std::array v75; - alignas(16) static std::array v76; - alignas(16) static std::array v77; - alignas(16) static std::array v78; - alignas(16) static std::array v79; - alignas(16) static std::array v80; - alignas(16) static std::array v81; - alignas(16) static std::array v82; - alignas(16) static std::array v83; - alignas(16) static std::array v84; - alignas(16) static std::array v85; - alignas(16) static std::array v86; - alignas(16) static std::array v87; - alignas(16) static std::array v88; - alignas(16) static std::array v89; - alignas(16) static std::array v90; - alignas(16) static std::array v91; - alignas(16) static std::array v92; - alignas(16) static std::array v93; - alignas(16) static std::array v94; - alignas(16) static std::array v95; - alignas(16) static std::array v96; - alignas(16) static std::array v97; - alignas(16) static std::array v98; - alignas(16) static std::array v99; - alignas(16) static std::array v100; - alignas(16) static std::array v101; - alignas(16) static std::array v102; - alignas(16) static std::array v103; - alignas(16) static std::array v104; - alignas(16) static std::array v105; - alignas(16) static std::array v106; - alignas(16) static std::array v107; - alignas(16) static std::array v108; - alignas(16) static std::array v109; - alignas(16) static std::array v110; - alignas(16) static std::array v111; - alignas(16) static std::array v112; - alignas(16) static std::array v113; - alignas(16) static std::array v114; - alignas(16) static std::array v115; - alignas(16) static std::array w116; - alignas(16) static std::array w117; - alignas(16) static std::array w118; - alignas(16) static std::array w119; - alignas(16) static std::array w120; - alignas(16) static std::array w121; - alignas(16) static std::array w122; - alignas(16) static std::array w123; - alignas(16) static std::array w124; - alignas(16) static std::array w125; - alignas(16) static std::array w126; - alignas(16) static std::array w127; - alignas(16) static std::array w128; - alignas(16) static std::array w129; - alignas(16) static std::array w130; - alignas(16) static std::array w131; - alignas(16) static std::array w132; - alignas(16) static std::array w133; - alignas(16) static std::array w134; - alignas(16) static std::array w135; - alignas(16) static std::array w136; - alignas(16) static std::array w137; - alignas(16) static std::array w138; - alignas(16) static std::array w139; - alignas(16) static std::array w140; - alignas(16) static std::array w141; - alignas(16) static std::array w142; - alignas(16) static std::array w143; - alignas(16) static std::array w144; - alignas(16) static std::array w145; - alignas(16) static std::array w146; - alignas(16) static std::array w147; - alignas(16) static std::array w148; - alignas(16) static std::array w149; - alignas(16) static std::array w150; - alignas(16) static std::array w151; - alignas(16) static std::array w152; - alignas(16) static std::array w153; - alignas(16) static std::array w154; - alignas(16) static std::array w155; - alignas(16) static std::array w156; - alignas(16) static std::array w157; - alignas(16) static std::array w158; - alignas(16) static std::array w159; - alignas(16) static std::array w160; - alignas(16) static std::array w161; - alignas(16) static std::array w162; - alignas(16) static std::array w163; - alignas(16) static std::array w164; - alignas(16) static std::array w165; - alignas(16) static std::array w166; - alignas(16) static std::array w167; - alignas(16) static std::array w168; - alignas(16) static std::array w169; - alignas(16) static std::array w170; - alignas(16) static std::array w171; - alignas(16) static std::array w172; - alignas(16) static std::array w173; - alignas(16) static std::array w174; - alignas(16) static std::array w175; - alignas(16) static std::array w176; - alignas(16) static std::array w177; - alignas(16) static std::array w178; - alignas(16) static std::array w179; - alignas(16) static std::array w180; - alignas(16) static std::array w181; - alignas(16) static std::array w182; - alignas(16) static std::array w183; - alignas(16) static std::array w184; - alignas(16) static std::array w185; - alignas(16) static std::array w186; - alignas(16) static std::array w187; - alignas(16) static std::array w188; - alignas(16) static std::array w189; - alignas(16) static std::array w190; - alignas(16) static std::array w191; - alignas(16) static std::array w192; - alignas(16) static std::array w193; - alignas(16) static std::array w194; - alignas(16) static std::array w195; - alignas(16) static std::array w196; - alignas(16) static std::array w197; - alignas(16) static std::array w198; - alignas(16) static std::array w199; - alignas(16) static std::array w200; - alignas(16) static std::array w201; - alignas(16) static std::array w202; - alignas(16) static std::array w203; - alignas(16) static std::array w204; - alignas(16) static std::array w205; - alignas(16) static std::array w206; - alignas(16) static std::array w207; - alignas(16) static std::array w208; - alignas(16) static std::array w209; - alignas(16) static std::array w210; - alignas(16) static std::array w211; - alignas(16) static std::array w212; - alignas(16) static std::array w213; - alignas(16) static std::array w214; - alignas(16) static std::array w215; - alignas(16) static std::array w216; - alignas(16) static std::array w217; - alignas(16) static std::array w218; - alignas(16) static std::array w219; - alignas(16) static std::array w220; - alignas(16) static std::array w221; - alignas(16) static std::array w222; - alignas(16) static std::array w223; - alignas(16) static std::array w224; - alignas(16) static std::array w225; - alignas(16) static std::array w226; - alignas(16) static std::array w227; - alignas(16) static std::array w228; - alignas(16) static std::array w229; - alignas(16) static std::array w230; - alignas(16) static std::array w231; - alignas(16) static std::array w232; - alignas(16) static std::array w233; - alignas(16) static std::array w234; - alignas(16) static std::array w235; - alignas(16) static std::array w236; - alignas(16) static std::array w237; - alignas(16) static std::array w238; - alignas(16) static std::array w239; - alignas(16) static std::array w240; - alignas(16) static std::array w241; - alignas(16) static std::array w242; - alignas(16) static std::array w243; - - std::random_device random_device; - auto rng = std::mt19937(random_device()); - auto f32rng = std::bind(std::uniform_real_distribution(-1.0f, +1.0f), std::ref(rng)); - std::generate(v0.begin(), v0.end(), std::ref(f32rng)); - std::generate(v1.begin(), v1.end(), std::ref(f32rng)); - std::generate(v2.begin(), v2.end(), std::ref(f32rng)); - std::generate(v3.begin(), v3.end(), std::ref(f32rng)); - std::generate(v4.begin(), v4.end(), std::ref(f32rng)); - std::generate(v5.begin(), v5.end(), std::ref(f32rng)); - std::generate(v6.begin(), v6.end(), std::ref(f32rng)); - std::generate(v7.begin(), v7.end(), std::ref(f32rng)); - std::generate(v8.begin(), v8.end(), std::ref(f32rng)); - std::generate(v9.begin(), v9.end(), std::ref(f32rng)); - std::generate(v10.begin(), v10.end(), std::ref(f32rng)); - std::generate(v11.begin(), v11.end(), std::ref(f32rng)); - std::generate(v12.begin(), v12.end(), std::ref(f32rng)); - std::generate(v13.begin(), v13.end(), std::ref(f32rng)); - std::generate(v14.begin(), v14.end(), std::ref(f32rng)); - std::generate(v15.begin(), v15.end(), std::ref(f32rng)); - std::generate(v16.begin(), v16.end(), std::ref(f32rng)); - std::generate(v17.begin(), v17.end(), std::ref(f32rng)); - std::generate(v18.begin(), v18.end(), std::ref(f32rng)); - std::generate(v19.begin(), v19.end(), std::ref(f32rng)); - std::generate(v20.begin(), v20.end(), std::ref(f32rng)); - std::generate(v21.begin(), v21.end(), std::ref(f32rng)); - std::generate(v22.begin(), v22.end(), std::ref(f32rng)); - std::generate(v23.begin(), v23.end(), std::ref(f32rng)); - std::generate(v24.begin(), v24.end(), std::ref(f32rng)); - std::generate(v25.begin(), v25.end(), std::ref(f32rng)); - std::generate(v26.begin(), v26.end(), std::ref(f32rng)); - std::generate(v27.begin(), v27.end(), std::ref(f32rng)); - std::generate(v28.begin(), v28.end(), std::ref(f32rng)); - std::generate(v29.begin(), v29.end(), std::ref(f32rng)); - std::generate(v30.begin(), v30.end(), std::ref(f32rng)); - std::generate(v31.begin(), v31.end(), std::ref(f32rng)); - std::generate(v32.begin(), v32.end(), std::ref(f32rng)); - std::generate(v33.begin(), v33.end(), std::ref(f32rng)); - std::generate(v34.begin(), v34.end(), std::ref(f32rng)); - std::generate(v35.begin(), v35.end(), std::ref(f32rng)); - std::generate(v36.begin(), v36.end(), std::ref(f32rng)); - std::generate(v37.begin(), v37.end(), std::ref(f32rng)); - std::generate(v38.begin(), v38.end(), std::ref(f32rng)); - std::generate(v39.begin(), v39.end(), std::ref(f32rng)); - std::generate(v40.begin(), v40.end(), std::ref(f32rng)); - std::generate(v41.begin(), v41.end(), std::ref(f32rng)); - std::generate(v42.begin(), v42.end(), std::ref(f32rng)); - std::generate(v43.begin(), v43.end(), std::ref(f32rng)); - std::generate(v44.begin(), v44.end(), std::ref(f32rng)); - std::generate(v45.begin(), v45.end(), std::ref(f32rng)); - std::generate(v46.begin(), v46.end(), std::ref(f32rng)); - std::generate(v47.begin(), v47.end(), std::ref(f32rng)); - std::generate(v48.begin(), v48.end(), std::ref(f32rng)); - std::generate(v49.begin(), v49.end(), std::ref(f32rng)); - std::generate(v50.begin(), v50.end(), std::ref(f32rng)); - std::generate(v51.begin(), v51.end(), std::ref(f32rng)); - std::generate(v52.begin(), v52.end(), std::ref(f32rng)); - std::generate(v53.begin(), v53.end(), std::ref(f32rng)); - std::generate(v54.begin(), v54.end(), std::ref(f32rng)); - std::generate(v55.begin(), v55.end(), std::ref(f32rng)); - std::generate(v56.begin(), v56.end(), std::ref(f32rng)); - std::generate(v57.begin(), v57.end(), std::ref(f32rng)); - std::generate(v58.begin(), v58.end(), std::ref(f32rng)); - std::generate(v59.begin(), v59.end(), std::ref(f32rng)); - std::generate(v60.begin(), v60.end(), std::ref(f32rng)); - std::generate(v61.begin(), v61.end(), std::ref(f32rng)); - std::generate(v62.begin(), v62.end(), std::ref(f32rng)); - std::generate(v63.begin(), v63.end(), std::ref(f32rng)); - std::generate(v64.begin(), v64.end(), std::ref(f32rng)); - std::generate(v65.begin(), v65.end(), std::ref(f32rng)); - std::generate(v66.begin(), v66.end(), std::ref(f32rng)); - std::generate(v67.begin(), v67.end(), std::ref(f32rng)); - std::generate(v68.begin(), v68.end(), std::ref(f32rng)); - std::generate(v69.begin(), v69.end(), std::ref(f32rng)); - std::generate(v70.begin(), v70.end(), std::ref(f32rng)); - std::generate(v71.begin(), v71.end(), std::ref(f32rng)); - std::generate(v72.begin(), v72.end(), std::ref(f32rng)); - std::generate(v73.begin(), v73.end(), std::ref(f32rng)); - std::generate(v74.begin(), v74.end(), std::ref(f32rng)); - std::generate(v75.begin(), v75.end(), std::ref(f32rng)); - std::generate(v76.begin(), v76.end(), std::ref(f32rng)); - std::generate(v77.begin(), v77.end(), std::ref(f32rng)); - std::generate(v78.begin(), v78.end(), std::ref(f32rng)); - std::generate(v79.begin(), v79.end(), std::ref(f32rng)); - std::generate(v80.begin(), v80.end(), std::ref(f32rng)); - std::generate(v81.begin(), v81.end(), std::ref(f32rng)); - std::generate(v82.begin(), v82.end(), std::ref(f32rng)); - std::generate(v83.begin(), v83.end(), std::ref(f32rng)); - std::generate(v84.begin(), v84.end(), std::ref(f32rng)); - std::generate(v85.begin(), v85.end(), std::ref(f32rng)); - std::generate(v86.begin(), v86.end(), std::ref(f32rng)); - std::generate(v87.begin(), v87.end(), std::ref(f32rng)); - std::generate(v88.begin(), v88.end(), std::ref(f32rng)); - std::generate(v89.begin(), v89.end(), std::ref(f32rng)); - std::generate(v90.begin(), v90.end(), std::ref(f32rng)); - std::generate(v91.begin(), v91.end(), std::ref(f32rng)); - std::generate(v92.begin(), v92.end(), std::ref(f32rng)); - std::generate(v93.begin(), v93.end(), std::ref(f32rng)); - std::generate(v94.begin(), v94.end(), std::ref(f32rng)); - std::generate(v95.begin(), v95.end(), std::ref(f32rng)); - std::generate(v96.begin(), v96.end(), std::ref(f32rng)); - std::generate(v97.begin(), v97.end(), std::ref(f32rng)); - std::generate(v98.begin(), v98.end(), std::ref(f32rng)); - std::generate(v99.begin(), v99.end(), std::ref(f32rng)); - std::generate(v100.begin(), v100.end(), std::ref(f32rng)); - std::generate(v101.begin(), v101.end(), std::ref(f32rng)); - std::generate(v102.begin(), v102.end(), std::ref(f32rng)); - std::generate(v103.begin(), v103.end(), std::ref(f32rng)); - std::generate(v104.begin(), v104.end(), std::ref(f32rng)); - std::generate(v105.begin(), v105.end(), std::ref(f32rng)); - std::generate(v106.begin(), v106.end(), std::ref(f32rng)); - std::generate(v107.begin(), v107.end(), std::ref(f32rng)); - std::generate(v108.begin(), v108.end(), std::ref(f32rng)); - std::generate(v109.begin(), v109.end(), std::ref(f32rng)); - std::generate(v110.begin(), v110.end(), std::ref(f32rng)); - std::generate(v111.begin(), v111.end(), std::ref(f32rng)); - std::generate(v112.begin(), v112.end(), std::ref(f32rng)); - std::generate(v113.begin(), v113.end(), std::ref(f32rng)); - std::generate(v114.begin(), v114.end(), std::ref(f32rng)); - std::generate(v115.begin(), v115.end(), std::ref(f32rng)); - std::generate(w116.begin(), w116.end(), std::ref(f32rng)); - std::generate(w117.begin(), w117.end(), std::ref(f32rng)); - std::generate(w118.begin(), w118.end(), std::ref(f32rng)); - std::generate(w119.begin(), w119.end(), std::ref(f32rng)); - std::generate(w120.begin(), w120.end(), std::ref(f32rng)); - std::generate(w121.begin(), w121.end(), std::ref(f32rng)); - std::generate(w122.begin(), w122.end(), std::ref(f32rng)); - std::generate(w123.begin(), w123.end(), std::ref(f32rng)); - std::generate(w124.begin(), w124.end(), std::ref(f32rng)); - std::generate(w125.begin(), w125.end(), std::ref(f32rng)); - std::generate(w126.begin(), w126.end(), std::ref(f32rng)); - std::generate(w127.begin(), w127.end(), std::ref(f32rng)); - std::generate(w128.begin(), w128.end(), std::ref(f32rng)); - std::generate(w129.begin(), w129.end(), std::ref(f32rng)); - std::generate(w130.begin(), w130.end(), std::ref(f32rng)); - std::generate(w131.begin(), w131.end(), std::ref(f32rng)); - std::generate(w132.begin(), w132.end(), std::ref(f32rng)); - std::generate(w133.begin(), w133.end(), std::ref(f32rng)); - std::generate(w134.begin(), w134.end(), std::ref(f32rng)); - std::generate(w135.begin(), w135.end(), std::ref(f32rng)); - std::generate(w136.begin(), w136.end(), std::ref(f32rng)); - std::generate(w137.begin(), w137.end(), std::ref(f32rng)); - std::generate(w138.begin(), w138.end(), std::ref(f32rng)); - std::generate(w139.begin(), w139.end(), std::ref(f32rng)); - std::generate(w140.begin(), w140.end(), std::ref(f32rng)); - std::generate(w141.begin(), w141.end(), std::ref(f32rng)); - std::generate(w142.begin(), w142.end(), std::ref(f32rng)); - std::generate(w143.begin(), w143.end(), std::ref(f32rng)); - std::generate(w144.begin(), w144.end(), std::ref(f32rng)); - std::generate(w145.begin(), w145.end(), std::ref(f32rng)); - std::generate(w146.begin(), w146.end(), std::ref(f32rng)); - std::generate(w147.begin(), w147.end(), std::ref(f32rng)); - std::generate(w148.begin(), w148.end(), std::ref(f32rng)); - std::generate(w149.begin(), w149.end(), std::ref(f32rng)); - std::generate(w150.begin(), w150.end(), std::ref(f32rng)); - std::generate(w151.begin(), w151.end(), std::ref(f32rng)); - std::generate(w152.begin(), w152.end(), std::ref(f32rng)); - std::generate(w153.begin(), w153.end(), std::ref(f32rng)); - std::generate(w154.begin(), w154.end(), std::ref(f32rng)); - std::generate(w155.begin(), w155.end(), std::ref(f32rng)); - std::generate(w156.begin(), w156.end(), std::ref(f32rng)); - std::generate(w157.begin(), w157.end(), std::ref(f32rng)); - std::generate(w158.begin(), w158.end(), std::ref(f32rng)); - std::generate(w159.begin(), w159.end(), std::ref(f32rng)); - std::generate(w160.begin(), w160.end(), std::ref(f32rng)); - std::generate(w161.begin(), w161.end(), std::ref(f32rng)); - std::generate(w162.begin(), w162.end(), std::ref(f32rng)); - std::generate(w163.begin(), w163.end(), std::ref(f32rng)); - std::generate(w164.begin(), w164.end(), std::ref(f32rng)); - std::generate(w165.begin(), w165.end(), std::ref(f32rng)); - std::generate(w166.begin(), w166.end(), std::ref(f32rng)); - std::generate(w167.begin(), w167.end(), std::ref(f32rng)); - std::generate(w168.begin(), w168.end(), std::ref(f32rng)); - std::generate(w169.begin(), w169.end(), std::ref(f32rng)); - std::generate(w170.begin(), w170.end(), std::ref(f32rng)); - std::generate(w171.begin(), w171.end(), std::ref(f32rng)); - std::generate(w172.begin(), w172.end(), std::ref(f32rng)); - std::generate(w173.begin(), w173.end(), std::ref(f32rng)); - std::generate(w174.begin(), w174.end(), std::ref(f32rng)); - std::generate(w175.begin(), w175.end(), std::ref(f32rng)); - std::generate(w176.begin(), w176.end(), std::ref(f32rng)); - std::generate(w177.begin(), w177.end(), std::ref(f32rng)); - std::generate(w178.begin(), w178.end(), std::ref(f32rng)); - std::generate(w179.begin(), w179.end(), std::ref(f32rng)); - std::generate(w180.begin(), w180.end(), std::ref(f32rng)); - std::generate(w181.begin(), w181.end(), std::ref(f32rng)); - std::generate(w182.begin(), w182.end(), std::ref(f32rng)); - std::generate(w183.begin(), w183.end(), std::ref(f32rng)); - std::generate(w184.begin(), w184.end(), std::ref(f32rng)); - std::generate(w185.begin(), w185.end(), std::ref(f32rng)); - std::generate(w186.begin(), w186.end(), std::ref(f32rng)); - std::generate(w187.begin(), w187.end(), std::ref(f32rng)); - std::generate(w188.begin(), w188.end(), std::ref(f32rng)); - std::generate(w189.begin(), w189.end(), std::ref(f32rng)); - std::generate(w190.begin(), w190.end(), std::ref(f32rng)); - std::generate(w191.begin(), w191.end(), std::ref(f32rng)); - std::generate(w192.begin(), w192.end(), std::ref(f32rng)); - std::generate(w193.begin(), w193.end(), std::ref(f32rng)); - std::generate(w194.begin(), w194.end(), std::ref(f32rng)); - std::generate(w195.begin(), w195.end(), std::ref(f32rng)); - std::generate(w196.begin(), w196.end(), std::ref(f32rng)); - std::generate(w197.begin(), w197.end(), std::ref(f32rng)); - std::generate(w198.begin(), w198.end(), std::ref(f32rng)); - std::generate(w199.begin(), w199.end(), std::ref(f32rng)); - std::generate(w200.begin(), w200.end(), std::ref(f32rng)); - std::generate(w201.begin(), w201.end(), std::ref(f32rng)); - std::generate(w202.begin(), w202.end(), std::ref(f32rng)); - std::generate(w203.begin(), w203.end(), std::ref(f32rng)); - std::generate(w204.begin(), w204.end(), std::ref(f32rng)); - std::generate(w205.begin(), w205.end(), std::ref(f32rng)); - std::generate(w206.begin(), w206.end(), std::ref(f32rng)); - std::generate(w207.begin(), w207.end(), std::ref(f32rng)); - std::generate(w208.begin(), w208.end(), std::ref(f32rng)); - std::generate(w209.begin(), w209.end(), std::ref(f32rng)); - std::generate(w210.begin(), w210.end(), std::ref(f32rng)); - std::generate(w211.begin(), w211.end(), std::ref(f32rng)); - std::generate(w212.begin(), w212.end(), std::ref(f32rng)); - std::generate(w213.begin(), w213.end(), std::ref(f32rng)); - std::generate(w214.begin(), w214.end(), std::ref(f32rng)); - std::generate(w215.begin(), w215.end(), std::ref(f32rng)); - std::generate(w216.begin(), w216.end(), std::ref(f32rng)); - std::generate(w217.begin(), w217.end(), std::ref(f32rng)); - std::generate(w218.begin(), w218.end(), std::ref(f32rng)); - std::generate(w219.begin(), w219.end(), std::ref(f32rng)); - std::generate(w220.begin(), w220.end(), std::ref(f32rng)); - std::generate(w221.begin(), w221.end(), std::ref(f32rng)); - std::generate(w222.begin(), w222.end(), std::ref(f32rng)); - std::generate(w223.begin(), w223.end(), std::ref(f32rng)); - std::generate(w224.begin(), w224.end(), std::ref(f32rng)); - std::generate(w225.begin(), w225.end(), std::ref(f32rng)); - std::generate(w226.begin(), w226.end(), std::ref(f32rng)); - std::generate(w227.begin(), w227.end(), std::ref(f32rng)); - std::generate(w228.begin(), w228.end(), std::ref(f32rng)); - std::generate(w229.begin(), w229.end(), std::ref(f32rng)); - std::generate(w230.begin(), w230.end(), std::ref(f32rng)); - std::generate(w231.begin(), w231.end(), std::ref(f32rng)); - std::generate(w232.begin(), w232.end(), std::ref(f32rng)); - std::generate(w233.begin(), w233.end(), std::ref(f32rng)); - std::generate(w234.begin(), w234.end(), std::ref(f32rng)); - std::generate(w235.begin(), w235.end(), std::ref(f32rng)); - std::generate(w236.begin(), w236.end(), std::ref(f32rng)); - std::generate(w237.begin(), w237.end(), std::ref(f32rng)); - std::generate(w238.begin(), w238.end(), std::ref(f32rng)); - std::generate(w239.begin(), w239.end(), std::ref(f32rng)); - std::generate(w240.begin(), w240.end(), std::ref(f32rng)); - std::generate(w241.begin(), w241.end(), std::ref(f32rng)); - std::generate(w242.begin(), w242.end(), std::ref(f32rng)); - std::generate(w243.begin(), w243.end(), std::ref(f32rng)); - - Operators operators; - xnn_status status; - xnn_code_cache* code_cache_ptr = nullptr; - size_t max_workspace_size = 0; - - xnn_operator_t op0 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/0, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/3, - /*group_output_channels=*/16, - /*input_channel_stride=*/3, - /*output_channel_stride=*/16, - /*kernel=*/w116.data(), /*bias=*/w117.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op0); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #0" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op0, xnn_delete_operator); - - xnn_operator_t op1 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op1); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #1" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op1, xnn_delete_operator); - - xnn_operator_t op2 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/16, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/16, - /*output_channel_stride=*/16, - /*kernel=*/w118.data(), /*bias=*/w119.data(), - /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op2); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #2" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op2, xnn_delete_operator); - - xnn_operator_t op3 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/16, - /*group_output_channels=*/16, - /*input_channel_stride=*/16, - /*output_channel_stride=*/16, - /*kernel=*/w120.data(), /*bias=*/w121.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op3); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #3" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op3, xnn_delete_operator); - - xnn_operator_t op4 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op4); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #4" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op4, xnn_delete_operator); - - xnn_operator_t op5 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/16, - /*group_output_channels=*/64, - /*input_channel_stride=*/16, - /*output_channel_stride=*/64, - /*kernel=*/w122.data(), /*bias=*/w123.data(), - /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op5); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #5" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op5, xnn_delete_operator); - - xnn_operator_t op6 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/0, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/64, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/64, - /*output_channel_stride=*/64, - /*kernel=*/w124.data(), /*bias=*/w125.data(), - /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op6); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #6" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op6, xnn_delete_operator); - - xnn_operator_t op7 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/64, - /*group_output_channels=*/24, - /*input_channel_stride=*/64, - /*output_channel_stride=*/24, - /*kernel=*/w126.data(), /*bias=*/w127.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op7); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #7" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op7, xnn_delete_operator); - - xnn_operator_t op8 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/24, - /*group_output_channels=*/72, - /*input_channel_stride=*/24, - /*output_channel_stride=*/72, - /*kernel=*/w128.data(), /*bias=*/w129.data(), - /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op8); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #8" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op8, xnn_delete_operator); - - xnn_operator_t op9 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/72, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/72, - /*output_channel_stride=*/72, - /*kernel=*/w130.data(), /*bias=*/w131.data(), - /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op9); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #9" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op9, xnn_delete_operator); - - xnn_operator_t op10 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/72, - /*group_output_channels=*/24, - /*input_channel_stride=*/72, - /*output_channel_stride=*/24, - /*kernel=*/w132.data(), /*bias=*/w133.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op10); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #10" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op10, xnn_delete_operator); - - xnn_operator_t op11 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op11); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #11" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op11, xnn_delete_operator); - - xnn_operator_t op12 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/24, - /*group_output_channels=*/72, - /*input_channel_stride=*/24, - /*output_channel_stride=*/72, - /*kernel=*/w134.data(), /*bias=*/w135.data(), - /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op12); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #12" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op12, xnn_delete_operator); - - xnn_operator_t op13 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/1, /*input_padding_right=*/2, - /*input_padding_bottom=*/2, /*input_padding_left=*/1, - /*kernel_height=*/5, /*kernel_width=*/5, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/72, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/72, - /*output_channel_stride=*/72, - /*kernel=*/w136.data(), /*bias=*/w137.data(), - /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op13); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #13" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op13, xnn_delete_operator); - - xnn_operator_t op14 = nullptr; - status = xnn_create_global_average_pooling_nwc_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op14); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #14" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op14, xnn_delete_operator); - - xnn_operator_t op15 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/72, - /*group_output_channels=*/24, - /*input_channel_stride=*/72, - /*output_channel_stride=*/24, - /*kernel=*/w138.data(), /*bias=*/w139.data(), - /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op15); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #15" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op15, xnn_delete_operator); - - xnn_operator_t op16 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/24, - /*group_output_channels=*/72, - /*input_channel_stride=*/24, - /*output_channel_stride=*/72, - /*kernel=*/w140.data(), /*bias=*/w141.data(), - /*output_min=*/0.0f, /*output_max=*/+0x1.00014Fp+0, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op16); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #16" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op16, xnn_delete_operator); - - xnn_operator_t op17 = nullptr; - status = xnn_create_multiply_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op17); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #17" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op17, xnn_delete_operator); - - xnn_operator_t op18 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/72, - /*group_output_channels=*/40, - /*input_channel_stride=*/72, - /*output_channel_stride=*/40, - /*kernel=*/w142.data(), /*bias=*/w143.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op18); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #18" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op18, xnn_delete_operator); - - xnn_operator_t op19 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/40, - /*group_output_channels=*/120, - /*input_channel_stride=*/40, - /*output_channel_stride=*/120, - /*kernel=*/w144.data(), /*bias=*/w145.data(), - /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op19); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #19" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op19, xnn_delete_operator); - - xnn_operator_t op20 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/2, /*input_padding_right=*/2, - /*input_padding_bottom=*/2, /*input_padding_left=*/2, - /*kernel_height=*/5, /*kernel_width=*/5, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/120, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/120, - /*output_channel_stride=*/120, - /*kernel=*/w146.data(), /*bias=*/w147.data(), - /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op20); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #20" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op20, xnn_delete_operator); - - xnn_operator_t op21 = nullptr; - status = xnn_create_global_average_pooling_nwc_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op21); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #21" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op21, xnn_delete_operator); - - xnn_operator_t op22 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/120, - /*group_output_channels=*/32, - /*input_channel_stride=*/120, - /*output_channel_stride=*/32, - /*kernel=*/w148.data(), /*bias=*/w149.data(), - /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op22); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #22" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op22, xnn_delete_operator); - - xnn_operator_t op23 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/32, - /*group_output_channels=*/120, - /*input_channel_stride=*/32, - /*output_channel_stride=*/120, - /*kernel=*/w150.data(), /*bias=*/w151.data(), - /*output_min=*/0.0f, /*output_max=*/+0x1.00014Fp+0, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op23); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #23" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op23, xnn_delete_operator); - - xnn_operator_t op24 = nullptr; - status = xnn_create_multiply_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op24); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #24" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op24, xnn_delete_operator); - - xnn_operator_t op25 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/120, - /*group_output_channels=*/40, - /*input_channel_stride=*/120, - /*output_channel_stride=*/40, - /*kernel=*/w152.data(), /*bias=*/w153.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op25); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #25" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op25, xnn_delete_operator); - - xnn_operator_t op26 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op26); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #26" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op26, xnn_delete_operator); - - xnn_operator_t op27 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/40, - /*group_output_channels=*/120, - /*input_channel_stride=*/40, - /*output_channel_stride=*/120, - /*kernel=*/w154.data(), /*bias=*/w155.data(), - /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op27); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #27" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op27, xnn_delete_operator); - - xnn_operator_t op28 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/2, /*input_padding_right=*/2, - /*input_padding_bottom=*/2, /*input_padding_left=*/2, - /*kernel_height=*/5, /*kernel_width=*/5, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/120, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/120, - /*output_channel_stride=*/120, - /*kernel=*/w156.data(), /*bias=*/w157.data(), - /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op28); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #28" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op28, xnn_delete_operator); - - xnn_operator_t op29 = nullptr; - status = xnn_create_global_average_pooling_nwc_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op29); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #29" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op29, xnn_delete_operator); - - xnn_operator_t op30 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/120, - /*group_output_channels=*/32, - /*input_channel_stride=*/120, - /*output_channel_stride=*/32, - /*kernel=*/w158.data(), /*bias=*/w159.data(), - /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op30); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #30" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op30, xnn_delete_operator); - - xnn_operator_t op31 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/32, - /*group_output_channels=*/120, - /*input_channel_stride=*/32, - /*output_channel_stride=*/120, - /*kernel=*/w160.data(), /*bias=*/w161.data(), - /*output_min=*/0.0f, /*output_max=*/+0x1.00014Fp+0, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op31); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #31" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op31, xnn_delete_operator); - - xnn_operator_t op32 = nullptr; - status = xnn_create_multiply_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op32); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #32" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op32, xnn_delete_operator); - - xnn_operator_t op33 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/120, - /*group_output_channels=*/40, - /*input_channel_stride=*/120, - /*output_channel_stride=*/40, - /*kernel=*/w162.data(), /*bias=*/w163.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op33); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #33" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op33, xnn_delete_operator); - - xnn_operator_t op34 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op34); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #34" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op34, xnn_delete_operator); - - xnn_operator_t op35 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/40, - /*group_output_channels=*/240, - /*input_channel_stride=*/40, - /*output_channel_stride=*/240, - /*kernel=*/w164.data(), /*bias=*/w165.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op35); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #35" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op35, xnn_delete_operator); - - xnn_operator_t op36 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op36); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #36" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op36, xnn_delete_operator); - - xnn_operator_t op37 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/0, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/240, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/240, - /*output_channel_stride=*/240, - /*kernel=*/w166.data(), /*bias=*/w167.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op37); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #37" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op37, xnn_delete_operator); - - xnn_operator_t op38 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op38); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #38" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op38, xnn_delete_operator); - - xnn_operator_t op39 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/240, - /*group_output_channels=*/80, - /*input_channel_stride=*/240, - /*output_channel_stride=*/80, - /*kernel=*/w168.data(), /*bias=*/w169.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op39); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #39" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op39, xnn_delete_operator); - - xnn_operator_t op40 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/80, - /*group_output_channels=*/200, - /*input_channel_stride=*/80, - /*output_channel_stride=*/200, - /*kernel=*/w170.data(), /*bias=*/w171.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op40); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #40" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op40, xnn_delete_operator); - - xnn_operator_t op41 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op41); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #41" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op41, xnn_delete_operator); - - xnn_operator_t op42 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/200, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/200, - /*output_channel_stride=*/200, - /*kernel=*/w172.data(), /*bias=*/w173.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op42); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #42" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op42, xnn_delete_operator); - - xnn_operator_t op43 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op43); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #43" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op43, xnn_delete_operator); - - xnn_operator_t op44 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/200, - /*group_output_channels=*/80, - /*input_channel_stride=*/200, - /*output_channel_stride=*/80, - /*kernel=*/w174.data(), /*bias=*/w175.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op44); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #44" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op44, xnn_delete_operator); - - xnn_operator_t op45 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op45); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #45" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op45, xnn_delete_operator); - - xnn_operator_t op46 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/80, - /*group_output_channels=*/184, - /*input_channel_stride=*/80, - /*output_channel_stride=*/184, - /*kernel=*/w176.data(), /*bias=*/w177.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op46); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #46" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op46, xnn_delete_operator); - - xnn_operator_t op47 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op47); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #47" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op47, xnn_delete_operator); - - xnn_operator_t op48 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/184, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/184, - /*output_channel_stride=*/184, - /*kernel=*/w178.data(), /*bias=*/w179.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op48); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #48" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op48, xnn_delete_operator); - - xnn_operator_t op49 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op49); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #49" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op49, xnn_delete_operator); - - xnn_operator_t op50 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/184, - /*group_output_channels=*/80, - /*input_channel_stride=*/184, - /*output_channel_stride=*/80, - /*kernel=*/w180.data(), /*bias=*/w181.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op50); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #50" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op50, xnn_delete_operator); - - xnn_operator_t op51 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op51); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #51" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op51, xnn_delete_operator); - - xnn_operator_t op52 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/80, - /*group_output_channels=*/184, - /*input_channel_stride=*/80, - /*output_channel_stride=*/184, - /*kernel=*/w182.data(), /*bias=*/w183.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op52); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #52" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op52, xnn_delete_operator); - - xnn_operator_t op53 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op53); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #53" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op53, xnn_delete_operator); - - xnn_operator_t op54 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/184, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/184, - /*output_channel_stride=*/184, - /*kernel=*/w184.data(), /*bias=*/w185.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op54); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #54" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op54, xnn_delete_operator); - - xnn_operator_t op55 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op55); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #55" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op55, xnn_delete_operator); - - xnn_operator_t op56 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/184, - /*group_output_channels=*/80, - /*input_channel_stride=*/184, - /*output_channel_stride=*/80, - /*kernel=*/w186.data(), /*bias=*/w187.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op56); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #56" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op56, xnn_delete_operator); - - xnn_operator_t op57 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op57); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #57" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op57, xnn_delete_operator); - - xnn_operator_t op58 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/80, - /*group_output_channels=*/480, - /*input_channel_stride=*/80, - /*output_channel_stride=*/480, - /*kernel=*/w188.data(), /*bias=*/w189.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op58); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #58" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op58, xnn_delete_operator); - - xnn_operator_t op59 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op59); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #59" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op59, xnn_delete_operator); - - xnn_operator_t op60 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/480, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/480, - /*output_channel_stride=*/480, - /*kernel=*/w190.data(), /*bias=*/w191.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op60); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #60" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op60, xnn_delete_operator); - - xnn_operator_t op61 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op61); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #61" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op61, xnn_delete_operator); - - xnn_operator_t op62 = nullptr; - status = xnn_create_global_average_pooling_nwc_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op62); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #62" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op62, xnn_delete_operator); - - xnn_operator_t op63 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/480, - /*group_output_channels=*/120, - /*input_channel_stride=*/480, - /*output_channel_stride=*/120, - /*kernel=*/w192.data(), /*bias=*/w193.data(), - /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op63); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #63" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op63, xnn_delete_operator); - - xnn_operator_t op64 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/120, - /*group_output_channels=*/480, - /*input_channel_stride=*/120, - /*output_channel_stride=*/480, - /*kernel=*/w194.data(), /*bias=*/w195.data(), - /*output_min=*/0.0f, /*output_max=*/+0x1.00014Fp+0, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op64); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #64" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op64, xnn_delete_operator); - - xnn_operator_t op65 = nullptr; - status = xnn_create_multiply_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op65); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #65" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op65, xnn_delete_operator); - - xnn_operator_t op66 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/480, - /*group_output_channels=*/112, - /*input_channel_stride=*/480, - /*output_channel_stride=*/112, - /*kernel=*/w196.data(), /*bias=*/w197.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op66); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #66" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op66, xnn_delete_operator); - - xnn_operator_t op67 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/112, - /*group_output_channels=*/672, - /*input_channel_stride=*/112, - /*output_channel_stride=*/672, - /*kernel=*/w198.data(), /*bias=*/w199.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op67); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #67" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op67, xnn_delete_operator); - - xnn_operator_t op68 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op68); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #68" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op68, xnn_delete_operator); - - xnn_operator_t op69 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/672, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/672, - /*output_channel_stride=*/672, - /*kernel=*/w200.data(), /*bias=*/w201.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op69); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #69" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op69, xnn_delete_operator); - - xnn_operator_t op70 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op70); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #70" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op70, xnn_delete_operator); - - xnn_operator_t op71 = nullptr; - status = xnn_create_global_average_pooling_nwc_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op71); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #71" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op71, xnn_delete_operator); - - xnn_operator_t op72 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/672, - /*group_output_channels=*/168, - /*input_channel_stride=*/672, - /*output_channel_stride=*/168, - /*kernel=*/w202.data(), /*bias=*/w203.data(), - /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op72); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #72" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op72, xnn_delete_operator); - - xnn_operator_t op73 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/168, - /*group_output_channels=*/672, - /*input_channel_stride=*/168, - /*output_channel_stride=*/672, - /*kernel=*/w204.data(), /*bias=*/w205.data(), - /*output_min=*/0.0f, /*output_max=*/+0x1.00014Fp+0, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op73); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #73" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op73, xnn_delete_operator); - - xnn_operator_t op74 = nullptr; - status = xnn_create_multiply_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op74); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #74" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op74, xnn_delete_operator); - - xnn_operator_t op75 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/672, - /*group_output_channels=*/112, - /*input_channel_stride=*/672, - /*output_channel_stride=*/112, - /*kernel=*/w206.data(), /*bias=*/w207.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op75); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #75" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op75, xnn_delete_operator); - - xnn_operator_t op76 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op76); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #76" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op76, xnn_delete_operator); - - xnn_operator_t op77 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/112, - /*group_output_channels=*/672, - /*input_channel_stride=*/112, - /*output_channel_stride=*/672, - /*kernel=*/w208.data(), /*bias=*/w209.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op77); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #77" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op77, xnn_delete_operator); - - xnn_operator_t op78 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op78); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #78" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op78, xnn_delete_operator); - - xnn_operator_t op79 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/1, /*input_padding_right=*/2, - /*input_padding_bottom=*/2, /*input_padding_left=*/1, - /*kernel_height=*/5, /*kernel_width=*/5, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/672, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/672, - /*output_channel_stride=*/672, - /*kernel=*/w210.data(), /*bias=*/w211.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op79); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #79" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op79, xnn_delete_operator); - - xnn_operator_t op80 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op80); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #80" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op80, xnn_delete_operator); - - xnn_operator_t op81 = nullptr; - status = xnn_create_global_average_pooling_nwc_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op81); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #81" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op81, xnn_delete_operator); - - xnn_operator_t op82 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/672, - /*group_output_channels=*/168, - /*input_channel_stride=*/672, - /*output_channel_stride=*/168, - /*kernel=*/w212.data(), /*bias=*/w213.data(), - /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op82); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #82" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op82, xnn_delete_operator); - - xnn_operator_t op83 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/168, - /*group_output_channels=*/672, - /*input_channel_stride=*/168, - /*output_channel_stride=*/672, - /*kernel=*/w214.data(), /*bias=*/w215.data(), - /*output_min=*/0.0f, /*output_max=*/+0x1.00014Fp+0, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op83); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #83" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op83, xnn_delete_operator); - - xnn_operator_t op84 = nullptr; - status = xnn_create_multiply_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op84); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #84" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op84, xnn_delete_operator); - - xnn_operator_t op85 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/672, - /*group_output_channels=*/160, - /*input_channel_stride=*/672, - /*output_channel_stride=*/160, - /*kernel=*/w216.data(), /*bias=*/w217.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op85); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #85" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op85, xnn_delete_operator); - - xnn_operator_t op86 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/160, - /*group_output_channels=*/960, - /*input_channel_stride=*/160, - /*output_channel_stride=*/960, - /*kernel=*/w218.data(), /*bias=*/w219.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op86); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #86" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op86, xnn_delete_operator); - - xnn_operator_t op87 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op87); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #87" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op87, xnn_delete_operator); - - xnn_operator_t op88 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/2, /*input_padding_right=*/2, - /*input_padding_bottom=*/2, /*input_padding_left=*/2, - /*kernel_height=*/5, /*kernel_width=*/5, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/960, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/960, - /*output_channel_stride=*/960, - /*kernel=*/w220.data(), /*bias=*/w221.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op88); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #88" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op88, xnn_delete_operator); - - xnn_operator_t op89 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op89); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #89" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op89, xnn_delete_operator); - - xnn_operator_t op90 = nullptr; - status = xnn_create_global_average_pooling_nwc_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op90); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #90" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op90, xnn_delete_operator); - - xnn_operator_t op91 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/960, - /*group_output_channels=*/240, - /*input_channel_stride=*/960, - /*output_channel_stride=*/240, - /*kernel=*/w222.data(), /*bias=*/w223.data(), - /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op91); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #91" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op91, xnn_delete_operator); - - xnn_operator_t op92 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/240, - /*group_output_channels=*/960, - /*input_channel_stride=*/240, - /*output_channel_stride=*/960, - /*kernel=*/w224.data(), /*bias=*/w225.data(), - /*output_min=*/0.0f, /*output_max=*/+0x1.00014Fp+0, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op92); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #92" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op92, xnn_delete_operator); - - xnn_operator_t op93 = nullptr; - status = xnn_create_multiply_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op93); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #93" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op93, xnn_delete_operator); - - xnn_operator_t op94 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/960, - /*group_output_channels=*/160, - /*input_channel_stride=*/960, - /*output_channel_stride=*/160, - /*kernel=*/w226.data(), /*bias=*/w227.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op94); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #94" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op94, xnn_delete_operator); - - xnn_operator_t op95 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op95); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #95" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op95, xnn_delete_operator); - - xnn_operator_t op96 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/160, - /*group_output_channels=*/960, - /*input_channel_stride=*/160, - /*output_channel_stride=*/960, - /*kernel=*/w228.data(), /*bias=*/w229.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op96); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #96" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op96, xnn_delete_operator); - - xnn_operator_t op97 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op97); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #97" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op97, xnn_delete_operator); - - xnn_operator_t op98 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/2, /*input_padding_right=*/2, - /*input_padding_bottom=*/2, /*input_padding_left=*/2, - /*kernel_height=*/5, /*kernel_width=*/5, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/960, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/960, - /*output_channel_stride=*/960, - /*kernel=*/w230.data(), /*bias=*/w231.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op98); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #98" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op98, xnn_delete_operator); - - xnn_operator_t op99 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op99); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #99" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op99, xnn_delete_operator); - - xnn_operator_t op100 = nullptr; - status = xnn_create_global_average_pooling_nwc_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op100); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #100" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op100, xnn_delete_operator); - - xnn_operator_t op101 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/960, - /*group_output_channels=*/240, - /*input_channel_stride=*/960, - /*output_channel_stride=*/240, - /*kernel=*/w232.data(), /*bias=*/w233.data(), - /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op101); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #101" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op101, xnn_delete_operator); - - xnn_operator_t op102 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/240, - /*group_output_channels=*/960, - /*input_channel_stride=*/240, - /*output_channel_stride=*/960, - /*kernel=*/w234.data(), /*bias=*/w235.data(), - /*output_min=*/0.0f, /*output_max=*/+0x1.00014Fp+0, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op102); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #102" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op102, xnn_delete_operator); - - xnn_operator_t op103 = nullptr; - status = xnn_create_multiply_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op103); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #103" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op103, xnn_delete_operator); - - xnn_operator_t op104 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/960, - /*group_output_channels=*/160, - /*input_channel_stride=*/960, - /*output_channel_stride=*/160, - /*kernel=*/w236.data(), /*bias=*/w237.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op104); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #104" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op104, xnn_delete_operator); - - xnn_operator_t op105 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op105); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #105" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op105, xnn_delete_operator); - - xnn_operator_t op106 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/160, - /*group_output_channels=*/960, - /*input_channel_stride=*/160, - /*output_channel_stride=*/960, - /*kernel=*/w238.data(), /*bias=*/w239.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op106); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #106" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op106, xnn_delete_operator); - - xnn_operator_t op107 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op107); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #107" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op107, xnn_delete_operator); - - xnn_operator_t op108 = nullptr; - status = xnn_create_global_average_pooling_nwc_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op108); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #108" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op108, xnn_delete_operator); - - xnn_operator_t op109 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/960, - /*group_output_channels=*/1280, - /*input_channel_stride=*/960, - /*output_channel_stride=*/1280, - /*kernel=*/w240.data(), /*bias=*/w241.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op109); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #109" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op109, xnn_delete_operator); - - xnn_operator_t op110 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op110); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #110" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op110, xnn_delete_operator); - - xnn_operator_t op111 = nullptr; - status = xnn_create_global_average_pooling_nwc_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op111); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #111" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op111, xnn_delete_operator); - - xnn_operator_t op112 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/1280, - /*group_output_channels=*/1001, - /*input_channel_stride=*/1280, - /*output_channel_stride=*/1001, - /*kernel=*/w242.data(), /*bias=*/w243.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op112); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #112" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op112, xnn_delete_operator); - - xnn_operator_t op113 = nullptr; - status = xnn_create_copy_nc_x32( - 0 /* flags */, - &op113); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #113" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op113, xnn_delete_operator); - - xnn_operator_t op114 = nullptr; - status = xnn_create_softmax_nc_f32( - /*flags=*/0, - &op114); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #114" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op114, xnn_delete_operator); - - size_t op0_workspace_size = 0; - size_t op0_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op0, - /*batch_size=*/1, /*input_height=*/224, /*input_width=*/224, - &op0_workspace_size, &op0_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op0_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #0" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op1, - /*batch_size=*/12544, - 16 /* channels */, - 16 /* input stride */, - 16 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #1" << std::endl; - return ExecutionPlan(); - } - - size_t op2_workspace_size = 0; - size_t op2_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op2, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op2_workspace_size, &op2_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op2_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #2" << std::endl; - return ExecutionPlan(); - } - - size_t op3_workspace_size = 0; - size_t op3_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op3, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op3_workspace_size, &op3_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op3_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #3" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 112, 112, 16 }; - const size_t b_shape[] = { 1, 112, 112, 16 }; - status = xnn_reshape_add_nd_f32( - op4, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #4" << std::endl; - return ExecutionPlan(); - } - - size_t op5_workspace_size = 0; - size_t op5_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op5, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op5_workspace_size, &op5_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op5_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #5" << std::endl; - return ExecutionPlan(); - } - - size_t op6_workspace_size = 0; - size_t op6_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op6, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op6_workspace_size, &op6_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op6_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #6" << std::endl; - return ExecutionPlan(); - } - - size_t op7_workspace_size = 0; - size_t op7_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op7, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op7_workspace_size, &op7_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op7_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #7" << std::endl; - return ExecutionPlan(); - } - - size_t op8_workspace_size = 0; - size_t op8_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op8, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op8_workspace_size, &op8_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op8_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #8" << std::endl; - return ExecutionPlan(); - } - - size_t op9_workspace_size = 0; - size_t op9_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op9, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op9_workspace_size, &op9_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op9_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #9" << std::endl; - return ExecutionPlan(); - } - - size_t op10_workspace_size = 0; - size_t op10_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op10, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op10_workspace_size, &op10_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op10_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #10" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 56, 56, 24 }; - const size_t b_shape[] = { 1, 56, 56, 24 }; - status = xnn_reshape_add_nd_f32( - op11, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #11" << std::endl; - return ExecutionPlan(); - } - - size_t op12_workspace_size = 0; - size_t op12_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op12, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op12_workspace_size, &op12_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op12_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #12" << std::endl; - return ExecutionPlan(); - } - - size_t op13_workspace_size = 0; - size_t op13_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op13, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op13_workspace_size, &op13_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op13_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #13" << std::endl; - return ExecutionPlan(); - } - - size_t op14_workspace_size = 0; - size_t op14_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f32( - op14, - /*batch_size=*/1, 784 /* width */, - 72 /* channels */, 72 /* input stride */, 72 /* output stride */, - &op14_workspace_size, &op14_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op14_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #14" << std::endl; - return ExecutionPlan(); - } - - size_t op15_workspace_size = 0; - size_t op15_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op15, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op15_workspace_size, &op15_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op15_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #15" << std::endl; - return ExecutionPlan(); - } - - size_t op16_workspace_size = 0; - size_t op16_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op16, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op16_workspace_size, &op16_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op16_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #16" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 28, 28, 72 }; - const size_t b_shape[] = { 1, 1, 1, 72 }; - status = xnn_reshape_multiply_nd_f32( - op17, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #17" << std::endl; - return ExecutionPlan(); - } - - size_t op18_workspace_size = 0; - size_t op18_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op18, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op18_workspace_size, &op18_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op18_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #18" << std::endl; - return ExecutionPlan(); - } - - size_t op19_workspace_size = 0; - size_t op19_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op19, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op19_workspace_size, &op19_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op19_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #19" << std::endl; - return ExecutionPlan(); - } - - size_t op20_workspace_size = 0; - size_t op20_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op20, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op20_workspace_size, &op20_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op20_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #20" << std::endl; - return ExecutionPlan(); - } - - size_t op21_workspace_size = 0; - size_t op21_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f32( - op21, - /*batch_size=*/1, 784 /* width */, - 120 /* channels */, 120 /* input stride */, 120 /* output stride */, - &op21_workspace_size, &op21_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op21_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #21" << std::endl; - return ExecutionPlan(); - } - - size_t op22_workspace_size = 0; - size_t op22_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op22, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op22_workspace_size, &op22_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op22_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #22" << std::endl; - return ExecutionPlan(); - } - - size_t op23_workspace_size = 0; - size_t op23_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op23, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op23_workspace_size, &op23_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op23_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #23" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 28, 28, 120 }; - const size_t b_shape[] = { 1, 1, 1, 120 }; - status = xnn_reshape_multiply_nd_f32( - op24, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #24" << std::endl; - return ExecutionPlan(); - } - - size_t op25_workspace_size = 0; - size_t op25_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op25, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op25_workspace_size, &op25_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op25_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #25" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 28, 28, 40 }; - const size_t b_shape[] = { 1, 28, 28, 40 }; - status = xnn_reshape_add_nd_f32( - op26, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #26" << std::endl; - return ExecutionPlan(); - } - - size_t op27_workspace_size = 0; - size_t op27_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op27, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op27_workspace_size, &op27_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op27_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #27" << std::endl; - return ExecutionPlan(); - } - - size_t op28_workspace_size = 0; - size_t op28_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op28, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op28_workspace_size, &op28_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op28_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #28" << std::endl; - return ExecutionPlan(); - } - - size_t op29_workspace_size = 0; - size_t op29_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f32( - op29, - /*batch_size=*/1, 784 /* width */, - 120 /* channels */, 120 /* input stride */, 120 /* output stride */, - &op29_workspace_size, &op29_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op29_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #29" << std::endl; - return ExecutionPlan(); - } - - size_t op30_workspace_size = 0; - size_t op30_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op30, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op30_workspace_size, &op30_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op30_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #30" << std::endl; - return ExecutionPlan(); - } - - size_t op31_workspace_size = 0; - size_t op31_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op31, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op31_workspace_size, &op31_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op31_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #31" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 28, 28, 120 }; - const size_t b_shape[] = { 1, 1, 1, 120 }; - status = xnn_reshape_multiply_nd_f32( - op32, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #32" << std::endl; - return ExecutionPlan(); - } - - size_t op33_workspace_size = 0; - size_t op33_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op33, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op33_workspace_size, &op33_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op33_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #33" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 28, 28, 40 }; - const size_t b_shape[] = { 1, 28, 28, 40 }; - status = xnn_reshape_add_nd_f32( - op34, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #34" << std::endl; - return ExecutionPlan(); - } - - size_t op35_workspace_size = 0; - size_t op35_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op35, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op35_workspace_size, &op35_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op35_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #35" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op36, - /*batch_size=*/784, - 240 /* channels */, - 240 /* input stride */, - 240 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #36" << std::endl; - return ExecutionPlan(); - } - - size_t op37_workspace_size = 0; - size_t op37_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op37, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op37_workspace_size, &op37_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op37_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #37" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op38, - /*batch_size=*/196, - 240 /* channels */, - 240 /* input stride */, - 240 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #38" << std::endl; - return ExecutionPlan(); - } - - size_t op39_workspace_size = 0; - size_t op39_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op39, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op39_workspace_size, &op39_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op39_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #39" << std::endl; - return ExecutionPlan(); - } - - size_t op40_workspace_size = 0; - size_t op40_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op40, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op40_workspace_size, &op40_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op40_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #40" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op41, - /*batch_size=*/196, - 200 /* channels */, - 200 /* input stride */, - 200 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #41" << std::endl; - return ExecutionPlan(); - } - - size_t op42_workspace_size = 0; - size_t op42_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op42, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op42_workspace_size, &op42_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op42_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #42" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op43, - /*batch_size=*/196, - 200 /* channels */, - 200 /* input stride */, - 200 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #43" << std::endl; - return ExecutionPlan(); - } - - size_t op44_workspace_size = 0; - size_t op44_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op44, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op44_workspace_size, &op44_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op44_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #44" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 80 }; - const size_t b_shape[] = { 1, 14, 14, 80 }; - status = xnn_reshape_add_nd_f32( - op45, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #45" << std::endl; - return ExecutionPlan(); - } - - size_t op46_workspace_size = 0; - size_t op46_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op46, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op46_workspace_size, &op46_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op46_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #46" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op47, - /*batch_size=*/196, - 184 /* channels */, - 184 /* input stride */, - 184 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #47" << std::endl; - return ExecutionPlan(); - } - - size_t op48_workspace_size = 0; - size_t op48_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op48, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op48_workspace_size, &op48_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op48_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #48" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op49, - /*batch_size=*/196, - 184 /* channels */, - 184 /* input stride */, - 184 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #49" << std::endl; - return ExecutionPlan(); - } - - size_t op50_workspace_size = 0; - size_t op50_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op50, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op50_workspace_size, &op50_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op50_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #50" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 80 }; - const size_t b_shape[] = { 1, 14, 14, 80 }; - status = xnn_reshape_add_nd_f32( - op51, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #51" << std::endl; - return ExecutionPlan(); - } - - size_t op52_workspace_size = 0; - size_t op52_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op52, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op52_workspace_size, &op52_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op52_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #52" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op53, - /*batch_size=*/196, - 184 /* channels */, - 184 /* input stride */, - 184 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #53" << std::endl; - return ExecutionPlan(); - } - - size_t op54_workspace_size = 0; - size_t op54_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op54, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op54_workspace_size, &op54_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op54_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #54" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op55, - /*batch_size=*/196, - 184 /* channels */, - 184 /* input stride */, - 184 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #55" << std::endl; - return ExecutionPlan(); - } - - size_t op56_workspace_size = 0; - size_t op56_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op56, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op56_workspace_size, &op56_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op56_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #56" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 80 }; - const size_t b_shape[] = { 1, 14, 14, 80 }; - status = xnn_reshape_add_nd_f32( - op57, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #57" << std::endl; - return ExecutionPlan(); - } - - size_t op58_workspace_size = 0; - size_t op58_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op58, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op58_workspace_size, &op58_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op58_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #58" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op59, - /*batch_size=*/196, - 480 /* channels */, - 480 /* input stride */, - 480 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #59" << std::endl; - return ExecutionPlan(); - } - - size_t op60_workspace_size = 0; - size_t op60_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op60, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op60_workspace_size, &op60_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op60_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #60" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op61, - /*batch_size=*/196, - 480 /* channels */, - 480 /* input stride */, - 480 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #61" << std::endl; - return ExecutionPlan(); - } - - size_t op62_workspace_size = 0; - size_t op62_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f32( - op62, - /*batch_size=*/1, 196 /* width */, - 480 /* channels */, 480 /* input stride */, 480 /* output stride */, - &op62_workspace_size, &op62_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op62_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #62" << std::endl; - return ExecutionPlan(); - } - - size_t op63_workspace_size = 0; - size_t op63_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op63, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op63_workspace_size, &op63_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op63_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #63" << std::endl; - return ExecutionPlan(); - } - - size_t op64_workspace_size = 0; - size_t op64_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op64, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op64_workspace_size, &op64_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op64_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #64" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 480 }; - const size_t b_shape[] = { 1, 1, 1, 480 }; - status = xnn_reshape_multiply_nd_f32( - op65, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #65" << std::endl; - return ExecutionPlan(); - } - - size_t op66_workspace_size = 0; - size_t op66_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op66, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op66_workspace_size, &op66_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op66_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #66" << std::endl; - return ExecutionPlan(); - } - - size_t op67_workspace_size = 0; - size_t op67_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op67, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op67_workspace_size, &op67_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op67_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #67" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op68, - /*batch_size=*/196, - 672 /* channels */, - 672 /* input stride */, - 672 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #68" << std::endl; - return ExecutionPlan(); - } - - size_t op69_workspace_size = 0; - size_t op69_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op69, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op69_workspace_size, &op69_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op69_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #69" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op70, - /*batch_size=*/196, - 672 /* channels */, - 672 /* input stride */, - 672 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #70" << std::endl; - return ExecutionPlan(); - } - - size_t op71_workspace_size = 0; - size_t op71_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f32( - op71, - /*batch_size=*/1, 196 /* width */, - 672 /* channels */, 672 /* input stride */, 672 /* output stride */, - &op71_workspace_size, &op71_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op71_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #71" << std::endl; - return ExecutionPlan(); - } - - size_t op72_workspace_size = 0; - size_t op72_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op72, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op72_workspace_size, &op72_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op72_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #72" << std::endl; - return ExecutionPlan(); - } - - size_t op73_workspace_size = 0; - size_t op73_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op73, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op73_workspace_size, &op73_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op73_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #73" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 672 }; - const size_t b_shape[] = { 1, 1, 1, 672 }; - status = xnn_reshape_multiply_nd_f32( - op74, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #74" << std::endl; - return ExecutionPlan(); - } - - size_t op75_workspace_size = 0; - size_t op75_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op75, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op75_workspace_size, &op75_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op75_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #75" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 112 }; - const size_t b_shape[] = { 1, 14, 14, 112 }; - status = xnn_reshape_add_nd_f32( - op76, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #76" << std::endl; - return ExecutionPlan(); - } - - size_t op77_workspace_size = 0; - size_t op77_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op77, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op77_workspace_size, &op77_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op77_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #77" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op78, - /*batch_size=*/196, - 672 /* channels */, - 672 /* input stride */, - 672 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #78" << std::endl; - return ExecutionPlan(); - } - - size_t op79_workspace_size = 0; - size_t op79_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op79, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op79_workspace_size, &op79_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op79_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #79" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op80, - /*batch_size=*/49, - 672 /* channels */, - 672 /* input stride */, - 672 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #80" << std::endl; - return ExecutionPlan(); - } - - size_t op81_workspace_size = 0; - size_t op81_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f32( - op81, - /*batch_size=*/1, 49 /* width */, - 672 /* channels */, 672 /* input stride */, 672 /* output stride */, - &op81_workspace_size, &op81_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op81_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #81" << std::endl; - return ExecutionPlan(); - } - - size_t op82_workspace_size = 0; - size_t op82_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op82, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op82_workspace_size, &op82_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op82_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #82" << std::endl; - return ExecutionPlan(); - } - - size_t op83_workspace_size = 0; - size_t op83_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op83, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op83_workspace_size, &op83_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op83_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #83" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 672 }; - const size_t b_shape[] = { 1, 1, 1, 672 }; - status = xnn_reshape_multiply_nd_f32( - op84, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #84" << std::endl; - return ExecutionPlan(); - } - - size_t op85_workspace_size = 0; - size_t op85_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op85, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op85_workspace_size, &op85_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op85_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #85" << std::endl; - return ExecutionPlan(); - } - - size_t op86_workspace_size = 0; - size_t op86_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op86, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op86_workspace_size, &op86_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op86_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #86" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op87, - /*batch_size=*/49, - 960 /* channels */, - 960 /* input stride */, - 960 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #87" << std::endl; - return ExecutionPlan(); - } - - size_t op88_workspace_size = 0; - size_t op88_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op88, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op88_workspace_size, &op88_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op88_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #88" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op89, - /*batch_size=*/49, - 960 /* channels */, - 960 /* input stride */, - 960 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #89" << std::endl; - return ExecutionPlan(); - } - - size_t op90_workspace_size = 0; - size_t op90_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f32( - op90, - /*batch_size=*/1, 49 /* width */, - 960 /* channels */, 960 /* input stride */, 960 /* output stride */, - &op90_workspace_size, &op90_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op90_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #90" << std::endl; - return ExecutionPlan(); - } - - size_t op91_workspace_size = 0; - size_t op91_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op91, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op91_workspace_size, &op91_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op91_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #91" << std::endl; - return ExecutionPlan(); - } - - size_t op92_workspace_size = 0; - size_t op92_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op92, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op92_workspace_size, &op92_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op92_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #92" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 960 }; - const size_t b_shape[] = { 1, 1, 1, 960 }; - status = xnn_reshape_multiply_nd_f32( - op93, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #93" << std::endl; - return ExecutionPlan(); - } - - size_t op94_workspace_size = 0; - size_t op94_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op94, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op94_workspace_size, &op94_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op94_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #94" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 160 }; - const size_t b_shape[] = { 1, 7, 7, 160 }; - status = xnn_reshape_add_nd_f32( - op95, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #95" << std::endl; - return ExecutionPlan(); - } - - size_t op96_workspace_size = 0; - size_t op96_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op96, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op96_workspace_size, &op96_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op96_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #96" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op97, - /*batch_size=*/49, - 960 /* channels */, - 960 /* input stride */, - 960 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #97" << std::endl; - return ExecutionPlan(); - } - - size_t op98_workspace_size = 0; - size_t op98_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op98, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op98_workspace_size, &op98_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op98_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #98" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op99, - /*batch_size=*/49, - 960 /* channels */, - 960 /* input stride */, - 960 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #99" << std::endl; - return ExecutionPlan(); - } - - size_t op100_workspace_size = 0; - size_t op100_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f32( - op100, - /*batch_size=*/1, 49 /* width */, - 960 /* channels */, 960 /* input stride */, 960 /* output stride */, - &op100_workspace_size, &op100_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op100_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #100" << std::endl; - return ExecutionPlan(); - } - - size_t op101_workspace_size = 0; - size_t op101_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op101, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op101_workspace_size, &op101_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op101_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #101" << std::endl; - return ExecutionPlan(); - } - - size_t op102_workspace_size = 0; - size_t op102_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op102, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op102_workspace_size, &op102_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op102_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #102" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 960 }; - const size_t b_shape[] = { 1, 1, 1, 960 }; - status = xnn_reshape_multiply_nd_f32( - op103, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #103" << std::endl; - return ExecutionPlan(); - } - - size_t op104_workspace_size = 0; - size_t op104_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op104, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op104_workspace_size, &op104_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op104_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #104" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 160 }; - const size_t b_shape[] = { 1, 7, 7, 160 }; - status = xnn_reshape_add_nd_f32( - op105, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #105" << std::endl; - return ExecutionPlan(); - } - - size_t op106_workspace_size = 0; - size_t op106_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op106, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op106_workspace_size, &op106_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op106_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #106" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op107, - /*batch_size=*/49, - 960 /* channels */, - 960 /* input stride */, - 960 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #107" << std::endl; - return ExecutionPlan(); - } - - size_t op108_workspace_size = 0; - size_t op108_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f32( - op108, - /*batch_size=*/1, 49 /* width */, - 960 /* channels */, 960 /* input stride */, 960 /* output stride */, - &op108_workspace_size, &op108_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op108_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #108" << std::endl; - return ExecutionPlan(); - } - - size_t op109_workspace_size = 0; - size_t op109_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op109, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op109_workspace_size, &op109_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op109_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #109" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op110, - /*batch_size=*/1, - 1280 /* channels */, - 1280 /* input stride */, - 1280 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #110" << std::endl; - return ExecutionPlan(); - } - - size_t op111_workspace_size = 0; - size_t op111_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f32( - op111, - /*batch_size=*/1, 1 /* width */, - 1280 /* channels */, 1280 /* input stride */, 1280 /* output stride */, - &op111_workspace_size, &op111_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op111_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #111" << std::endl; - return ExecutionPlan(); - } - - size_t op112_workspace_size = 0; - size_t op112_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op112, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op112_workspace_size, &op112_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op112_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #112" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x32( - op113, - /*batch_size=*/1001, - 1 /* channels */, - 1 /* input stride */, - 1 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #113" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_softmax_nc_f32( - op114, - /*channels=*/1001, - /*input_stride=*/1001, - /*output_stride=*/1001, - /*batch_size=*/1, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #114" << std::endl; - return ExecutionPlan(); - } - - Workspace workspace(max_workspace_size); - - status = xnn_setup_convolution2d_nhwc_f32( - op0, - workspace.data(), /*input=*/v0.data(), /*output=*/v1.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #0" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op1, - /*input=*/v1.data(), /*output=*/v2.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #1" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op2, - workspace.data(), /*input=*/v2.data(), /*output=*/v3.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #2" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op3, - workspace.data(), /*input=*/v3.data(), /*output=*/v4.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #3" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op4, - v4.data() /* a */, v2.data() /* b */, /*output=*/v5.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #4" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op5, - workspace.data(), /*input=*/v5.data(), /*output=*/v6.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #5" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op6, - workspace.data(), /*input=*/v6.data(), /*output=*/v7.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #6" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op7, - workspace.data(), /*input=*/v7.data(), /*output=*/v8.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #7" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op8, - workspace.data(), /*input=*/v8.data(), /*output=*/v9.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #8" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op9, - workspace.data(), /*input=*/v9.data(), /*output=*/v10.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #9" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op10, - workspace.data(), /*input=*/v10.data(), /*output=*/v11.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #10" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op11, - v11.data() /* a */, v8.data() /* b */, /*output=*/v12.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #11" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op12, - workspace.data(), /*input=*/v12.data(), /*output=*/v13.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #12" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op13, - workspace.data(), /*input=*/v13.data(), /*output=*/v14.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #13" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f32( - op14, - workspace.data(), - /*input=*/v14.data(), /*output=*/v15.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #14" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op15, - workspace.data(), /*input=*/v15.data(), /*output=*/v16.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #15" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op16, - workspace.data(), /*input=*/v16.data(), /*output=*/v17.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #16" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f32( - op17, - v14.data() /* a */, v17.data() /* b */, /*output=*/v18.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #17" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op18, - workspace.data(), /*input=*/v18.data(), /*output=*/v19.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #18" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op19, - workspace.data(), /*input=*/v19.data(), /*output=*/v20.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #19" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op20, - workspace.data(), /*input=*/v20.data(), /*output=*/v21.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #20" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f32( - op21, - workspace.data(), - /*input=*/v21.data(), /*output=*/v22.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #21" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op22, - workspace.data(), /*input=*/v22.data(), /*output=*/v23.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #22" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op23, - workspace.data(), /*input=*/v23.data(), /*output=*/v24.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #23" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f32( - op24, - v21.data() /* a */, v24.data() /* b */, /*output=*/v25.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #24" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op25, - workspace.data(), /*input=*/v25.data(), /*output=*/v26.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #25" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op26, - v26.data() /* a */, v19.data() /* b */, /*output=*/v27.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #26" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op27, - workspace.data(), /*input=*/v27.data(), /*output=*/v28.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #27" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op28, - workspace.data(), /*input=*/v28.data(), /*output=*/v29.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #28" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f32( - op29, - workspace.data(), - /*input=*/v29.data(), /*output=*/v30.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #29" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op30, - workspace.data(), /*input=*/v30.data(), /*output=*/v31.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #30" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op31, - workspace.data(), /*input=*/v31.data(), /*output=*/v32.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #31" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f32( - op32, - v29.data() /* a */, v32.data() /* b */, /*output=*/v33.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #32" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op33, - workspace.data(), /*input=*/v33.data(), /*output=*/v34.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #33" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op34, - v34.data() /* a */, v27.data() /* b */, /*output=*/v35.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #34" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op35, - workspace.data(), /*input=*/v35.data(), /*output=*/v36.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #35" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op36, - /*input=*/v36.data(), /*output=*/v37.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #36" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op37, - workspace.data(), /*input=*/v37.data(), /*output=*/v38.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #37" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op38, - /*input=*/v38.data(), /*output=*/v39.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #38" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op39, - workspace.data(), /*input=*/v39.data(), /*output=*/v40.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #39" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op40, - workspace.data(), /*input=*/v40.data(), /*output=*/v41.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #40" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op41, - /*input=*/v41.data(), /*output=*/v42.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #41" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op42, - workspace.data(), /*input=*/v42.data(), /*output=*/v43.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #42" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op43, - /*input=*/v43.data(), /*output=*/v44.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #43" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op44, - workspace.data(), /*input=*/v44.data(), /*output=*/v45.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #44" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op45, - v45.data() /* a */, v40.data() /* b */, /*output=*/v46.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #45" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op46, - workspace.data(), /*input=*/v46.data(), /*output=*/v47.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #46" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op47, - /*input=*/v47.data(), /*output=*/v48.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #47" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op48, - workspace.data(), /*input=*/v48.data(), /*output=*/v49.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #48" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op49, - /*input=*/v49.data(), /*output=*/v50.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #49" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op50, - workspace.data(), /*input=*/v50.data(), /*output=*/v51.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #50" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op51, - v51.data() /* a */, v46.data() /* b */, /*output=*/v52.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #51" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op52, - workspace.data(), /*input=*/v52.data(), /*output=*/v53.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #52" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op53, - /*input=*/v53.data(), /*output=*/v54.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #53" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op54, - workspace.data(), /*input=*/v54.data(), /*output=*/v55.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #54" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op55, - /*input=*/v55.data(), /*output=*/v56.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #55" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op56, - workspace.data(), /*input=*/v56.data(), /*output=*/v57.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #56" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op57, - v57.data() /* a */, v52.data() /* b */, /*output=*/v58.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #57" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op58, - workspace.data(), /*input=*/v58.data(), /*output=*/v59.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #58" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op59, - /*input=*/v59.data(), /*output=*/v60.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #59" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op60, - workspace.data(), /*input=*/v60.data(), /*output=*/v61.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #60" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op61, - /*input=*/v61.data(), /*output=*/v62.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #61" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f32( - op62, - workspace.data(), - /*input=*/v62.data(), /*output=*/v63.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #62" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op63, - workspace.data(), /*input=*/v63.data(), /*output=*/v64.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #63" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op64, - workspace.data(), /*input=*/v64.data(), /*output=*/v65.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #64" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f32( - op65, - v62.data() /* a */, v65.data() /* b */, /*output=*/v66.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #65" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op66, - workspace.data(), /*input=*/v66.data(), /*output=*/v67.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #66" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op67, - workspace.data(), /*input=*/v67.data(), /*output=*/v68.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #67" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op68, - /*input=*/v68.data(), /*output=*/v69.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #68" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op69, - workspace.data(), /*input=*/v69.data(), /*output=*/v70.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #69" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op70, - /*input=*/v70.data(), /*output=*/v71.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #70" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f32( - op71, - workspace.data(), - /*input=*/v71.data(), /*output=*/v72.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #71" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op72, - workspace.data(), /*input=*/v72.data(), /*output=*/v73.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #72" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op73, - workspace.data(), /*input=*/v73.data(), /*output=*/v74.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #73" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f32( - op74, - v71.data() /* a */, v74.data() /* b */, /*output=*/v75.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #74" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op75, - workspace.data(), /*input=*/v75.data(), /*output=*/v76.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #75" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op76, - v76.data() /* a */, v67.data() /* b */, /*output=*/v77.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #76" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op77, - workspace.data(), /*input=*/v77.data(), /*output=*/v78.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #77" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op78, - /*input=*/v78.data(), /*output=*/v79.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #78" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op79, - workspace.data(), /*input=*/v79.data(), /*output=*/v80.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #79" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op80, - /*input=*/v80.data(), /*output=*/v81.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #80" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f32( - op81, - workspace.data(), - /*input=*/v81.data(), /*output=*/v82.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #81" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op82, - workspace.data(), /*input=*/v82.data(), /*output=*/v83.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #82" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op83, - workspace.data(), /*input=*/v83.data(), /*output=*/v84.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #83" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f32( - op84, - v81.data() /* a */, v84.data() /* b */, /*output=*/v85.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #84" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op85, - workspace.data(), /*input=*/v85.data(), /*output=*/v86.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #85" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op86, - workspace.data(), /*input=*/v86.data(), /*output=*/v87.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #86" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op87, - /*input=*/v87.data(), /*output=*/v88.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #87" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op88, - workspace.data(), /*input=*/v88.data(), /*output=*/v89.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #88" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op89, - /*input=*/v89.data(), /*output=*/v90.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #89" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f32( - op90, - workspace.data(), - /*input=*/v90.data(), /*output=*/v91.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #90" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op91, - workspace.data(), /*input=*/v91.data(), /*output=*/v92.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #91" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op92, - workspace.data(), /*input=*/v92.data(), /*output=*/v93.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #92" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f32( - op93, - v90.data() /* a */, v93.data() /* b */, /*output=*/v94.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #93" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op94, - workspace.data(), /*input=*/v94.data(), /*output=*/v95.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #94" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op95, - v95.data() /* a */, v86.data() /* b */, /*output=*/v96.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #95" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op96, - workspace.data(), /*input=*/v96.data(), /*output=*/v97.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #96" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op97, - /*input=*/v97.data(), /*output=*/v98.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #97" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op98, - workspace.data(), /*input=*/v98.data(), /*output=*/v99.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #98" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op99, - /*input=*/v99.data(), /*output=*/v100.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #99" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f32( - op100, - workspace.data(), - /*input=*/v100.data(), /*output=*/v101.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #100" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op101, - workspace.data(), /*input=*/v101.data(), /*output=*/v102.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #101" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op102, - workspace.data(), /*input=*/v102.data(), /*output=*/v103.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #102" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f32( - op103, - v100.data() /* a */, v103.data() /* b */, /*output=*/v104.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #103" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op104, - workspace.data(), /*input=*/v104.data(), /*output=*/v105.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #104" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op105, - v105.data() /* a */, v96.data() /* b */, /*output=*/v106.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #105" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op106, - workspace.data(), /*input=*/v106.data(), /*output=*/v107.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #106" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op107, - /*input=*/v107.data(), /*output=*/v108.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #107" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f32( - op108, - workspace.data(), - /*input=*/v108.data(), /*output=*/v109.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #108" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op109, - workspace.data(), /*input=*/v109.data(), /*output=*/v110.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #109" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op110, - /*input=*/v110.data(), /*output=*/v111.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #110" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f32( - op111, - workspace.data(), - /*input=*/v111.data(), /*output=*/v112.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #111" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op112, - workspace.data(), /*input=*/v112.data(), /*output=*/v113.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #112" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x32( - op113, - /*input=*/v113.data(), /*output=*/v114.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #113" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_softmax_nc_f32( - op114, - /*input=*/v114.data(), /*output=*/v115.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #114" << std::endl; - return ExecutionPlan(); - } - - XNN_PRAGMA_CLANG("clang diagnostic push") - XNN_PRAGMA_CLANG("clang diagnostic ignored \"-Wpessimizing-move\"") - return ExecutionPlan{operators, workspace}; - XNN_PRAGMA_CLANG("clang diagnostic pop") -} - -} // namespace models diff --git a/models/fp32-mobilenet-v3-small.cc b/models/fp32-mobilenet-v3-small.cc deleted file mode 100644 index 9597c0404d8..00000000000 --- a/models/fp32-mobilenet-v3-small.cc +++ /dev/null @@ -1,4430 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! - -#include "xnnpack.h" - -#include -#include -#include -#include -#include -#include - -#include "xnnpack/cache.h" -#include "xnnpack/common.h" -#include "xnnpack/models.h" - -namespace models { - -ExecutionPlan FP32MobileNetV3Small(pthreadpool_t threadpool) { - alignas(16) static std::array v0; - alignas(16) static std::array v1; - alignas(16) static std::array v2; - alignas(16) static std::array v3; - alignas(16) static std::array v4; - alignas(16) static std::array v5; - alignas(16) static std::array v6; - alignas(16) static std::array v7; - alignas(16) static std::array v8; - alignas(16) static std::array v9; - alignas(16) static std::array v10; - alignas(16) static std::array v11; - alignas(16) static std::array v12; - alignas(16) static std::array v13; - alignas(16) static std::array v14; - alignas(16) static std::array v15; - alignas(16) static std::array v16; - alignas(16) static std::array v17; - alignas(16) static std::array v18; - alignas(16) static std::array v19; - alignas(16) static std::array v20; - alignas(16) static std::array v21; - alignas(16) static std::array v22; - alignas(16) static std::array v23; - alignas(16) static std::array v24; - alignas(16) static std::array v25; - alignas(16) static std::array v26; - alignas(16) static std::array v27; - alignas(16) static std::array v28; - alignas(16) static std::array v29; - alignas(16) static std::array v30; - alignas(16) static std::array v31; - alignas(16) static std::array v32; - alignas(16) static std::array v33; - alignas(16) static std::array v34; - alignas(16) static std::array v35; - alignas(16) static std::array v36; - alignas(16) static std::array v37; - alignas(16) static std::array v38; - alignas(16) static std::array v39; - alignas(16) static std::array v40; - alignas(16) static std::array v41; - alignas(16) static std::array v42; - alignas(16) static std::array v43; - alignas(16) static std::array v44; - alignas(16) static std::array v45; - alignas(16) static std::array v46; - alignas(16) static std::array v47; - alignas(16) static std::array v48; - alignas(16) static std::array v49; - alignas(16) static std::array v50; - alignas(16) static std::array v51; - alignas(16) static std::array v52; - alignas(16) static std::array v53; - alignas(16) static std::array v54; - alignas(16) static std::array v55; - alignas(16) static std::array v56; - alignas(16) static std::array v57; - alignas(16) static std::array v58; - alignas(16) static std::array v59; - alignas(16) static std::array v60; - alignas(16) static std::array v61; - alignas(16) static std::array v62; - alignas(16) static std::array v63; - alignas(16) static std::array v64; - alignas(16) static std::array v65; - alignas(16) static std::array v66; - alignas(16) static std::array v67; - alignas(16) static std::array v68; - alignas(16) static std::array v69; - alignas(16) static std::array v70; - alignas(16) static std::array v71; - alignas(16) static std::array v72; - alignas(16) static std::array v73; - alignas(16) static std::array v74; - alignas(16) static std::array v75; - alignas(16) static std::array v76; - alignas(16) static std::array v77; - alignas(16) static std::array v78; - alignas(16) static std::array v79; - alignas(16) static std::array v80; - alignas(16) static std::array v81; - alignas(16) static std::array v82; - alignas(16) static std::array v83; - alignas(16) static std::array v84; - alignas(16) static std::array v85; - alignas(16) static std::array v86; - alignas(16) static std::array v87; - alignas(16) static std::array v88; - alignas(16) static std::array v89; - alignas(16) static std::array v90; - alignas(16) static std::array v91; - alignas(16) static std::array v92; - alignas(16) static std::array v93; - alignas(16) static std::array v94; - alignas(16) static std::array v95; - alignas(16) static std::array v96; - alignas(16) static std::array v97; - alignas(16) static std::array v98; - alignas(16) static std::array v99; - alignas(16) static std::array v100; - alignas(16) static std::array v101; - alignas(16) static std::array w102; - alignas(16) static std::array w103; - alignas(16) static std::array w104; - alignas(16) static std::array w105; - alignas(16) static std::array w106; - alignas(16) static std::array w107; - alignas(16) static std::array w108; - alignas(16) static std::array w109; - alignas(16) static std::array w110; - alignas(16) static std::array w111; - alignas(16) static std::array w112; - alignas(16) static std::array w113; - alignas(16) static std::array w114; - alignas(16) static std::array w115; - alignas(16) static std::array w116; - alignas(16) static std::array w117; - alignas(16) static std::array w118; - alignas(16) static std::array w119; - alignas(16) static std::array w120; - alignas(16) static std::array w121; - alignas(16) static std::array w122; - alignas(16) static std::array w123; - alignas(16) static std::array w124; - alignas(16) static std::array w125; - alignas(16) static std::array w126; - alignas(16) static std::array w127; - alignas(16) static std::array w128; - alignas(16) static std::array w129; - alignas(16) static std::array w130; - alignas(16) static std::array w131; - alignas(16) static std::array w132; - alignas(16) static std::array w133; - alignas(16) static std::array w134; - alignas(16) static std::array w135; - alignas(16) static std::array w136; - alignas(16) static std::array w137; - alignas(16) static std::array w138; - alignas(16) static std::array w139; - alignas(16) static std::array w140; - alignas(16) static std::array w141; - alignas(16) static std::array w142; - alignas(16) static std::array w143; - alignas(16) static std::array w144; - alignas(16) static std::array w145; - alignas(16) static std::array w146; - alignas(16) static std::array w147; - alignas(16) static std::array w148; - alignas(16) static std::array w149; - alignas(16) static std::array w150; - alignas(16) static std::array w151; - alignas(16) static std::array w152; - alignas(16) static std::array w153; - alignas(16) static std::array w154; - alignas(16) static std::array w155; - alignas(16) static std::array w156; - alignas(16) static std::array w157; - alignas(16) static std::array w158; - alignas(16) static std::array w159; - alignas(16) static std::array w160; - alignas(16) static std::array w161; - alignas(16) static std::array w162; - alignas(16) static std::array w163; - alignas(16) static std::array w164; - alignas(16) static std::array w165; - alignas(16) static std::array w166; - alignas(16) static std::array w167; - alignas(16) static std::array w168; - alignas(16) static std::array w169; - alignas(16) static std::array w170; - alignas(16) static std::array w171; - alignas(16) static std::array w172; - alignas(16) static std::array w173; - alignas(16) static std::array w174; - alignas(16) static std::array w175; - alignas(16) static std::array w176; - alignas(16) static std::array w177; - alignas(16) static std::array w178; - alignas(16) static std::array w179; - alignas(16) static std::array w180; - alignas(16) static std::array w181; - alignas(16) static std::array w182; - alignas(16) static std::array w183; - alignas(16) static std::array w184; - alignas(16) static std::array w185; - alignas(16) static std::array w186; - alignas(16) static std::array w187; - alignas(16) static std::array w188; - alignas(16) static std::array w189; - alignas(16) static std::array w190; - alignas(16) static std::array w191; - alignas(16) static std::array w192; - alignas(16) static std::array w193; - alignas(16) static std::array w194; - alignas(16) static std::array w195; - alignas(16) static std::array w196; - alignas(16) static std::array w197; - alignas(16) static std::array w198; - alignas(16) static std::array w199; - alignas(16) static std::array w200; - alignas(16) static std::array w201; - alignas(16) static std::array w202; - alignas(16) static std::array w203; - alignas(16) static std::array w204; - alignas(16) static std::array w205; - alignas(16) static std::array w206; - alignas(16) static std::array w207; - alignas(16) static std::array w208; - alignas(16) static std::array w209; - - std::random_device random_device; - auto rng = std::mt19937(random_device()); - auto f32rng = std::bind(std::uniform_real_distribution(-1.0f, +1.0f), std::ref(rng)); - std::generate(v0.begin(), v0.end(), std::ref(f32rng)); - std::generate(v1.begin(), v1.end(), std::ref(f32rng)); - std::generate(v2.begin(), v2.end(), std::ref(f32rng)); - std::generate(v3.begin(), v3.end(), std::ref(f32rng)); - std::generate(v4.begin(), v4.end(), std::ref(f32rng)); - std::generate(v5.begin(), v5.end(), std::ref(f32rng)); - std::generate(v6.begin(), v6.end(), std::ref(f32rng)); - std::generate(v7.begin(), v7.end(), std::ref(f32rng)); - std::generate(v8.begin(), v8.end(), std::ref(f32rng)); - std::generate(v9.begin(), v9.end(), std::ref(f32rng)); - std::generate(v10.begin(), v10.end(), std::ref(f32rng)); - std::generate(v11.begin(), v11.end(), std::ref(f32rng)); - std::generate(v12.begin(), v12.end(), std::ref(f32rng)); - std::generate(v13.begin(), v13.end(), std::ref(f32rng)); - std::generate(v14.begin(), v14.end(), std::ref(f32rng)); - std::generate(v15.begin(), v15.end(), std::ref(f32rng)); - std::generate(v16.begin(), v16.end(), std::ref(f32rng)); - std::generate(v17.begin(), v17.end(), std::ref(f32rng)); - std::generate(v18.begin(), v18.end(), std::ref(f32rng)); - std::generate(v19.begin(), v19.end(), std::ref(f32rng)); - std::generate(v20.begin(), v20.end(), std::ref(f32rng)); - std::generate(v21.begin(), v21.end(), std::ref(f32rng)); - std::generate(v22.begin(), v22.end(), std::ref(f32rng)); - std::generate(v23.begin(), v23.end(), std::ref(f32rng)); - std::generate(v24.begin(), v24.end(), std::ref(f32rng)); - std::generate(v25.begin(), v25.end(), std::ref(f32rng)); - std::generate(v26.begin(), v26.end(), std::ref(f32rng)); - std::generate(v27.begin(), v27.end(), std::ref(f32rng)); - std::generate(v28.begin(), v28.end(), std::ref(f32rng)); - std::generate(v29.begin(), v29.end(), std::ref(f32rng)); - std::generate(v30.begin(), v30.end(), std::ref(f32rng)); - std::generate(v31.begin(), v31.end(), std::ref(f32rng)); - std::generate(v32.begin(), v32.end(), std::ref(f32rng)); - std::generate(v33.begin(), v33.end(), std::ref(f32rng)); - std::generate(v34.begin(), v34.end(), std::ref(f32rng)); - std::generate(v35.begin(), v35.end(), std::ref(f32rng)); - std::generate(v36.begin(), v36.end(), std::ref(f32rng)); - std::generate(v37.begin(), v37.end(), std::ref(f32rng)); - std::generate(v38.begin(), v38.end(), std::ref(f32rng)); - std::generate(v39.begin(), v39.end(), std::ref(f32rng)); - std::generate(v40.begin(), v40.end(), std::ref(f32rng)); - std::generate(v41.begin(), v41.end(), std::ref(f32rng)); - std::generate(v42.begin(), v42.end(), std::ref(f32rng)); - std::generate(v43.begin(), v43.end(), std::ref(f32rng)); - std::generate(v44.begin(), v44.end(), std::ref(f32rng)); - std::generate(v45.begin(), v45.end(), std::ref(f32rng)); - std::generate(v46.begin(), v46.end(), std::ref(f32rng)); - std::generate(v47.begin(), v47.end(), std::ref(f32rng)); - std::generate(v48.begin(), v48.end(), std::ref(f32rng)); - std::generate(v49.begin(), v49.end(), std::ref(f32rng)); - std::generate(v50.begin(), v50.end(), std::ref(f32rng)); - std::generate(v51.begin(), v51.end(), std::ref(f32rng)); - std::generate(v52.begin(), v52.end(), std::ref(f32rng)); - std::generate(v53.begin(), v53.end(), std::ref(f32rng)); - std::generate(v54.begin(), v54.end(), std::ref(f32rng)); - std::generate(v55.begin(), v55.end(), std::ref(f32rng)); - std::generate(v56.begin(), v56.end(), std::ref(f32rng)); - std::generate(v57.begin(), v57.end(), std::ref(f32rng)); - std::generate(v58.begin(), v58.end(), std::ref(f32rng)); - std::generate(v59.begin(), v59.end(), std::ref(f32rng)); - std::generate(v60.begin(), v60.end(), std::ref(f32rng)); - std::generate(v61.begin(), v61.end(), std::ref(f32rng)); - std::generate(v62.begin(), v62.end(), std::ref(f32rng)); - std::generate(v63.begin(), v63.end(), std::ref(f32rng)); - std::generate(v64.begin(), v64.end(), std::ref(f32rng)); - std::generate(v65.begin(), v65.end(), std::ref(f32rng)); - std::generate(v66.begin(), v66.end(), std::ref(f32rng)); - std::generate(v67.begin(), v67.end(), std::ref(f32rng)); - std::generate(v68.begin(), v68.end(), std::ref(f32rng)); - std::generate(v69.begin(), v69.end(), std::ref(f32rng)); - std::generate(v70.begin(), v70.end(), std::ref(f32rng)); - std::generate(v71.begin(), v71.end(), std::ref(f32rng)); - std::generate(v72.begin(), v72.end(), std::ref(f32rng)); - std::generate(v73.begin(), v73.end(), std::ref(f32rng)); - std::generate(v74.begin(), v74.end(), std::ref(f32rng)); - std::generate(v75.begin(), v75.end(), std::ref(f32rng)); - std::generate(v76.begin(), v76.end(), std::ref(f32rng)); - std::generate(v77.begin(), v77.end(), std::ref(f32rng)); - std::generate(v78.begin(), v78.end(), std::ref(f32rng)); - std::generate(v79.begin(), v79.end(), std::ref(f32rng)); - std::generate(v80.begin(), v80.end(), std::ref(f32rng)); - std::generate(v81.begin(), v81.end(), std::ref(f32rng)); - std::generate(v82.begin(), v82.end(), std::ref(f32rng)); - std::generate(v83.begin(), v83.end(), std::ref(f32rng)); - std::generate(v84.begin(), v84.end(), std::ref(f32rng)); - std::generate(v85.begin(), v85.end(), std::ref(f32rng)); - std::generate(v86.begin(), v86.end(), std::ref(f32rng)); - std::generate(v87.begin(), v87.end(), std::ref(f32rng)); - std::generate(v88.begin(), v88.end(), std::ref(f32rng)); - std::generate(v89.begin(), v89.end(), std::ref(f32rng)); - std::generate(v90.begin(), v90.end(), std::ref(f32rng)); - std::generate(v91.begin(), v91.end(), std::ref(f32rng)); - std::generate(v92.begin(), v92.end(), std::ref(f32rng)); - std::generate(v93.begin(), v93.end(), std::ref(f32rng)); - std::generate(v94.begin(), v94.end(), std::ref(f32rng)); - std::generate(v95.begin(), v95.end(), std::ref(f32rng)); - std::generate(v96.begin(), v96.end(), std::ref(f32rng)); - std::generate(v97.begin(), v97.end(), std::ref(f32rng)); - std::generate(v98.begin(), v98.end(), std::ref(f32rng)); - std::generate(v99.begin(), v99.end(), std::ref(f32rng)); - std::generate(v100.begin(), v100.end(), std::ref(f32rng)); - std::generate(v101.begin(), v101.end(), std::ref(f32rng)); - std::generate(w102.begin(), w102.end(), std::ref(f32rng)); - std::generate(w103.begin(), w103.end(), std::ref(f32rng)); - std::generate(w104.begin(), w104.end(), std::ref(f32rng)); - std::generate(w105.begin(), w105.end(), std::ref(f32rng)); - std::generate(w106.begin(), w106.end(), std::ref(f32rng)); - std::generate(w107.begin(), w107.end(), std::ref(f32rng)); - std::generate(w108.begin(), w108.end(), std::ref(f32rng)); - std::generate(w109.begin(), w109.end(), std::ref(f32rng)); - std::generate(w110.begin(), w110.end(), std::ref(f32rng)); - std::generate(w111.begin(), w111.end(), std::ref(f32rng)); - std::generate(w112.begin(), w112.end(), std::ref(f32rng)); - std::generate(w113.begin(), w113.end(), std::ref(f32rng)); - std::generate(w114.begin(), w114.end(), std::ref(f32rng)); - std::generate(w115.begin(), w115.end(), std::ref(f32rng)); - std::generate(w116.begin(), w116.end(), std::ref(f32rng)); - std::generate(w117.begin(), w117.end(), std::ref(f32rng)); - std::generate(w118.begin(), w118.end(), std::ref(f32rng)); - std::generate(w119.begin(), w119.end(), std::ref(f32rng)); - std::generate(w120.begin(), w120.end(), std::ref(f32rng)); - std::generate(w121.begin(), w121.end(), std::ref(f32rng)); - std::generate(w122.begin(), w122.end(), std::ref(f32rng)); - std::generate(w123.begin(), w123.end(), std::ref(f32rng)); - std::generate(w124.begin(), w124.end(), std::ref(f32rng)); - std::generate(w125.begin(), w125.end(), std::ref(f32rng)); - std::generate(w126.begin(), w126.end(), std::ref(f32rng)); - std::generate(w127.begin(), w127.end(), std::ref(f32rng)); - std::generate(w128.begin(), w128.end(), std::ref(f32rng)); - std::generate(w129.begin(), w129.end(), std::ref(f32rng)); - std::generate(w130.begin(), w130.end(), std::ref(f32rng)); - std::generate(w131.begin(), w131.end(), std::ref(f32rng)); - std::generate(w132.begin(), w132.end(), std::ref(f32rng)); - std::generate(w133.begin(), w133.end(), std::ref(f32rng)); - std::generate(w134.begin(), w134.end(), std::ref(f32rng)); - std::generate(w135.begin(), w135.end(), std::ref(f32rng)); - std::generate(w136.begin(), w136.end(), std::ref(f32rng)); - std::generate(w137.begin(), w137.end(), std::ref(f32rng)); - std::generate(w138.begin(), w138.end(), std::ref(f32rng)); - std::generate(w139.begin(), w139.end(), std::ref(f32rng)); - std::generate(w140.begin(), w140.end(), std::ref(f32rng)); - std::generate(w141.begin(), w141.end(), std::ref(f32rng)); - std::generate(w142.begin(), w142.end(), std::ref(f32rng)); - std::generate(w143.begin(), w143.end(), std::ref(f32rng)); - std::generate(w144.begin(), w144.end(), std::ref(f32rng)); - std::generate(w145.begin(), w145.end(), std::ref(f32rng)); - std::generate(w146.begin(), w146.end(), std::ref(f32rng)); - std::generate(w147.begin(), w147.end(), std::ref(f32rng)); - std::generate(w148.begin(), w148.end(), std::ref(f32rng)); - std::generate(w149.begin(), w149.end(), std::ref(f32rng)); - std::generate(w150.begin(), w150.end(), std::ref(f32rng)); - std::generate(w151.begin(), w151.end(), std::ref(f32rng)); - std::generate(w152.begin(), w152.end(), std::ref(f32rng)); - std::generate(w153.begin(), w153.end(), std::ref(f32rng)); - std::generate(w154.begin(), w154.end(), std::ref(f32rng)); - std::generate(w155.begin(), w155.end(), std::ref(f32rng)); - std::generate(w156.begin(), w156.end(), std::ref(f32rng)); - std::generate(w157.begin(), w157.end(), std::ref(f32rng)); - std::generate(w158.begin(), w158.end(), std::ref(f32rng)); - std::generate(w159.begin(), w159.end(), std::ref(f32rng)); - std::generate(w160.begin(), w160.end(), std::ref(f32rng)); - std::generate(w161.begin(), w161.end(), std::ref(f32rng)); - std::generate(w162.begin(), w162.end(), std::ref(f32rng)); - std::generate(w163.begin(), w163.end(), std::ref(f32rng)); - std::generate(w164.begin(), w164.end(), std::ref(f32rng)); - std::generate(w165.begin(), w165.end(), std::ref(f32rng)); - std::generate(w166.begin(), w166.end(), std::ref(f32rng)); - std::generate(w167.begin(), w167.end(), std::ref(f32rng)); - std::generate(w168.begin(), w168.end(), std::ref(f32rng)); - std::generate(w169.begin(), w169.end(), std::ref(f32rng)); - std::generate(w170.begin(), w170.end(), std::ref(f32rng)); - std::generate(w171.begin(), w171.end(), std::ref(f32rng)); - std::generate(w172.begin(), w172.end(), std::ref(f32rng)); - std::generate(w173.begin(), w173.end(), std::ref(f32rng)); - std::generate(w174.begin(), w174.end(), std::ref(f32rng)); - std::generate(w175.begin(), w175.end(), std::ref(f32rng)); - std::generate(w176.begin(), w176.end(), std::ref(f32rng)); - std::generate(w177.begin(), w177.end(), std::ref(f32rng)); - std::generate(w178.begin(), w178.end(), std::ref(f32rng)); - std::generate(w179.begin(), w179.end(), std::ref(f32rng)); - std::generate(w180.begin(), w180.end(), std::ref(f32rng)); - std::generate(w181.begin(), w181.end(), std::ref(f32rng)); - std::generate(w182.begin(), w182.end(), std::ref(f32rng)); - std::generate(w183.begin(), w183.end(), std::ref(f32rng)); - std::generate(w184.begin(), w184.end(), std::ref(f32rng)); - std::generate(w185.begin(), w185.end(), std::ref(f32rng)); - std::generate(w186.begin(), w186.end(), std::ref(f32rng)); - std::generate(w187.begin(), w187.end(), std::ref(f32rng)); - std::generate(w188.begin(), w188.end(), std::ref(f32rng)); - std::generate(w189.begin(), w189.end(), std::ref(f32rng)); - std::generate(w190.begin(), w190.end(), std::ref(f32rng)); - std::generate(w191.begin(), w191.end(), std::ref(f32rng)); - std::generate(w192.begin(), w192.end(), std::ref(f32rng)); - std::generate(w193.begin(), w193.end(), std::ref(f32rng)); - std::generate(w194.begin(), w194.end(), std::ref(f32rng)); - std::generate(w195.begin(), w195.end(), std::ref(f32rng)); - std::generate(w196.begin(), w196.end(), std::ref(f32rng)); - std::generate(w197.begin(), w197.end(), std::ref(f32rng)); - std::generate(w198.begin(), w198.end(), std::ref(f32rng)); - std::generate(w199.begin(), w199.end(), std::ref(f32rng)); - std::generate(w200.begin(), w200.end(), std::ref(f32rng)); - std::generate(w201.begin(), w201.end(), std::ref(f32rng)); - std::generate(w202.begin(), w202.end(), std::ref(f32rng)); - std::generate(w203.begin(), w203.end(), std::ref(f32rng)); - std::generate(w204.begin(), w204.end(), std::ref(f32rng)); - std::generate(w205.begin(), w205.end(), std::ref(f32rng)); - std::generate(w206.begin(), w206.end(), std::ref(f32rng)); - std::generate(w207.begin(), w207.end(), std::ref(f32rng)); - std::generate(w208.begin(), w208.end(), std::ref(f32rng)); - std::generate(w209.begin(), w209.end(), std::ref(f32rng)); - - Operators operators; - xnn_status status; - xnn_code_cache* code_cache_ptr = nullptr; - size_t max_workspace_size = 0; - - xnn_operator_t op0 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/0, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/3, - /*group_output_channels=*/16, - /*input_channel_stride=*/3, - /*output_channel_stride=*/16, - /*kernel=*/w102.data(), /*bias=*/w103.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op0); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #0" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op0, xnn_delete_operator); - - xnn_operator_t op1 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op1); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #1" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op1, xnn_delete_operator); - - xnn_operator_t op2 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/0, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/16, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/16, - /*output_channel_stride=*/16, - /*kernel=*/w104.data(), /*bias=*/w105.data(), - /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op2); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #2" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op2, xnn_delete_operator); - - xnn_operator_t op3 = nullptr; - status = xnn_create_global_average_pooling_nwc_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op3); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #3" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op3, xnn_delete_operator); - - xnn_operator_t op4 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/16, - /*group_output_channels=*/8, - /*input_channel_stride=*/16, - /*output_channel_stride=*/8, - /*kernel=*/w106.data(), /*bias=*/w107.data(), - /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op4); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #4" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op4, xnn_delete_operator); - - xnn_operator_t op5 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/8, - /*group_output_channels=*/16, - /*input_channel_stride=*/8, - /*output_channel_stride=*/16, - /*kernel=*/w108.data(), /*bias=*/w109.data(), - /*output_min=*/0.0f, /*output_max=*/+0x1.00014Fp+0, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op5); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #5" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op5, xnn_delete_operator); - - xnn_operator_t op6 = nullptr; - status = xnn_create_multiply_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op6); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #6" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op6, xnn_delete_operator); - - xnn_operator_t op7 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/16, - /*group_output_channels=*/16, - /*input_channel_stride=*/16, - /*output_channel_stride=*/16, - /*kernel=*/w110.data(), /*bias=*/w111.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op7); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #7" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op7, xnn_delete_operator); - - xnn_operator_t op8 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/16, - /*group_output_channels=*/72, - /*input_channel_stride=*/16, - /*output_channel_stride=*/72, - /*kernel=*/w112.data(), /*bias=*/w113.data(), - /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op8); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #8" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op8, xnn_delete_operator); - - xnn_operator_t op9 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/0, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/72, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/72, - /*output_channel_stride=*/72, - /*kernel=*/w114.data(), /*bias=*/w115.data(), - /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op9); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #9" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op9, xnn_delete_operator); - - xnn_operator_t op10 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/72, - /*group_output_channels=*/24, - /*input_channel_stride=*/72, - /*output_channel_stride=*/24, - /*kernel=*/w116.data(), /*bias=*/w117.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op10); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #10" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op10, xnn_delete_operator); - - xnn_operator_t op11 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/24, - /*group_output_channels=*/88, - /*input_channel_stride=*/24, - /*output_channel_stride=*/88, - /*kernel=*/w118.data(), /*bias=*/w119.data(), - /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op11); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #11" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op11, xnn_delete_operator); - - xnn_operator_t op12 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/88, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/88, - /*output_channel_stride=*/88, - /*kernel=*/w120.data(), /*bias=*/w121.data(), - /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op12); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #12" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op12, xnn_delete_operator); - - xnn_operator_t op13 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/88, - /*group_output_channels=*/24, - /*input_channel_stride=*/88, - /*output_channel_stride=*/24, - /*kernel=*/w122.data(), /*bias=*/w123.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op13); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #13" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op13, xnn_delete_operator); - - xnn_operator_t op14 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op14); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #14" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op14, xnn_delete_operator); - - xnn_operator_t op15 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/24, - /*group_output_channels=*/96, - /*input_channel_stride=*/24, - /*output_channel_stride=*/96, - /*kernel=*/w124.data(), /*bias=*/w125.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op15); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #15" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op15, xnn_delete_operator); - - xnn_operator_t op16 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op16); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #16" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op16, xnn_delete_operator); - - xnn_operator_t op17 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/1, /*input_padding_right=*/2, - /*input_padding_bottom=*/2, /*input_padding_left=*/1, - /*kernel_height=*/5, /*kernel_width=*/5, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/96, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/96, - /*output_channel_stride=*/96, - /*kernel=*/w126.data(), /*bias=*/w127.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op17); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #17" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op17, xnn_delete_operator); - - xnn_operator_t op18 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op18); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #18" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op18, xnn_delete_operator); - - xnn_operator_t op19 = nullptr; - status = xnn_create_global_average_pooling_nwc_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op19); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #19" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op19, xnn_delete_operator); - - xnn_operator_t op20 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/96, - /*group_output_channels=*/24, - /*input_channel_stride=*/96, - /*output_channel_stride=*/24, - /*kernel=*/w128.data(), /*bias=*/w129.data(), - /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op20); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #20" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op20, xnn_delete_operator); - - xnn_operator_t op21 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/24, - /*group_output_channels=*/96, - /*input_channel_stride=*/24, - /*output_channel_stride=*/96, - /*kernel=*/w130.data(), /*bias=*/w131.data(), - /*output_min=*/0.0f, /*output_max=*/+0x1.00014Fp+0, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op21); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #21" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op21, xnn_delete_operator); - - xnn_operator_t op22 = nullptr; - status = xnn_create_multiply_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op22); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #22" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op22, xnn_delete_operator); - - xnn_operator_t op23 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/96, - /*group_output_channels=*/40, - /*input_channel_stride=*/96, - /*output_channel_stride=*/40, - /*kernel=*/w132.data(), /*bias=*/w133.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op23); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #23" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op23, xnn_delete_operator); - - xnn_operator_t op24 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/40, - /*group_output_channels=*/240, - /*input_channel_stride=*/40, - /*output_channel_stride=*/240, - /*kernel=*/w134.data(), /*bias=*/w135.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op24); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #24" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op24, xnn_delete_operator); - - xnn_operator_t op25 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op25); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #25" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op25, xnn_delete_operator); - - xnn_operator_t op26 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/2, /*input_padding_right=*/2, - /*input_padding_bottom=*/2, /*input_padding_left=*/2, - /*kernel_height=*/5, /*kernel_width=*/5, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/240, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/240, - /*output_channel_stride=*/240, - /*kernel=*/w136.data(), /*bias=*/w137.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op26); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #26" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op26, xnn_delete_operator); - - xnn_operator_t op27 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op27); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #27" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op27, xnn_delete_operator); - - xnn_operator_t op28 = nullptr; - status = xnn_create_global_average_pooling_nwc_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op28); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #28" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op28, xnn_delete_operator); - - xnn_operator_t op29 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/240, - /*group_output_channels=*/64, - /*input_channel_stride=*/240, - /*output_channel_stride=*/64, - /*kernel=*/w138.data(), /*bias=*/w139.data(), - /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op29); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #29" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op29, xnn_delete_operator); - - xnn_operator_t op30 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/64, - /*group_output_channels=*/240, - /*input_channel_stride=*/64, - /*output_channel_stride=*/240, - /*kernel=*/w140.data(), /*bias=*/w141.data(), - /*output_min=*/0.0f, /*output_max=*/+0x1.00014Fp+0, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op30); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #30" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op30, xnn_delete_operator); - - xnn_operator_t op31 = nullptr; - status = xnn_create_multiply_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op31); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #31" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op31, xnn_delete_operator); - - xnn_operator_t op32 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/240, - /*group_output_channels=*/40, - /*input_channel_stride=*/240, - /*output_channel_stride=*/40, - /*kernel=*/w142.data(), /*bias=*/w143.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op32); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #32" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op32, xnn_delete_operator); - - xnn_operator_t op33 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op33); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #33" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op33, xnn_delete_operator); - - xnn_operator_t op34 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/40, - /*group_output_channels=*/240, - /*input_channel_stride=*/40, - /*output_channel_stride=*/240, - /*kernel=*/w144.data(), /*bias=*/w145.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op34); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #34" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op34, xnn_delete_operator); - - xnn_operator_t op35 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op35); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #35" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op35, xnn_delete_operator); - - xnn_operator_t op36 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/2, /*input_padding_right=*/2, - /*input_padding_bottom=*/2, /*input_padding_left=*/2, - /*kernel_height=*/5, /*kernel_width=*/5, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/240, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/240, - /*output_channel_stride=*/240, - /*kernel=*/w146.data(), /*bias=*/w147.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op36); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #36" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op36, xnn_delete_operator); - - xnn_operator_t op37 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op37); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #37" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op37, xnn_delete_operator); - - xnn_operator_t op38 = nullptr; - status = xnn_create_global_average_pooling_nwc_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op38); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #38" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op38, xnn_delete_operator); - - xnn_operator_t op39 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/240, - /*group_output_channels=*/64, - /*input_channel_stride=*/240, - /*output_channel_stride=*/64, - /*kernel=*/w148.data(), /*bias=*/w149.data(), - /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op39); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #39" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op39, xnn_delete_operator); - - xnn_operator_t op40 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/64, - /*group_output_channels=*/240, - /*input_channel_stride=*/64, - /*output_channel_stride=*/240, - /*kernel=*/w150.data(), /*bias=*/w151.data(), - /*output_min=*/0.0f, /*output_max=*/+0x1.00014Fp+0, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op40); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #40" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op40, xnn_delete_operator); - - xnn_operator_t op41 = nullptr; - status = xnn_create_multiply_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op41); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #41" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op41, xnn_delete_operator); - - xnn_operator_t op42 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/240, - /*group_output_channels=*/40, - /*input_channel_stride=*/240, - /*output_channel_stride=*/40, - /*kernel=*/w152.data(), /*bias=*/w153.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op42); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #42" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op42, xnn_delete_operator); - - xnn_operator_t op43 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op43); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #43" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op43, xnn_delete_operator); - - xnn_operator_t op44 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/40, - /*group_output_channels=*/120, - /*input_channel_stride=*/40, - /*output_channel_stride=*/120, - /*kernel=*/w154.data(), /*bias=*/w155.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op44); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #44" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op44, xnn_delete_operator); - - xnn_operator_t op45 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op45); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #45" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op45, xnn_delete_operator); - - xnn_operator_t op46 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/2, /*input_padding_right=*/2, - /*input_padding_bottom=*/2, /*input_padding_left=*/2, - /*kernel_height=*/5, /*kernel_width=*/5, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/120, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/120, - /*output_channel_stride=*/120, - /*kernel=*/w156.data(), /*bias=*/w157.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op46); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #46" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op46, xnn_delete_operator); - - xnn_operator_t op47 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op47); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #47" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op47, xnn_delete_operator); - - xnn_operator_t op48 = nullptr; - status = xnn_create_global_average_pooling_nwc_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op48); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #48" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op48, xnn_delete_operator); - - xnn_operator_t op49 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/120, - /*group_output_channels=*/32, - /*input_channel_stride=*/120, - /*output_channel_stride=*/32, - /*kernel=*/w158.data(), /*bias=*/w159.data(), - /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op49); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #49" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op49, xnn_delete_operator); - - xnn_operator_t op50 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/32, - /*group_output_channels=*/120, - /*input_channel_stride=*/32, - /*output_channel_stride=*/120, - /*kernel=*/w160.data(), /*bias=*/w161.data(), - /*output_min=*/0.0f, /*output_max=*/+0x1.00014Fp+0, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op50); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #50" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op50, xnn_delete_operator); - - xnn_operator_t op51 = nullptr; - status = xnn_create_multiply_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op51); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #51" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op51, xnn_delete_operator); - - xnn_operator_t op52 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/120, - /*group_output_channels=*/48, - /*input_channel_stride=*/120, - /*output_channel_stride=*/48, - /*kernel=*/w162.data(), /*bias=*/w163.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op52); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #52" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op52, xnn_delete_operator); - - xnn_operator_t op53 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/48, - /*group_output_channels=*/144, - /*input_channel_stride=*/48, - /*output_channel_stride=*/144, - /*kernel=*/w164.data(), /*bias=*/w165.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op53); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #53" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op53, xnn_delete_operator); - - xnn_operator_t op54 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op54); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #54" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op54, xnn_delete_operator); - - xnn_operator_t op55 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/2, /*input_padding_right=*/2, - /*input_padding_bottom=*/2, /*input_padding_left=*/2, - /*kernel_height=*/5, /*kernel_width=*/5, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/144, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/144, - /*output_channel_stride=*/144, - /*kernel=*/w166.data(), /*bias=*/w167.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op55); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #55" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op55, xnn_delete_operator); - - xnn_operator_t op56 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op56); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #56" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op56, xnn_delete_operator); - - xnn_operator_t op57 = nullptr; - status = xnn_create_global_average_pooling_nwc_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op57); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #57" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op57, xnn_delete_operator); - - xnn_operator_t op58 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/144, - /*group_output_channels=*/40, - /*input_channel_stride=*/144, - /*output_channel_stride=*/40, - /*kernel=*/w168.data(), /*bias=*/w169.data(), - /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op58); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #58" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op58, xnn_delete_operator); - - xnn_operator_t op59 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/40, - /*group_output_channels=*/144, - /*input_channel_stride=*/40, - /*output_channel_stride=*/144, - /*kernel=*/w170.data(), /*bias=*/w171.data(), - /*output_min=*/0.0f, /*output_max=*/+0x1.00014Fp+0, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op59); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #59" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op59, xnn_delete_operator); - - xnn_operator_t op60 = nullptr; - status = xnn_create_multiply_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op60); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #60" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op60, xnn_delete_operator); - - xnn_operator_t op61 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/144, - /*group_output_channels=*/48, - /*input_channel_stride=*/144, - /*output_channel_stride=*/48, - /*kernel=*/w172.data(), /*bias=*/w173.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op61); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #61" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op61, xnn_delete_operator); - - xnn_operator_t op62 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op62); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #62" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op62, xnn_delete_operator); - - xnn_operator_t op63 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/48, - /*group_output_channels=*/288, - /*input_channel_stride=*/48, - /*output_channel_stride=*/288, - /*kernel=*/w174.data(), /*bias=*/w175.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op63); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #63" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op63, xnn_delete_operator); - - xnn_operator_t op64 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op64); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #64" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op64, xnn_delete_operator); - - xnn_operator_t op65 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/1, /*input_padding_right=*/2, - /*input_padding_bottom=*/2, /*input_padding_left=*/1, - /*kernel_height=*/5, /*kernel_width=*/5, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/288, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/288, - /*output_channel_stride=*/288, - /*kernel=*/w176.data(), /*bias=*/w177.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op65); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #65" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op65, xnn_delete_operator); - - xnn_operator_t op66 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op66); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #66" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op66, xnn_delete_operator); - - xnn_operator_t op67 = nullptr; - status = xnn_create_global_average_pooling_nwc_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op67); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #67" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op67, xnn_delete_operator); - - xnn_operator_t op68 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/288, - /*group_output_channels=*/72, - /*input_channel_stride=*/288, - /*output_channel_stride=*/72, - /*kernel=*/w178.data(), /*bias=*/w179.data(), - /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op68); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #68" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op68, xnn_delete_operator); - - xnn_operator_t op69 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/72, - /*group_output_channels=*/288, - /*input_channel_stride=*/72, - /*output_channel_stride=*/288, - /*kernel=*/w180.data(), /*bias=*/w181.data(), - /*output_min=*/0.0f, /*output_max=*/+0x1.00014Fp+0, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op69); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #69" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op69, xnn_delete_operator); - - xnn_operator_t op70 = nullptr; - status = xnn_create_multiply_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op70); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #70" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op70, xnn_delete_operator); - - xnn_operator_t op71 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/288, - /*group_output_channels=*/96, - /*input_channel_stride=*/288, - /*output_channel_stride=*/96, - /*kernel=*/w182.data(), /*bias=*/w183.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op71); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #71" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op71, xnn_delete_operator); - - xnn_operator_t op72 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/96, - /*group_output_channels=*/576, - /*input_channel_stride=*/96, - /*output_channel_stride=*/576, - /*kernel=*/w184.data(), /*bias=*/w185.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op72); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #72" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op72, xnn_delete_operator); - - xnn_operator_t op73 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op73); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #73" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op73, xnn_delete_operator); - - xnn_operator_t op74 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/2, /*input_padding_right=*/2, - /*input_padding_bottom=*/2, /*input_padding_left=*/2, - /*kernel_height=*/5, /*kernel_width=*/5, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/576, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/576, - /*output_channel_stride=*/576, - /*kernel=*/w186.data(), /*bias=*/w187.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op74); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #74" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op74, xnn_delete_operator); - - xnn_operator_t op75 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op75); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #75" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op75, xnn_delete_operator); - - xnn_operator_t op76 = nullptr; - status = xnn_create_global_average_pooling_nwc_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op76); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #76" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op76, xnn_delete_operator); - - xnn_operator_t op77 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/576, - /*group_output_channels=*/144, - /*input_channel_stride=*/576, - /*output_channel_stride=*/144, - /*kernel=*/w188.data(), /*bias=*/w189.data(), - /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op77); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #77" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op77, xnn_delete_operator); - - xnn_operator_t op78 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/144, - /*group_output_channels=*/576, - /*input_channel_stride=*/144, - /*output_channel_stride=*/576, - /*kernel=*/w190.data(), /*bias=*/w191.data(), - /*output_min=*/0.0f, /*output_max=*/+0x1.00014Fp+0, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op78); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #78" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op78, xnn_delete_operator); - - xnn_operator_t op79 = nullptr; - status = xnn_create_multiply_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op79); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #79" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op79, xnn_delete_operator); - - xnn_operator_t op80 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/576, - /*group_output_channels=*/96, - /*input_channel_stride=*/576, - /*output_channel_stride=*/96, - /*kernel=*/w192.data(), /*bias=*/w193.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op80); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #80" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op80, xnn_delete_operator); - - xnn_operator_t op81 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op81); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #81" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op81, xnn_delete_operator); - - xnn_operator_t op82 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/96, - /*group_output_channels=*/576, - /*input_channel_stride=*/96, - /*output_channel_stride=*/576, - /*kernel=*/w194.data(), /*bias=*/w195.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op82); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #82" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op82, xnn_delete_operator); - - xnn_operator_t op83 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op83); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #83" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op83, xnn_delete_operator); - - xnn_operator_t op84 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/2, /*input_padding_right=*/2, - /*input_padding_bottom=*/2, /*input_padding_left=*/2, - /*kernel_height=*/5, /*kernel_width=*/5, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/576, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/576, - /*output_channel_stride=*/576, - /*kernel=*/w196.data(), /*bias=*/w197.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op84); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #84" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op84, xnn_delete_operator); - - xnn_operator_t op85 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op85); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #85" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op85, xnn_delete_operator); - - xnn_operator_t op86 = nullptr; - status = xnn_create_global_average_pooling_nwc_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op86); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #86" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op86, xnn_delete_operator); - - xnn_operator_t op87 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/576, - /*group_output_channels=*/144, - /*input_channel_stride=*/576, - /*output_channel_stride=*/144, - /*kernel=*/w198.data(), /*bias=*/w199.data(), - /*output_min=*/0.0f, /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op87); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #87" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op87, xnn_delete_operator); - - xnn_operator_t op88 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/144, - /*group_output_channels=*/576, - /*input_channel_stride=*/144, - /*output_channel_stride=*/576, - /*kernel=*/w200.data(), /*bias=*/w201.data(), - /*output_min=*/0.0f, /*output_max=*/+0x1.00014Fp+0, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op88); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #88" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op88, xnn_delete_operator); - - xnn_operator_t op89 = nullptr; - status = xnn_create_multiply_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op89); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #89" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op89, xnn_delete_operator); - - xnn_operator_t op90 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/576, - /*group_output_channels=*/96, - /*input_channel_stride=*/576, - /*output_channel_stride=*/96, - /*kernel=*/w202.data(), /*bias=*/w203.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op90); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #90" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op90, xnn_delete_operator); - - xnn_operator_t op91 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op91); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #91" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op91, xnn_delete_operator); - - xnn_operator_t op92 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/96, - /*group_output_channels=*/576, - /*input_channel_stride=*/96, - /*output_channel_stride=*/576, - /*kernel=*/w204.data(), /*bias=*/w205.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op92); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #92" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op92, xnn_delete_operator); - - xnn_operator_t op93 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op93); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #93" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op93, xnn_delete_operator); - - xnn_operator_t op94 = nullptr; - status = xnn_create_global_average_pooling_nwc_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op94); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #94" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op94, xnn_delete_operator); - - xnn_operator_t op95 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/576, - /*group_output_channels=*/1024, - /*input_channel_stride=*/576, - /*output_channel_stride=*/1024, - /*kernel=*/w206.data(), /*bias=*/w207.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op95); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #95" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op95, xnn_delete_operator); - - xnn_operator_t op96 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op96); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #96" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op96, xnn_delete_operator); - - xnn_operator_t op97 = nullptr; - status = xnn_create_global_average_pooling_nwc_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op97); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #97" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op97, xnn_delete_operator); - - xnn_operator_t op98 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/1024, - /*group_output_channels=*/1001, - /*input_channel_stride=*/1024, - /*output_channel_stride=*/1001, - /*kernel=*/w208.data(), /*bias=*/w209.data(), - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op98); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #98" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op98, xnn_delete_operator); - - xnn_operator_t op99 = nullptr; - status = xnn_create_copy_nc_x32( - 0 /* flags */, - &op99); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #99" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op99, xnn_delete_operator); - - xnn_operator_t op100 = nullptr; - status = xnn_create_softmax_nc_f32( - /*flags=*/0, - &op100); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #100" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op100, xnn_delete_operator); - - size_t op0_workspace_size = 0; - size_t op0_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op0, - /*batch_size=*/1, /*input_height=*/224, /*input_width=*/224, - &op0_workspace_size, &op0_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op0_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #0" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op1, - /*batch_size=*/12544, - 16 /* channels */, - 16 /* input stride */, - 16 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #1" << std::endl; - return ExecutionPlan(); - } - - size_t op2_workspace_size = 0; - size_t op2_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op2, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op2_workspace_size, &op2_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op2_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #2" << std::endl; - return ExecutionPlan(); - } - - size_t op3_workspace_size = 0; - size_t op3_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f32( - op3, - /*batch_size=*/1, 3136 /* width */, - 16 /* channels */, 16 /* input stride */, 16 /* output stride */, - &op3_workspace_size, &op3_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op3_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #3" << std::endl; - return ExecutionPlan(); - } - - size_t op4_workspace_size = 0; - size_t op4_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op4, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op4_workspace_size, &op4_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op4_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #4" << std::endl; - return ExecutionPlan(); - } - - size_t op5_workspace_size = 0; - size_t op5_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op5, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op5_workspace_size, &op5_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op5_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #5" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 56, 56, 16 }; - const size_t b_shape[] = { 1, 1, 1, 16 }; - status = xnn_reshape_multiply_nd_f32( - op6, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #6" << std::endl; - return ExecutionPlan(); - } - - size_t op7_workspace_size = 0; - size_t op7_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op7, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op7_workspace_size, &op7_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op7_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #7" << std::endl; - return ExecutionPlan(); - } - - size_t op8_workspace_size = 0; - size_t op8_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op8, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op8_workspace_size, &op8_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op8_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #8" << std::endl; - return ExecutionPlan(); - } - - size_t op9_workspace_size = 0; - size_t op9_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op9, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op9_workspace_size, &op9_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op9_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #9" << std::endl; - return ExecutionPlan(); - } - - size_t op10_workspace_size = 0; - size_t op10_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op10, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op10_workspace_size, &op10_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op10_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #10" << std::endl; - return ExecutionPlan(); - } - - size_t op11_workspace_size = 0; - size_t op11_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op11, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op11_workspace_size, &op11_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op11_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #11" << std::endl; - return ExecutionPlan(); - } - - size_t op12_workspace_size = 0; - size_t op12_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op12, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op12_workspace_size, &op12_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op12_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #12" << std::endl; - return ExecutionPlan(); - } - - size_t op13_workspace_size = 0; - size_t op13_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op13, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op13_workspace_size, &op13_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op13_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #13" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 28, 28, 24 }; - const size_t b_shape[] = { 1, 28, 28, 24 }; - status = xnn_reshape_add_nd_f32( - op14, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #14" << std::endl; - return ExecutionPlan(); - } - - size_t op15_workspace_size = 0; - size_t op15_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op15, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op15_workspace_size, &op15_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op15_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #15" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op16, - /*batch_size=*/784, - 96 /* channels */, - 96 /* input stride */, - 96 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #16" << std::endl; - return ExecutionPlan(); - } - - size_t op17_workspace_size = 0; - size_t op17_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op17, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op17_workspace_size, &op17_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op17_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #17" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op18, - /*batch_size=*/196, - 96 /* channels */, - 96 /* input stride */, - 96 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #18" << std::endl; - return ExecutionPlan(); - } - - size_t op19_workspace_size = 0; - size_t op19_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f32( - op19, - /*batch_size=*/1, 196 /* width */, - 96 /* channels */, 96 /* input stride */, 96 /* output stride */, - &op19_workspace_size, &op19_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op19_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #19" << std::endl; - return ExecutionPlan(); - } - - size_t op20_workspace_size = 0; - size_t op20_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op20, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op20_workspace_size, &op20_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op20_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #20" << std::endl; - return ExecutionPlan(); - } - - size_t op21_workspace_size = 0; - size_t op21_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op21, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op21_workspace_size, &op21_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op21_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #21" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 96 }; - const size_t b_shape[] = { 1, 1, 1, 96 }; - status = xnn_reshape_multiply_nd_f32( - op22, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #22" << std::endl; - return ExecutionPlan(); - } - - size_t op23_workspace_size = 0; - size_t op23_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op23, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op23_workspace_size, &op23_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op23_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #23" << std::endl; - return ExecutionPlan(); - } - - size_t op24_workspace_size = 0; - size_t op24_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op24, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op24_workspace_size, &op24_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op24_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #24" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op25, - /*batch_size=*/196, - 240 /* channels */, - 240 /* input stride */, - 240 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #25" << std::endl; - return ExecutionPlan(); - } - - size_t op26_workspace_size = 0; - size_t op26_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op26, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op26_workspace_size, &op26_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op26_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #26" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op27, - /*batch_size=*/196, - 240 /* channels */, - 240 /* input stride */, - 240 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #27" << std::endl; - return ExecutionPlan(); - } - - size_t op28_workspace_size = 0; - size_t op28_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f32( - op28, - /*batch_size=*/1, 196 /* width */, - 240 /* channels */, 240 /* input stride */, 240 /* output stride */, - &op28_workspace_size, &op28_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op28_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #28" << std::endl; - return ExecutionPlan(); - } - - size_t op29_workspace_size = 0; - size_t op29_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op29, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op29_workspace_size, &op29_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op29_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #29" << std::endl; - return ExecutionPlan(); - } - - size_t op30_workspace_size = 0; - size_t op30_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op30, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op30_workspace_size, &op30_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op30_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #30" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 240 }; - const size_t b_shape[] = { 1, 1, 1, 240 }; - status = xnn_reshape_multiply_nd_f32( - op31, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #31" << std::endl; - return ExecutionPlan(); - } - - size_t op32_workspace_size = 0; - size_t op32_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op32, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op32_workspace_size, &op32_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op32_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #32" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 40 }; - const size_t b_shape[] = { 1, 14, 14, 40 }; - status = xnn_reshape_add_nd_f32( - op33, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #33" << std::endl; - return ExecutionPlan(); - } - - size_t op34_workspace_size = 0; - size_t op34_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op34, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op34_workspace_size, &op34_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op34_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #34" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op35, - /*batch_size=*/196, - 240 /* channels */, - 240 /* input stride */, - 240 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #35" << std::endl; - return ExecutionPlan(); - } - - size_t op36_workspace_size = 0; - size_t op36_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op36, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op36_workspace_size, &op36_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op36_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #36" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op37, - /*batch_size=*/196, - 240 /* channels */, - 240 /* input stride */, - 240 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #37" << std::endl; - return ExecutionPlan(); - } - - size_t op38_workspace_size = 0; - size_t op38_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f32( - op38, - /*batch_size=*/1, 196 /* width */, - 240 /* channels */, 240 /* input stride */, 240 /* output stride */, - &op38_workspace_size, &op38_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op38_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #38" << std::endl; - return ExecutionPlan(); - } - - size_t op39_workspace_size = 0; - size_t op39_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op39, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op39_workspace_size, &op39_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op39_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #39" << std::endl; - return ExecutionPlan(); - } - - size_t op40_workspace_size = 0; - size_t op40_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op40, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op40_workspace_size, &op40_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op40_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #40" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 240 }; - const size_t b_shape[] = { 1, 1, 1, 240 }; - status = xnn_reshape_multiply_nd_f32( - op41, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #41" << std::endl; - return ExecutionPlan(); - } - - size_t op42_workspace_size = 0; - size_t op42_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op42, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op42_workspace_size, &op42_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op42_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #42" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 40 }; - const size_t b_shape[] = { 1, 14, 14, 40 }; - status = xnn_reshape_add_nd_f32( - op43, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #43" << std::endl; - return ExecutionPlan(); - } - - size_t op44_workspace_size = 0; - size_t op44_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op44, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op44_workspace_size, &op44_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op44_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #44" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op45, - /*batch_size=*/196, - 120 /* channels */, - 120 /* input stride */, - 120 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #45" << std::endl; - return ExecutionPlan(); - } - - size_t op46_workspace_size = 0; - size_t op46_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op46, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op46_workspace_size, &op46_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op46_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #46" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op47, - /*batch_size=*/196, - 120 /* channels */, - 120 /* input stride */, - 120 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #47" << std::endl; - return ExecutionPlan(); - } - - size_t op48_workspace_size = 0; - size_t op48_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f32( - op48, - /*batch_size=*/1, 196 /* width */, - 120 /* channels */, 120 /* input stride */, 120 /* output stride */, - &op48_workspace_size, &op48_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op48_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #48" << std::endl; - return ExecutionPlan(); - } - - size_t op49_workspace_size = 0; - size_t op49_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op49, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op49_workspace_size, &op49_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op49_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #49" << std::endl; - return ExecutionPlan(); - } - - size_t op50_workspace_size = 0; - size_t op50_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op50, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op50_workspace_size, &op50_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op50_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #50" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 120 }; - const size_t b_shape[] = { 1, 1, 1, 120 }; - status = xnn_reshape_multiply_nd_f32( - op51, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #51" << std::endl; - return ExecutionPlan(); - } - - size_t op52_workspace_size = 0; - size_t op52_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op52, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op52_workspace_size, &op52_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op52_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #52" << std::endl; - return ExecutionPlan(); - } - - size_t op53_workspace_size = 0; - size_t op53_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op53, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op53_workspace_size, &op53_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op53_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #53" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op54, - /*batch_size=*/196, - 144 /* channels */, - 144 /* input stride */, - 144 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #54" << std::endl; - return ExecutionPlan(); - } - - size_t op55_workspace_size = 0; - size_t op55_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op55, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op55_workspace_size, &op55_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op55_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #55" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op56, - /*batch_size=*/196, - 144 /* channels */, - 144 /* input stride */, - 144 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #56" << std::endl; - return ExecutionPlan(); - } - - size_t op57_workspace_size = 0; - size_t op57_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f32( - op57, - /*batch_size=*/1, 196 /* width */, - 144 /* channels */, 144 /* input stride */, 144 /* output stride */, - &op57_workspace_size, &op57_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op57_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #57" << std::endl; - return ExecutionPlan(); - } - - size_t op58_workspace_size = 0; - size_t op58_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op58, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op58_workspace_size, &op58_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op58_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #58" << std::endl; - return ExecutionPlan(); - } - - size_t op59_workspace_size = 0; - size_t op59_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op59, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op59_workspace_size, &op59_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op59_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #59" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 144 }; - const size_t b_shape[] = { 1, 1, 1, 144 }; - status = xnn_reshape_multiply_nd_f32( - op60, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #60" << std::endl; - return ExecutionPlan(); - } - - size_t op61_workspace_size = 0; - size_t op61_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op61, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op61_workspace_size, &op61_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op61_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #61" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 48 }; - const size_t b_shape[] = { 1, 14, 14, 48 }; - status = xnn_reshape_add_nd_f32( - op62, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #62" << std::endl; - return ExecutionPlan(); - } - - size_t op63_workspace_size = 0; - size_t op63_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op63, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op63_workspace_size, &op63_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op63_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #63" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op64, - /*batch_size=*/196, - 288 /* channels */, - 288 /* input stride */, - 288 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #64" << std::endl; - return ExecutionPlan(); - } - - size_t op65_workspace_size = 0; - size_t op65_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op65, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op65_workspace_size, &op65_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op65_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #65" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op66, - /*batch_size=*/49, - 288 /* channels */, - 288 /* input stride */, - 288 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #66" << std::endl; - return ExecutionPlan(); - } - - size_t op67_workspace_size = 0; - size_t op67_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f32( - op67, - /*batch_size=*/1, 49 /* width */, - 288 /* channels */, 288 /* input stride */, 288 /* output stride */, - &op67_workspace_size, &op67_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op67_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #67" << std::endl; - return ExecutionPlan(); - } - - size_t op68_workspace_size = 0; - size_t op68_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op68, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op68_workspace_size, &op68_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op68_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #68" << std::endl; - return ExecutionPlan(); - } - - size_t op69_workspace_size = 0; - size_t op69_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op69, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op69_workspace_size, &op69_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op69_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #69" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 288 }; - const size_t b_shape[] = { 1, 1, 1, 288 }; - status = xnn_reshape_multiply_nd_f32( - op70, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #70" << std::endl; - return ExecutionPlan(); - } - - size_t op71_workspace_size = 0; - size_t op71_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op71, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op71_workspace_size, &op71_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op71_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #71" << std::endl; - return ExecutionPlan(); - } - - size_t op72_workspace_size = 0; - size_t op72_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op72, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op72_workspace_size, &op72_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op72_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #72" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op73, - /*batch_size=*/49, - 576 /* channels */, - 576 /* input stride */, - 576 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #73" << std::endl; - return ExecutionPlan(); - } - - size_t op74_workspace_size = 0; - size_t op74_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op74, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op74_workspace_size, &op74_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op74_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #74" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op75, - /*batch_size=*/49, - 576 /* channels */, - 576 /* input stride */, - 576 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #75" << std::endl; - return ExecutionPlan(); - } - - size_t op76_workspace_size = 0; - size_t op76_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f32( - op76, - /*batch_size=*/1, 49 /* width */, - 576 /* channels */, 576 /* input stride */, 576 /* output stride */, - &op76_workspace_size, &op76_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op76_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #76" << std::endl; - return ExecutionPlan(); - } - - size_t op77_workspace_size = 0; - size_t op77_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op77, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op77_workspace_size, &op77_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op77_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #77" << std::endl; - return ExecutionPlan(); - } - - size_t op78_workspace_size = 0; - size_t op78_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op78, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op78_workspace_size, &op78_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op78_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #78" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 576 }; - const size_t b_shape[] = { 1, 1, 1, 576 }; - status = xnn_reshape_multiply_nd_f32( - op79, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #79" << std::endl; - return ExecutionPlan(); - } - - size_t op80_workspace_size = 0; - size_t op80_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op80, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op80_workspace_size, &op80_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op80_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #80" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 96 }; - const size_t b_shape[] = { 1, 7, 7, 96 }; - status = xnn_reshape_add_nd_f32( - op81, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #81" << std::endl; - return ExecutionPlan(); - } - - size_t op82_workspace_size = 0; - size_t op82_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op82, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op82_workspace_size, &op82_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op82_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #82" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op83, - /*batch_size=*/49, - 576 /* channels */, - 576 /* input stride */, - 576 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #83" << std::endl; - return ExecutionPlan(); - } - - size_t op84_workspace_size = 0; - size_t op84_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op84, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op84_workspace_size, &op84_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op84_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #84" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op85, - /*batch_size=*/49, - 576 /* channels */, - 576 /* input stride */, - 576 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #85" << std::endl; - return ExecutionPlan(); - } - - size_t op86_workspace_size = 0; - size_t op86_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f32( - op86, - /*batch_size=*/1, 49 /* width */, - 576 /* channels */, 576 /* input stride */, 576 /* output stride */, - &op86_workspace_size, &op86_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op86_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #86" << std::endl; - return ExecutionPlan(); - } - - size_t op87_workspace_size = 0; - size_t op87_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op87, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op87_workspace_size, &op87_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op87_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #87" << std::endl; - return ExecutionPlan(); - } - - size_t op88_workspace_size = 0; - size_t op88_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op88, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op88_workspace_size, &op88_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op88_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #88" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 576 }; - const size_t b_shape[] = { 1, 1, 1, 576 }; - status = xnn_reshape_multiply_nd_f32( - op89, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #89" << std::endl; - return ExecutionPlan(); - } - - size_t op90_workspace_size = 0; - size_t op90_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op90, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op90_workspace_size, &op90_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op90_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #90" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 96 }; - const size_t b_shape[] = { 1, 7, 7, 96 }; - status = xnn_reshape_add_nd_f32( - op91, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #91" << std::endl; - return ExecutionPlan(); - } - - size_t op92_workspace_size = 0; - size_t op92_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op92, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op92_workspace_size, &op92_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op92_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #92" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op93, - /*batch_size=*/49, - 576 /* channels */, - 576 /* input stride */, - 576 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #93" << std::endl; - return ExecutionPlan(); - } - - size_t op94_workspace_size = 0; - size_t op94_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f32( - op94, - /*batch_size=*/1, 49 /* width */, - 576 /* channels */, 576 /* input stride */, 576 /* output stride */, - &op94_workspace_size, &op94_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op94_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #94" << std::endl; - return ExecutionPlan(); - } - - size_t op95_workspace_size = 0; - size_t op95_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op95, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op95_workspace_size, &op95_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op95_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #95" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op96, - /*batch_size=*/1, - 1024 /* channels */, - 1024 /* input stride */, - 1024 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #96" << std::endl; - return ExecutionPlan(); - } - - size_t op97_workspace_size = 0; - size_t op97_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f32( - op97, - /*batch_size=*/1, 1 /* width */, - 1024 /* channels */, 1024 /* input stride */, 1024 /* output stride */, - &op97_workspace_size, &op97_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op97_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #97" << std::endl; - return ExecutionPlan(); - } - - size_t op98_workspace_size = 0; - size_t op98_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op98, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op98_workspace_size, &op98_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op98_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #98" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x32( - op99, - /*batch_size=*/1001, - 1 /* channels */, - 1 /* input stride */, - 1 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #99" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_softmax_nc_f32( - op100, - /*channels=*/1001, - /*input_stride=*/1001, - /*output_stride=*/1001, - /*batch_size=*/1, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #100" << std::endl; - return ExecutionPlan(); - } - - Workspace workspace(max_workspace_size); - - status = xnn_setup_convolution2d_nhwc_f32( - op0, - workspace.data(), /*input=*/v0.data(), /*output=*/v1.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #0" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op1, - /*input=*/v1.data(), /*output=*/v2.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #1" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op2, - workspace.data(), /*input=*/v2.data(), /*output=*/v3.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #2" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f32( - op3, - workspace.data(), - /*input=*/v3.data(), /*output=*/v4.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #3" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op4, - workspace.data(), /*input=*/v4.data(), /*output=*/v5.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #4" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op5, - workspace.data(), /*input=*/v5.data(), /*output=*/v6.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #5" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f32( - op6, - v3.data() /* a */, v6.data() /* b */, /*output=*/v7.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #6" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op7, - workspace.data(), /*input=*/v7.data(), /*output=*/v8.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #7" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op8, - workspace.data(), /*input=*/v8.data(), /*output=*/v9.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #8" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op9, - workspace.data(), /*input=*/v9.data(), /*output=*/v10.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #9" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op10, - workspace.data(), /*input=*/v10.data(), /*output=*/v11.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #10" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op11, - workspace.data(), /*input=*/v11.data(), /*output=*/v12.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #11" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op12, - workspace.data(), /*input=*/v12.data(), /*output=*/v13.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #12" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op13, - workspace.data(), /*input=*/v13.data(), /*output=*/v14.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #13" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op14, - v14.data() /* a */, v11.data() /* b */, /*output=*/v15.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #14" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op15, - workspace.data(), /*input=*/v15.data(), /*output=*/v16.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #15" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op16, - /*input=*/v16.data(), /*output=*/v17.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #16" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op17, - workspace.data(), /*input=*/v17.data(), /*output=*/v18.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #17" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op18, - /*input=*/v18.data(), /*output=*/v19.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #18" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f32( - op19, - workspace.data(), - /*input=*/v19.data(), /*output=*/v20.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #19" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op20, - workspace.data(), /*input=*/v20.data(), /*output=*/v21.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #20" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op21, - workspace.data(), /*input=*/v21.data(), /*output=*/v22.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #21" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f32( - op22, - v19.data() /* a */, v22.data() /* b */, /*output=*/v23.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #22" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op23, - workspace.data(), /*input=*/v23.data(), /*output=*/v24.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #23" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op24, - workspace.data(), /*input=*/v24.data(), /*output=*/v25.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #24" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op25, - /*input=*/v25.data(), /*output=*/v26.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #25" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op26, - workspace.data(), /*input=*/v26.data(), /*output=*/v27.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #26" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op27, - /*input=*/v27.data(), /*output=*/v28.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #27" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f32( - op28, - workspace.data(), - /*input=*/v28.data(), /*output=*/v29.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #28" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op29, - workspace.data(), /*input=*/v29.data(), /*output=*/v30.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #29" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op30, - workspace.data(), /*input=*/v30.data(), /*output=*/v31.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #30" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f32( - op31, - v28.data() /* a */, v31.data() /* b */, /*output=*/v32.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #31" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op32, - workspace.data(), /*input=*/v32.data(), /*output=*/v33.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #32" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op33, - v33.data() /* a */, v24.data() /* b */, /*output=*/v34.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #33" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op34, - workspace.data(), /*input=*/v34.data(), /*output=*/v35.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #34" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op35, - /*input=*/v35.data(), /*output=*/v36.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #35" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op36, - workspace.data(), /*input=*/v36.data(), /*output=*/v37.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #36" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op37, - /*input=*/v37.data(), /*output=*/v38.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #37" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f32( - op38, - workspace.data(), - /*input=*/v38.data(), /*output=*/v39.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #38" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op39, - workspace.data(), /*input=*/v39.data(), /*output=*/v40.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #39" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op40, - workspace.data(), /*input=*/v40.data(), /*output=*/v41.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #40" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f32( - op41, - v38.data() /* a */, v41.data() /* b */, /*output=*/v42.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #41" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op42, - workspace.data(), /*input=*/v42.data(), /*output=*/v43.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #42" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op43, - v43.data() /* a */, v34.data() /* b */, /*output=*/v44.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #43" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op44, - workspace.data(), /*input=*/v44.data(), /*output=*/v45.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #44" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op45, - /*input=*/v45.data(), /*output=*/v46.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #45" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op46, - workspace.data(), /*input=*/v46.data(), /*output=*/v47.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #46" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op47, - /*input=*/v47.data(), /*output=*/v48.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #47" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f32( - op48, - workspace.data(), - /*input=*/v48.data(), /*output=*/v49.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #48" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op49, - workspace.data(), /*input=*/v49.data(), /*output=*/v50.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #49" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op50, - workspace.data(), /*input=*/v50.data(), /*output=*/v51.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #50" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f32( - op51, - v48.data() /* a */, v51.data() /* b */, /*output=*/v52.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #51" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op52, - workspace.data(), /*input=*/v52.data(), /*output=*/v53.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #52" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op53, - workspace.data(), /*input=*/v53.data(), /*output=*/v54.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #53" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op54, - /*input=*/v54.data(), /*output=*/v55.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #54" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op55, - workspace.data(), /*input=*/v55.data(), /*output=*/v56.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #55" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op56, - /*input=*/v56.data(), /*output=*/v57.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #56" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f32( - op57, - workspace.data(), - /*input=*/v57.data(), /*output=*/v58.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #57" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op58, - workspace.data(), /*input=*/v58.data(), /*output=*/v59.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #58" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op59, - workspace.data(), /*input=*/v59.data(), /*output=*/v60.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #59" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f32( - op60, - v57.data() /* a */, v60.data() /* b */, /*output=*/v61.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #60" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op61, - workspace.data(), /*input=*/v61.data(), /*output=*/v62.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #61" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op62, - v62.data() /* a */, v53.data() /* b */, /*output=*/v63.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #62" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op63, - workspace.data(), /*input=*/v63.data(), /*output=*/v64.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #63" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op64, - /*input=*/v64.data(), /*output=*/v65.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #64" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op65, - workspace.data(), /*input=*/v65.data(), /*output=*/v66.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #65" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op66, - /*input=*/v66.data(), /*output=*/v67.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #66" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f32( - op67, - workspace.data(), - /*input=*/v67.data(), /*output=*/v68.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #67" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op68, - workspace.data(), /*input=*/v68.data(), /*output=*/v69.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #68" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op69, - workspace.data(), /*input=*/v69.data(), /*output=*/v70.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #69" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f32( - op70, - v67.data() /* a */, v70.data() /* b */, /*output=*/v71.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #70" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op71, - workspace.data(), /*input=*/v71.data(), /*output=*/v72.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #71" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op72, - workspace.data(), /*input=*/v72.data(), /*output=*/v73.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #72" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op73, - /*input=*/v73.data(), /*output=*/v74.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #73" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op74, - workspace.data(), /*input=*/v74.data(), /*output=*/v75.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #74" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op75, - /*input=*/v75.data(), /*output=*/v76.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #75" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f32( - op76, - workspace.data(), - /*input=*/v76.data(), /*output=*/v77.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #76" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op77, - workspace.data(), /*input=*/v77.data(), /*output=*/v78.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #77" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op78, - workspace.data(), /*input=*/v78.data(), /*output=*/v79.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #78" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f32( - op79, - v76.data() /* a */, v79.data() /* b */, /*output=*/v80.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #79" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op80, - workspace.data(), /*input=*/v80.data(), /*output=*/v81.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #80" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op81, - v81.data() /* a */, v72.data() /* b */, /*output=*/v82.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #81" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op82, - workspace.data(), /*input=*/v82.data(), /*output=*/v83.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #82" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op83, - /*input=*/v83.data(), /*output=*/v84.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #83" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op84, - workspace.data(), /*input=*/v84.data(), /*output=*/v85.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #84" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op85, - /*input=*/v85.data(), /*output=*/v86.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #85" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f32( - op86, - workspace.data(), - /*input=*/v86.data(), /*output=*/v87.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #86" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op87, - workspace.data(), /*input=*/v87.data(), /*output=*/v88.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #87" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op88, - workspace.data(), /*input=*/v88.data(), /*output=*/v89.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #88" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f32( - op89, - v86.data() /* a */, v89.data() /* b */, /*output=*/v90.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #89" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op90, - workspace.data(), /*input=*/v90.data(), /*output=*/v91.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #90" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op91, - v91.data() /* a */, v82.data() /* b */, /*output=*/v92.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #91" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op92, - workspace.data(), /*input=*/v92.data(), /*output=*/v93.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #92" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op93, - /*input=*/v93.data(), /*output=*/v94.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #93" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f32( - op94, - workspace.data(), - /*input=*/v94.data(), /*output=*/v95.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #94" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op95, - workspace.data(), /*input=*/v95.data(), /*output=*/v96.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #95" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op96, - /*input=*/v96.data(), /*output=*/v97.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #96" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f32( - op97, - workspace.data(), - /*input=*/v97.data(), /*output=*/v98.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #97" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op98, - workspace.data(), /*input=*/v98.data(), /*output=*/v99.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #98" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x32( - op99, - /*input=*/v99.data(), /*output=*/v100.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #99" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_softmax_nc_f32( - op100, - /*input=*/v100.data(), /*output=*/v101.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #100" << std::endl; - return ExecutionPlan(); - } - - XNN_PRAGMA_CLANG("clang diagnostic push") - XNN_PRAGMA_CLANG("clang diagnostic ignored \"-Wpessimizing-move\"") - return ExecutionPlan{operators, workspace}; - XNN_PRAGMA_CLANG("clang diagnostic pop") -} - -} // namespace models diff --git a/models/fp32-sparse-mobilenet-v1.cc b/models/fp32-sparse-mobilenet-v1.cc deleted file mode 100644 index 4b08cab1e23..00000000000 --- a/models/fp32-sparse-mobilenet-v1.cc +++ /dev/null @@ -1,1448 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include "xnnpack.h" - -#include -#include -#include -#include -#include -#include - -#include "xnnpack/cache.h" -#include "xnnpack/common.h" -#include "xnnpack/common.h" -#include "xnnpack/models.h" - -namespace models { - -ExecutionPlan FP32SparseMobileNetV1(float sparsity, pthreadpool_t threadpool) { - alignas(16) static std::array v0; - alignas(16) static std::array v1; - alignas(16) static std::array v2; - alignas(16) static std::array v3; - alignas(16) static std::array v4; - alignas(16) static std::array v5; - alignas(16) static std::array v6; - alignas(16) static std::array v7; - alignas(16) static std::array v8; - alignas(16) static std::array v9; - alignas(16) static std::array v10; - alignas(16) static std::array v11; - alignas(16) static std::array v12; - alignas(16) static std::array v13; - alignas(16) static std::array v14; - alignas(16) static std::array v15; - alignas(16) static std::array v16; - alignas(16) static std::array v17; - alignas(16) static std::array v18; - alignas(16) static std::array v19; - alignas(16) static std::array v20; - alignas(16) static std::array v21; - alignas(16) static std::array v22; - alignas(16) static std::array v23; - alignas(16) static std::array v24; - alignas(16) static std::array v25; - alignas(16) static std::array v26; - alignas(16) static std::array v27; - alignas(16) static std::array v28; - alignas(16) static std::array v29; - alignas(16) static std::array w30; - alignas(16) static std::array w31; - alignas(16) static std::array w32; - alignas(16) static std::array w33; - alignas(16) static std::array w34; - alignas(16) static std::array w35; - alignas(16) static std::array w36; - alignas(16) static std::array w37; - alignas(16) static std::array w38; - alignas(16) static std::array w39; - alignas(16) static std::array w40; - alignas(16) static std::array w41; - alignas(16) static std::array w42; - alignas(16) static std::array w43; - alignas(16) static std::array w44; - alignas(16) static std::array w45; - alignas(16) static std::array w46; - alignas(16) static std::array w47; - alignas(16) static std::array w48; - alignas(16) static std::array w49; - alignas(16) static std::array w50; - alignas(16) static std::array w51; - alignas(16) static std::array w52; - alignas(16) static std::array w53; - alignas(16) static std::array w54; - alignas(16) static std::array w55; - alignas(16) static std::array w56; - alignas(16) static std::array w57; - alignas(16) static std::array w58; - alignas(16) static std::array w59; - alignas(16) static std::array w60; - alignas(16) static std::array w61; - alignas(16) static std::array w62; - alignas(16) static std::array w63; - alignas(16) static std::array w64; - alignas(16) static std::array w65; - alignas(16) static std::array w66; - alignas(16) static std::array w67; - alignas(16) static std::array w68; - alignas(16) static std::array w69; - alignas(16) static std::array w70; - alignas(16) static std::array w71; - alignas(16) static std::array w72; - alignas(16) static std::array w73; - alignas(16) static std::array w74; - alignas(16) static std::array w75; - alignas(16) static std::array w76; - alignas(16) static std::array w77; - alignas(16) static std::array w78; - alignas(16) static std::array w79; - alignas(16) static std::array w80; - alignas(16) static std::array w81; - alignas(16) static std::array w82; - alignas(16) static std::array w83; - alignas(16) static std::array w84; - alignas(16) static std::array w85; - - std::random_device random_device; - auto rng = std::mt19937(random_device()); - auto f32rng = std::bind(std::uniform_real_distribution(-1.0f, +1.0f), std::ref(rng)); - std::generate(v0.begin(), v0.end(), std::ref(f32rng)); - std::generate(v1.begin(), v1.end(), std::ref(f32rng)); - std::generate(v2.begin(), v2.end(), std::ref(f32rng)); - std::generate(v3.begin(), v3.end(), std::ref(f32rng)); - std::generate(v4.begin(), v4.end(), std::ref(f32rng)); - std::generate(v5.begin(), v5.end(), std::ref(f32rng)); - std::generate(v6.begin(), v6.end(), std::ref(f32rng)); - std::generate(v7.begin(), v7.end(), std::ref(f32rng)); - std::generate(v8.begin(), v8.end(), std::ref(f32rng)); - std::generate(v9.begin(), v9.end(), std::ref(f32rng)); - std::generate(v10.begin(), v10.end(), std::ref(f32rng)); - std::generate(v11.begin(), v11.end(), std::ref(f32rng)); - std::generate(v12.begin(), v12.end(), std::ref(f32rng)); - std::generate(v13.begin(), v13.end(), std::ref(f32rng)); - std::generate(v14.begin(), v14.end(), std::ref(f32rng)); - std::generate(v15.begin(), v15.end(), std::ref(f32rng)); - std::generate(v16.begin(), v16.end(), std::ref(f32rng)); - std::generate(v17.begin(), v17.end(), std::ref(f32rng)); - std::generate(v18.begin(), v18.end(), std::ref(f32rng)); - std::generate(v19.begin(), v19.end(), std::ref(f32rng)); - std::generate(v20.begin(), v20.end(), std::ref(f32rng)); - std::generate(v21.begin(), v21.end(), std::ref(f32rng)); - std::generate(v22.begin(), v22.end(), std::ref(f32rng)); - std::generate(v23.begin(), v23.end(), std::ref(f32rng)); - std::generate(v24.begin(), v24.end(), std::ref(f32rng)); - std::generate(v25.begin(), v25.end(), std::ref(f32rng)); - std::generate(v26.begin(), v26.end(), std::ref(f32rng)); - std::generate(v27.begin(), v27.end(), std::ref(f32rng)); - std::generate(v28.begin(), v28.end(), std::ref(f32rng)); - std::generate(v29.begin(), v29.end(), std::ref(f32rng)); - std::generate(w30.begin(), w30.end(), std::ref(f32rng)); - std::generate(w31.begin(), w31.end(), std::ref(f32rng)); - std::generate(w32.begin(), w32.end(), std::ref(f32rng)); - std::generate(w33.begin(), w33.end(), std::ref(f32rng)); - std::fill(w34.begin(), w34.end(), 0.0f); - std::generate(w34.begin(), w34.end() - size_t(sparsity * w34.size()), std::ref(f32rng)); - std::shuffle(w34.begin(), w34.end(), rng); - std::generate(w35.begin(), w35.end(), std::ref(f32rng)); - std::generate(w36.begin(), w36.end(), std::ref(f32rng)); - std::generate(w37.begin(), w37.end(), std::ref(f32rng)); - std::fill(w38.begin(), w38.end(), 0.0f); - std::generate(w38.begin(), w38.end() - size_t(sparsity * w38.size()), std::ref(f32rng)); - std::shuffle(w38.begin(), w38.end(), rng); - std::generate(w39.begin(), w39.end(), std::ref(f32rng)); - std::generate(w40.begin(), w40.end(), std::ref(f32rng)); - std::generate(w41.begin(), w41.end(), std::ref(f32rng)); - std::fill(w42.begin(), w42.end(), 0.0f); - std::generate(w42.begin(), w42.end() - size_t(sparsity * w42.size()), std::ref(f32rng)); - std::shuffle(w42.begin(), w42.end(), rng); - std::generate(w43.begin(), w43.end(), std::ref(f32rng)); - std::generate(w44.begin(), w44.end(), std::ref(f32rng)); - std::generate(w45.begin(), w45.end(), std::ref(f32rng)); - std::fill(w46.begin(), w46.end(), 0.0f); - std::generate(w46.begin(), w46.end() - size_t(sparsity * w46.size()), std::ref(f32rng)); - std::shuffle(w46.begin(), w46.end(), rng); - std::generate(w47.begin(), w47.end(), std::ref(f32rng)); - std::generate(w48.begin(), w48.end(), std::ref(f32rng)); - std::generate(w49.begin(), w49.end(), std::ref(f32rng)); - std::fill(w50.begin(), w50.end(), 0.0f); - std::generate(w50.begin(), w50.end() - size_t(sparsity * w50.size()), std::ref(f32rng)); - std::shuffle(w50.begin(), w50.end(), rng); - std::generate(w51.begin(), w51.end(), std::ref(f32rng)); - std::generate(w52.begin(), w52.end(), std::ref(f32rng)); - std::generate(w53.begin(), w53.end(), std::ref(f32rng)); - std::fill(w54.begin(), w54.end(), 0.0f); - std::generate(w54.begin(), w54.end() - size_t(sparsity * w54.size()), std::ref(f32rng)); - std::shuffle(w54.begin(), w54.end(), rng); - std::generate(w55.begin(), w55.end(), std::ref(f32rng)); - std::generate(w56.begin(), w56.end(), std::ref(f32rng)); - std::generate(w57.begin(), w57.end(), std::ref(f32rng)); - std::fill(w58.begin(), w58.end(), 0.0f); - std::generate(w58.begin(), w58.end() - size_t(sparsity * w58.size()), std::ref(f32rng)); - std::shuffle(w58.begin(), w58.end(), rng); - std::generate(w59.begin(), w59.end(), std::ref(f32rng)); - std::generate(w60.begin(), w60.end(), std::ref(f32rng)); - std::generate(w61.begin(), w61.end(), std::ref(f32rng)); - std::fill(w62.begin(), w62.end(), 0.0f); - std::generate(w62.begin(), w62.end() - size_t(sparsity * w62.size()), std::ref(f32rng)); - std::shuffle(w62.begin(), w62.end(), rng); - std::generate(w63.begin(), w63.end(), std::ref(f32rng)); - std::generate(w64.begin(), w64.end(), std::ref(f32rng)); - std::generate(w65.begin(), w65.end(), std::ref(f32rng)); - std::fill(w66.begin(), w66.end(), 0.0f); - std::generate(w66.begin(), w66.end() - size_t(sparsity * w66.size()), std::ref(f32rng)); - std::shuffle(w66.begin(), w66.end(), rng); - std::generate(w67.begin(), w67.end(), std::ref(f32rng)); - std::generate(w68.begin(), w68.end(), std::ref(f32rng)); - std::generate(w69.begin(), w69.end(), std::ref(f32rng)); - std::fill(w70.begin(), w70.end(), 0.0f); - std::generate(w70.begin(), w70.end() - size_t(sparsity * w70.size()), std::ref(f32rng)); - std::shuffle(w70.begin(), w70.end(), rng); - std::generate(w71.begin(), w71.end(), std::ref(f32rng)); - std::generate(w72.begin(), w72.end(), std::ref(f32rng)); - std::generate(w73.begin(), w73.end(), std::ref(f32rng)); - std::fill(w74.begin(), w74.end(), 0.0f); - std::generate(w74.begin(), w74.end() - size_t(sparsity * w74.size()), std::ref(f32rng)); - std::shuffle(w74.begin(), w74.end(), rng); - std::generate(w75.begin(), w75.end(), std::ref(f32rng)); - std::generate(w76.begin(), w76.end(), std::ref(f32rng)); - std::generate(w77.begin(), w77.end(), std::ref(f32rng)); - std::fill(w78.begin(), w78.end(), 0.0f); - std::generate(w78.begin(), w78.end() - size_t(sparsity * w78.size()), std::ref(f32rng)); - std::shuffle(w78.begin(), w78.end(), rng); - std::generate(w79.begin(), w79.end(), std::ref(f32rng)); - std::generate(w80.begin(), w80.end(), std::ref(f32rng)); - std::generate(w81.begin(), w81.end(), std::ref(f32rng)); - std::fill(w82.begin(), w82.end(), 0.0f); - std::generate(w82.begin(), w82.end() - size_t(sparsity * w82.size()), std::ref(f32rng)); - std::shuffle(w82.begin(), w82.end(), rng); - std::generate(w83.begin(), w83.end(), std::ref(f32rng)); - std::generate(w84.begin(), w84.end(), std::ref(f32rng)); - std::generate(w85.begin(), w85.end(), std::ref(f32rng)); - - Operators operators; - xnn_status status; - size_t max_workspace_size = 0; - - xnn_operator_t op0 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 3 /* input channels per group */, - 32 /* output_channels_per_group */, - 3 /* input pixel stride */, - 32 /* output pixel stride */, - w30.data(), w31.data(), - 0.0f /* output min */, 6.0f /* output max */, - XNN_FLAG_INPUT_NHWC /* flags */, - nullptr, - nullptr, - &op0); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #0" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op0, xnn_delete_operator); - - xnn_operator_t op1 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 32 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 32 /* input pixel stride */, - 32 /* output pixel stride */, - w32.data(), w33.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op1); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #1" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op1, xnn_delete_operator); - - xnn_operator_t op2 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 32 /* input channels per group */, - 64 /* output_channels_per_group */, - 32 /* input pixel stride */, - 64 /* output pixel stride */, - w34.data(), w35.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op2); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #2" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op2, xnn_delete_operator); - - xnn_operator_t op3 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 64 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 64 /* input pixel stride */, - 64 /* output pixel stride */, - w36.data(), w37.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op3); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #3" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op3, xnn_delete_operator); - - xnn_operator_t op4 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 64 /* input channels per group */, - 128 /* output_channels_per_group */, - 64 /* input pixel stride */, - 128 /* output pixel stride */, - w38.data(), w39.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op4); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #4" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op4, xnn_delete_operator); - - xnn_operator_t op5 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 128 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 128 /* input pixel stride */, - 128 /* output pixel stride */, - w40.data(), w41.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op5); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #5" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op5, xnn_delete_operator); - - xnn_operator_t op6 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 128 /* input channels per group */, - 128 /* output_channels_per_group */, - 128 /* input pixel stride */, - 128 /* output pixel stride */, - w42.data(), w43.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op6); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #6" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op6, xnn_delete_operator); - - xnn_operator_t op7 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 128 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 128 /* input pixel stride */, - 128 /* output pixel stride */, - w44.data(), w45.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op7); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #7" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op7, xnn_delete_operator); - - xnn_operator_t op8 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 128 /* input channels per group */, - 256 /* output_channels_per_group */, - 128 /* input pixel stride */, - 256 /* output pixel stride */, - w46.data(), w47.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op8); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #8" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op8, xnn_delete_operator); - - xnn_operator_t op9 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 256 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 256 /* input pixel stride */, - 256 /* output pixel stride */, - w48.data(), w49.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op9); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #9" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op9, xnn_delete_operator); - - xnn_operator_t op10 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 256 /* input channels per group */, - 256 /* output_channels_per_group */, - 256 /* input pixel stride */, - 256 /* output pixel stride */, - w50.data(), w51.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op10); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #10" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op10, xnn_delete_operator); - - xnn_operator_t op11 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 256 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 256 /* input pixel stride */, - 256 /* output pixel stride */, - w52.data(), w53.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op11); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #11" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op11, xnn_delete_operator); - - xnn_operator_t op12 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 256 /* input channels per group */, - 512 /* output_channels_per_group */, - 256 /* input pixel stride */, - 512 /* output pixel stride */, - w54.data(), w55.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op12); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #12" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op12, xnn_delete_operator); - - xnn_operator_t op13 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 512 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - w56.data(), w57.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op13); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #13" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op13, xnn_delete_operator); - - xnn_operator_t op14 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 512 /* input channels per group */, - 512 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - w58.data(), w59.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op14); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #14" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op14, xnn_delete_operator); - - xnn_operator_t op15 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 512 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - w60.data(), w61.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op15); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #15" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op15, xnn_delete_operator); - - xnn_operator_t op16 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 512 /* input channels per group */, - 512 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - w62.data(), w63.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op16); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #16" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op16, xnn_delete_operator); - - xnn_operator_t op17 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 512 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - w64.data(), w65.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op17); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #17" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op17, xnn_delete_operator); - - xnn_operator_t op18 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 512 /* input channels per group */, - 512 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - w66.data(), w67.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op18); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #18" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op18, xnn_delete_operator); - - xnn_operator_t op19 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 512 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - w68.data(), w69.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op19); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #19" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op19, xnn_delete_operator); - - xnn_operator_t op20 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 512 /* input channels per group */, - 512 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - w70.data(), w71.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op20); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #20" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op20, xnn_delete_operator); - - xnn_operator_t op21 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 512 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - w72.data(), w73.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op21); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #21" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op21, xnn_delete_operator); - - xnn_operator_t op22 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 512 /* input channels per group */, - 512 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - w74.data(), w75.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op22); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #22" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op22, xnn_delete_operator); - - xnn_operator_t op23 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 512 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - w76.data(), w77.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op23); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #23" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op23, xnn_delete_operator); - - xnn_operator_t op24 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 512 /* input channels per group */, - 1024 /* output_channels_per_group */, - 512 /* input pixel stride */, - 1024 /* output pixel stride */, - w78.data(), w79.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op24); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #24" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op24, xnn_delete_operator); - - xnn_operator_t op25 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1024 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 1024 /* input pixel stride */, - 1024 /* output pixel stride */, - w80.data(), w81.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op25); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #25" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op25, xnn_delete_operator); - - xnn_operator_t op26 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 1024 /* input channels per group */, - 1024 /* output_channels_per_group */, - 1024 /* input pixel stride */, - 1024 /* output pixel stride */, - w82.data(), w83.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op26); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #26" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op26, xnn_delete_operator); - - xnn_operator_t op27 = nullptr; - status = xnn_create_global_average_pooling_ncw_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op27); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #27" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op27, xnn_delete_operator); - - xnn_operator_t op28 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 1024 /* input channels per group */, - 1001 /* output_channels_per_group */, - 1024 /* input pixel stride */, - 1001 /* output pixel stride */, - w84.data(), w85.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op28); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #28" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op28, xnn_delete_operator); - - status = xnn_reshape_convolution2d_nchw_f32( - op0, - /*batch_size=*/1, /*input_height=*/224, /*input_width=*/224, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #0" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op1, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #1" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op2, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #2" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op3, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #3" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op4, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #4" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op5, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #5" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op6, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #6" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op7, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #7" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op8, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #8" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op9, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #9" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op10, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #10" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op11, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #11" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op12, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #12" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op13, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #13" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op14, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #14" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op15, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #15" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op16, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #16" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op17, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #17" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op18, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #18" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op19, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #19" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op20, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #20" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op21, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #21" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op22, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #22" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op23, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #23" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op24, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #24" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op25, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #25" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op26, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #26" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f32( - op27, - /*batch_size=*/1, 49 /* width */, - 1024 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #27" << std::endl; - return ExecutionPlan(); - } - - size_t op28_workspace_size = 0; - size_t op28_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op28, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op28_workspace_size, &op28_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op28_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #28" << std::endl; - return ExecutionPlan(); - } - - Workspace workspace(max_workspace_size); - - status = xnn_setup_convolution2d_nchw_f32( - op0, - /*input=*/v0.data(), /*output=*/v1.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #0" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op1, - /*input=*/v1.data(), /*output=*/v2.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #1" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op2, - /*input=*/v2.data(), /*output=*/v3.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #2" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op3, - /*input=*/v3.data(), /*output=*/v4.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #3" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op4, - /*input=*/v4.data(), /*output=*/v5.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #4" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op5, - /*input=*/v5.data(), /*output=*/v6.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #5" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op6, - /*input=*/v6.data(), /*output=*/v7.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #6" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op7, - /*input=*/v7.data(), /*output=*/v8.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #7" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op8, - /*input=*/v8.data(), /*output=*/v9.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #8" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op9, - /*input=*/v9.data(), /*output=*/v10.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #9" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op10, - /*input=*/v10.data(), /*output=*/v11.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #10" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op11, - /*input=*/v11.data(), /*output=*/v12.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #11" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op12, - /*input=*/v12.data(), /*output=*/v13.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #12" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op13, - /*input=*/v13.data(), /*output=*/v14.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #13" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op14, - /*input=*/v14.data(), /*output=*/v15.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #14" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op15, - /*input=*/v15.data(), /*output=*/v16.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #15" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op16, - /*input=*/v16.data(), /*output=*/v17.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #16" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op17, - /*input=*/v17.data(), /*output=*/v18.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #17" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op18, - /*input=*/v18.data(), /*output=*/v19.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #18" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op19, - /*input=*/v19.data(), /*output=*/v20.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #19" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op20, - /*input=*/v20.data(), /*output=*/v21.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #20" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op21, - /*input=*/v21.data(), /*output=*/v22.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #21" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op22, - /*input=*/v22.data(), /*output=*/v23.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #22" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op23, - /*input=*/v23.data(), /*output=*/v24.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #23" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op24, - /*input=*/v24.data(), /*output=*/v25.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #24" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op25, - /*input=*/v25.data(), /*output=*/v26.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #25" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op26, - /*input=*/v26.data(), /*output=*/v27.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #26" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f32( - op27, - /*input=*/v27.data(), /*output=*/v28.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #27" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op28, - workspace.data(), - /*input=*/v28.data(), /*output=*/v29.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #28" << std::endl; - return ExecutionPlan(); - } - - XNN_PRAGMA_CLANG("clang diagnostic push") - XNN_PRAGMA_CLANG("clang diagnostic ignored \"-Wpessimizing-move\"") - return ExecutionPlan{operators, workspace}; - XNN_PRAGMA_CLANG("clang diagnostic pop") -} - -} // namespace models diff --git a/models/fp32-sparse-mobilenet-v2.cc b/models/fp32-sparse-mobilenet-v2.cc deleted file mode 100644 index a9539709d44..00000000000 --- a/models/fp32-sparse-mobilenet-v2.cc +++ /dev/null @@ -1,3031 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include "xnnpack.h" - -#include -#include -#include -#include -#include -#include - -#include "xnnpack/cache.h" -#include "xnnpack/common.h" -#include "xnnpack/models.h" - -namespace models { - -ExecutionPlan FP32SparseMobileNetV2(float sparsity, pthreadpool_t threadpool) { - alignas(16) static std::array v0; - alignas(16) static std::array v1; - alignas(16) static std::array v2; - alignas(16) static std::array v3; - alignas(16) static std::array v4; - alignas(16) static std::array v5; - alignas(16) static std::array v6; - alignas(16) static std::array v7; - alignas(16) static std::array v8; - alignas(16) static std::array v9; - alignas(16) static std::array v10; - alignas(16) static std::array v11; - alignas(16) static std::array v12; - alignas(16) static std::array v13; - alignas(16) static std::array v14; - alignas(16) static std::array v15; - alignas(16) static std::array v16; - alignas(16) static std::array v17; - alignas(16) static std::array v18; - alignas(16) static std::array v19; - alignas(16) static std::array v20; - alignas(16) static std::array v21; - alignas(16) static std::array v22; - alignas(16) static std::array v23; - alignas(16) static std::array v24; - alignas(16) static std::array v25; - alignas(16) static std::array v26; - alignas(16) static std::array v27; - alignas(16) static std::array v28; - alignas(16) static std::array v29; - alignas(16) static std::array v30; - alignas(16) static std::array v31; - alignas(16) static std::array v32; - alignas(16) static std::array v33; - alignas(16) static std::array v34; - alignas(16) static std::array v35; - alignas(16) static std::array v36; - alignas(16) static std::array v37; - alignas(16) static std::array v38; - alignas(16) static std::array v39; - alignas(16) static std::array v40; - alignas(16) static std::array v41; - alignas(16) static std::array v42; - alignas(16) static std::array v43; - alignas(16) static std::array v44; - alignas(16) static std::array v45; - alignas(16) static std::array v46; - alignas(16) static std::array v47; - alignas(16) static std::array v48; - alignas(16) static std::array v49; - alignas(16) static std::array v50; - alignas(16) static std::array v51; - alignas(16) static std::array v52; - alignas(16) static std::array v53; - alignas(16) static std::array v54; - alignas(16) static std::array v55; - alignas(16) static std::array v56; - alignas(16) static std::array v57; - alignas(16) static std::array v58; - alignas(16) static std::array v59; - alignas(16) static std::array v60; - alignas(16) static std::array v61; - alignas(16) static std::array v62; - alignas(16) static std::array v63; - alignas(16) static std::array v64; - alignas(16) static std::array w65; - alignas(16) static std::array w66; - alignas(16) static std::array w67; - alignas(16) static std::array w68; - alignas(16) static std::array w69; - alignas(16) static std::array w70; - alignas(16) static std::array w71; - alignas(16) static std::array w72; - alignas(16) static std::array w73; - alignas(16) static std::array w74; - alignas(16) static std::array w75; - alignas(16) static std::array w76; - alignas(16) static std::array w77; - alignas(16) static std::array w78; - alignas(16) static std::array w79; - alignas(16) static std::array w80; - alignas(16) static std::array w81; - alignas(16) static std::array w82; - alignas(16) static std::array w83; - alignas(16) static std::array w84; - alignas(16) static std::array w85; - alignas(16) static std::array w86; - alignas(16) static std::array w87; - alignas(16) static std::array w88; - alignas(16) static std::array w89; - alignas(16) static std::array w90; - alignas(16) static std::array w91; - alignas(16) static std::array w92; - alignas(16) static std::array w93; - alignas(16) static std::array w94; - alignas(16) static std::array w95; - alignas(16) static std::array w96; - alignas(16) static std::array w97; - alignas(16) static std::array w98; - alignas(16) static std::array w99; - alignas(16) static std::array w100; - alignas(16) static std::array w101; - alignas(16) static std::array w102; - alignas(16) static std::array w103; - alignas(16) static std::array w104; - alignas(16) static std::array w105; - alignas(16) static std::array w106; - alignas(16) static std::array w107; - alignas(16) static std::array w108; - alignas(16) static std::array w109; - alignas(16) static std::array w110; - alignas(16) static std::array w111; - alignas(16) static std::array w112; - alignas(16) static std::array w113; - alignas(16) static std::array w114; - alignas(16) static std::array w115; - alignas(16) static std::array w116; - alignas(16) static std::array w117; - alignas(16) static std::array w118; - alignas(16) static std::array w119; - alignas(16) static std::array w120; - alignas(16) static std::array w121; - alignas(16) static std::array w122; - alignas(16) static std::array w123; - alignas(16) static std::array w124; - alignas(16) static std::array w125; - alignas(16) static std::array w126; - alignas(16) static std::array w127; - alignas(16) static std::array w128; - alignas(16) static std::array w129; - alignas(16) static std::array w130; - alignas(16) static std::array w131; - alignas(16) static std::array w132; - alignas(16) static std::array w133; - alignas(16) static std::array w134; - alignas(16) static std::array w135; - alignas(16) static std::array w136; - alignas(16) static std::array w137; - alignas(16) static std::array w138; - alignas(16) static std::array w139; - alignas(16) static std::array w140; - alignas(16) static std::array w141; - alignas(16) static std::array w142; - alignas(16) static std::array w143; - alignas(16) static std::array w144; - alignas(16) static std::array w145; - alignas(16) static std::array w146; - alignas(16) static std::array w147; - alignas(16) static std::array w148; - alignas(16) static std::array w149; - alignas(16) static std::array w150; - alignas(16) static std::array w151; - alignas(16) static std::array w152; - alignas(16) static std::array w153; - alignas(16) static std::array w154; - alignas(16) static std::array w155; - alignas(16) static std::array w156; - alignas(16) static std::array w157; - alignas(16) static std::array w158; - alignas(16) static std::array w159; - alignas(16) static std::array w160; - alignas(16) static std::array w161; - alignas(16) static std::array w162; - alignas(16) static std::array w163; - alignas(16) static std::array w164; - alignas(16) static std::array w165; - alignas(16) static std::array w166; - alignas(16) static std::array w167; - alignas(16) static std::array w168; - alignas(16) static std::array w169; - alignas(16) static std::array w170; - - std::random_device random_device; - auto rng = std::mt19937(random_device()); - auto f32rng = std::bind(std::uniform_real_distribution(-1.0f, +1.0f), std::ref(rng)); - std::generate(v0.begin(), v0.end(), std::ref(f32rng)); - std::generate(v1.begin(), v1.end(), std::ref(f32rng)); - std::generate(v2.begin(), v2.end(), std::ref(f32rng)); - std::generate(v3.begin(), v3.end(), std::ref(f32rng)); - std::generate(v4.begin(), v4.end(), std::ref(f32rng)); - std::generate(v5.begin(), v5.end(), std::ref(f32rng)); - std::generate(v6.begin(), v6.end(), std::ref(f32rng)); - std::generate(v7.begin(), v7.end(), std::ref(f32rng)); - std::generate(v8.begin(), v8.end(), std::ref(f32rng)); - std::generate(v9.begin(), v9.end(), std::ref(f32rng)); - std::generate(v10.begin(), v10.end(), std::ref(f32rng)); - std::generate(v11.begin(), v11.end(), std::ref(f32rng)); - std::generate(v12.begin(), v12.end(), std::ref(f32rng)); - std::generate(v13.begin(), v13.end(), std::ref(f32rng)); - std::generate(v14.begin(), v14.end(), std::ref(f32rng)); - std::generate(v15.begin(), v15.end(), std::ref(f32rng)); - std::generate(v16.begin(), v16.end(), std::ref(f32rng)); - std::generate(v17.begin(), v17.end(), std::ref(f32rng)); - std::generate(v18.begin(), v18.end(), std::ref(f32rng)); - std::generate(v19.begin(), v19.end(), std::ref(f32rng)); - std::generate(v20.begin(), v20.end(), std::ref(f32rng)); - std::generate(v21.begin(), v21.end(), std::ref(f32rng)); - std::generate(v22.begin(), v22.end(), std::ref(f32rng)); - std::generate(v23.begin(), v23.end(), std::ref(f32rng)); - std::generate(v24.begin(), v24.end(), std::ref(f32rng)); - std::generate(v25.begin(), v25.end(), std::ref(f32rng)); - std::generate(v26.begin(), v26.end(), std::ref(f32rng)); - std::generate(v27.begin(), v27.end(), std::ref(f32rng)); - std::generate(v28.begin(), v28.end(), std::ref(f32rng)); - std::generate(v29.begin(), v29.end(), std::ref(f32rng)); - std::generate(v30.begin(), v30.end(), std::ref(f32rng)); - std::generate(v31.begin(), v31.end(), std::ref(f32rng)); - std::generate(v32.begin(), v32.end(), std::ref(f32rng)); - std::generate(v33.begin(), v33.end(), std::ref(f32rng)); - std::generate(v34.begin(), v34.end(), std::ref(f32rng)); - std::generate(v35.begin(), v35.end(), std::ref(f32rng)); - std::generate(v36.begin(), v36.end(), std::ref(f32rng)); - std::generate(v37.begin(), v37.end(), std::ref(f32rng)); - std::generate(v38.begin(), v38.end(), std::ref(f32rng)); - std::generate(v39.begin(), v39.end(), std::ref(f32rng)); - std::generate(v40.begin(), v40.end(), std::ref(f32rng)); - std::generate(v41.begin(), v41.end(), std::ref(f32rng)); - std::generate(v42.begin(), v42.end(), std::ref(f32rng)); - std::generate(v43.begin(), v43.end(), std::ref(f32rng)); - std::generate(v44.begin(), v44.end(), std::ref(f32rng)); - std::generate(v45.begin(), v45.end(), std::ref(f32rng)); - std::generate(v46.begin(), v46.end(), std::ref(f32rng)); - std::generate(v47.begin(), v47.end(), std::ref(f32rng)); - std::generate(v48.begin(), v48.end(), std::ref(f32rng)); - std::generate(v49.begin(), v49.end(), std::ref(f32rng)); - std::generate(v50.begin(), v50.end(), std::ref(f32rng)); - std::generate(v51.begin(), v51.end(), std::ref(f32rng)); - std::generate(v52.begin(), v52.end(), std::ref(f32rng)); - std::generate(v53.begin(), v53.end(), std::ref(f32rng)); - std::generate(v54.begin(), v54.end(), std::ref(f32rng)); - std::generate(v55.begin(), v55.end(), std::ref(f32rng)); - std::generate(v56.begin(), v56.end(), std::ref(f32rng)); - std::generate(v57.begin(), v57.end(), std::ref(f32rng)); - std::generate(v58.begin(), v58.end(), std::ref(f32rng)); - std::generate(v59.begin(), v59.end(), std::ref(f32rng)); - std::generate(v60.begin(), v60.end(), std::ref(f32rng)); - std::generate(v61.begin(), v61.end(), std::ref(f32rng)); - std::generate(v62.begin(), v62.end(), std::ref(f32rng)); - std::generate(v63.begin(), v63.end(), std::ref(f32rng)); - std::generate(v64.begin(), v64.end(), std::ref(f32rng)); - std::generate(w65.begin(), w65.end(), std::ref(f32rng)); - std::generate(w66.begin(), w66.end(), std::ref(f32rng)); - std::generate(w67.begin(), w67.end(), std::ref(f32rng)); - std::generate(w68.begin(), w68.end(), std::ref(f32rng)); - std::fill(w69.begin(), w69.end(), 0.0f); - std::generate(w69.begin(), w69.end() - size_t(sparsity * w69.size()), std::ref(f32rng)); - std::shuffle(w69.begin(), w69.end(), rng); - std::generate(w70.begin(), w70.end(), std::ref(f32rng)); - std::fill(w71.begin(), w71.end(), 0.0f); - std::generate(w71.begin(), w71.end() - size_t(sparsity * w71.size()), std::ref(f32rng)); - std::shuffle(w71.begin(), w71.end(), rng); - std::generate(w72.begin(), w72.end(), std::ref(f32rng)); - std::generate(w73.begin(), w73.end(), std::ref(f32rng)); - std::generate(w74.begin(), w74.end(), std::ref(f32rng)); - std::fill(w75.begin(), w75.end(), 0.0f); - std::generate(w75.begin(), w75.end() - size_t(sparsity * w75.size()), std::ref(f32rng)); - std::shuffle(w75.begin(), w75.end(), rng); - std::generate(w76.begin(), w76.end(), std::ref(f32rng)); - std::fill(w77.begin(), w77.end(), 0.0f); - std::generate(w77.begin(), w77.end() - size_t(sparsity * w77.size()), std::ref(f32rng)); - std::shuffle(w77.begin(), w77.end(), rng); - std::generate(w78.begin(), w78.end(), std::ref(f32rng)); - std::generate(w79.begin(), w79.end(), std::ref(f32rng)); - std::generate(w80.begin(), w80.end(), std::ref(f32rng)); - std::fill(w81.begin(), w81.end(), 0.0f); - std::generate(w81.begin(), w81.end() - size_t(sparsity * w81.size()), std::ref(f32rng)); - std::shuffle(w81.begin(), w81.end(), rng); - std::generate(w82.begin(), w82.end(), std::ref(f32rng)); - std::fill(w83.begin(), w83.end(), 0.0f); - std::generate(w83.begin(), w83.end() - size_t(sparsity * w83.size()), std::ref(f32rng)); - std::shuffle(w83.begin(), w83.end(), rng); - std::generate(w84.begin(), w84.end(), std::ref(f32rng)); - std::generate(w85.begin(), w85.end(), std::ref(f32rng)); - std::generate(w86.begin(), w86.end(), std::ref(f32rng)); - std::fill(w87.begin(), w87.end(), 0.0f); - std::generate(w87.begin(), w87.end() - size_t(sparsity * w87.size()), std::ref(f32rng)); - std::shuffle(w87.begin(), w87.end(), rng); - std::generate(w88.begin(), w88.end(), std::ref(f32rng)); - std::fill(w89.begin(), w89.end(), 0.0f); - std::generate(w89.begin(), w89.end() - size_t(sparsity * w89.size()), std::ref(f32rng)); - std::shuffle(w89.begin(), w89.end(), rng); - std::generate(w90.begin(), w90.end(), std::ref(f32rng)); - std::generate(w91.begin(), w91.end(), std::ref(f32rng)); - std::generate(w92.begin(), w92.end(), std::ref(f32rng)); - std::fill(w93.begin(), w93.end(), 0.0f); - std::generate(w93.begin(), w93.end() - size_t(sparsity * w93.size()), std::ref(f32rng)); - std::shuffle(w93.begin(), w93.end(), rng); - std::generate(w94.begin(), w94.end(), std::ref(f32rng)); - std::fill(w95.begin(), w95.end(), 0.0f); - std::generate(w95.begin(), w95.end() - size_t(sparsity * w95.size()), std::ref(f32rng)); - std::shuffle(w95.begin(), w95.end(), rng); - std::generate(w96.begin(), w96.end(), std::ref(f32rng)); - std::generate(w97.begin(), w97.end(), std::ref(f32rng)); - std::generate(w98.begin(), w98.end(), std::ref(f32rng)); - std::fill(w99.begin(), w99.end(), 0.0f); - std::generate(w99.begin(), w99.end() - size_t(sparsity * w99.size()), std::ref(f32rng)); - std::shuffle(w99.begin(), w99.end(), rng); - std::generate(w100.begin(), w100.end(), std::ref(f32rng)); - std::fill(w101.begin(), w101.end(), 0.0f); - std::generate(w101.begin(), w101.end() - size_t(sparsity * w101.size()), std::ref(f32rng)); - std::shuffle(w101.begin(), w101.end(), rng); - std::generate(w102.begin(), w102.end(), std::ref(f32rng)); - std::generate(w103.begin(), w103.end(), std::ref(f32rng)); - std::generate(w104.begin(), w104.end(), std::ref(f32rng)); - std::fill(w105.begin(), w105.end(), 0.0f); - std::generate(w105.begin(), w105.end() - size_t(sparsity * w105.size()), std::ref(f32rng)); - std::shuffle(w105.begin(), w105.end(), rng); - std::generate(w106.begin(), w106.end(), std::ref(f32rng)); - std::fill(w107.begin(), w107.end(), 0.0f); - std::generate(w107.begin(), w107.end() - size_t(sparsity * w107.size()), std::ref(f32rng)); - std::shuffle(w107.begin(), w107.end(), rng); - std::generate(w108.begin(), w108.end(), std::ref(f32rng)); - std::generate(w109.begin(), w109.end(), std::ref(f32rng)); - std::generate(w110.begin(), w110.end(), std::ref(f32rng)); - std::fill(w111.begin(), w111.end(), 0.0f); - std::generate(w111.begin(), w111.end() - size_t(sparsity * w111.size()), std::ref(f32rng)); - std::shuffle(w111.begin(), w111.end(), rng); - std::generate(w112.begin(), w112.end(), std::ref(f32rng)); - std::fill(w113.begin(), w113.end(), 0.0f); - std::generate(w113.begin(), w113.end() - size_t(sparsity * w113.size()), std::ref(f32rng)); - std::shuffle(w113.begin(), w113.end(), rng); - std::generate(w114.begin(), w114.end(), std::ref(f32rng)); - std::generate(w115.begin(), w115.end(), std::ref(f32rng)); - std::generate(w116.begin(), w116.end(), std::ref(f32rng)); - std::fill(w117.begin(), w117.end(), 0.0f); - std::generate(w117.begin(), w117.end() - size_t(sparsity * w117.size()), std::ref(f32rng)); - std::shuffle(w117.begin(), w117.end(), rng); - std::generate(w118.begin(), w118.end(), std::ref(f32rng)); - std::fill(w119.begin(), w119.end(), 0.0f); - std::generate(w119.begin(), w119.end() - size_t(sparsity * w119.size()), std::ref(f32rng)); - std::shuffle(w119.begin(), w119.end(), rng); - std::generate(w120.begin(), w120.end(), std::ref(f32rng)); - std::generate(w121.begin(), w121.end(), std::ref(f32rng)); - std::generate(w122.begin(), w122.end(), std::ref(f32rng)); - std::fill(w123.begin(), w123.end(), 0.0f); - std::generate(w123.begin(), w123.end() - size_t(sparsity * w123.size()), std::ref(f32rng)); - std::shuffle(w123.begin(), w123.end(), rng); - std::generate(w124.begin(), w124.end(), std::ref(f32rng)); - std::fill(w125.begin(), w125.end(), 0.0f); - std::generate(w125.begin(), w125.end() - size_t(sparsity * w125.size()), std::ref(f32rng)); - std::shuffle(w125.begin(), w125.end(), rng); - std::generate(w126.begin(), w126.end(), std::ref(f32rng)); - std::generate(w127.begin(), w127.end(), std::ref(f32rng)); - std::generate(w128.begin(), w128.end(), std::ref(f32rng)); - std::fill(w129.begin(), w129.end(), 0.0f); - std::generate(w129.begin(), w129.end() - size_t(sparsity * w129.size()), std::ref(f32rng)); - std::shuffle(w129.begin(), w129.end(), rng); - std::generate(w130.begin(), w130.end(), std::ref(f32rng)); - std::fill(w131.begin(), w131.end(), 0.0f); - std::generate(w131.begin(), w131.end() - size_t(sparsity * w131.size()), std::ref(f32rng)); - std::shuffle(w131.begin(), w131.end(), rng); - std::generate(w132.begin(), w132.end(), std::ref(f32rng)); - std::generate(w133.begin(), w133.end(), std::ref(f32rng)); - std::generate(w134.begin(), w134.end(), std::ref(f32rng)); - std::fill(w135.begin(), w135.end(), 0.0f); - std::generate(w135.begin(), w135.end() - size_t(sparsity * w135.size()), std::ref(f32rng)); - std::shuffle(w135.begin(), w135.end(), rng); - std::generate(w136.begin(), w136.end(), std::ref(f32rng)); - std::fill(w137.begin(), w137.end(), 0.0f); - std::generate(w137.begin(), w137.end() - size_t(sparsity * w137.size()), std::ref(f32rng)); - std::shuffle(w137.begin(), w137.end(), rng); - std::generate(w138.begin(), w138.end(), std::ref(f32rng)); - std::generate(w139.begin(), w139.end(), std::ref(f32rng)); - std::generate(w140.begin(), w140.end(), std::ref(f32rng)); - std::fill(w141.begin(), w141.end(), 0.0f); - std::generate(w141.begin(), w141.end() - size_t(sparsity * w141.size()), std::ref(f32rng)); - std::shuffle(w141.begin(), w141.end(), rng); - std::generate(w142.begin(), w142.end(), std::ref(f32rng)); - std::fill(w143.begin(), w143.end(), 0.0f); - std::generate(w143.begin(), w143.end() - size_t(sparsity * w143.size()), std::ref(f32rng)); - std::shuffle(w143.begin(), w143.end(), rng); - std::generate(w144.begin(), w144.end(), std::ref(f32rng)); - std::generate(w145.begin(), w145.end(), std::ref(f32rng)); - std::generate(w146.begin(), w146.end(), std::ref(f32rng)); - std::fill(w147.begin(), w147.end(), 0.0f); - std::generate(w147.begin(), w147.end() - size_t(sparsity * w147.size()), std::ref(f32rng)); - std::shuffle(w147.begin(), w147.end(), rng); - std::generate(w148.begin(), w148.end(), std::ref(f32rng)); - std::fill(w149.begin(), w149.end(), 0.0f); - std::generate(w149.begin(), w149.end() - size_t(sparsity * w149.size()), std::ref(f32rng)); - std::shuffle(w149.begin(), w149.end(), rng); - std::generate(w150.begin(), w150.end(), std::ref(f32rng)); - std::generate(w151.begin(), w151.end(), std::ref(f32rng)); - std::generate(w152.begin(), w152.end(), std::ref(f32rng)); - std::fill(w153.begin(), w153.end(), 0.0f); - std::generate(w153.begin(), w153.end() - size_t(sparsity * w153.size()), std::ref(f32rng)); - std::shuffle(w153.begin(), w153.end(), rng); - std::generate(w154.begin(), w154.end(), std::ref(f32rng)); - std::fill(w155.begin(), w155.end(), 0.0f); - std::generate(w155.begin(), w155.end() - size_t(sparsity * w155.size()), std::ref(f32rng)); - std::shuffle(w155.begin(), w155.end(), rng); - std::generate(w156.begin(), w156.end(), std::ref(f32rng)); - std::generate(w157.begin(), w157.end(), std::ref(f32rng)); - std::generate(w158.begin(), w158.end(), std::ref(f32rng)); - std::fill(w159.begin(), w159.end(), 0.0f); - std::generate(w159.begin(), w159.end() - size_t(sparsity * w159.size()), std::ref(f32rng)); - std::shuffle(w159.begin(), w159.end(), rng); - std::generate(w160.begin(), w160.end(), std::ref(f32rng)); - std::fill(w161.begin(), w161.end(), 0.0f); - std::generate(w161.begin(), w161.end() - size_t(sparsity * w161.size()), std::ref(f32rng)); - std::shuffle(w161.begin(), w161.end(), rng); - std::generate(w162.begin(), w162.end(), std::ref(f32rng)); - std::generate(w163.begin(), w163.end(), std::ref(f32rng)); - std::generate(w164.begin(), w164.end(), std::ref(f32rng)); - std::fill(w165.begin(), w165.end(), 0.0f); - std::generate(w165.begin(), w165.end() - size_t(sparsity * w165.size()), std::ref(f32rng)); - std::shuffle(w165.begin(), w165.end(), rng); - std::generate(w166.begin(), w166.end(), std::ref(f32rng)); - std::fill(w167.begin(), w167.end(), 0.0f); - std::generate(w167.begin(), w167.end() - size_t(sparsity * w167.size()), std::ref(f32rng)); - std::shuffle(w167.begin(), w167.end(), rng); - std::generate(w168.begin(), w168.end(), std::ref(f32rng)); - std::fill(w169.begin(), w169.end(), 0.0f); - std::generate(w169.begin(), w169.end() - size_t(sparsity * w169.size()), std::ref(f32rng)); - std::shuffle(w169.begin(), w169.end(), rng); - std::generate(w170.begin(), w170.end(), std::ref(f32rng)); - - size_t max_workspace_size = 0; - Operators operators; - xnn_status status; - - xnn_operator_t op0 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 3 /* input channels per group */, - 32 /* output_channels_per_group */, - 3 /* input pixel stride */, - 32 /* output pixel stride */, - w65.data(), w66.data(), - 0.0f /* output min */, 6.0f /* output max */, - XNN_FLAG_INPUT_NHWC /* flags */, - nullptr, - nullptr, - &op0); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #0" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op0, xnn_delete_operator); - - xnn_operator_t op1 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 32 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 32 /* input pixel stride */, - 32 /* output pixel stride */, - w67.data(), w68.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op1); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #1" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op1, xnn_delete_operator); - - xnn_operator_t op2 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 32 /* input channels per group */, - 16 /* output_channels_per_group */, - 32 /* input pixel stride */, - 16 /* output pixel stride */, - w69.data(), w70.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op2); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #2" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op2, xnn_delete_operator); - - xnn_operator_t op3 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 16 /* input channels per group */, - 96 /* output_channels_per_group */, - 16 /* input pixel stride */, - 96 /* output pixel stride */, - w71.data(), w72.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op3); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #3" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op3, xnn_delete_operator); - - xnn_operator_t op4 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 96 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 96 /* input pixel stride */, - 96 /* output pixel stride */, - w73.data(), w74.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op4); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #4" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op4, xnn_delete_operator); - - xnn_operator_t op5 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 96 /* input channels per group */, - 24 /* output_channels_per_group */, - 96 /* input pixel stride */, - 24 /* output pixel stride */, - w75.data(), w76.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op5); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #5" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op5, xnn_delete_operator); - - xnn_operator_t op6 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 24 /* input channels per group */, - 144 /* output_channels_per_group */, - 24 /* input pixel stride */, - 144 /* output pixel stride */, - w77.data(), w78.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op6); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #6" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op6, xnn_delete_operator); - - xnn_operator_t op7 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 144 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 144 /* input pixel stride */, - 144 /* output pixel stride */, - w79.data(), w80.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op7); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #7" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op7, xnn_delete_operator); - - xnn_operator_t op8 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 144 /* input channels per group */, - 24 /* output_channels_per_group */, - 144 /* input pixel stride */, - 24 /* output pixel stride */, - w81.data(), w82.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op8); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #8" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op8, xnn_delete_operator); - - xnn_operator_t op9 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op9); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #9" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op9, xnn_delete_operator); - - xnn_operator_t op10 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 24 /* input channels per group */, - 144 /* output_channels_per_group */, - 24 /* input pixel stride */, - 144 /* output pixel stride */, - w83.data(), w84.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op10); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #10" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op10, xnn_delete_operator); - - xnn_operator_t op11 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 144 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 144 /* input pixel stride */, - 144 /* output pixel stride */, - w85.data(), w86.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op11); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #11" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op11, xnn_delete_operator); - - xnn_operator_t op12 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 144 /* input channels per group */, - 32 /* output_channels_per_group */, - 144 /* input pixel stride */, - 32 /* output pixel stride */, - w87.data(), w88.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op12); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #12" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op12, xnn_delete_operator); - - xnn_operator_t op13 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 32 /* input channels per group */, - 192 /* output_channels_per_group */, - 32 /* input pixel stride */, - 192 /* output pixel stride */, - w89.data(), w90.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op13); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #13" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op13, xnn_delete_operator); - - xnn_operator_t op14 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 192 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 192 /* input pixel stride */, - 192 /* output pixel stride */, - w91.data(), w92.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op14); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #14" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op14, xnn_delete_operator); - - xnn_operator_t op15 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 192 /* input channels per group */, - 32 /* output_channels_per_group */, - 192 /* input pixel stride */, - 32 /* output pixel stride */, - w93.data(), w94.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op15); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #15" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op15, xnn_delete_operator); - - xnn_operator_t op16 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op16); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #16" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op16, xnn_delete_operator); - - xnn_operator_t op17 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 32 /* input channels per group */, - 192 /* output_channels_per_group */, - 32 /* input pixel stride */, - 192 /* output pixel stride */, - w95.data(), w96.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op17); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #17" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op17, xnn_delete_operator); - - xnn_operator_t op18 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 192 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 192 /* input pixel stride */, - 192 /* output pixel stride */, - w97.data(), w98.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op18); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #18" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op18, xnn_delete_operator); - - xnn_operator_t op19 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 192 /* input channels per group */, - 32 /* output_channels_per_group */, - 192 /* input pixel stride */, - 32 /* output pixel stride */, - w99.data(), w100.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op19); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #19" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op19, xnn_delete_operator); - - xnn_operator_t op20 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op20); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #20" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op20, xnn_delete_operator); - - xnn_operator_t op21 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 32 /* input channels per group */, - 192 /* output_channels_per_group */, - 32 /* input pixel stride */, - 192 /* output pixel stride */, - w101.data(), w102.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op21); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #21" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op21, xnn_delete_operator); - - xnn_operator_t op22 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 192 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 192 /* input pixel stride */, - 192 /* output pixel stride */, - w103.data(), w104.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op22); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #22" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op22, xnn_delete_operator); - - xnn_operator_t op23 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 192 /* input channels per group */, - 64 /* output_channels_per_group */, - 192 /* input pixel stride */, - 64 /* output pixel stride */, - w105.data(), w106.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op23); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #23" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op23, xnn_delete_operator); - - xnn_operator_t op24 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 64 /* input channels per group */, - 384 /* output_channels_per_group */, - 64 /* input pixel stride */, - 384 /* output pixel stride */, - w107.data(), w108.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op24); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #24" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op24, xnn_delete_operator); - - xnn_operator_t op25 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 384 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 384 /* input pixel stride */, - 384 /* output pixel stride */, - w109.data(), w110.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op25); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #25" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op25, xnn_delete_operator); - - xnn_operator_t op26 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 384 /* input channels per group */, - 64 /* output_channels_per_group */, - 384 /* input pixel stride */, - 64 /* output pixel stride */, - w111.data(), w112.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op26); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #26" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op26, xnn_delete_operator); - - xnn_operator_t op27 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op27); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #27" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op27, xnn_delete_operator); - - xnn_operator_t op28 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 64 /* input channels per group */, - 384 /* output_channels_per_group */, - 64 /* input pixel stride */, - 384 /* output pixel stride */, - w113.data(), w114.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op28); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #28" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op28, xnn_delete_operator); - - xnn_operator_t op29 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 384 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 384 /* input pixel stride */, - 384 /* output pixel stride */, - w115.data(), w116.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op29); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #29" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op29, xnn_delete_operator); - - xnn_operator_t op30 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 384 /* input channels per group */, - 64 /* output_channels_per_group */, - 384 /* input pixel stride */, - 64 /* output pixel stride */, - w117.data(), w118.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op30); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #30" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op30, xnn_delete_operator); - - xnn_operator_t op31 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op31); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #31" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op31, xnn_delete_operator); - - xnn_operator_t op32 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 64 /* input channels per group */, - 384 /* output_channels_per_group */, - 64 /* input pixel stride */, - 384 /* output pixel stride */, - w119.data(), w120.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op32); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #32" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op32, xnn_delete_operator); - - xnn_operator_t op33 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 384 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 384 /* input pixel stride */, - 384 /* output pixel stride */, - w121.data(), w122.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op33); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #33" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op33, xnn_delete_operator); - - xnn_operator_t op34 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 384 /* input channels per group */, - 64 /* output_channels_per_group */, - 384 /* input pixel stride */, - 64 /* output pixel stride */, - w123.data(), w124.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op34); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #34" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op34, xnn_delete_operator); - - xnn_operator_t op35 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op35); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #35" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op35, xnn_delete_operator); - - xnn_operator_t op36 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 64 /* input channels per group */, - 384 /* output_channels_per_group */, - 64 /* input pixel stride */, - 384 /* output pixel stride */, - w125.data(), w126.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op36); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #36" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op36, xnn_delete_operator); - - xnn_operator_t op37 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 384 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 384 /* input pixel stride */, - 384 /* output pixel stride */, - w127.data(), w128.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op37); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #37" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op37, xnn_delete_operator); - - xnn_operator_t op38 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 384 /* input channels per group */, - 96 /* output_channels_per_group */, - 384 /* input pixel stride */, - 96 /* output pixel stride */, - w129.data(), w130.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op38); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #38" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op38, xnn_delete_operator); - - xnn_operator_t op39 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 96 /* input channels per group */, - 576 /* output_channels_per_group */, - 96 /* input pixel stride */, - 576 /* output pixel stride */, - w131.data(), w132.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op39); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #39" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op39, xnn_delete_operator); - - xnn_operator_t op40 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 576 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 576 /* input pixel stride */, - 576 /* output pixel stride */, - w133.data(), w134.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op40); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #40" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op40, xnn_delete_operator); - - xnn_operator_t op41 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 576 /* input channels per group */, - 96 /* output_channels_per_group */, - 576 /* input pixel stride */, - 96 /* output pixel stride */, - w135.data(), w136.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op41); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #41" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op41, xnn_delete_operator); - - xnn_operator_t op42 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op42); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #42" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op42, xnn_delete_operator); - - xnn_operator_t op43 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 96 /* input channels per group */, - 576 /* output_channels_per_group */, - 96 /* input pixel stride */, - 576 /* output pixel stride */, - w137.data(), w138.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op43); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #43" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op43, xnn_delete_operator); - - xnn_operator_t op44 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 576 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 576 /* input pixel stride */, - 576 /* output pixel stride */, - w139.data(), w140.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op44); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #44" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op44, xnn_delete_operator); - - xnn_operator_t op45 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 576 /* input channels per group */, - 96 /* output_channels_per_group */, - 576 /* input pixel stride */, - 96 /* output pixel stride */, - w141.data(), w142.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op45); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #45" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op45, xnn_delete_operator); - - xnn_operator_t op46 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op46); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #46" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op46, xnn_delete_operator); - - xnn_operator_t op47 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 96 /* input channels per group */, - 576 /* output_channels_per_group */, - 96 /* input pixel stride */, - 576 /* output pixel stride */, - w143.data(), w144.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op47); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #47" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op47, xnn_delete_operator); - - xnn_operator_t op48 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 576 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 576 /* input pixel stride */, - 576 /* output pixel stride */, - w145.data(), w146.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op48); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #48" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op48, xnn_delete_operator); - - xnn_operator_t op49 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 576 /* input channels per group */, - 160 /* output_channels_per_group */, - 576 /* input pixel stride */, - 160 /* output pixel stride */, - w147.data(), w148.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op49); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #49" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op49, xnn_delete_operator); - - xnn_operator_t op50 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 160 /* input channels per group */, - 960 /* output_channels_per_group */, - 160 /* input pixel stride */, - 960 /* output pixel stride */, - w149.data(), w150.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op50); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #50" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op50, xnn_delete_operator); - - xnn_operator_t op51 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 960 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 960 /* input pixel stride */, - 960 /* output pixel stride */, - w151.data(), w152.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op51); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #51" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op51, xnn_delete_operator); - - xnn_operator_t op52 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 960 /* input channels per group */, - 160 /* output_channels_per_group */, - 960 /* input pixel stride */, - 160 /* output pixel stride */, - w153.data(), w154.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op52); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #52" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op52, xnn_delete_operator); - - xnn_operator_t op53 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op53); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #53" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op53, xnn_delete_operator); - - xnn_operator_t op54 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 160 /* input channels per group */, - 960 /* output_channels_per_group */, - 160 /* input pixel stride */, - 960 /* output pixel stride */, - w155.data(), w156.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op54); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #54" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op54, xnn_delete_operator); - - xnn_operator_t op55 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 960 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 960 /* input pixel stride */, - 960 /* output pixel stride */, - w157.data(), w158.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op55); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #55" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op55, xnn_delete_operator); - - xnn_operator_t op56 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 960 /* input channels per group */, - 160 /* output_channels_per_group */, - 960 /* input pixel stride */, - 160 /* output pixel stride */, - w159.data(), w160.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op56); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #56" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op56, xnn_delete_operator); - - xnn_operator_t op57 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op57); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #57" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op57, xnn_delete_operator); - - xnn_operator_t op58 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 160 /* input channels per group */, - 960 /* output_channels_per_group */, - 160 /* input pixel stride */, - 960 /* output pixel stride */, - w161.data(), w162.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op58); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #58" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op58, xnn_delete_operator); - - xnn_operator_t op59 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 960 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 960 /* input pixel stride */, - 960 /* output pixel stride */, - w163.data(), w164.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op59); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #59" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op59, xnn_delete_operator); - - xnn_operator_t op60 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 960 /* input channels per group */, - 320 /* output_channels_per_group */, - 960 /* input pixel stride */, - 320 /* output pixel stride */, - w165.data(), w166.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op60); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #60" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op60, xnn_delete_operator); - - xnn_operator_t op61 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 320 /* input channels per group */, - 1280 /* output_channels_per_group */, - 320 /* input pixel stride */, - 1280 /* output pixel stride */, - w167.data(), w168.data(), - 0.0f /* output min */, 6.0f /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op61); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #61" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op61, xnn_delete_operator); - - xnn_operator_t op62 = nullptr; - status = xnn_create_global_average_pooling_ncw_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op62); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #62" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op62, xnn_delete_operator); - - xnn_operator_t op63 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 1280 /* input channels per group */, - 1001 /* output_channels_per_group */, - 1280 /* input pixel stride */, - 1001 /* output pixel stride */, - w169.data(), w170.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op63); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #63" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op63, xnn_delete_operator); - - status = xnn_reshape_convolution2d_nchw_f32( - op0, - /*batch_size=*/1, /*input_height=*/224, /*input_width=*/224, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #0" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op1, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #1" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op2, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #2" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op3, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #3" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op4, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #4" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op5, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #5" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op6, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #6" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op7, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #7" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op8, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #8" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 24, 56, 56 }; - const size_t b_shape[] = { 1, 24, 56, 56 }; - status = xnn_reshape_add_nd_f32( - op9, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #9" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op10, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #10" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op11, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #11" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op12, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #12" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op13, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #13" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op14, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #14" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op15, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #15" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 32, 28, 28 }; - const size_t b_shape[] = { 1, 32, 28, 28 }; - status = xnn_reshape_add_nd_f32( - op16, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #16" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op17, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #17" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op18, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #18" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op19, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #19" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 32, 28, 28 }; - const size_t b_shape[] = { 1, 32, 28, 28 }; - status = xnn_reshape_add_nd_f32( - op20, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #20" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op21, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #21" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op22, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #22" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op23, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #23" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op24, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #24" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op25, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #25" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op26, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #26" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 64, 14, 14 }; - const size_t b_shape[] = { 1, 64, 14, 14 }; - status = xnn_reshape_add_nd_f32( - op27, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #27" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op28, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #28" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op29, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #29" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op30, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #30" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 64, 14, 14 }; - const size_t b_shape[] = { 1, 64, 14, 14 }; - status = xnn_reshape_add_nd_f32( - op31, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #31" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op32, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #32" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op33, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #33" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op34, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #34" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 64, 14, 14 }; - const size_t b_shape[] = { 1, 64, 14, 14 }; - status = xnn_reshape_add_nd_f32( - op35, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #35" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op36, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #36" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op37, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #37" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op38, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #38" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op39, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #39" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op40, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #40" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op41, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #41" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 96, 14, 14 }; - const size_t b_shape[] = { 1, 96, 14, 14 }; - status = xnn_reshape_add_nd_f32( - op42, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #42" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op43, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #43" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op44, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #44" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op45, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #45" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 96, 14, 14 }; - const size_t b_shape[] = { 1, 96, 14, 14 }; - status = xnn_reshape_add_nd_f32( - op46, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #46" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op47, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #47" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op48, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #48" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op49, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #49" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op50, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #50" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op51, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #51" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op52, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #52" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 160, 7, 7 }; - const size_t b_shape[] = { 1, 160, 7, 7 }; - status = xnn_reshape_add_nd_f32( - op53, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #53" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op54, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #54" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op55, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #55" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op56, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #56" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 160, 7, 7 }; - const size_t b_shape[] = { 1, 160, 7, 7 }; - status = xnn_reshape_add_nd_f32( - op57, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #57" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op58, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #58" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op59, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #59" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op60, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #60" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op61, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #61" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f32( - op62, - /*batch_size=*/1, 49 /* width */, - 1280 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #62" << std::endl; - return ExecutionPlan(); - } - - size_t op63_workspace_size = 0; - size_t op63_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op63, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op63_workspace_size, &op63_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op63_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #63" << std::endl; - return ExecutionPlan(); - } - - Workspace workspace(max_workspace_size); - - status = xnn_setup_convolution2d_nchw_f32( - op0, - /*input=*/v0.data(), /*output=*/v1.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #0" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op1, - /*input=*/v1.data(), /*output=*/v2.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #1" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op2, - /*input=*/v2.data(), /*output=*/v3.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #2" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op3, - /*input=*/v3.data(), /*output=*/v4.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #3" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op4, - /*input=*/v4.data(), /*output=*/v5.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #4" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op5, - /*input=*/v5.data(), /*output=*/v6.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #5" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op6, - /*input=*/v6.data(), /*output=*/v7.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #6" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op7, - /*input=*/v7.data(), /*output=*/v8.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #7" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op8, - /*input=*/v8.data(), /*output=*/v9.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #8" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op9, - v9.data() /* a */, v6.data() /* b */, /*output=*/v10.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #9" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op10, - /*input=*/v10.data(), /*output=*/v11.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #10" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op11, - /*input=*/v11.data(), /*output=*/v12.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #11" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op12, - /*input=*/v12.data(), /*output=*/v13.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #12" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op13, - /*input=*/v13.data(), /*output=*/v14.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #13" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op14, - /*input=*/v14.data(), /*output=*/v15.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #14" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op15, - /*input=*/v15.data(), /*output=*/v16.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #15" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op16, - v16.data() /* a */, v13.data() /* b */, /*output=*/v17.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #16" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op17, - /*input=*/v17.data(), /*output=*/v18.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #17" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op18, - /*input=*/v18.data(), /*output=*/v19.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #18" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op19, - /*input=*/v19.data(), /*output=*/v20.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #19" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op20, - v20.data() /* a */, v17.data() /* b */, /*output=*/v21.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #20" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op21, - /*input=*/v21.data(), /*output=*/v22.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #21" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op22, - /*input=*/v22.data(), /*output=*/v23.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #22" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op23, - /*input=*/v23.data(), /*output=*/v24.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #23" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op24, - /*input=*/v24.data(), /*output=*/v25.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #24" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op25, - /*input=*/v25.data(), /*output=*/v26.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #25" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op26, - /*input=*/v26.data(), /*output=*/v27.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #26" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op27, - v27.data() /* a */, v24.data() /* b */, /*output=*/v28.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #27" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op28, - /*input=*/v28.data(), /*output=*/v29.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #28" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op29, - /*input=*/v29.data(), /*output=*/v30.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #29" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op30, - /*input=*/v30.data(), /*output=*/v31.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #30" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op31, - v31.data() /* a */, v28.data() /* b */, /*output=*/v32.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #31" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op32, - /*input=*/v32.data(), /*output=*/v33.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #32" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op33, - /*input=*/v33.data(), /*output=*/v34.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #33" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op34, - /*input=*/v34.data(), /*output=*/v35.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #34" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op35, - v35.data() /* a */, v32.data() /* b */, /*output=*/v36.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #35" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op36, - /*input=*/v36.data(), /*output=*/v37.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #36" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op37, - /*input=*/v37.data(), /*output=*/v38.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #37" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op38, - /*input=*/v38.data(), /*output=*/v39.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #38" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op39, - /*input=*/v39.data(), /*output=*/v40.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #39" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op40, - /*input=*/v40.data(), /*output=*/v41.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #40" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op41, - /*input=*/v41.data(), /*output=*/v42.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #41" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op42, - v42.data() /* a */, v39.data() /* b */, /*output=*/v43.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #42" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op43, - /*input=*/v43.data(), /*output=*/v44.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #43" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op44, - /*input=*/v44.data(), /*output=*/v45.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #44" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op45, - /*input=*/v45.data(), /*output=*/v46.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #45" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op46, - v46.data() /* a */, v43.data() /* b */, /*output=*/v47.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #46" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op47, - /*input=*/v47.data(), /*output=*/v48.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #47" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op48, - /*input=*/v48.data(), /*output=*/v49.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #48" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op49, - /*input=*/v49.data(), /*output=*/v50.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #49" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op50, - /*input=*/v50.data(), /*output=*/v51.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #50" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op51, - /*input=*/v51.data(), /*output=*/v52.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #51" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op52, - /*input=*/v52.data(), /*output=*/v53.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #52" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op53, - v53.data() /* a */, v50.data() /* b */, /*output=*/v54.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #53" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op54, - /*input=*/v54.data(), /*output=*/v55.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #54" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op55, - /*input=*/v55.data(), /*output=*/v56.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #55" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op56, - /*input=*/v56.data(), /*output=*/v57.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #56" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op57, - v57.data() /* a */, v54.data() /* b */, /*output=*/v58.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #57" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op58, - /*input=*/v58.data(), /*output=*/v59.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #58" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op59, - /*input=*/v59.data(), /*output=*/v60.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #59" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op60, - /*input=*/v60.data(), /*output=*/v61.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #60" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op61, - /*input=*/v61.data(), /*output=*/v62.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #61" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f32( - op62, - /*input=*/v62.data(), /*output=*/v63.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #62" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op63, - workspace.data(), - /*input=*/v63.data(), /*output=*/v64.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #63" << std::endl; - return ExecutionPlan(); - } - - XNN_PRAGMA_CLANG("clang diagnostic push") - XNN_PRAGMA_CLANG("clang diagnostic ignored \"-Wpessimizing-move\"") - return ExecutionPlan{operators, workspace}; - XNN_PRAGMA_CLANG("clang diagnostic pop") -} - -} // namespace models diff --git a/models/fp32-sparse-mobilenet-v3-large.cc b/models/fp32-sparse-mobilenet-v3-large.cc deleted file mode 100644 index 5bfc6fa9214..00000000000 --- a/models/fp32-sparse-mobilenet-v3-large.cc +++ /dev/null @@ -1,4814 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include "xnnpack.h" - -#include -#include -#include -#include -#include -#include - -#include "xnnpack/cache.h" -#include "xnnpack/common.h" -#include "xnnpack/models.h" - -namespace models { - -ExecutionPlan FP32SparseMobileNetV3Large(float sparsity, pthreadpool_t threadpool) { - alignas(16) static std::array v0; - alignas(16) static std::array v1; - alignas(16) static std::array v2; - alignas(16) static std::array v3; - alignas(16) static std::array v4; - alignas(16) static std::array v5; - alignas(16) static std::array v6; - alignas(16) static std::array v7; - alignas(16) static std::array v8; - alignas(16) static std::array v9; - alignas(16) static std::array v10; - alignas(16) static std::array v11; - alignas(16) static std::array v12; - alignas(16) static std::array v13; - alignas(16) static std::array v14; - alignas(16) static std::array v15; - alignas(16) static std::array v16; - alignas(16) static std::array v17; - alignas(16) static std::array v18; - alignas(16) static std::array v19; - alignas(16) static std::array v20; - alignas(16) static std::array v21; - alignas(16) static std::array v22; - alignas(16) static std::array v23; - alignas(16) static std::array v24; - alignas(16) static std::array v25; - alignas(16) static std::array v26; - alignas(16) static std::array v27; - alignas(16) static std::array v28; - alignas(16) static std::array v29; - alignas(16) static std::array v30; - alignas(16) static std::array v31; - alignas(16) static std::array v32; - alignas(16) static std::array v33; - alignas(16) static std::array v34; - alignas(16) static std::array v35; - alignas(16) static std::array v36; - alignas(16) static std::array v37; - alignas(16) static std::array v38; - alignas(16) static std::array v39; - alignas(16) static std::array v40; - alignas(16) static std::array v41; - alignas(16) static std::array v42; - alignas(16) static std::array v43; - alignas(16) static std::array v44; - alignas(16) static std::array v45; - alignas(16) static std::array v46; - alignas(16) static std::array v47; - alignas(16) static std::array v48; - alignas(16) static std::array v49; - alignas(16) static std::array v50; - alignas(16) static std::array v51; - alignas(16) static std::array v52; - alignas(16) static std::array v53; - alignas(16) static std::array v54; - alignas(16) static std::array v55; - alignas(16) static std::array v56; - alignas(16) static std::array v57; - alignas(16) static std::array v58; - alignas(16) static std::array v59; - alignas(16) static std::array v60; - alignas(16) static std::array v61; - alignas(16) static std::array v62; - alignas(16) static std::array v63; - alignas(16) static std::array v64; - alignas(16) static std::array v65; - alignas(16) static std::array v66; - alignas(16) static std::array v67; - alignas(16) static std::array v68; - alignas(16) static std::array v69; - alignas(16) static std::array v70; - alignas(16) static std::array v71; - alignas(16) static std::array v72; - alignas(16) static std::array v73; - alignas(16) static std::array v74; - alignas(16) static std::array v75; - alignas(16) static std::array v76; - alignas(16) static std::array v77; - alignas(16) static std::array v78; - alignas(16) static std::array v79; - alignas(16) static std::array v80; - alignas(16) static std::array v81; - alignas(16) static std::array v82; - alignas(16) static std::array v83; - alignas(16) static std::array v84; - alignas(16) static std::array v85; - alignas(16) static std::array v86; - alignas(16) static std::array v87; - alignas(16) static std::array v88; - alignas(16) static std::array v89; - alignas(16) static std::array v90; - alignas(16) static std::array v91; - alignas(16) static std::array v92; - alignas(16) static std::array v93; - alignas(16) static std::array v94; - alignas(16) static std::array v95; - alignas(16) static std::array v96; - alignas(16) static std::array v97; - alignas(16) static std::array v98; - alignas(16) static std::array v99; - alignas(16) static std::array v100; - alignas(16) static std::array v101; - alignas(16) static std::array v102; - alignas(16) static std::array v103; - alignas(16) static std::array v104; - alignas(16) static std::array v105; - alignas(16) static std::array v106; - alignas(16) static std::array v107; - alignas(16) static std::array v108; - alignas(16) static std::array v109; - alignas(16) static std::array v110; - alignas(16) static std::array v111; - alignas(16) static std::array v112; - alignas(16) static std::array v113; - alignas(16) static std::array w114; - alignas(16) static std::array w115; - alignas(16) static std::array w116; - alignas(16) static std::array w117; - alignas(16) static std::array w118; - alignas(16) static std::array w119; - alignas(16) static std::array w120; - alignas(16) static std::array w121; - alignas(16) static std::array w122; - alignas(16) static std::array w123; - alignas(16) static std::array w124; - alignas(16) static std::array w125; - alignas(16) static std::array w126; - alignas(16) static std::array w127; - alignas(16) static std::array w128; - alignas(16) static std::array w129; - alignas(16) static std::array w130; - alignas(16) static std::array w131; - alignas(16) static std::array w132; - alignas(16) static std::array w133; - alignas(16) static std::array w134; - alignas(16) static std::array w135; - alignas(16) static std::array w136; - alignas(16) static std::array w137; - alignas(16) static std::array w138; - alignas(16) static std::array w139; - alignas(16) static std::array w140; - alignas(16) static std::array w141; - alignas(16) static std::array w142; - alignas(16) static std::array w143; - alignas(16) static std::array w144; - alignas(16) static std::array w145; - alignas(16) static std::array w146; - alignas(16) static std::array w147; - alignas(16) static std::array w148; - alignas(16) static std::array w149; - alignas(16) static std::array w150; - alignas(16) static std::array w151; - alignas(16) static std::array w152; - alignas(16) static std::array w153; - alignas(16) static std::array w154; - alignas(16) static std::array w155; - alignas(16) static std::array w156; - alignas(16) static std::array w157; - alignas(16) static std::array w158; - alignas(16) static std::array w159; - alignas(16) static std::array w160; - alignas(16) static std::array w161; - alignas(16) static std::array w162; - alignas(16) static std::array w163; - alignas(16) static std::array w164; - alignas(16) static std::array w165; - alignas(16) static std::array w166; - alignas(16) static std::array w167; - alignas(16) static std::array w168; - alignas(16) static std::array w169; - alignas(16) static std::array w170; - alignas(16) static std::array w171; - alignas(16) static std::array w172; - alignas(16) static std::array w173; - alignas(16) static std::array w174; - alignas(16) static std::array w175; - alignas(16) static std::array w176; - alignas(16) static std::array w177; - alignas(16) static std::array w178; - alignas(16) static std::array w179; - alignas(16) static std::array w180; - alignas(16) static std::array w181; - alignas(16) static std::array w182; - alignas(16) static std::array w183; - alignas(16) static std::array w184; - alignas(16) static std::array w185; - alignas(16) static std::array w186; - alignas(16) static std::array w187; - alignas(16) static std::array w188; - alignas(16) static std::array w189; - alignas(16) static std::array w190; - alignas(16) static std::array w191; - alignas(16) static std::array w192; - alignas(16) static std::array w193; - alignas(16) static std::array w194; - alignas(16) static std::array w195; - alignas(16) static std::array w196; - alignas(16) static std::array w197; - alignas(16) static std::array w198; - alignas(16) static std::array w199; - alignas(16) static std::array w200; - alignas(16) static std::array w201; - alignas(16) static std::array w202; - alignas(16) static std::array w203; - alignas(16) static std::array w204; - alignas(16) static std::array w205; - alignas(16) static std::array w206; - alignas(16) static std::array w207; - alignas(16) static std::array w208; - alignas(16) static std::array w209; - alignas(16) static std::array w210; - alignas(16) static std::array w211; - alignas(16) static std::array w212; - alignas(16) static std::array w213; - alignas(16) static std::array w214; - alignas(16) static std::array w215; - alignas(16) static std::array w216; - alignas(16) static std::array w217; - alignas(16) static std::array w218; - alignas(16) static std::array w219; - alignas(16) static std::array w220; - alignas(16) static std::array w221; - alignas(16) static std::array w222; - alignas(16) static std::array w223; - alignas(16) static std::array w224; - alignas(16) static std::array w225; - alignas(16) static std::array w226; - alignas(16) static std::array w227; - alignas(16) static std::array w228; - alignas(16) static std::array w229; - alignas(16) static std::array w230; - alignas(16) static std::array w231; - alignas(16) static std::array w232; - alignas(16) static std::array w233; - alignas(16) static std::array w234; - alignas(16) static std::array w235; - alignas(16) static std::array w236; - alignas(16) static std::array w237; - alignas(16) static std::array w238; - alignas(16) static std::array w239; - alignas(16) static std::array w240; - alignas(16) static std::array w241; - - std::random_device random_device; - auto rng = std::mt19937(random_device()); - auto f32rng = std::bind(std::uniform_real_distribution(-1.0f, +1.0f), std::ref(rng)); - std::generate(v0.begin(), v0.end(), std::ref(f32rng)); - std::generate(v1.begin(), v1.end(), std::ref(f32rng)); - std::generate(v2.begin(), v2.end(), std::ref(f32rng)); - std::generate(v3.begin(), v3.end(), std::ref(f32rng)); - std::generate(v4.begin(), v4.end(), std::ref(f32rng)); - std::generate(v5.begin(), v5.end(), std::ref(f32rng)); - std::generate(v6.begin(), v6.end(), std::ref(f32rng)); - std::generate(v7.begin(), v7.end(), std::ref(f32rng)); - std::generate(v8.begin(), v8.end(), std::ref(f32rng)); - std::generate(v9.begin(), v9.end(), std::ref(f32rng)); - std::generate(v10.begin(), v10.end(), std::ref(f32rng)); - std::generate(v11.begin(), v11.end(), std::ref(f32rng)); - std::generate(v12.begin(), v12.end(), std::ref(f32rng)); - std::generate(v13.begin(), v13.end(), std::ref(f32rng)); - std::generate(v14.begin(), v14.end(), std::ref(f32rng)); - std::generate(v15.begin(), v15.end(), std::ref(f32rng)); - std::generate(v16.begin(), v16.end(), std::ref(f32rng)); - std::generate(v17.begin(), v17.end(), std::ref(f32rng)); - std::generate(v18.begin(), v18.end(), std::ref(f32rng)); - std::generate(v19.begin(), v19.end(), std::ref(f32rng)); - std::generate(v20.begin(), v20.end(), std::ref(f32rng)); - std::generate(v21.begin(), v21.end(), std::ref(f32rng)); - std::generate(v22.begin(), v22.end(), std::ref(f32rng)); - std::generate(v23.begin(), v23.end(), std::ref(f32rng)); - std::generate(v24.begin(), v24.end(), std::ref(f32rng)); - std::generate(v25.begin(), v25.end(), std::ref(f32rng)); - std::generate(v26.begin(), v26.end(), std::ref(f32rng)); - std::generate(v27.begin(), v27.end(), std::ref(f32rng)); - std::generate(v28.begin(), v28.end(), std::ref(f32rng)); - std::generate(v29.begin(), v29.end(), std::ref(f32rng)); - std::generate(v30.begin(), v30.end(), std::ref(f32rng)); - std::generate(v31.begin(), v31.end(), std::ref(f32rng)); - std::generate(v32.begin(), v32.end(), std::ref(f32rng)); - std::generate(v33.begin(), v33.end(), std::ref(f32rng)); - std::generate(v34.begin(), v34.end(), std::ref(f32rng)); - std::generate(v35.begin(), v35.end(), std::ref(f32rng)); - std::generate(v36.begin(), v36.end(), std::ref(f32rng)); - std::generate(v37.begin(), v37.end(), std::ref(f32rng)); - std::generate(v38.begin(), v38.end(), std::ref(f32rng)); - std::generate(v39.begin(), v39.end(), std::ref(f32rng)); - std::generate(v40.begin(), v40.end(), std::ref(f32rng)); - std::generate(v41.begin(), v41.end(), std::ref(f32rng)); - std::generate(v42.begin(), v42.end(), std::ref(f32rng)); - std::generate(v43.begin(), v43.end(), std::ref(f32rng)); - std::generate(v44.begin(), v44.end(), std::ref(f32rng)); - std::generate(v45.begin(), v45.end(), std::ref(f32rng)); - std::generate(v46.begin(), v46.end(), std::ref(f32rng)); - std::generate(v47.begin(), v47.end(), std::ref(f32rng)); - std::generate(v48.begin(), v48.end(), std::ref(f32rng)); - std::generate(v49.begin(), v49.end(), std::ref(f32rng)); - std::generate(v50.begin(), v50.end(), std::ref(f32rng)); - std::generate(v51.begin(), v51.end(), std::ref(f32rng)); - std::generate(v52.begin(), v52.end(), std::ref(f32rng)); - std::generate(v53.begin(), v53.end(), std::ref(f32rng)); - std::generate(v54.begin(), v54.end(), std::ref(f32rng)); - std::generate(v55.begin(), v55.end(), std::ref(f32rng)); - std::generate(v56.begin(), v56.end(), std::ref(f32rng)); - std::generate(v57.begin(), v57.end(), std::ref(f32rng)); - std::generate(v58.begin(), v58.end(), std::ref(f32rng)); - std::generate(v59.begin(), v59.end(), std::ref(f32rng)); - std::generate(v60.begin(), v60.end(), std::ref(f32rng)); - std::generate(v61.begin(), v61.end(), std::ref(f32rng)); - std::generate(v62.begin(), v62.end(), std::ref(f32rng)); - std::generate(v63.begin(), v63.end(), std::ref(f32rng)); - std::generate(v64.begin(), v64.end(), std::ref(f32rng)); - std::generate(v65.begin(), v65.end(), std::ref(f32rng)); - std::generate(v66.begin(), v66.end(), std::ref(f32rng)); - std::generate(v67.begin(), v67.end(), std::ref(f32rng)); - std::generate(v68.begin(), v68.end(), std::ref(f32rng)); - std::generate(v69.begin(), v69.end(), std::ref(f32rng)); - std::generate(v70.begin(), v70.end(), std::ref(f32rng)); - std::generate(v71.begin(), v71.end(), std::ref(f32rng)); - std::generate(v72.begin(), v72.end(), std::ref(f32rng)); - std::generate(v73.begin(), v73.end(), std::ref(f32rng)); - std::generate(v74.begin(), v74.end(), std::ref(f32rng)); - std::generate(v75.begin(), v75.end(), std::ref(f32rng)); - std::generate(v76.begin(), v76.end(), std::ref(f32rng)); - std::generate(v77.begin(), v77.end(), std::ref(f32rng)); - std::generate(v78.begin(), v78.end(), std::ref(f32rng)); - std::generate(v79.begin(), v79.end(), std::ref(f32rng)); - std::generate(v80.begin(), v80.end(), std::ref(f32rng)); - std::generate(v81.begin(), v81.end(), std::ref(f32rng)); - std::generate(v82.begin(), v82.end(), std::ref(f32rng)); - std::generate(v83.begin(), v83.end(), std::ref(f32rng)); - std::generate(v84.begin(), v84.end(), std::ref(f32rng)); - std::generate(v85.begin(), v85.end(), std::ref(f32rng)); - std::generate(v86.begin(), v86.end(), std::ref(f32rng)); - std::generate(v87.begin(), v87.end(), std::ref(f32rng)); - std::generate(v88.begin(), v88.end(), std::ref(f32rng)); - std::generate(v89.begin(), v89.end(), std::ref(f32rng)); - std::generate(v90.begin(), v90.end(), std::ref(f32rng)); - std::generate(v91.begin(), v91.end(), std::ref(f32rng)); - std::generate(v92.begin(), v92.end(), std::ref(f32rng)); - std::generate(v93.begin(), v93.end(), std::ref(f32rng)); - std::generate(v94.begin(), v94.end(), std::ref(f32rng)); - std::generate(v95.begin(), v95.end(), std::ref(f32rng)); - std::generate(v96.begin(), v96.end(), std::ref(f32rng)); - std::generate(v97.begin(), v97.end(), std::ref(f32rng)); - std::generate(v98.begin(), v98.end(), std::ref(f32rng)); - std::generate(v99.begin(), v99.end(), std::ref(f32rng)); - std::generate(v100.begin(), v100.end(), std::ref(f32rng)); - std::generate(v101.begin(), v101.end(), std::ref(f32rng)); - std::generate(v102.begin(), v102.end(), std::ref(f32rng)); - std::generate(v103.begin(), v103.end(), std::ref(f32rng)); - std::generate(v104.begin(), v104.end(), std::ref(f32rng)); - std::generate(v105.begin(), v105.end(), std::ref(f32rng)); - std::generate(v106.begin(), v106.end(), std::ref(f32rng)); - std::generate(v107.begin(), v107.end(), std::ref(f32rng)); - std::generate(v108.begin(), v108.end(), std::ref(f32rng)); - std::generate(v109.begin(), v109.end(), std::ref(f32rng)); - std::generate(v110.begin(), v110.end(), std::ref(f32rng)); - std::generate(v111.begin(), v111.end(), std::ref(f32rng)); - std::generate(v112.begin(), v112.end(), std::ref(f32rng)); - std::generate(v113.begin(), v113.end(), std::ref(f32rng)); - std::generate(w114.begin(), w114.end(), std::ref(f32rng)); - std::generate(w115.begin(), w115.end(), std::ref(f32rng)); - std::generate(w116.begin(), w116.end(), std::ref(f32rng)); - std::generate(w117.begin(), w117.end(), std::ref(f32rng)); - std::fill(w118.begin(), w118.end(), 0.0f); - std::generate(w118.begin(), w118.end() - size_t(sparsity * w118.size()), std::ref(f32rng)); - std::shuffle(w118.begin(), w118.end(), rng); - std::generate(w119.begin(), w119.end(), std::ref(f32rng)); - std::fill(w120.begin(), w120.end(), 0.0f); - std::generate(w120.begin(), w120.end() - size_t(sparsity * w120.size()), std::ref(f32rng)); - std::shuffle(w120.begin(), w120.end(), rng); - std::generate(w121.begin(), w121.end(), std::ref(f32rng)); - std::generate(w122.begin(), w122.end(), std::ref(f32rng)); - std::generate(w123.begin(), w123.end(), std::ref(f32rng)); - std::fill(w124.begin(), w124.end(), 0.0f); - std::generate(w124.begin(), w124.end() - size_t(sparsity * w124.size()), std::ref(f32rng)); - std::shuffle(w124.begin(), w124.end(), rng); - std::generate(w125.begin(), w125.end(), std::ref(f32rng)); - std::fill(w126.begin(), w126.end(), 0.0f); - std::generate(w126.begin(), w126.end() - size_t(sparsity * w126.size()), std::ref(f32rng)); - std::shuffle(w126.begin(), w126.end(), rng); - std::generate(w127.begin(), w127.end(), std::ref(f32rng)); - std::generate(w128.begin(), w128.end(), std::ref(f32rng)); - std::generate(w129.begin(), w129.end(), std::ref(f32rng)); - std::fill(w130.begin(), w130.end(), 0.0f); - std::generate(w130.begin(), w130.end() - size_t(sparsity * w130.size()), std::ref(f32rng)); - std::shuffle(w130.begin(), w130.end(), rng); - std::generate(w131.begin(), w131.end(), std::ref(f32rng)); - std::fill(w132.begin(), w132.end(), 0.0f); - std::generate(w132.begin(), w132.end() - size_t(sparsity * w132.size()), std::ref(f32rng)); - std::shuffle(w132.begin(), w132.end(), rng); - std::generate(w133.begin(), w133.end(), std::ref(f32rng)); - std::generate(w134.begin(), w134.end(), std::ref(f32rng)); - std::generate(w135.begin(), w135.end(), std::ref(f32rng)); - std::fill(w136.begin(), w136.end(), 0.0f); - std::generate(w136.begin(), w136.end() - size_t(sparsity * w136.size()), std::ref(f32rng)); - std::shuffle(w136.begin(), w136.end(), rng); - std::generate(w137.begin(), w137.end(), std::ref(f32rng)); - std::fill(w138.begin(), w138.end(), 0.0f); - std::generate(w138.begin(), w138.end() - size_t(sparsity * w138.size()), std::ref(f32rng)); - std::shuffle(w138.begin(), w138.end(), rng); - std::generate(w139.begin(), w139.end(), std::ref(f32rng)); - std::fill(w140.begin(), w140.end(), 0.0f); - std::generate(w140.begin(), w140.end() - size_t(sparsity * w140.size()), std::ref(f32rng)); - std::shuffle(w140.begin(), w140.end(), rng); - std::generate(w141.begin(), w141.end(), std::ref(f32rng)); - std::fill(w142.begin(), w142.end(), 0.0f); - std::generate(w142.begin(), w142.end() - size_t(sparsity * w142.size()), std::ref(f32rng)); - std::shuffle(w142.begin(), w142.end(), rng); - std::generate(w143.begin(), w143.end(), std::ref(f32rng)); - std::generate(w144.begin(), w144.end(), std::ref(f32rng)); - std::generate(w145.begin(), w145.end(), std::ref(f32rng)); - std::fill(w146.begin(), w146.end(), 0.0f); - std::generate(w146.begin(), w146.end() - size_t(sparsity * w146.size()), std::ref(f32rng)); - std::shuffle(w146.begin(), w146.end(), rng); - std::generate(w147.begin(), w147.end(), std::ref(f32rng)); - std::fill(w148.begin(), w148.end(), 0.0f); - std::generate(w148.begin(), w148.end() - size_t(sparsity * w148.size()), std::ref(f32rng)); - std::shuffle(w148.begin(), w148.end(), rng); - std::generate(w149.begin(), w149.end(), std::ref(f32rng)); - std::fill(w150.begin(), w150.end(), 0.0f); - std::generate(w150.begin(), w150.end() - size_t(sparsity * w150.size()), std::ref(f32rng)); - std::shuffle(w150.begin(), w150.end(), rng); - std::generate(w151.begin(), w151.end(), std::ref(f32rng)); - std::fill(w152.begin(), w152.end(), 0.0f); - std::generate(w152.begin(), w152.end() - size_t(sparsity * w152.size()), std::ref(f32rng)); - std::shuffle(w152.begin(), w152.end(), rng); - std::generate(w153.begin(), w153.end(), std::ref(f32rng)); - std::generate(w154.begin(), w154.end(), std::ref(f32rng)); - std::generate(w155.begin(), w155.end(), std::ref(f32rng)); - std::fill(w156.begin(), w156.end(), 0.0f); - std::generate(w156.begin(), w156.end() - size_t(sparsity * w156.size()), std::ref(f32rng)); - std::shuffle(w156.begin(), w156.end(), rng); - std::generate(w157.begin(), w157.end(), std::ref(f32rng)); - std::fill(w158.begin(), w158.end(), 0.0f); - std::generate(w158.begin(), w158.end() - size_t(sparsity * w158.size()), std::ref(f32rng)); - std::shuffle(w158.begin(), w158.end(), rng); - std::generate(w159.begin(), w159.end(), std::ref(f32rng)); - std::fill(w160.begin(), w160.end(), 0.0f); - std::generate(w160.begin(), w160.end() - size_t(sparsity * w160.size()), std::ref(f32rng)); - std::shuffle(w160.begin(), w160.end(), rng); - std::generate(w161.begin(), w161.end(), std::ref(f32rng)); - std::fill(w162.begin(), w162.end(), 0.0f); - std::generate(w162.begin(), w162.end() - size_t(sparsity * w162.size()), std::ref(f32rng)); - std::shuffle(w162.begin(), w162.end(), rng); - std::generate(w163.begin(), w163.end(), std::ref(f32rng)); - std::generate(w164.begin(), w164.end(), std::ref(f32rng)); - std::generate(w165.begin(), w165.end(), std::ref(f32rng)); - std::fill(w166.begin(), w166.end(), 0.0f); - std::generate(w166.begin(), w166.end() - size_t(sparsity * w166.size()), std::ref(f32rng)); - std::shuffle(w166.begin(), w166.end(), rng); - std::generate(w167.begin(), w167.end(), std::ref(f32rng)); - std::fill(w168.begin(), w168.end(), 0.0f); - std::generate(w168.begin(), w168.end() - size_t(sparsity * w168.size()), std::ref(f32rng)); - std::shuffle(w168.begin(), w168.end(), rng); - std::generate(w169.begin(), w169.end(), std::ref(f32rng)); - std::generate(w170.begin(), w170.end(), std::ref(f32rng)); - std::generate(w171.begin(), w171.end(), std::ref(f32rng)); - std::fill(w172.begin(), w172.end(), 0.0f); - std::generate(w172.begin(), w172.end() - size_t(sparsity * w172.size()), std::ref(f32rng)); - std::shuffle(w172.begin(), w172.end(), rng); - std::generate(w173.begin(), w173.end(), std::ref(f32rng)); - std::fill(w174.begin(), w174.end(), 0.0f); - std::generate(w174.begin(), w174.end() - size_t(sparsity * w174.size()), std::ref(f32rng)); - std::shuffle(w174.begin(), w174.end(), rng); - std::generate(w175.begin(), w175.end(), std::ref(f32rng)); - std::generate(w176.begin(), w176.end(), std::ref(f32rng)); - std::generate(w177.begin(), w177.end(), std::ref(f32rng)); - std::fill(w178.begin(), w178.end(), 0.0f); - std::generate(w178.begin(), w178.end() - size_t(sparsity * w178.size()), std::ref(f32rng)); - std::shuffle(w178.begin(), w178.end(), rng); - std::generate(w179.begin(), w179.end(), std::ref(f32rng)); - std::fill(w180.begin(), w180.end(), 0.0f); - std::generate(w180.begin(), w180.end() - size_t(sparsity * w180.size()), std::ref(f32rng)); - std::shuffle(w180.begin(), w180.end(), rng); - std::generate(w181.begin(), w181.end(), std::ref(f32rng)); - std::generate(w182.begin(), w182.end(), std::ref(f32rng)); - std::generate(w183.begin(), w183.end(), std::ref(f32rng)); - std::fill(w184.begin(), w184.end(), 0.0f); - std::generate(w184.begin(), w184.end() - size_t(sparsity * w184.size()), std::ref(f32rng)); - std::shuffle(w184.begin(), w184.end(), rng); - std::generate(w185.begin(), w185.end(), std::ref(f32rng)); - std::fill(w186.begin(), w186.end(), 0.0f); - std::generate(w186.begin(), w186.end() - size_t(sparsity * w186.size()), std::ref(f32rng)); - std::shuffle(w186.begin(), w186.end(), rng); - std::generate(w187.begin(), w187.end(), std::ref(f32rng)); - std::generate(w188.begin(), w188.end(), std::ref(f32rng)); - std::generate(w189.begin(), w189.end(), std::ref(f32rng)); - std::fill(w190.begin(), w190.end(), 0.0f); - std::generate(w190.begin(), w190.end() - size_t(sparsity * w190.size()), std::ref(f32rng)); - std::shuffle(w190.begin(), w190.end(), rng); - std::generate(w191.begin(), w191.end(), std::ref(f32rng)); - std::fill(w192.begin(), w192.end(), 0.0f); - std::generate(w192.begin(), w192.end() - size_t(sparsity * w192.size()), std::ref(f32rng)); - std::shuffle(w192.begin(), w192.end(), rng); - std::generate(w193.begin(), w193.end(), std::ref(f32rng)); - std::fill(w194.begin(), w194.end(), 0.0f); - std::generate(w194.begin(), w194.end() - size_t(sparsity * w194.size()), std::ref(f32rng)); - std::shuffle(w194.begin(), w194.end(), rng); - std::generate(w195.begin(), w195.end(), std::ref(f32rng)); - std::fill(w196.begin(), w196.end(), 0.0f); - std::generate(w196.begin(), w196.end() - size_t(sparsity * w196.size()), std::ref(f32rng)); - std::shuffle(w196.begin(), w196.end(), rng); - std::generate(w197.begin(), w197.end(), std::ref(f32rng)); - std::generate(w198.begin(), w198.end(), std::ref(f32rng)); - std::generate(w199.begin(), w199.end(), std::ref(f32rng)); - std::fill(w200.begin(), w200.end(), 0.0f); - std::generate(w200.begin(), w200.end() - size_t(sparsity * w200.size()), std::ref(f32rng)); - std::shuffle(w200.begin(), w200.end(), rng); - std::generate(w201.begin(), w201.end(), std::ref(f32rng)); - std::fill(w202.begin(), w202.end(), 0.0f); - std::generate(w202.begin(), w202.end() - size_t(sparsity * w202.size()), std::ref(f32rng)); - std::shuffle(w202.begin(), w202.end(), rng); - std::generate(w203.begin(), w203.end(), std::ref(f32rng)); - std::fill(w204.begin(), w204.end(), 0.0f); - std::generate(w204.begin(), w204.end() - size_t(sparsity * w204.size()), std::ref(f32rng)); - std::shuffle(w204.begin(), w204.end(), rng); - std::generate(w205.begin(), w205.end(), std::ref(f32rng)); - std::fill(w206.begin(), w206.end(), 0.0f); - std::generate(w206.begin(), w206.end() - size_t(sparsity * w206.size()), std::ref(f32rng)); - std::shuffle(w206.begin(), w206.end(), rng); - std::generate(w207.begin(), w207.end(), std::ref(f32rng)); - std::generate(w208.begin(), w208.end(), std::ref(f32rng)); - std::generate(w209.begin(), w209.end(), std::ref(f32rng)); - std::fill(w210.begin(), w210.end(), 0.0f); - std::generate(w210.begin(), w210.end() - size_t(sparsity * w210.size()), std::ref(f32rng)); - std::shuffle(w210.begin(), w210.end(), rng); - std::generate(w211.begin(), w211.end(), std::ref(f32rng)); - std::fill(w212.begin(), w212.end(), 0.0f); - std::generate(w212.begin(), w212.end() - size_t(sparsity * w212.size()), std::ref(f32rng)); - std::shuffle(w212.begin(), w212.end(), rng); - std::generate(w213.begin(), w213.end(), std::ref(f32rng)); - std::fill(w214.begin(), w214.end(), 0.0f); - std::generate(w214.begin(), w214.end() - size_t(sparsity * w214.size()), std::ref(f32rng)); - std::shuffle(w214.begin(), w214.end(), rng); - std::generate(w215.begin(), w215.end(), std::ref(f32rng)); - std::fill(w216.begin(), w216.end(), 0.0f); - std::generate(w216.begin(), w216.end() - size_t(sparsity * w216.size()), std::ref(f32rng)); - std::shuffle(w216.begin(), w216.end(), rng); - std::generate(w217.begin(), w217.end(), std::ref(f32rng)); - std::generate(w218.begin(), w218.end(), std::ref(f32rng)); - std::generate(w219.begin(), w219.end(), std::ref(f32rng)); - std::fill(w220.begin(), w220.end(), 0.0f); - std::generate(w220.begin(), w220.end() - size_t(sparsity * w220.size()), std::ref(f32rng)); - std::shuffle(w220.begin(), w220.end(), rng); - std::generate(w221.begin(), w221.end(), std::ref(f32rng)); - std::fill(w222.begin(), w222.end(), 0.0f); - std::generate(w222.begin(), w222.end() - size_t(sparsity * w222.size()), std::ref(f32rng)); - std::shuffle(w222.begin(), w222.end(), rng); - std::generate(w223.begin(), w223.end(), std::ref(f32rng)); - std::fill(w224.begin(), w224.end(), 0.0f); - std::generate(w224.begin(), w224.end() - size_t(sparsity * w224.size()), std::ref(f32rng)); - std::shuffle(w224.begin(), w224.end(), rng); - std::generate(w225.begin(), w225.end(), std::ref(f32rng)); - std::fill(w226.begin(), w226.end(), 0.0f); - std::generate(w226.begin(), w226.end() - size_t(sparsity * w226.size()), std::ref(f32rng)); - std::shuffle(w226.begin(), w226.end(), rng); - std::generate(w227.begin(), w227.end(), std::ref(f32rng)); - std::generate(w228.begin(), w228.end(), std::ref(f32rng)); - std::generate(w229.begin(), w229.end(), std::ref(f32rng)); - std::fill(w230.begin(), w230.end(), 0.0f); - std::generate(w230.begin(), w230.end() - size_t(sparsity * w230.size()), std::ref(f32rng)); - std::shuffle(w230.begin(), w230.end(), rng); - std::generate(w231.begin(), w231.end(), std::ref(f32rng)); - std::fill(w232.begin(), w232.end(), 0.0f); - std::generate(w232.begin(), w232.end() - size_t(sparsity * w232.size()), std::ref(f32rng)); - std::shuffle(w232.begin(), w232.end(), rng); - std::generate(w233.begin(), w233.end(), std::ref(f32rng)); - std::fill(w234.begin(), w234.end(), 0.0f); - std::generate(w234.begin(), w234.end() - size_t(sparsity * w234.size()), std::ref(f32rng)); - std::shuffle(w234.begin(), w234.end(), rng); - std::generate(w235.begin(), w235.end(), std::ref(f32rng)); - std::fill(w236.begin(), w236.end(), 0.0f); - std::generate(w236.begin(), w236.end() - size_t(sparsity * w236.size()), std::ref(f32rng)); - std::shuffle(w236.begin(), w236.end(), rng); - std::generate(w237.begin(), w237.end(), std::ref(f32rng)); - std::generate(w238.begin(), w238.end(), std::ref(f32rng)); - std::generate(w239.begin(), w239.end(), std::ref(f32rng)); - std::generate(w240.begin(), w240.end(), std::ref(f32rng)); - std::generate(w241.begin(), w241.end(), std::ref(f32rng)); - - Operators operators; - xnn_status status; - size_t max_workspace_size = 0; - - xnn_operator_t op0 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 3 /* input channels per group */, - 16 /* output_channels_per_group */, - 3 /* input pixel stride */, - 16 /* output pixel stride */, - w114.data(), w115.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - XNN_FLAG_INPUT_NHWC /* flags */, - nullptr, - nullptr, - &op0); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #0" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op0, xnn_delete_operator); - - xnn_operator_t op1 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op1); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #1" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op1, xnn_delete_operator); - - xnn_operator_t op2 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 16 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 16 /* input pixel stride */, - 16 /* output pixel stride */, - w116.data(), w117.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op2); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #2" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op2, xnn_delete_operator); - - xnn_operator_t op3 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 16 /* input channels per group */, - 16 /* output_channels_per_group */, - 16 /* input pixel stride */, - 16 /* output pixel stride */, - w118.data(), w119.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op3); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #3" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op3, xnn_delete_operator); - - xnn_operator_t op4 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op4); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #4" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op4, xnn_delete_operator); - - xnn_operator_t op5 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 16 /* input channels per group */, - 64 /* output_channels_per_group */, - 16 /* input pixel stride */, - 64 /* output pixel stride */, - w120.data(), w121.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op5); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #5" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op5, xnn_delete_operator); - - xnn_operator_t op6 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 64 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 64 /* input pixel stride */, - 64 /* output pixel stride */, - w122.data(), w123.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op6); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #6" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op6, xnn_delete_operator); - - xnn_operator_t op7 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 64 /* input channels per group */, - 24 /* output_channels_per_group */, - 64 /* input pixel stride */, - 24 /* output pixel stride */, - w124.data(), w125.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op7); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #7" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op7, xnn_delete_operator); - - xnn_operator_t op8 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 24 /* input channels per group */, - 72 /* output_channels_per_group */, - 24 /* input pixel stride */, - 72 /* output pixel stride */, - w126.data(), w127.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op8); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #8" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op8, xnn_delete_operator); - - xnn_operator_t op9 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 72 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 72 /* input pixel stride */, - 72 /* output pixel stride */, - w128.data(), w129.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op9); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #9" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op9, xnn_delete_operator); - - xnn_operator_t op10 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 72 /* input channels per group */, - 24 /* output_channels_per_group */, - 72 /* input pixel stride */, - 24 /* output pixel stride */, - w130.data(), w131.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op10); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #10" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op10, xnn_delete_operator); - - xnn_operator_t op11 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op11); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #11" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op11, xnn_delete_operator); - - xnn_operator_t op12 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 24 /* input channels per group */, - 72 /* output_channels_per_group */, - 24 /* input pixel stride */, - 72 /* output pixel stride */, - w132.data(), w133.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op12); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #12" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op12, xnn_delete_operator); - - xnn_operator_t op13 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 2 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 2 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 72 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 72 /* input pixel stride */, - 72 /* output pixel stride */, - w134.data(), w135.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op13); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #13" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op13, xnn_delete_operator); - - xnn_operator_t op14 = nullptr; - status = xnn_create_global_average_pooling_ncw_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op14); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #14" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op14, xnn_delete_operator); - - xnn_operator_t op15 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 72 /* input channels per group */, - 24 /* output_channels_per_group */, - 72 /* input pixel stride */, - 24 /* output pixel stride */, - w136.data(), w137.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op15); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #15" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op15, xnn_delete_operator); - - xnn_operator_t op16 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 24 /* input channels per group */, - 72 /* output_channels_per_group */, - 24 /* input pixel stride */, - 72 /* output pixel stride */, - w138.data(), w139.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op16); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #16" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op16, xnn_delete_operator); - - xnn_operator_t op17 = nullptr; - status = xnn_create_multiply_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op17); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #17" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op17, xnn_delete_operator); - - xnn_operator_t op18 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 72 /* input channels per group */, - 40 /* output_channels_per_group */, - 72 /* input pixel stride */, - 40 /* output pixel stride */, - w140.data(), w141.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op18); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #18" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op18, xnn_delete_operator); - - xnn_operator_t op19 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 40 /* input channels per group */, - 120 /* output_channels_per_group */, - 40 /* input pixel stride */, - 120 /* output pixel stride */, - w142.data(), w143.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op19); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #19" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op19, xnn_delete_operator); - - xnn_operator_t op20 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 2 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 2 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 120 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 120 /* input pixel stride */, - 120 /* output pixel stride */, - w144.data(), w145.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op20); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #20" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op20, xnn_delete_operator); - - xnn_operator_t op21 = nullptr; - status = xnn_create_global_average_pooling_ncw_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op21); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #21" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op21, xnn_delete_operator); - - xnn_operator_t op22 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 120 /* input channels per group */, - 32 /* output_channels_per_group */, - 120 /* input pixel stride */, - 32 /* output pixel stride */, - w146.data(), w147.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op22); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #22" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op22, xnn_delete_operator); - - xnn_operator_t op23 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 32 /* input channels per group */, - 120 /* output_channels_per_group */, - 32 /* input pixel stride */, - 120 /* output pixel stride */, - w148.data(), w149.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op23); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #23" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op23, xnn_delete_operator); - - xnn_operator_t op24 = nullptr; - status = xnn_create_multiply_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op24); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #24" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op24, xnn_delete_operator); - - xnn_operator_t op25 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 120 /* input channels per group */, - 40 /* output_channels_per_group */, - 120 /* input pixel stride */, - 40 /* output pixel stride */, - w150.data(), w151.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op25); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #25" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op25, xnn_delete_operator); - - xnn_operator_t op26 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op26); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #26" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op26, xnn_delete_operator); - - xnn_operator_t op27 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 40 /* input channels per group */, - 120 /* output_channels_per_group */, - 40 /* input pixel stride */, - 120 /* output pixel stride */, - w152.data(), w153.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op27); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #27" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op27, xnn_delete_operator); - - xnn_operator_t op28 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 2 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 2 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 120 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 120 /* input pixel stride */, - 120 /* output pixel stride */, - w154.data(), w155.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op28); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #28" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op28, xnn_delete_operator); - - xnn_operator_t op29 = nullptr; - status = xnn_create_global_average_pooling_ncw_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op29); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #29" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op29, xnn_delete_operator); - - xnn_operator_t op30 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 120 /* input channels per group */, - 32 /* output_channels_per_group */, - 120 /* input pixel stride */, - 32 /* output pixel stride */, - w156.data(), w157.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op30); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #30" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op30, xnn_delete_operator); - - xnn_operator_t op31 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 32 /* input channels per group */, - 120 /* output_channels_per_group */, - 32 /* input pixel stride */, - 120 /* output pixel stride */, - w158.data(), w159.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op31); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #31" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op31, xnn_delete_operator); - - xnn_operator_t op32 = nullptr; - status = xnn_create_multiply_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op32); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #32" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op32, xnn_delete_operator); - - xnn_operator_t op33 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 120 /* input channels per group */, - 40 /* output_channels_per_group */, - 120 /* input pixel stride */, - 40 /* output pixel stride */, - w160.data(), w161.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op33); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #33" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op33, xnn_delete_operator); - - xnn_operator_t op34 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op34); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #34" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op34, xnn_delete_operator); - - xnn_operator_t op35 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 40 /* input channels per group */, - 240 /* output_channels_per_group */, - 40 /* input pixel stride */, - 240 /* output pixel stride */, - w162.data(), w163.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op35); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #35" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op35, xnn_delete_operator); - - xnn_operator_t op36 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op36); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #36" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op36, xnn_delete_operator); - - xnn_operator_t op37 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 240 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 240 /* input pixel stride */, - 240 /* output pixel stride */, - w164.data(), w165.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op37); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #37" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op37, xnn_delete_operator); - - xnn_operator_t op38 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op38); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #38" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op38, xnn_delete_operator); - - xnn_operator_t op39 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 240 /* input channels per group */, - 80 /* output_channels_per_group */, - 240 /* input pixel stride */, - 80 /* output pixel stride */, - w166.data(), w167.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op39); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #39" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op39, xnn_delete_operator); - - xnn_operator_t op40 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 80 /* input channels per group */, - 200 /* output_channels_per_group */, - 80 /* input pixel stride */, - 200 /* output pixel stride */, - w168.data(), w169.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op40); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #40" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op40, xnn_delete_operator); - - xnn_operator_t op41 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op41); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #41" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op41, xnn_delete_operator); - - xnn_operator_t op42 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 200 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 200 /* input pixel stride */, - 200 /* output pixel stride */, - w170.data(), w171.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op42); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #42" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op42, xnn_delete_operator); - - xnn_operator_t op43 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op43); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #43" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op43, xnn_delete_operator); - - xnn_operator_t op44 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 200 /* input channels per group */, - 80 /* output_channels_per_group */, - 200 /* input pixel stride */, - 80 /* output pixel stride */, - w172.data(), w173.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op44); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #44" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op44, xnn_delete_operator); - - xnn_operator_t op45 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op45); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #45" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op45, xnn_delete_operator); - - xnn_operator_t op46 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 80 /* input channels per group */, - 184 /* output_channels_per_group */, - 80 /* input pixel stride */, - 184 /* output pixel stride */, - w174.data(), w175.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op46); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #46" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op46, xnn_delete_operator); - - xnn_operator_t op47 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op47); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #47" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op47, xnn_delete_operator); - - xnn_operator_t op48 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 184 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 184 /* input pixel stride */, - 184 /* output pixel stride */, - w176.data(), w177.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op48); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #48" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op48, xnn_delete_operator); - - xnn_operator_t op49 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op49); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #49" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op49, xnn_delete_operator); - - xnn_operator_t op50 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 184 /* input channels per group */, - 80 /* output_channels_per_group */, - 184 /* input pixel stride */, - 80 /* output pixel stride */, - w178.data(), w179.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op50); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #50" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op50, xnn_delete_operator); - - xnn_operator_t op51 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op51); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #51" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op51, xnn_delete_operator); - - xnn_operator_t op52 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 80 /* input channels per group */, - 184 /* output_channels_per_group */, - 80 /* input pixel stride */, - 184 /* output pixel stride */, - w180.data(), w181.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op52); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #52" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op52, xnn_delete_operator); - - xnn_operator_t op53 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op53); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #53" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op53, xnn_delete_operator); - - xnn_operator_t op54 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 184 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 184 /* input pixel stride */, - 184 /* output pixel stride */, - w182.data(), w183.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op54); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #54" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op54, xnn_delete_operator); - - xnn_operator_t op55 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op55); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #55" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op55, xnn_delete_operator); - - xnn_operator_t op56 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 184 /* input channels per group */, - 80 /* output_channels_per_group */, - 184 /* input pixel stride */, - 80 /* output pixel stride */, - w184.data(), w185.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op56); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #56" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op56, xnn_delete_operator); - - xnn_operator_t op57 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op57); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #57" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op57, xnn_delete_operator); - - xnn_operator_t op58 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 80 /* input channels per group */, - 480 /* output_channels_per_group */, - 80 /* input pixel stride */, - 480 /* output pixel stride */, - w186.data(), w187.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op58); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #58" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op58, xnn_delete_operator); - - xnn_operator_t op59 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op59); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #59" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op59, xnn_delete_operator); - - xnn_operator_t op60 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 480 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 480 /* input pixel stride */, - 480 /* output pixel stride */, - w188.data(), w189.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op60); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #60" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op60, xnn_delete_operator); - - xnn_operator_t op61 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op61); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #61" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op61, xnn_delete_operator); - - xnn_operator_t op62 = nullptr; - status = xnn_create_global_average_pooling_ncw_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op62); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #62" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op62, xnn_delete_operator); - - xnn_operator_t op63 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 480 /* input channels per group */, - 120 /* output_channels_per_group */, - 480 /* input pixel stride */, - 120 /* output pixel stride */, - w190.data(), w191.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op63); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #63" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op63, xnn_delete_operator); - - xnn_operator_t op64 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 120 /* input channels per group */, - 480 /* output_channels_per_group */, - 120 /* input pixel stride */, - 480 /* output pixel stride */, - w192.data(), w193.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op64); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #64" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op64, xnn_delete_operator); - - xnn_operator_t op65 = nullptr; - status = xnn_create_multiply_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op65); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #65" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op65, xnn_delete_operator); - - xnn_operator_t op66 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 480 /* input channels per group */, - 112 /* output_channels_per_group */, - 480 /* input pixel stride */, - 112 /* output pixel stride */, - w194.data(), w195.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op66); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #66" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op66, xnn_delete_operator); - - xnn_operator_t op67 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 112 /* input channels per group */, - 672 /* output_channels_per_group */, - 112 /* input pixel stride */, - 672 /* output pixel stride */, - w196.data(), w197.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op67); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #67" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op67, xnn_delete_operator); - - xnn_operator_t op68 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op68); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #68" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op68, xnn_delete_operator); - - xnn_operator_t op69 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 672 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 672 /* input pixel stride */, - 672 /* output pixel stride */, - w198.data(), w199.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op69); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #69" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op69, xnn_delete_operator); - - xnn_operator_t op70 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op70); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #70" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op70, xnn_delete_operator); - - xnn_operator_t op71 = nullptr; - status = xnn_create_global_average_pooling_ncw_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op71); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #71" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op71, xnn_delete_operator); - - xnn_operator_t op72 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 672 /* input channels per group */, - 168 /* output_channels_per_group */, - 672 /* input pixel stride */, - 168 /* output pixel stride */, - w200.data(), w201.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op72); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #72" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op72, xnn_delete_operator); - - xnn_operator_t op73 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 168 /* input channels per group */, - 672 /* output_channels_per_group */, - 168 /* input pixel stride */, - 672 /* output pixel stride */, - w202.data(), w203.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op73); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #73" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op73, xnn_delete_operator); - - xnn_operator_t op74 = nullptr; - status = xnn_create_multiply_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op74); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #74" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op74, xnn_delete_operator); - - xnn_operator_t op75 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 672 /* input channels per group */, - 112 /* output_channels_per_group */, - 672 /* input pixel stride */, - 112 /* output pixel stride */, - w204.data(), w205.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op75); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #75" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op75, xnn_delete_operator); - - xnn_operator_t op76 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op76); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #76" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op76, xnn_delete_operator); - - xnn_operator_t op77 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 112 /* input channels per group */, - 672 /* output_channels_per_group */, - 112 /* input pixel stride */, - 672 /* output pixel stride */, - w206.data(), w207.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op77); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #77" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op77, xnn_delete_operator); - - xnn_operator_t op78 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op78); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #78" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op78, xnn_delete_operator); - - xnn_operator_t op79 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 2 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 2 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 672 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 672 /* input pixel stride */, - 672 /* output pixel stride */, - w208.data(), w209.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op79); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #79" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op79, xnn_delete_operator); - - xnn_operator_t op80 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op80); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #80" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op80, xnn_delete_operator); - - xnn_operator_t op81 = nullptr; - status = xnn_create_global_average_pooling_ncw_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op81); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #81" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op81, xnn_delete_operator); - - xnn_operator_t op82 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 672 /* input channels per group */, - 168 /* output_channels_per_group */, - 672 /* input pixel stride */, - 168 /* output pixel stride */, - w210.data(), w211.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op82); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #82" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op82, xnn_delete_operator); - - xnn_operator_t op83 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 168 /* input channels per group */, - 672 /* output_channels_per_group */, - 168 /* input pixel stride */, - 672 /* output pixel stride */, - w212.data(), w213.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op83); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #83" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op83, xnn_delete_operator); - - xnn_operator_t op84 = nullptr; - status = xnn_create_multiply_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op84); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #84" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op84, xnn_delete_operator); - - xnn_operator_t op85 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 672 /* input channels per group */, - 160 /* output_channels_per_group */, - 672 /* input pixel stride */, - 160 /* output pixel stride */, - w214.data(), w215.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op85); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #85" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op85, xnn_delete_operator); - - xnn_operator_t op86 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 160 /* input channels per group */, - 960 /* output_channels_per_group */, - 160 /* input pixel stride */, - 960 /* output pixel stride */, - w216.data(), w217.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op86); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #86" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op86, xnn_delete_operator); - - xnn_operator_t op87 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op87); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #87" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op87, xnn_delete_operator); - - xnn_operator_t op88 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 2 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 2 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 960 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 960 /* input pixel stride */, - 960 /* output pixel stride */, - w218.data(), w219.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op88); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #88" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op88, xnn_delete_operator); - - xnn_operator_t op89 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op89); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #89" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op89, xnn_delete_operator); - - xnn_operator_t op90 = nullptr; - status = xnn_create_global_average_pooling_ncw_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op90); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #90" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op90, xnn_delete_operator); - - xnn_operator_t op91 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 960 /* input channels per group */, - 240 /* output_channels_per_group */, - 960 /* input pixel stride */, - 240 /* output pixel stride */, - w220.data(), w221.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op91); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #91" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op91, xnn_delete_operator); - - xnn_operator_t op92 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 240 /* input channels per group */, - 960 /* output_channels_per_group */, - 240 /* input pixel stride */, - 960 /* output pixel stride */, - w222.data(), w223.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op92); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #92" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op92, xnn_delete_operator); - - xnn_operator_t op93 = nullptr; - status = xnn_create_multiply_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op93); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #93" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op93, xnn_delete_operator); - - xnn_operator_t op94 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 960 /* input channels per group */, - 160 /* output_channels_per_group */, - 960 /* input pixel stride */, - 160 /* output pixel stride */, - w224.data(), w225.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op94); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #94" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op94, xnn_delete_operator); - - xnn_operator_t op95 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op95); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #95" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op95, xnn_delete_operator); - - xnn_operator_t op96 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 160 /* input channels per group */, - 960 /* output_channels_per_group */, - 160 /* input pixel stride */, - 960 /* output pixel stride */, - w226.data(), w227.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op96); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #96" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op96, xnn_delete_operator); - - xnn_operator_t op97 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op97); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #97" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op97, xnn_delete_operator); - - xnn_operator_t op98 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 2 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 2 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 960 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 960 /* input pixel stride */, - 960 /* output pixel stride */, - w228.data(), w229.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op98); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #98" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op98, xnn_delete_operator); - - xnn_operator_t op99 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op99); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #99" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op99, xnn_delete_operator); - - xnn_operator_t op100 = nullptr; - status = xnn_create_global_average_pooling_ncw_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op100); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #100" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op100, xnn_delete_operator); - - xnn_operator_t op101 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 960 /* input channels per group */, - 240 /* output_channels_per_group */, - 960 /* input pixel stride */, - 240 /* output pixel stride */, - w230.data(), w231.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op101); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #101" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op101, xnn_delete_operator); - - xnn_operator_t op102 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 240 /* input channels per group */, - 960 /* output_channels_per_group */, - 240 /* input pixel stride */, - 960 /* output pixel stride */, - w232.data(), w233.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op102); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #102" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op102, xnn_delete_operator); - - xnn_operator_t op103 = nullptr; - status = xnn_create_multiply_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op103); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #103" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op103, xnn_delete_operator); - - xnn_operator_t op104 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 960 /* input channels per group */, - 160 /* output_channels_per_group */, - 960 /* input pixel stride */, - 160 /* output pixel stride */, - w234.data(), w235.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op104); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #104" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op104, xnn_delete_operator); - - xnn_operator_t op105 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op105); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #105" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op105, xnn_delete_operator); - - xnn_operator_t op106 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 160 /* input channels per group */, - 960 /* output_channels_per_group */, - 160 /* input pixel stride */, - 960 /* output pixel stride */, - w236.data(), w237.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op106); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #106" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op106, xnn_delete_operator); - - xnn_operator_t op107 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op107); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #107" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op107, xnn_delete_operator); - - xnn_operator_t op108 = nullptr; - status = xnn_create_global_average_pooling_ncw_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op108); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #108" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op108, xnn_delete_operator); - - xnn_operator_t op109 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 960 /* input channels per group */, - 1280 /* output_channels_per_group */, - 960 /* input pixel stride */, - 1280 /* output pixel stride */, - w238.data(), w239.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op109); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #109" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op109, xnn_delete_operator); - - xnn_operator_t op110 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op110); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #110" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op110, xnn_delete_operator); - - xnn_operator_t op111 = nullptr; - status = xnn_create_global_average_pooling_nwc_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op111); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #111" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op111, xnn_delete_operator); - - xnn_operator_t op112 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 1280 /* input channels per group */, - 1001 /* output_channels_per_group */, - 1280 /* input pixel stride */, - 1001 /* output pixel stride */, - w240.data(), w241.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op112); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #112" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op112, xnn_delete_operator); - - status = xnn_reshape_convolution2d_nchw_f32( - op0, - /*batch_size=*/1, /*input_height=*/224, /*input_width=*/224, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #0" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op1, - /*batch_size=*/12544, - 16 /* channels */, - 16 /* input stride */, - 16 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #1" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op2, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #2" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op3, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #3" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 16, 112, 112 }; - const size_t b_shape[] = { 1, 16, 112, 112 }; - status = xnn_reshape_add_nd_f32( - op4, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #4" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op5, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #5" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op6, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #6" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op7, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #7" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op8, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #8" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op9, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #9" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op10, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #10" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 24, 56, 56 }; - const size_t b_shape[] = { 1, 24, 56, 56 }; - status = xnn_reshape_add_nd_f32( - op11, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #11" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op12, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #12" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op13, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #13" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f32( - op14, - /*batch_size=*/1, 784 /* width */, - 72 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #14" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op15, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #15" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op16, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #16" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 72, 28, 28 }; - const size_t b_shape[] = { 1, 72, 1, 1 }; - status = xnn_reshape_multiply_nd_f32( - op17, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #17" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op18, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #18" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op19, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #19" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op20, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #20" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f32( - op21, - /*batch_size=*/1, 784 /* width */, - 120 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #21" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op22, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #22" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op23, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #23" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 120, 28, 28 }; - const size_t b_shape[] = { 1, 120, 1, 1 }; - status = xnn_reshape_multiply_nd_f32( - op24, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #24" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op25, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #25" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 40, 28, 28 }; - const size_t b_shape[] = { 1, 40, 28, 28 }; - status = xnn_reshape_add_nd_f32( - op26, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #26" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op27, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #27" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op28, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #28" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f32( - op29, - /*batch_size=*/1, 784 /* width */, - 120 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #29" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op30, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #30" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op31, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #31" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 120, 28, 28 }; - const size_t b_shape[] = { 1, 120, 1, 1 }; - status = xnn_reshape_multiply_nd_f32( - op32, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #32" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op33, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #33" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 40, 28, 28 }; - const size_t b_shape[] = { 1, 40, 28, 28 }; - status = xnn_reshape_add_nd_f32( - op34, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #34" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op35, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #35" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op36, - /*batch_size=*/784, - 240 /* channels */, - 240 /* input stride */, - 240 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #36" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op37, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #37" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op38, - /*batch_size=*/196, - 240 /* channels */, - 240 /* input stride */, - 240 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #38" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op39, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #39" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op40, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #40" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op41, - 200 /* channels */, - 200 /* input stride */, - 200 /* output stride */, - /*batch_size=*/196, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #41" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op42, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #42" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op43, - /*batch_size=*/196, - 200 /* channels */, - 200 /* input stride */, - 200 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #43" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op44, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #44" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 80, 14, 14 }; - const size_t b_shape[] = { 1, 80, 14, 14 }; - status = xnn_reshape_add_nd_f32( - op45, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #45" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op46, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #46" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op47, - /*batch_size=*/196, - 184 /* channels */, - 184 /* input stride */, - 184 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #47" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op48, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #48" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op49, - /*batch_size=*/196, - 184 /* channels */, - 184 /* input stride */, - 184 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #49" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op50, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #50" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 80, 14, 14 }; - const size_t b_shape[] = { 1, 80, 14, 14 }; - status = xnn_reshape_add_nd_f32( - op51, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #51" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op52, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #52" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op53, - /*batch_size=*/196, - 184 /* channels */, - 184 /* input stride */, - 184 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #53" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op54, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #54" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op55, - /*batch_size=*/196, - 184 /* channels */, - 184 /* input stride */, - 184 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #55" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op56, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #56" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 80, 14, 14 }; - const size_t b_shape[] = { 1, 80, 14, 14 }; - status = xnn_reshape_add_nd_f32( - op57, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #57" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op58, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #58" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op59, - /*batch_size=*/196, - 480 /* channels */, - 480 /* input stride */, - 480 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #59" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op60, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #60" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op61, - /*batch_size=*/196, - 480 /* channels */, - 480 /* input stride */, - 480 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #61" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f32( - op62, - /*batch_size=*/1, 196 /* width */, - 480 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #62" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op63, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #63" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op64, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #64" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 480, 14, 14 }; - const size_t b_shape[] = { 1, 480, 1, 1 }; - status = xnn_reshape_multiply_nd_f32( - op65, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #65" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op66, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #66" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op67, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #67" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op68, - /*batch_size=*/196, - 672 /* channels */, - 672 /* input stride */, - 672 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #68" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op69, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #69" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op70, - /*batch_size=*/196, - 672 /* channels */, - 672 /* input stride */, - 672 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #70" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f32( - op71, - /*batch_size=*/1, 196 /* width */, - 672 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #71" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op72, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #72" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op73, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #73" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 672, 14, 14 }; - const size_t b_shape[] = { 1, 672, 1, 1 }; - status = xnn_reshape_multiply_nd_f32( - op74, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #74" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op75, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #75" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 112, 14, 14 }; - const size_t b_shape[] = { 1, 112, 14, 14 }; - status = xnn_reshape_add_nd_f32( - op76, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #76" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op77, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #77" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op78, - /*batch_size=*/196, - 672 /* channels */, - 672 /* input stride */, - 672 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #78" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op79, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #79" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op80, - /*batch_size=*/49, - 672 /* channels */, - 672 /* input stride */, - 672 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #80" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f32( - op81, - /*batch_size=*/1, 49 /* width */, - 672 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #81" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op82, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #82" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op83, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #83" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 672, 7, 7 }; - const size_t b_shape[] = { 1, 672, 1, 1 }; - status = xnn_reshape_multiply_nd_f32( - op84, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #84" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op85, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #85" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op86, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #86" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op87, - /*batch_size=*/49, - 960 /* channels */, - 960 /* input stride */, - 960 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #87" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op88, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #88" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op89, - /*batch_size=*/49, - 960 /* channels */, - 960 /* input stride */, - 960 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #89" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f32( - op90, - /*batch_size=*/1, 49 /* width */, - 960 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #90" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op91, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #91" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op92, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #92" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 960, 7, 7 }; - const size_t b_shape[] = { 1, 960, 1, 1 }; - status = xnn_reshape_multiply_nd_f32( - op93, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #93" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op94, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #94" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 160, 7, 7 }; - const size_t b_shape[] = { 1, 160, 7, 7 }; - status = xnn_reshape_add_nd_f32( - op95, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #95" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op96, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #96" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op97, - /*batch_size=*/49, - 960 /* channels */, - 960 /* input stride */, - 960 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #97" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op98, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #98" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op99, - /*batch_size=*/49, - 960 /* channels */, - 960 /* input stride */, - 960 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #99" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f32( - op100, - /*batch_size=*/1, 49 /* width */, - 960 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #100" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op101, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #101" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op102, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #102" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 960, 7, 7 }; - const size_t b_shape[] = { 1, 960, 1, 1 }; - status = xnn_reshape_multiply_nd_f32( - op103, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #103" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op104, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #104" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 160, 7, 7 }; - const size_t b_shape[] = { 1, 160, 7, 7 }; - status = xnn_reshape_add_nd_f32( - op105, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #105" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op106, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #106" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op107, - /*batch_size=*/49, - 960 /* channels */, - 960 /* input stride */, - 960 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #107" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f32( - op108, - /*batch_size=*/1, 49 /* width */, - 960 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #108" << std::endl; - return ExecutionPlan(); - } - - size_t op109_workspace_size = 0; - size_t op109_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op109, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op109_workspace_size, &op109_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op109_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #109" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op110, - /*batch_size=*/1, - 1280 /* channels */, - 1280 /* input stride */, - 1280 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #110" << std::endl; - return ExecutionPlan(); - } - - size_t op111_workspace_size = 0; - size_t op111_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f32( - op111, - /*batch_size=*/1, 1 /* width */, - 1280 /* channels */, 1280 /* input stride */, 1280 /* output stride */, - &op111_workspace_size, &op111_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op111_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #111" << std::endl; - return ExecutionPlan(); - } - - size_t op112_workspace_size = 0; - size_t op112_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op112, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op112_workspace_size, &op112_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op112_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #112" << std::endl; - return ExecutionPlan(); - } - - Workspace workspace(max_workspace_size); - - status = xnn_setup_convolution2d_nchw_f32( - op0, - /*input=*/v0.data(), /*output=*/v1.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #0" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op1, - /*input=*/v1.data(), /*output=*/v2.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #1" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op2, - /*input=*/v2.data(), /*output=*/v3.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #2" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op3, - /*input=*/v3.data(), /*output=*/v4.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #3" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op4, - v4.data() /* a */, v2.data() /* b */, /*output=*/v5.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #4" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op5, - /*input=*/v5.data(), /*output=*/v6.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #5" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op6, - /*input=*/v6.data(), /*output=*/v7.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #6" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op7, - /*input=*/v7.data(), /*output=*/v8.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #7" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op8, - /*input=*/v8.data(), /*output=*/v9.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #8" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op9, - /*input=*/v9.data(), /*output=*/v10.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #9" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op10, - /*input=*/v10.data(), /*output=*/v11.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #10" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op11, - v11.data() /* a */, v8.data() /* b */, /*output=*/v12.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #11" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op12, - /*input=*/v12.data(), /*output=*/v13.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #12" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op13, - /*input=*/v13.data(), /*output=*/v14.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #13" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f32( - op14, - /*input=*/v14.data(), /*output=*/v15.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #14" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op15, - /*input=*/v15.data(), /*output=*/v16.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #15" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op16, - /*input=*/v16.data(), /*output=*/v17.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #16" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f32( - op17, - v14.data() /* a */, v17.data() /* b */, /*output=*/v18.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #17" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op18, - /*input=*/v18.data(), /*output=*/v19.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #18" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op19, - /*input=*/v19.data(), /*output=*/v20.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #19" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op20, - /*input=*/v20.data(), /*output=*/v21.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #20" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f32( - op21, - /*input=*/v21.data(), /*output=*/v22.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #21" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op22, - /*input=*/v22.data(), /*output=*/v23.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #22" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op23, - /*input=*/v23.data(), /*output=*/v24.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #23" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f32( - op24, - v21.data() /* a */, v24.data() /* b */, /*output=*/v25.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #24" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op25, - /*input=*/v25.data(), /*output=*/v26.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #25" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op26, - v26.data() /* a */, v19.data() /* b */, /*output=*/v27.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #26" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op27, - /*input=*/v27.data(), /*output=*/v28.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #27" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op28, - /*input=*/v28.data(), /*output=*/v29.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #28" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f32( - op29, - /*input=*/v29.data(), /*output=*/v30.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #29" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op30, - /*input=*/v30.data(), /*output=*/v31.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #30" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op31, - /*input=*/v31.data(), /*output=*/v32.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #31" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f32( - op32, - v29.data() /* a */, v32.data() /* b */, /*output=*/v33.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #32" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op33, - /*input=*/v33.data(), /*output=*/v34.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #33" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op34, - v34.data() /* a */, v27.data() /* b */, /*output=*/v35.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #34" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op35, - /*input=*/v35.data(), /*output=*/v36.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #35" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op36, - /*input=*/v36.data(), /*output=*/v37.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #36" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op37, - /*input=*/v37.data(), /*output=*/v38.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #37" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op38, - /*input=*/v38.data(), /*output=*/v39.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #38" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op39, - /*input=*/v39.data(), /*output=*/v40.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #39" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op40, - /*input=*/v40.data(), /*output=*/v41.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #40" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op41, - /*input=*/v41.data(), /*output=*/v42.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #41" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op42, - /*input=*/v42.data(), /*output=*/v43.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #42" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op43, - /*input=*/v43.data(), /*output=*/v44.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #43" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op44, - /*input=*/v44.data(), /*output=*/v45.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #44" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op45, - v45.data() /* a */, v40.data() /* b */, /*output=*/v46.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #45" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op46, - /*input=*/v46.data(), /*output=*/v47.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #46" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op47, - /*input=*/v47.data(), /*output=*/v48.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #47" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op48, - /*input=*/v48.data(), /*output=*/v49.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #48" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op49, - /*input=*/v49.data(), /*output=*/v50.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #49" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op50, - /*input=*/v50.data(), /*output=*/v51.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #50" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op51, - v51.data() /* a */, v46.data() /* b */, /*output=*/v52.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #51" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op52, - /*input=*/v52.data(), /*output=*/v53.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #52" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op53, - /*input=*/v53.data(), /*output=*/v54.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #53" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op54, - /*input=*/v54.data(), /*output=*/v55.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #54" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op55, - /*input=*/v55.data(), /*output=*/v56.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #55" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op56, - /*input=*/v56.data(), /*output=*/v57.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #56" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op57, - v57.data() /* a */, v52.data() /* b */, /*output=*/v58.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #57" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op58, - /*input=*/v58.data(), /*output=*/v59.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #58" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op59, - /*input=*/v59.data(), /*output=*/v60.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #59" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op60, - /*input=*/v60.data(), /*output=*/v61.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #60" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op61, - /*input=*/v61.data(), /*output=*/v62.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #61" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f32( - op62, - /*input=*/v62.data(), /*output=*/v63.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #62" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op63, - /*input=*/v63.data(), /*output=*/v64.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #63" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op64, - /*input=*/v64.data(), /*output=*/v65.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #64" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f32( - op65, - v62.data() /* a */, v65.data() /* b */, /*output=*/v66.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #65" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op66, - /*input=*/v66.data(), /*output=*/v67.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #66" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op67, - /*input=*/v67.data(), /*output=*/v68.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #67" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op68, - /*input=*/v68.data(), /*output=*/v69.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #68" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op69, - /*input=*/v69.data(), /*output=*/v70.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #69" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op70, - /*input=*/v70.data(), /*output=*/v71.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #70" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f32( - op71, - /*input=*/v71.data(), /*output=*/v72.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #71" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op72, - /*input=*/v72.data(), /*output=*/v73.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #72" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op73, - /*input=*/v73.data(), /*output=*/v74.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #73" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f32( - op74, - v71.data() /* a */, v74.data() /* b */, /*output=*/v75.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #74" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op75, - /*input=*/v75.data(), /*output=*/v76.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #75" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op76, - v76.data() /* a */, v67.data() /* b */, /*output=*/v77.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #76" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op77, - /*input=*/v77.data(), /*output=*/v78.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #77" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op78, - /*input=*/v78.data(), /*output=*/v79.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #78" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op79, - /*input=*/v79.data(), /*output=*/v80.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #79" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op80, - /*input=*/v80.data(), /*output=*/v81.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #80" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f32( - op81, - /*input=*/v81.data(), /*output=*/v82.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #81" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op82, - /*input=*/v82.data(), /*output=*/v83.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #82" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op83, - /*input=*/v83.data(), /*output=*/v84.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #83" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f32( - op84, - v81.data() /* a */, v84.data() /* b */, /*output=*/v85.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #84" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op85, - /*input=*/v85.data(), /*output=*/v86.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #85" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op86, - /*input=*/v86.data(), /*output=*/v87.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #86" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op87, - /*input=*/v87.data(), /*output=*/v88.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #87" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op88, - /*input=*/v88.data(), /*output=*/v89.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #88" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op89, - /*input=*/v89.data(), /*output=*/v90.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #89" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f32( - op90, - /*input=*/v90.data(), /*output=*/v91.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #90" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op91, - /*input=*/v91.data(), /*output=*/v92.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #91" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op92, - /*input=*/v92.data(), /*output=*/v93.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #92" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f32( - op93, - v90.data() /* a */, v93.data() /* b */, /*output=*/v94.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #93" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op94, - /*input=*/v94.data(), /*output=*/v95.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #94" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op95, - v95.data() /* a */, v86.data() /* b */, /*output=*/v96.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #95" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op96, - /*input=*/v96.data(), /*output=*/v97.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #96" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op97, - /*input=*/v97.data(), /*output=*/v98.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #97" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op98, - /*input=*/v98.data(), /*output=*/v99.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #98" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op99, - /*input=*/v99.data(), /*output=*/v100.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #99" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f32( - op100, - /*input=*/v100.data(), /*output=*/v101.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #100" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op101, - /*input=*/v101.data(), /*output=*/v102.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #101" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op102, - /*input=*/v102.data(), /*output=*/v103.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #102" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f32( - op103, - v100.data() /* a */, v103.data() /* b */, /*output=*/v104.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #103" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op104, - /*input=*/v104.data(), /*output=*/v105.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #104" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op105, - v105.data() /* a */, v96.data() /* b */, /*output=*/v106.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #105" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op106, - /*input=*/v106.data(), /*output=*/v107.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #106" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op107, - /*input=*/v107.data(), /*output=*/v108.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #107" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f32( - op108, - /*input=*/v108.data(), /*output=*/v109.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #108" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op109, - workspace.data(), - /*input=*/v109.data(), /*output=*/v110.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #109" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op110, - /*input=*/v110.data(), /*output=*/v111.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #110" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f32( - op111, - workspace.data(), - /*input=*/v111.data(), /*output=*/v112.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #111" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op112, - workspace.data(), - /*input=*/v112.data(), /*output=*/v113.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #112" << std::endl; - return ExecutionPlan(); - } - - XNN_PRAGMA_CLANG("clang diagnostic push") - XNN_PRAGMA_CLANG("clang diagnostic ignored \"-Wpessimizing-move\"") - return ExecutionPlan{operators, workspace}; - XNN_PRAGMA_CLANG("clang diagnostic pop") -} - -} // namespace models diff --git a/models/fp32-sparse-mobilenet-v3-small.cc b/models/fp32-sparse-mobilenet-v3-small.cc deleted file mode 100644 index cdb84b9ace2..00000000000 --- a/models/fp32-sparse-mobilenet-v3-small.cc +++ /dev/null @@ -1,4189 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include "xnnpack.h" - -#include -#include -#include -#include -#include -#include - -#include "xnnpack/cache.h" -#include "xnnpack/common.h" -#include "xnnpack/models.h" - -namespace models { - -ExecutionPlan FP32SparseMobileNetV3Small(float sparsity, pthreadpool_t threadpool) { - alignas(16) static std::array v0; - alignas(16) static std::array v1; - alignas(16) static std::array v2; - alignas(16) static std::array v3; - alignas(16) static std::array v4; - alignas(16) static std::array v5; - alignas(16) static std::array v6; - alignas(16) static std::array v7; - alignas(16) static std::array v8; - alignas(16) static std::array v9; - alignas(16) static std::array v10; - alignas(16) static std::array v11; - alignas(16) static std::array v12; - alignas(16) static std::array v13; - alignas(16) static std::array v14; - alignas(16) static std::array v15; - alignas(16) static std::array v16; - alignas(16) static std::array v17; - alignas(16) static std::array v18; - alignas(16) static std::array v19; - alignas(16) static std::array v20; - alignas(16) static std::array v21; - alignas(16) static std::array v22; - alignas(16) static std::array v23; - alignas(16) static std::array v24; - alignas(16) static std::array v25; - alignas(16) static std::array v26; - alignas(16) static std::array v27; - alignas(16) static std::array v28; - alignas(16) static std::array v29; - alignas(16) static std::array v30; - alignas(16) static std::array v31; - alignas(16) static std::array v32; - alignas(16) static std::array v33; - alignas(16) static std::array v34; - alignas(16) static std::array v35; - alignas(16) static std::array v36; - alignas(16) static std::array v37; - alignas(16) static std::array v38; - alignas(16) static std::array v39; - alignas(16) static std::array v40; - alignas(16) static std::array v41; - alignas(16) static std::array v42; - alignas(16) static std::array v43; - alignas(16) static std::array v44; - alignas(16) static std::array v45; - alignas(16) static std::array v46; - alignas(16) static std::array v47; - alignas(16) static std::array v48; - alignas(16) static std::array v49; - alignas(16) static std::array v50; - alignas(16) static std::array v51; - alignas(16) static std::array v52; - alignas(16) static std::array v53; - alignas(16) static std::array v54; - alignas(16) static std::array v55; - alignas(16) static std::array v56; - alignas(16) static std::array v57; - alignas(16) static std::array v58; - alignas(16) static std::array v59; - alignas(16) static std::array v60; - alignas(16) static std::array v61; - alignas(16) static std::array v62; - alignas(16) static std::array v63; - alignas(16) static std::array v64; - alignas(16) static std::array v65; - alignas(16) static std::array v66; - alignas(16) static std::array v67; - alignas(16) static std::array v68; - alignas(16) static std::array v69; - alignas(16) static std::array v70; - alignas(16) static std::array v71; - alignas(16) static std::array v72; - alignas(16) static std::array v73; - alignas(16) static std::array v74; - alignas(16) static std::array v75; - alignas(16) static std::array v76; - alignas(16) static std::array v77; - alignas(16) static std::array v78; - alignas(16) static std::array v79; - alignas(16) static std::array v80; - alignas(16) static std::array v81; - alignas(16) static std::array v82; - alignas(16) static std::array v83; - alignas(16) static std::array v84; - alignas(16) static std::array v85; - alignas(16) static std::array v86; - alignas(16) static std::array v87; - alignas(16) static std::array v88; - alignas(16) static std::array v89; - alignas(16) static std::array v90; - alignas(16) static std::array v91; - alignas(16) static std::array v92; - alignas(16) static std::array v93; - alignas(16) static std::array v94; - alignas(16) static std::array v95; - alignas(16) static std::array v96; - alignas(16) static std::array v97; - alignas(16) static std::array v98; - alignas(16) static std::array v99; - alignas(16) static std::array w100; - alignas(16) static std::array w101; - alignas(16) static std::array w102; - alignas(16) static std::array w103; - alignas(16) static std::array w104; - alignas(16) static std::array w105; - alignas(16) static std::array w106; - alignas(16) static std::array w107; - alignas(16) static std::array w108; - alignas(16) static std::array w109; - alignas(16) static std::array w110; - alignas(16) static std::array w111; - alignas(16) static std::array w112; - alignas(16) static std::array w113; - alignas(16) static std::array w114; - alignas(16) static std::array w115; - alignas(16) static std::array w116; - alignas(16) static std::array w117; - alignas(16) static std::array w118; - alignas(16) static std::array w119; - alignas(16) static std::array w120; - alignas(16) static std::array w121; - alignas(16) static std::array w122; - alignas(16) static std::array w123; - alignas(16) static std::array w124; - alignas(16) static std::array w125; - alignas(16) static std::array w126; - alignas(16) static std::array w127; - alignas(16) static std::array w128; - alignas(16) static std::array w129; - alignas(16) static std::array w130; - alignas(16) static std::array w131; - alignas(16) static std::array w132; - alignas(16) static std::array w133; - alignas(16) static std::array w134; - alignas(16) static std::array w135; - alignas(16) static std::array w136; - alignas(16) static std::array w137; - alignas(16) static std::array w138; - alignas(16) static std::array w139; - alignas(16) static std::array w140; - alignas(16) static std::array w141; - alignas(16) static std::array w142; - alignas(16) static std::array w143; - alignas(16) static std::array w144; - alignas(16) static std::array w145; - alignas(16) static std::array w146; - alignas(16) static std::array w147; - alignas(16) static std::array w148; - alignas(16) static std::array w149; - alignas(16) static std::array w150; - alignas(16) static std::array w151; - alignas(16) static std::array w152; - alignas(16) static std::array w153; - alignas(16) static std::array w154; - alignas(16) static std::array w155; - alignas(16) static std::array w156; - alignas(16) static std::array w157; - alignas(16) static std::array w158; - alignas(16) static std::array w159; - alignas(16) static std::array w160; - alignas(16) static std::array w161; - alignas(16) static std::array w162; - alignas(16) static std::array w163; - alignas(16) static std::array w164; - alignas(16) static std::array w165; - alignas(16) static std::array w166; - alignas(16) static std::array w167; - alignas(16) static std::array w168; - alignas(16) static std::array w169; - alignas(16) static std::array w170; - alignas(16) static std::array w171; - alignas(16) static std::array w172; - alignas(16) static std::array w173; - alignas(16) static std::array w174; - alignas(16) static std::array w175; - alignas(16) static std::array w176; - alignas(16) static std::array w177; - alignas(16) static std::array w178; - alignas(16) static std::array w179; - alignas(16) static std::array w180; - alignas(16) static std::array w181; - alignas(16) static std::array w182; - alignas(16) static std::array w183; - alignas(16) static std::array w184; - alignas(16) static std::array w185; - alignas(16) static std::array w186; - alignas(16) static std::array w187; - alignas(16) static std::array w188; - alignas(16) static std::array w189; - alignas(16) static std::array w190; - alignas(16) static std::array w191; - alignas(16) static std::array w192; - alignas(16) static std::array w193; - alignas(16) static std::array w194; - alignas(16) static std::array w195; - alignas(16) static std::array w196; - alignas(16) static std::array w197; - alignas(16) static std::array w198; - alignas(16) static std::array w199; - alignas(16) static std::array w200; - alignas(16) static std::array w201; - alignas(16) static std::array w202; - alignas(16) static std::array w203; - alignas(16) static std::array w204; - alignas(16) static std::array w205; - alignas(16) static std::array w206; - alignas(16) static std::array w207; - - std::random_device random_device; - auto rng = std::mt19937(random_device()); - auto f32rng = std::bind(std::uniform_real_distribution(-1.0f, +1.0f), std::ref(rng)); - std::generate(v0.begin(), v0.end(), std::ref(f32rng)); - std::generate(v1.begin(), v1.end(), std::ref(f32rng)); - std::generate(v2.begin(), v2.end(), std::ref(f32rng)); - std::generate(v3.begin(), v3.end(), std::ref(f32rng)); - std::generate(v4.begin(), v4.end(), std::ref(f32rng)); - std::generate(v5.begin(), v5.end(), std::ref(f32rng)); - std::generate(v6.begin(), v6.end(), std::ref(f32rng)); - std::generate(v7.begin(), v7.end(), std::ref(f32rng)); - std::generate(v8.begin(), v8.end(), std::ref(f32rng)); - std::generate(v9.begin(), v9.end(), std::ref(f32rng)); - std::generate(v10.begin(), v10.end(), std::ref(f32rng)); - std::generate(v11.begin(), v11.end(), std::ref(f32rng)); - std::generate(v12.begin(), v12.end(), std::ref(f32rng)); - std::generate(v13.begin(), v13.end(), std::ref(f32rng)); - std::generate(v14.begin(), v14.end(), std::ref(f32rng)); - std::generate(v15.begin(), v15.end(), std::ref(f32rng)); - std::generate(v16.begin(), v16.end(), std::ref(f32rng)); - std::generate(v17.begin(), v17.end(), std::ref(f32rng)); - std::generate(v18.begin(), v18.end(), std::ref(f32rng)); - std::generate(v19.begin(), v19.end(), std::ref(f32rng)); - std::generate(v20.begin(), v20.end(), std::ref(f32rng)); - std::generate(v21.begin(), v21.end(), std::ref(f32rng)); - std::generate(v22.begin(), v22.end(), std::ref(f32rng)); - std::generate(v23.begin(), v23.end(), std::ref(f32rng)); - std::generate(v24.begin(), v24.end(), std::ref(f32rng)); - std::generate(v25.begin(), v25.end(), std::ref(f32rng)); - std::generate(v26.begin(), v26.end(), std::ref(f32rng)); - std::generate(v27.begin(), v27.end(), std::ref(f32rng)); - std::generate(v28.begin(), v28.end(), std::ref(f32rng)); - std::generate(v29.begin(), v29.end(), std::ref(f32rng)); - std::generate(v30.begin(), v30.end(), std::ref(f32rng)); - std::generate(v31.begin(), v31.end(), std::ref(f32rng)); - std::generate(v32.begin(), v32.end(), std::ref(f32rng)); - std::generate(v33.begin(), v33.end(), std::ref(f32rng)); - std::generate(v34.begin(), v34.end(), std::ref(f32rng)); - std::generate(v35.begin(), v35.end(), std::ref(f32rng)); - std::generate(v36.begin(), v36.end(), std::ref(f32rng)); - std::generate(v37.begin(), v37.end(), std::ref(f32rng)); - std::generate(v38.begin(), v38.end(), std::ref(f32rng)); - std::generate(v39.begin(), v39.end(), std::ref(f32rng)); - std::generate(v40.begin(), v40.end(), std::ref(f32rng)); - std::generate(v41.begin(), v41.end(), std::ref(f32rng)); - std::generate(v42.begin(), v42.end(), std::ref(f32rng)); - std::generate(v43.begin(), v43.end(), std::ref(f32rng)); - std::generate(v44.begin(), v44.end(), std::ref(f32rng)); - std::generate(v45.begin(), v45.end(), std::ref(f32rng)); - std::generate(v46.begin(), v46.end(), std::ref(f32rng)); - std::generate(v47.begin(), v47.end(), std::ref(f32rng)); - std::generate(v48.begin(), v48.end(), std::ref(f32rng)); - std::generate(v49.begin(), v49.end(), std::ref(f32rng)); - std::generate(v50.begin(), v50.end(), std::ref(f32rng)); - std::generate(v51.begin(), v51.end(), std::ref(f32rng)); - std::generate(v52.begin(), v52.end(), std::ref(f32rng)); - std::generate(v53.begin(), v53.end(), std::ref(f32rng)); - std::generate(v54.begin(), v54.end(), std::ref(f32rng)); - std::generate(v55.begin(), v55.end(), std::ref(f32rng)); - std::generate(v56.begin(), v56.end(), std::ref(f32rng)); - std::generate(v57.begin(), v57.end(), std::ref(f32rng)); - std::generate(v58.begin(), v58.end(), std::ref(f32rng)); - std::generate(v59.begin(), v59.end(), std::ref(f32rng)); - std::generate(v60.begin(), v60.end(), std::ref(f32rng)); - std::generate(v61.begin(), v61.end(), std::ref(f32rng)); - std::generate(v62.begin(), v62.end(), std::ref(f32rng)); - std::generate(v63.begin(), v63.end(), std::ref(f32rng)); - std::generate(v64.begin(), v64.end(), std::ref(f32rng)); - std::generate(v65.begin(), v65.end(), std::ref(f32rng)); - std::generate(v66.begin(), v66.end(), std::ref(f32rng)); - std::generate(v67.begin(), v67.end(), std::ref(f32rng)); - std::generate(v68.begin(), v68.end(), std::ref(f32rng)); - std::generate(v69.begin(), v69.end(), std::ref(f32rng)); - std::generate(v70.begin(), v70.end(), std::ref(f32rng)); - std::generate(v71.begin(), v71.end(), std::ref(f32rng)); - std::generate(v72.begin(), v72.end(), std::ref(f32rng)); - std::generate(v73.begin(), v73.end(), std::ref(f32rng)); - std::generate(v74.begin(), v74.end(), std::ref(f32rng)); - std::generate(v75.begin(), v75.end(), std::ref(f32rng)); - std::generate(v76.begin(), v76.end(), std::ref(f32rng)); - std::generate(v77.begin(), v77.end(), std::ref(f32rng)); - std::generate(v78.begin(), v78.end(), std::ref(f32rng)); - std::generate(v79.begin(), v79.end(), std::ref(f32rng)); - std::generate(v80.begin(), v80.end(), std::ref(f32rng)); - std::generate(v81.begin(), v81.end(), std::ref(f32rng)); - std::generate(v82.begin(), v82.end(), std::ref(f32rng)); - std::generate(v83.begin(), v83.end(), std::ref(f32rng)); - std::generate(v84.begin(), v84.end(), std::ref(f32rng)); - std::generate(v85.begin(), v85.end(), std::ref(f32rng)); - std::generate(v86.begin(), v86.end(), std::ref(f32rng)); - std::generate(v87.begin(), v87.end(), std::ref(f32rng)); - std::generate(v88.begin(), v88.end(), std::ref(f32rng)); - std::generate(v89.begin(), v89.end(), std::ref(f32rng)); - std::generate(v90.begin(), v90.end(), std::ref(f32rng)); - std::generate(v91.begin(), v91.end(), std::ref(f32rng)); - std::generate(v92.begin(), v92.end(), std::ref(f32rng)); - std::generate(v93.begin(), v93.end(), std::ref(f32rng)); - std::generate(v94.begin(), v94.end(), std::ref(f32rng)); - std::generate(v95.begin(), v95.end(), std::ref(f32rng)); - std::generate(v96.begin(), v96.end(), std::ref(f32rng)); - std::generate(v97.begin(), v97.end(), std::ref(f32rng)); - std::generate(v98.begin(), v98.end(), std::ref(f32rng)); - std::generate(v99.begin(), v99.end(), std::ref(f32rng)); - std::generate(w100.begin(), w100.end(), std::ref(f32rng)); - std::generate(w101.begin(), w101.end(), std::ref(f32rng)); - std::generate(w102.begin(), w102.end(), std::ref(f32rng)); - std::generate(w103.begin(), w103.end(), std::ref(f32rng)); - std::fill(w104.begin(), w104.end(), 0.0f); - std::generate(w104.begin(), w104.end() - size_t(sparsity * w104.size()), std::ref(f32rng)); - std::shuffle(w104.begin(), w104.end(), rng); - std::generate(w105.begin(), w105.end(), std::ref(f32rng)); - std::fill(w106.begin(), w106.end(), 0.0f); - std::generate(w106.begin(), w106.end() - size_t(sparsity * w106.size()), std::ref(f32rng)); - std::shuffle(w106.begin(), w106.end(), rng); - std::generate(w107.begin(), w107.end(), std::ref(f32rng)); - std::fill(w108.begin(), w108.end(), 0.0f); - std::generate(w108.begin(), w108.end() - size_t(sparsity * w108.size()), std::ref(f32rng)); - std::shuffle(w108.begin(), w108.end(), rng); - std::generate(w109.begin(), w109.end(), std::ref(f32rng)); - std::fill(w110.begin(), w110.end(), 0.0f); - std::generate(w110.begin(), w110.end() - size_t(sparsity * w110.size()), std::ref(f32rng)); - std::shuffle(w110.begin(), w110.end(), rng); - std::generate(w111.begin(), w111.end(), std::ref(f32rng)); - std::generate(w112.begin(), w112.end(), std::ref(f32rng)); - std::generate(w113.begin(), w113.end(), std::ref(f32rng)); - std::fill(w114.begin(), w114.end(), 0.0f); - std::generate(w114.begin(), w114.end() - size_t(sparsity * w114.size()), std::ref(f32rng)); - std::shuffle(w114.begin(), w114.end(), rng); - std::generate(w115.begin(), w115.end(), std::ref(f32rng)); - std::fill(w116.begin(), w116.end(), 0.0f); - std::generate(w116.begin(), w116.end() - size_t(sparsity * w116.size()), std::ref(f32rng)); - std::shuffle(w116.begin(), w116.end(), rng); - std::generate(w117.begin(), w117.end(), std::ref(f32rng)); - std::generate(w118.begin(), w118.end(), std::ref(f32rng)); - std::generate(w119.begin(), w119.end(), std::ref(f32rng)); - std::fill(w120.begin(), w120.end(), 0.0f); - std::generate(w120.begin(), w120.end() - size_t(sparsity * w120.size()), std::ref(f32rng)); - std::shuffle(w120.begin(), w120.end(), rng); - std::generate(w121.begin(), w121.end(), std::ref(f32rng)); - std::fill(w122.begin(), w122.end(), 0.0f); - std::generate(w122.begin(), w122.end() - size_t(sparsity * w122.size()), std::ref(f32rng)); - std::shuffle(w122.begin(), w122.end(), rng); - std::generate(w123.begin(), w123.end(), std::ref(f32rng)); - std::generate(w124.begin(), w124.end(), std::ref(f32rng)); - std::generate(w125.begin(), w125.end(), std::ref(f32rng)); - std::fill(w126.begin(), w126.end(), 0.0f); - std::generate(w126.begin(), w126.end() - size_t(sparsity * w126.size()), std::ref(f32rng)); - std::shuffle(w126.begin(), w126.end(), rng); - std::generate(w127.begin(), w127.end(), std::ref(f32rng)); - std::fill(w128.begin(), w128.end(), 0.0f); - std::generate(w128.begin(), w128.end() - size_t(sparsity * w128.size()), std::ref(f32rng)); - std::shuffle(w128.begin(), w128.end(), rng); - std::generate(w129.begin(), w129.end(), std::ref(f32rng)); - std::fill(w130.begin(), w130.end(), 0.0f); - std::generate(w130.begin(), w130.end() - size_t(sparsity * w130.size()), std::ref(f32rng)); - std::shuffle(w130.begin(), w130.end(), rng); - std::generate(w131.begin(), w131.end(), std::ref(f32rng)); - std::fill(w132.begin(), w132.end(), 0.0f); - std::generate(w132.begin(), w132.end() - size_t(sparsity * w132.size()), std::ref(f32rng)); - std::shuffle(w132.begin(), w132.end(), rng); - std::generate(w133.begin(), w133.end(), std::ref(f32rng)); - std::generate(w134.begin(), w134.end(), std::ref(f32rng)); - std::generate(w135.begin(), w135.end(), std::ref(f32rng)); - std::fill(w136.begin(), w136.end(), 0.0f); - std::generate(w136.begin(), w136.end() - size_t(sparsity * w136.size()), std::ref(f32rng)); - std::shuffle(w136.begin(), w136.end(), rng); - std::generate(w137.begin(), w137.end(), std::ref(f32rng)); - std::fill(w138.begin(), w138.end(), 0.0f); - std::generate(w138.begin(), w138.end() - size_t(sparsity * w138.size()), std::ref(f32rng)); - std::shuffle(w138.begin(), w138.end(), rng); - std::generate(w139.begin(), w139.end(), std::ref(f32rng)); - std::fill(w140.begin(), w140.end(), 0.0f); - std::generate(w140.begin(), w140.end() - size_t(sparsity * w140.size()), std::ref(f32rng)); - std::shuffle(w140.begin(), w140.end(), rng); - std::generate(w141.begin(), w141.end(), std::ref(f32rng)); - std::fill(w142.begin(), w142.end(), 0.0f); - std::generate(w142.begin(), w142.end() - size_t(sparsity * w142.size()), std::ref(f32rng)); - std::shuffle(w142.begin(), w142.end(), rng); - std::generate(w143.begin(), w143.end(), std::ref(f32rng)); - std::generate(w144.begin(), w144.end(), std::ref(f32rng)); - std::generate(w145.begin(), w145.end(), std::ref(f32rng)); - std::fill(w146.begin(), w146.end(), 0.0f); - std::generate(w146.begin(), w146.end() - size_t(sparsity * w146.size()), std::ref(f32rng)); - std::shuffle(w146.begin(), w146.end(), rng); - std::generate(w147.begin(), w147.end(), std::ref(f32rng)); - std::fill(w148.begin(), w148.end(), 0.0f); - std::generate(w148.begin(), w148.end() - size_t(sparsity * w148.size()), std::ref(f32rng)); - std::shuffle(w148.begin(), w148.end(), rng); - std::generate(w149.begin(), w149.end(), std::ref(f32rng)); - std::fill(w150.begin(), w150.end(), 0.0f); - std::generate(w150.begin(), w150.end() - size_t(sparsity * w150.size()), std::ref(f32rng)); - std::shuffle(w150.begin(), w150.end(), rng); - std::generate(w151.begin(), w151.end(), std::ref(f32rng)); - std::fill(w152.begin(), w152.end(), 0.0f); - std::generate(w152.begin(), w152.end() - size_t(sparsity * w152.size()), std::ref(f32rng)); - std::shuffle(w152.begin(), w152.end(), rng); - std::generate(w153.begin(), w153.end(), std::ref(f32rng)); - std::generate(w154.begin(), w154.end(), std::ref(f32rng)); - std::generate(w155.begin(), w155.end(), std::ref(f32rng)); - std::fill(w156.begin(), w156.end(), 0.0f); - std::generate(w156.begin(), w156.end() - size_t(sparsity * w156.size()), std::ref(f32rng)); - std::shuffle(w156.begin(), w156.end(), rng); - std::generate(w157.begin(), w157.end(), std::ref(f32rng)); - std::fill(w158.begin(), w158.end(), 0.0f); - std::generate(w158.begin(), w158.end() - size_t(sparsity * w158.size()), std::ref(f32rng)); - std::shuffle(w158.begin(), w158.end(), rng); - std::generate(w159.begin(), w159.end(), std::ref(f32rng)); - std::fill(w160.begin(), w160.end(), 0.0f); - std::generate(w160.begin(), w160.end() - size_t(sparsity * w160.size()), std::ref(f32rng)); - std::shuffle(w160.begin(), w160.end(), rng); - std::generate(w161.begin(), w161.end(), std::ref(f32rng)); - std::fill(w162.begin(), w162.end(), 0.0f); - std::generate(w162.begin(), w162.end() - size_t(sparsity * w162.size()), std::ref(f32rng)); - std::shuffle(w162.begin(), w162.end(), rng); - std::generate(w163.begin(), w163.end(), std::ref(f32rng)); - std::generate(w164.begin(), w164.end(), std::ref(f32rng)); - std::generate(w165.begin(), w165.end(), std::ref(f32rng)); - std::fill(w166.begin(), w166.end(), 0.0f); - std::generate(w166.begin(), w166.end() - size_t(sparsity * w166.size()), std::ref(f32rng)); - std::shuffle(w166.begin(), w166.end(), rng); - std::generate(w167.begin(), w167.end(), std::ref(f32rng)); - std::fill(w168.begin(), w168.end(), 0.0f); - std::generate(w168.begin(), w168.end() - size_t(sparsity * w168.size()), std::ref(f32rng)); - std::shuffle(w168.begin(), w168.end(), rng); - std::generate(w169.begin(), w169.end(), std::ref(f32rng)); - std::fill(w170.begin(), w170.end(), 0.0f); - std::generate(w170.begin(), w170.end() - size_t(sparsity * w170.size()), std::ref(f32rng)); - std::shuffle(w170.begin(), w170.end(), rng); - std::generate(w171.begin(), w171.end(), std::ref(f32rng)); - std::fill(w172.begin(), w172.end(), 0.0f); - std::generate(w172.begin(), w172.end() - size_t(sparsity * w172.size()), std::ref(f32rng)); - std::shuffle(w172.begin(), w172.end(), rng); - std::generate(w173.begin(), w173.end(), std::ref(f32rng)); - std::generate(w174.begin(), w174.end(), std::ref(f32rng)); - std::generate(w175.begin(), w175.end(), std::ref(f32rng)); - std::fill(w176.begin(), w176.end(), 0.0f); - std::generate(w176.begin(), w176.end() - size_t(sparsity * w176.size()), std::ref(f32rng)); - std::shuffle(w176.begin(), w176.end(), rng); - std::generate(w177.begin(), w177.end(), std::ref(f32rng)); - std::fill(w178.begin(), w178.end(), 0.0f); - std::generate(w178.begin(), w178.end() - size_t(sparsity * w178.size()), std::ref(f32rng)); - std::shuffle(w178.begin(), w178.end(), rng); - std::generate(w179.begin(), w179.end(), std::ref(f32rng)); - std::fill(w180.begin(), w180.end(), 0.0f); - std::generate(w180.begin(), w180.end() - size_t(sparsity * w180.size()), std::ref(f32rng)); - std::shuffle(w180.begin(), w180.end(), rng); - std::generate(w181.begin(), w181.end(), std::ref(f32rng)); - std::fill(w182.begin(), w182.end(), 0.0f); - std::generate(w182.begin(), w182.end() - size_t(sparsity * w182.size()), std::ref(f32rng)); - std::shuffle(w182.begin(), w182.end(), rng); - std::generate(w183.begin(), w183.end(), std::ref(f32rng)); - std::generate(w184.begin(), w184.end(), std::ref(f32rng)); - std::generate(w185.begin(), w185.end(), std::ref(f32rng)); - std::fill(w186.begin(), w186.end(), 0.0f); - std::generate(w186.begin(), w186.end() - size_t(sparsity * w186.size()), std::ref(f32rng)); - std::shuffle(w186.begin(), w186.end(), rng); - std::generate(w187.begin(), w187.end(), std::ref(f32rng)); - std::fill(w188.begin(), w188.end(), 0.0f); - std::generate(w188.begin(), w188.end() - size_t(sparsity * w188.size()), std::ref(f32rng)); - std::shuffle(w188.begin(), w188.end(), rng); - std::generate(w189.begin(), w189.end(), std::ref(f32rng)); - std::fill(w190.begin(), w190.end(), 0.0f); - std::generate(w190.begin(), w190.end() - size_t(sparsity * w190.size()), std::ref(f32rng)); - std::shuffle(w190.begin(), w190.end(), rng); - std::generate(w191.begin(), w191.end(), std::ref(f32rng)); - std::fill(w192.begin(), w192.end(), 0.0f); - std::generate(w192.begin(), w192.end() - size_t(sparsity * w192.size()), std::ref(f32rng)); - std::shuffle(w192.begin(), w192.end(), rng); - std::generate(w193.begin(), w193.end(), std::ref(f32rng)); - std::generate(w194.begin(), w194.end(), std::ref(f32rng)); - std::generate(w195.begin(), w195.end(), std::ref(f32rng)); - std::fill(w196.begin(), w196.end(), 0.0f); - std::generate(w196.begin(), w196.end() - size_t(sparsity * w196.size()), std::ref(f32rng)); - std::shuffle(w196.begin(), w196.end(), rng); - std::generate(w197.begin(), w197.end(), std::ref(f32rng)); - std::fill(w198.begin(), w198.end(), 0.0f); - std::generate(w198.begin(), w198.end() - size_t(sparsity * w198.size()), std::ref(f32rng)); - std::shuffle(w198.begin(), w198.end(), rng); - std::generate(w199.begin(), w199.end(), std::ref(f32rng)); - std::fill(w200.begin(), w200.end(), 0.0f); - std::generate(w200.begin(), w200.end() - size_t(sparsity * w200.size()), std::ref(f32rng)); - std::shuffle(w200.begin(), w200.end(), rng); - std::generate(w201.begin(), w201.end(), std::ref(f32rng)); - std::fill(w202.begin(), w202.end(), 0.0f); - std::generate(w202.begin(), w202.end() - size_t(sparsity * w202.size()), std::ref(f32rng)); - std::shuffle(w202.begin(), w202.end(), rng); - std::generate(w203.begin(), w203.end(), std::ref(f32rng)); - std::fill(w204.begin(), w204.end(), 0.0f); - std::generate(w204.begin(), w204.end() - size_t(sparsity * w204.size()), std::ref(f32rng)); - std::shuffle(w204.begin(), w204.end(), rng); - std::generate(w205.begin(), w205.end(), std::ref(f32rng)); - std::generate(w206.begin(), w206.end(), std::ref(f32rng)); - std::generate(w207.begin(), w207.end(), std::ref(f32rng)); - - Operators operators; - xnn_status status; - size_t max_workspace_size = 0; - - xnn_operator_t op0 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 3 /* input channels per group */, - 16 /* output_channels_per_group */, - 3 /* input pixel stride */, - 16 /* output pixel stride */, - w100.data(), w101.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - XNN_FLAG_INPUT_NHWC /* flags */, - nullptr, - nullptr, - &op0); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #0" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op0, xnn_delete_operator); - - xnn_operator_t op1 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op1); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #1" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op1, xnn_delete_operator); - - xnn_operator_t op2 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 16 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 16 /* input pixel stride */, - 16 /* output pixel stride */, - w102.data(), w103.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op2); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #2" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op2, xnn_delete_operator); - - xnn_operator_t op3 = nullptr; - status = xnn_create_global_average_pooling_ncw_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op3); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #3" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op3, xnn_delete_operator); - - xnn_operator_t op4 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 16 /* input channels per group */, - 8 /* output_channels_per_group */, - 16 /* input pixel stride */, - 8 /* output pixel stride */, - w104.data(), w105.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op4); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #4" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op4, xnn_delete_operator); - - xnn_operator_t op5 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 8 /* input channels per group */, - 16 /* output_channels_per_group */, - 8 /* input pixel stride */, - 16 /* output pixel stride */, - w106.data(), w107.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op5); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #5" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op5, xnn_delete_operator); - - xnn_operator_t op6 = nullptr; - status = xnn_create_multiply_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op6); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #6" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op6, xnn_delete_operator); - - xnn_operator_t op7 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 16 /* input channels per group */, - 16 /* output_channels_per_group */, - 16 /* input pixel stride */, - 16 /* output pixel stride */, - w108.data(), w109.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op7); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #7" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op7, xnn_delete_operator); - - xnn_operator_t op8 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 16 /* input channels per group */, - 72 /* output_channels_per_group */, - 16 /* input pixel stride */, - 72 /* output pixel stride */, - w110.data(), w111.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op8); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #8" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op8, xnn_delete_operator); - - xnn_operator_t op9 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 72 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 72 /* input pixel stride */, - 72 /* output pixel stride */, - w112.data(), w113.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op9); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #9" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op9, xnn_delete_operator); - - xnn_operator_t op10 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 72 /* input channels per group */, - 24 /* output_channels_per_group */, - 72 /* input pixel stride */, - 24 /* output pixel stride */, - w114.data(), w115.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op10); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #10" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op10, xnn_delete_operator); - - xnn_operator_t op11 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 24 /* input channels per group */, - 88 /* output_channels_per_group */, - 24 /* input pixel stride */, - 88 /* output pixel stride */, - w116.data(), w117.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op11); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #11" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op11, xnn_delete_operator); - - xnn_operator_t op12 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 88 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 88 /* input pixel stride */, - 88 /* output pixel stride */, - w118.data(), w119.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op12); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #12" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op12, xnn_delete_operator); - - xnn_operator_t op13 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 88 /* input channels per group */, - 24 /* output_channels_per_group */, - 88 /* input pixel stride */, - 24 /* output pixel stride */, - w120.data(), w121.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op13); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #13" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op13, xnn_delete_operator); - - xnn_operator_t op14 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op14); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #14" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op14, xnn_delete_operator); - - xnn_operator_t op15 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 24 /* input channels per group */, - 96 /* output_channels_per_group */, - 24 /* input pixel stride */, - 96 /* output pixel stride */, - w122.data(), w123.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op15); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #15" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op15, xnn_delete_operator); - - xnn_operator_t op16 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op16); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #16" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op16, xnn_delete_operator); - - xnn_operator_t op17 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 2 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 2 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 96 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 96 /* input pixel stride */, - 96 /* output pixel stride */, - w124.data(), w125.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op17); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #17" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op17, xnn_delete_operator); - - xnn_operator_t op18 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op18); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #18" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op18, xnn_delete_operator); - - xnn_operator_t op19 = nullptr; - status = xnn_create_global_average_pooling_ncw_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op19); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #19" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op19, xnn_delete_operator); - - xnn_operator_t op20 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 96 /* input channels per group */, - 24 /* output_channels_per_group */, - 96 /* input pixel stride */, - 24 /* output pixel stride */, - w126.data(), w127.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op20); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #20" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op20, xnn_delete_operator); - - xnn_operator_t op21 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 24 /* input channels per group */, - 96 /* output_channels_per_group */, - 24 /* input pixel stride */, - 96 /* output pixel stride */, - w128.data(), w129.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op21); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #21" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op21, xnn_delete_operator); - - xnn_operator_t op22 = nullptr; - status = xnn_create_multiply_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op22); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #22" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op22, xnn_delete_operator); - - xnn_operator_t op23 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 96 /* input channels per group */, - 40 /* output_channels_per_group */, - 96 /* input pixel stride */, - 40 /* output pixel stride */, - w130.data(), w131.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op23); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #23" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op23, xnn_delete_operator); - - xnn_operator_t op24 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 40 /* input channels per group */, - 240 /* output_channels_per_group */, - 40 /* input pixel stride */, - 240 /* output pixel stride */, - w132.data(), w133.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op24); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #24" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op24, xnn_delete_operator); - - xnn_operator_t op25 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op25); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #25" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op25, xnn_delete_operator); - - xnn_operator_t op26 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 2 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 2 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 240 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 240 /* input pixel stride */, - 240 /* output pixel stride */, - w134.data(), w135.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op26); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #26" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op26, xnn_delete_operator); - - xnn_operator_t op27 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op27); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #27" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op27, xnn_delete_operator); - - xnn_operator_t op28 = nullptr; - status = xnn_create_global_average_pooling_ncw_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op28); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #28" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op28, xnn_delete_operator); - - xnn_operator_t op29 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 240 /* input channels per group */, - 64 /* output_channels_per_group */, - 240 /* input pixel stride */, - 64 /* output pixel stride */, - w136.data(), w137.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op29); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #29" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op29, xnn_delete_operator); - - xnn_operator_t op30 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 64 /* input channels per group */, - 240 /* output_channels_per_group */, - 64 /* input pixel stride */, - 240 /* output pixel stride */, - w138.data(), w139.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op30); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #30" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op30, xnn_delete_operator); - - xnn_operator_t op31 = nullptr; - status = xnn_create_multiply_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op31); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #31" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op31, xnn_delete_operator); - - xnn_operator_t op32 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 240 /* input channels per group */, - 40 /* output_channels_per_group */, - 240 /* input pixel stride */, - 40 /* output pixel stride */, - w140.data(), w141.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op32); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #32" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op32, xnn_delete_operator); - - xnn_operator_t op33 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op33); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #33" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op33, xnn_delete_operator); - - xnn_operator_t op34 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 40 /* input channels per group */, - 240 /* output_channels_per_group */, - 40 /* input pixel stride */, - 240 /* output pixel stride */, - w142.data(), w143.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op34); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #34" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op34, xnn_delete_operator); - - xnn_operator_t op35 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op35); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #35" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op35, xnn_delete_operator); - - xnn_operator_t op36 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 2 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 2 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 240 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 240 /* input pixel stride */, - 240 /* output pixel stride */, - w144.data(), w145.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op36); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #36" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op36, xnn_delete_operator); - - xnn_operator_t op37 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op37); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #37" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op37, xnn_delete_operator); - - xnn_operator_t op38 = nullptr; - status = xnn_create_global_average_pooling_ncw_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op38); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #38" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op38, xnn_delete_operator); - - xnn_operator_t op39 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 240 /* input channels per group */, - 64 /* output_channels_per_group */, - 240 /* input pixel stride */, - 64 /* output pixel stride */, - w146.data(), w147.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op39); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #39" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op39, xnn_delete_operator); - - xnn_operator_t op40 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 64 /* input channels per group */, - 240 /* output_channels_per_group */, - 64 /* input pixel stride */, - 240 /* output pixel stride */, - w148.data(), w149.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op40); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #40" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op40, xnn_delete_operator); - - xnn_operator_t op41 = nullptr; - status = xnn_create_multiply_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op41); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #41" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op41, xnn_delete_operator); - - xnn_operator_t op42 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 240 /* input channels per group */, - 40 /* output_channels_per_group */, - 240 /* input pixel stride */, - 40 /* output pixel stride */, - w150.data(), w151.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op42); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #42" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op42, xnn_delete_operator); - - xnn_operator_t op43 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op43); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #43" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op43, xnn_delete_operator); - - xnn_operator_t op44 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 40 /* input channels per group */, - 120 /* output_channels_per_group */, - 40 /* input pixel stride */, - 120 /* output pixel stride */, - w152.data(), w153.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op44); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #44" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op44, xnn_delete_operator); - - xnn_operator_t op45 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op45); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #45" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op45, xnn_delete_operator); - - xnn_operator_t op46 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 2 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 2 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 120 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 120 /* input pixel stride */, - 120 /* output pixel stride */, - w154.data(), w155.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op46); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #46" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op46, xnn_delete_operator); - - xnn_operator_t op47 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op47); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #47" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op47, xnn_delete_operator); - - xnn_operator_t op48 = nullptr; - status = xnn_create_global_average_pooling_ncw_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op48); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #48" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op48, xnn_delete_operator); - - xnn_operator_t op49 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 120 /* input channels per group */, - 32 /* output_channels_per_group */, - 120 /* input pixel stride */, - 32 /* output pixel stride */, - w156.data(), w157.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op49); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #49" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op49, xnn_delete_operator); - - xnn_operator_t op50 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 32 /* input channels per group */, - 120 /* output_channels_per_group */, - 32 /* input pixel stride */, - 120 /* output pixel stride */, - w158.data(), w159.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op50); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #50" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op50, xnn_delete_operator); - - xnn_operator_t op51 = nullptr; - status = xnn_create_multiply_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op51); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #51" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op51, xnn_delete_operator); - - xnn_operator_t op52 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 120 /* input channels per group */, - 48 /* output_channels_per_group */, - 120 /* input pixel stride */, - 48 /* output pixel stride */, - w160.data(), w161.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op52); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #52" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op52, xnn_delete_operator); - - xnn_operator_t op53 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 48 /* input channels per group */, - 144 /* output_channels_per_group */, - 48 /* input pixel stride */, - 144 /* output pixel stride */, - w162.data(), w163.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op53); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #53" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op53, xnn_delete_operator); - - xnn_operator_t op54 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op54); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #54" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op54, xnn_delete_operator); - - xnn_operator_t op55 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 2 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 2 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 144 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 144 /* input pixel stride */, - 144 /* output pixel stride */, - w164.data(), w165.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op55); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #55" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op55, xnn_delete_operator); - - xnn_operator_t op56 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op56); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #56" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op56, xnn_delete_operator); - - xnn_operator_t op57 = nullptr; - status = xnn_create_global_average_pooling_ncw_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op57); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #57" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op57, xnn_delete_operator); - - xnn_operator_t op58 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 144 /* input channels per group */, - 40 /* output_channels_per_group */, - 144 /* input pixel stride */, - 40 /* output pixel stride */, - w166.data(), w167.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op58); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #58" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op58, xnn_delete_operator); - - xnn_operator_t op59 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 40 /* input channels per group */, - 144 /* output_channels_per_group */, - 40 /* input pixel stride */, - 144 /* output pixel stride */, - w168.data(), w169.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op59); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #59" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op59, xnn_delete_operator); - - xnn_operator_t op60 = nullptr; - status = xnn_create_multiply_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op60); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #60" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op60, xnn_delete_operator); - - xnn_operator_t op61 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 144 /* input channels per group */, - 48 /* output_channels_per_group */, - 144 /* input pixel stride */, - 48 /* output pixel stride */, - w170.data(), w171.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op61); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #61" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op61, xnn_delete_operator); - - xnn_operator_t op62 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op62); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #62" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op62, xnn_delete_operator); - - xnn_operator_t op63 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 48 /* input channels per group */, - 288 /* output_channels_per_group */, - 48 /* input pixel stride */, - 288 /* output pixel stride */, - w172.data(), w173.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op63); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #63" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op63, xnn_delete_operator); - - xnn_operator_t op64 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op64); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #64" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op64, xnn_delete_operator); - - xnn_operator_t op65 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 2 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 2 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 288 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 288 /* input pixel stride */, - 288 /* output pixel stride */, - w174.data(), w175.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op65); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #65" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op65, xnn_delete_operator); - - xnn_operator_t op66 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op66); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #66" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op66, xnn_delete_operator); - - xnn_operator_t op67 = nullptr; - status = xnn_create_global_average_pooling_ncw_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op67); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #67" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op67, xnn_delete_operator); - - xnn_operator_t op68 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 288 /* input channels per group */, - 72 /* output_channels_per_group */, - 288 /* input pixel stride */, - 72 /* output pixel stride */, - w176.data(), w177.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op68); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #68" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op68, xnn_delete_operator); - - xnn_operator_t op69 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 72 /* input channels per group */, - 288 /* output_channels_per_group */, - 72 /* input pixel stride */, - 288 /* output pixel stride */, - w178.data(), w179.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op69); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #69" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op69, xnn_delete_operator); - - xnn_operator_t op70 = nullptr; - status = xnn_create_multiply_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op70); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #70" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op70, xnn_delete_operator); - - xnn_operator_t op71 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 288 /* input channels per group */, - 96 /* output_channels_per_group */, - 288 /* input pixel stride */, - 96 /* output pixel stride */, - w180.data(), w181.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op71); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #71" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op71, xnn_delete_operator); - - xnn_operator_t op72 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 96 /* input channels per group */, - 576 /* output_channels_per_group */, - 96 /* input pixel stride */, - 576 /* output pixel stride */, - w182.data(), w183.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op72); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #72" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op72, xnn_delete_operator); - - xnn_operator_t op73 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op73); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #73" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op73, xnn_delete_operator); - - xnn_operator_t op74 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 2 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 2 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 576 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 576 /* input pixel stride */, - 576 /* output pixel stride */, - w184.data(), w185.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op74); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #74" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op74, xnn_delete_operator); - - xnn_operator_t op75 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op75); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #75" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op75, xnn_delete_operator); - - xnn_operator_t op76 = nullptr; - status = xnn_create_global_average_pooling_ncw_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op76); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #76" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op76, xnn_delete_operator); - - xnn_operator_t op77 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 576 /* input channels per group */, - 144 /* output_channels_per_group */, - 576 /* input pixel stride */, - 144 /* output pixel stride */, - w186.data(), w187.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op77); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #77" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op77, xnn_delete_operator); - - xnn_operator_t op78 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 144 /* input channels per group */, - 576 /* output_channels_per_group */, - 144 /* input pixel stride */, - 576 /* output pixel stride */, - w188.data(), w189.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op78); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #78" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op78, xnn_delete_operator); - - xnn_operator_t op79 = nullptr; - status = xnn_create_multiply_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op79); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #79" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op79, xnn_delete_operator); - - xnn_operator_t op80 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 576 /* input channels per group */, - 96 /* output_channels_per_group */, - 576 /* input pixel stride */, - 96 /* output pixel stride */, - w190.data(), w191.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op80); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #80" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op80, xnn_delete_operator); - - xnn_operator_t op81 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op81); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #81" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op81, xnn_delete_operator); - - xnn_operator_t op82 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 96 /* input channels per group */, - 576 /* output_channels_per_group */, - 96 /* input pixel stride */, - 576 /* output pixel stride */, - w192.data(), w193.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op82); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #82" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op82, xnn_delete_operator); - - xnn_operator_t op83 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op83); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #83" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op83, xnn_delete_operator); - - xnn_operator_t op84 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 2 /* top padding */, 2 /* right padding */, - 2 /* bottom padding */, 2 /* left padding */, - 5 /* kernel height */, 5 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 576 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 576 /* input pixel stride */, - 576 /* output pixel stride */, - w194.data(), w195.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op84); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #84" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op84, xnn_delete_operator); - - xnn_operator_t op85 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op85); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #85" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op85, xnn_delete_operator); - - xnn_operator_t op86 = nullptr; - status = xnn_create_global_average_pooling_ncw_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op86); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #86" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op86, xnn_delete_operator); - - xnn_operator_t op87 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 576 /* input channels per group */, - 144 /* output_channels_per_group */, - 576 /* input pixel stride */, - 144 /* output pixel stride */, - w196.data(), w197.data(), - 0.0f /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op87); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #87" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op87, xnn_delete_operator); - - xnn_operator_t op88 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 144 /* input channels per group */, - 576 /* output_channels_per_group */, - 144 /* input pixel stride */, - 576 /* output pixel stride */, - w198.data(), w199.data(), - 0.0f /* output min */, +0x1.00014Fp+0 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op88); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #88" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op88, xnn_delete_operator); - - xnn_operator_t op89 = nullptr; - status = xnn_create_multiply_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op89); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #89" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op89, xnn_delete_operator); - - xnn_operator_t op90 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 576 /* input channels per group */, - 96 /* output_channels_per_group */, - 576 /* input pixel stride */, - 96 /* output pixel stride */, - w200.data(), w201.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op90); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #90" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op90, xnn_delete_operator); - - xnn_operator_t op91 = nullptr; - status = xnn_create_add_nd_f32( - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - &op91); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #91" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op91, xnn_delete_operator); - - xnn_operator_t op92 = nullptr; - status = xnn_create_convolution2d_nchw_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 96 /* input channels per group */, - 576 /* output_channels_per_group */, - 96 /* input pixel stride */, - 576 /* output pixel stride */, - w202.data(), w203.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op92); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #92" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op92, xnn_delete_operator); - - xnn_operator_t op93 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op93); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #93" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op93, xnn_delete_operator); - - xnn_operator_t op94 = nullptr; - status = xnn_create_global_average_pooling_ncw_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op94); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #94" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op94, xnn_delete_operator); - - xnn_operator_t op95 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 576 /* input channels per group */, - 1024 /* output_channels_per_group */, - 576 /* input pixel stride */, - 1024 /* output pixel stride */, - w204.data(), w205.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op95); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #95" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op95, xnn_delete_operator); - - xnn_operator_t op96 = nullptr; - status = xnn_create_hardswish_nc_f32( - 0 /* flags */, - &op96); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #96" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op96, xnn_delete_operator); - - xnn_operator_t op97 = nullptr; - status = xnn_create_global_average_pooling_nwc_f32( - -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0 /* flags */, - &op97); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #97" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op97, xnn_delete_operator); - - xnn_operator_t op98 = nullptr; - status = xnn_create_convolution2d_nhwc_f32( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 1024 /* input channels per group */, - 1001 /* output_channels_per_group */, - 1024 /* input pixel stride */, - 1001 /* output pixel stride */, - w206.data(), w207.data(), - -std::numeric_limits::infinity() /* output min */, std::numeric_limits::infinity() /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op98); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #98" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op98, xnn_delete_operator); - - status = xnn_reshape_convolution2d_nchw_f32( - op0, - /*batch_size=*/1, /*input_height=*/224, /*input_width=*/224, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #0" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op1, - /*batch_size=*/12544, - 16 /* channels */, - 16 /* input stride */, - 16 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #1" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op2, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #2" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f32( - op3, - /*batch_size=*/1, 3136 /* width */, - 16 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #3" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op4, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #4" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op5, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #5" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 16, 56, 56 }; - const size_t b_shape[] = { 1, 16, 1, 1 }; - status = xnn_reshape_multiply_nd_f32( - op6, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #6" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op7, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #7" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op8, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #8" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op9, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #9" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op10, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #10" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op11, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #11" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op12, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #12" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op13, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #13" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 24, 28, 28 }; - const size_t b_shape[] = { 1, 24, 28, 28 }; - status = xnn_reshape_add_nd_f32( - op14, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #14" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op15, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #15" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op16, - /*batch_size=*/784, - 96 /* channels */, - 96 /* input stride */, - 96 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #16" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op17, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #17" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op18, - /*batch_size=*/196, - 96 /* channels */, - 96 /* input stride */, - 96 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #18" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f32( - op19, - /*batch_size=*/1, 196 /* width */, - 96 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #19" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op20, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #20" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op21, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #21" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 96, 14, 14 }; - const size_t b_shape[] = { 1, 96, 1, 1 }; - status = xnn_reshape_multiply_nd_f32( - op22, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #22" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op23, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #23" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op24, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #24" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op25, - /*batch_size=*/196, - 240 /* channels */, - 240 /* input stride */, - 240 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #25" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op26, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #26" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op27, - /*batch_size=*/196, - 240 /* channels */, - 240 /* input stride */, - 240 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #27" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f32( - op28, - /*batch_size=*/1, 196 /* width */, - 240 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #28" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op29, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #29" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op30, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #30" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 240, 14, 14 }; - const size_t b_shape[] = { 1, 240, 1, 1 }; - status = xnn_reshape_multiply_nd_f32( - op31, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #31" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op32, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #32" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 40, 14, 14 }; - const size_t b_shape[] = { 1, 40, 14, 14 }; - status = xnn_reshape_add_nd_f32( - op33, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #33" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op34, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #34" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op35, - /*batch_size=*/196, - 240 /* channels */, - 240 /* input stride */, - 240 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #35" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op36, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #36" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op37, - /*batch_size=*/196, - 240 /* channels */, - 240 /* input stride */, - 240 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #37" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f32( - op38, - /*batch_size=*/1, 196 /* width */, - 240 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #38" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op39, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #39" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op40, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #40" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 240, 14, 14 }; - const size_t b_shape[] = { 1, 240, 1, 1 }; - status = xnn_reshape_multiply_nd_f32( - op41, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #41" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op42, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #42" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 40, 14, 14 }; - const size_t b_shape[] = { 1, 40, 14, 14 }; - status = xnn_reshape_add_nd_f32( - op43, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #43" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op44, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #44" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op45, - /*batch_size=*/196, - 120 /* channels */, - 120 /* input stride */, - 120 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #45" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op46, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #46" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op47, - /*batch_size=*/196, - 120 /* channels */, - 120 /* input stride */, - 120 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #47" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f32( - op48, - /*batch_size=*/1, 196 /* width */, - 120 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #48" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op49, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #49" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op50, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #50" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 120, 14, 14 }; - const size_t b_shape[] = { 1, 120, 1, 1 }; - status = xnn_reshape_multiply_nd_f32( - op51, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #51" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op52, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #52" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op53, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #53" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op54, - /*batch_size=*/196, - 144 /* channels */, - 144 /* input stride */, - 144 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #54" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op55, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #55" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op56, - /*batch_size=*/196, - 144 /* channels */, - 144 /* input stride */, - 144 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #56" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f32( - op57, - /*batch_size=*/1, 196 /* width */, - 144 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #57" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op58, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #58" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op59, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #59" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 144, 14, 14 }; - const size_t b_shape[] = { 1, 144, 1, 1 }; - status = xnn_reshape_multiply_nd_f32( - op60, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #60" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op61, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #61" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 48, 14, 14 }; - const size_t b_shape[] = { 1, 48, 14, 14 }; - status = xnn_reshape_add_nd_f32( - op62, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #62" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op63, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #63" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op64, - /*batch_size=*/196, - 288 /* channels */, - 288 /* input stride */, - 288 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #64" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op65, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #65" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op66, - /*batch_size=*/49, - 288 /* channels */, - 288 /* input stride */, - 288 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #66" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f32( - op67, - /*batch_size=*/1, 49 /* width */, - 288 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #67" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op68, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #68" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op69, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #69" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 288, 7, 7 }; - const size_t b_shape[] = { 1, 288, 1, 1 }; - status = xnn_reshape_multiply_nd_f32( - op70, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #70" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op71, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #71" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op72, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #72" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op73, - /*batch_size=*/49, - 576 /* channels */, - 576 /* input stride */, - 576 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #73" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op74, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #74" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op75, - /*batch_size=*/49, - 576 /* channels */, - 576 /* input stride */, - 576 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #75" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f32( - op76, - /*batch_size=*/1, 49 /* width */, - 576 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #76" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op77, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #77" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op78, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #78" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 576, 7, 7 }; - const size_t b_shape[] = { 1, 576, 1, 1 }; - status = xnn_reshape_multiply_nd_f32( - op79, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #79" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op80, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #80" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 96, 7, 7 }; - const size_t b_shape[] = { 1, 96, 7, 7 }; - status = xnn_reshape_add_nd_f32( - op81, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #81" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op82, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #82" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op83, - /*batch_size=*/49, - 576 /* channels */, - 576 /* input stride */, - 576 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #83" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op84, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #84" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op85, - /*batch_size=*/49, - 576 /* channels */, - 576 /* input stride */, - 576 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #85" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f32( - op86, - /*batch_size=*/1, 49 /* width */, - 576 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #86" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op87, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #87" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op88, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #88" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 576, 7, 7 }; - const size_t b_shape[] = { 1, 576, 1, 1 }; - status = xnn_reshape_multiply_nd_f32( - op89, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #89" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op90, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #90" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 96, 7, 7 }; - const size_t b_shape[] = { 1, 96, 7, 7 }; - status = xnn_reshape_add_nd_f32( - op91, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #91" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_convolution2d_nchw_f32( - op92, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #92" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op93, - /*batch_size=*/49, - 576 /* channels */, - 576 /* input stride */, - 576 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #93" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_global_average_pooling_ncw_f32( - op94, - /*batch_size=*/1, 49 /* width */, - 576 /* channels */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #94" << std::endl; - return ExecutionPlan(); - } - - size_t op95_workspace_size = 0; - size_t op95_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op95, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op95_workspace_size, &op95_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op95_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #95" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_hardswish_nc_f32( - op96, - /*batch_size=*/1, - 1024 /* channels */, - 1024 /* input stride */, - 1024 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #96" << std::endl; - return ExecutionPlan(); - } - - size_t op97_workspace_size = 0; - size_t op97_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_f32( - op97, - /*batch_size=*/1, 1 /* width */, - 1024 /* channels */, 1024 /* input stride */, 1024 /* output stride */, - &op97_workspace_size, &op97_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op97_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #97" << std::endl; - return ExecutionPlan(); - } - - size_t op98_workspace_size = 0; - size_t op98_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - op98, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op98_workspace_size, &op98_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op98_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #98" << std::endl; - return ExecutionPlan(); - } - - Workspace workspace(max_workspace_size); - - status = xnn_setup_convolution2d_nchw_f32( - op0, - /*input=*/v0.data(), /*output=*/v1.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #0" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op1, - /*input=*/v1.data(), /*output=*/v2.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #1" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op2, - /*input=*/v2.data(), /*output=*/v3.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #2" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f32( - op3, - /*input=*/v3.data(), /*output=*/v4.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #3" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op4, - /*input=*/v4.data(), /*output=*/v5.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #4" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op5, - /*input=*/v5.data(), /*output=*/v6.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #5" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f32( - op6, - v3.data() /* a */, v6.data() /* b */, /*output=*/v7.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #6" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op7, - /*input=*/v7.data(), /*output=*/v8.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #7" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op8, - /*input=*/v8.data(), /*output=*/v9.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #8" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op9, - /*input=*/v9.data(), /*output=*/v10.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #9" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op10, - /*input=*/v10.data(), /*output=*/v11.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #10" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op11, - /*input=*/v11.data(), /*output=*/v12.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #11" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op12, - /*input=*/v12.data(), /*output=*/v13.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #12" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op13, - /*input=*/v13.data(), /*output=*/v14.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #13" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op14, - v14.data() /* a */, v11.data() /* b */, /*output=*/v15.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #14" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op15, - /*input=*/v15.data(), /*output=*/v16.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #15" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op16, - /*input=*/v16.data(), /*output=*/v17.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #16" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op17, - /*input=*/v17.data(), /*output=*/v18.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #17" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op18, - /*input=*/v18.data(), /*output=*/v19.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #18" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f32( - op19, - /*input=*/v19.data(), /*output=*/v20.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #19" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op20, - /*input=*/v20.data(), /*output=*/v21.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #20" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op21, - /*input=*/v21.data(), /*output=*/v22.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #21" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f32( - op22, - v19.data() /* a */, v22.data() /* b */, /*output=*/v23.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #22" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op23, - /*input=*/v23.data(), /*output=*/v24.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #23" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op24, - /*input=*/v24.data(), /*output=*/v25.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #24" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op25, - /*input=*/v25.data(), /*output=*/v26.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #25" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op26, - /*input=*/v26.data(), /*output=*/v27.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #26" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op27, - /*input=*/v27.data(), /*output=*/v28.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #27" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f32( - op28, - /*input=*/v28.data(), /*output=*/v29.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #28" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op29, - /*input=*/v29.data(), /*output=*/v30.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #29" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op30, - /*input=*/v30.data(), /*output=*/v31.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #30" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f32( - op31, - v28.data() /* a */, v31.data() /* b */, /*output=*/v32.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #31" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op32, - /*input=*/v32.data(), /*output=*/v33.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #32" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op33, - v33.data() /* a */, v24.data() /* b */, /*output=*/v34.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #33" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op34, - /*input=*/v34.data(), /*output=*/v35.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #34" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op35, - /*input=*/v35.data(), /*output=*/v36.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #35" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op36, - /*input=*/v36.data(), /*output=*/v37.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #36" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op37, - /*input=*/v37.data(), /*output=*/v38.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #37" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f32( - op38, - /*input=*/v38.data(), /*output=*/v39.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #38" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op39, - /*input=*/v39.data(), /*output=*/v40.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #39" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op40, - /*input=*/v40.data(), /*output=*/v41.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #40" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f32( - op41, - v38.data() /* a */, v41.data() /* b */, /*output=*/v42.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #41" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op42, - /*input=*/v42.data(), /*output=*/v43.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #42" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op43, - v43.data() /* a */, v34.data() /* b */, /*output=*/v44.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #43" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op44, - /*input=*/v44.data(), /*output=*/v45.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #44" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op45, - /*input=*/v45.data(), /*output=*/v46.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #45" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op46, - /*input=*/v46.data(), /*output=*/v47.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #46" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op47, - /*input=*/v47.data(), /*output=*/v48.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #47" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f32( - op48, - /*input=*/v48.data(), /*output=*/v49.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #48" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op49, - /*input=*/v49.data(), /*output=*/v50.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #49" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op50, - /*input=*/v50.data(), /*output=*/v51.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #50" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f32( - op51, - v48.data() /* a */, v51.data() /* b */, /*output=*/v52.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #51" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op52, - /*input=*/v52.data(), /*output=*/v53.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #52" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op53, - /*input=*/v53.data(), /*output=*/v54.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #53" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op54, - /*input=*/v54.data(), /*output=*/v55.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #54" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op55, - /*input=*/v55.data(), /*output=*/v56.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #55" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op56, - /*input=*/v56.data(), /*output=*/v57.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #56" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f32( - op57, - /*input=*/v57.data(), /*output=*/v58.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #57" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op58, - /*input=*/v58.data(), /*output=*/v59.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #58" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op59, - /*input=*/v59.data(), /*output=*/v60.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #59" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f32( - op60, - v57.data() /* a */, v60.data() /* b */, /*output=*/v61.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #60" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op61, - /*input=*/v61.data(), /*output=*/v62.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #61" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op62, - v62.data() /* a */, v53.data() /* b */, /*output=*/v63.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #62" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op63, - /*input=*/v63.data(), /*output=*/v64.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #63" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op64, - /*input=*/v64.data(), /*output=*/v65.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #64" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op65, - /*input=*/v65.data(), /*output=*/v66.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #65" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op66, - /*input=*/v66.data(), /*output=*/v67.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #66" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f32( - op67, - /*input=*/v67.data(), /*output=*/v68.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #67" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op68, - /*input=*/v68.data(), /*output=*/v69.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #68" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op69, - /*input=*/v69.data(), /*output=*/v70.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #69" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f32( - op70, - v67.data() /* a */, v70.data() /* b */, /*output=*/v71.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #70" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op71, - /*input=*/v71.data(), /*output=*/v72.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #71" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op72, - /*input=*/v72.data(), /*output=*/v73.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #72" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op73, - /*input=*/v73.data(), /*output=*/v74.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #73" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op74, - /*input=*/v74.data(), /*output=*/v75.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #74" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op75, - /*input=*/v75.data(), /*output=*/v76.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #75" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f32( - op76, - /*input=*/v76.data(), /*output=*/v77.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #76" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op77, - /*input=*/v77.data(), /*output=*/v78.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #77" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op78, - /*input=*/v78.data(), /*output=*/v79.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #78" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f32( - op79, - v76.data() /* a */, v79.data() /* b */, /*output=*/v80.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #79" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op80, - /*input=*/v80.data(), /*output=*/v81.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #80" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op81, - v81.data() /* a */, v72.data() /* b */, /*output=*/v82.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #81" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op82, - /*input=*/v82.data(), /*output=*/v83.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #82" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op83, - /*input=*/v83.data(), /*output=*/v84.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #83" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op84, - /*input=*/v84.data(), /*output=*/v85.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #84" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op85, - /*input=*/v85.data(), /*output=*/v86.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #85" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f32( - op86, - /*input=*/v86.data(), /*output=*/v87.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #86" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op87, - /*input=*/v87.data(), /*output=*/v88.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #87" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op88, - /*input=*/v88.data(), /*output=*/v89.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #88" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_f32( - op89, - v86.data() /* a */, v89.data() /* b */, /*output=*/v90.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #89" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op90, - /*input=*/v90.data(), /*output=*/v91.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #90" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_f32( - op91, - v91.data() /* a */, v82.data() /* b */, /*output=*/v92.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #91" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nchw_f32( - op92, - /*input=*/v92.data(), /*output=*/v93.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #92" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op93, - /*input=*/v93.data(), /*output=*/v94.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #93" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_ncw_f32( - op94, - /*input=*/v94.data(), /*output=*/v95.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #94" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op95, - workspace.data(), - /*input=*/v95.data(), /*output=*/v96.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #95" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_hardswish_nc_f32( - op96, - /*input=*/v96.data(), /*output=*/v97.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #96" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_f32( - op97, - workspace.data(), - /*input=*/v97.data(), /*output=*/v98.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #97" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_f32( - op98, - workspace.data(), - /*input=*/v98.data(), /*output=*/v99.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #98" << std::endl; - return ExecutionPlan(); - } - - XNN_PRAGMA_CLANG("clang diagnostic push") - XNN_PRAGMA_CLANG("clang diagnostic ignored \"-Wpessimizing-move\"") - return ExecutionPlan{operators, workspace}; - XNN_PRAGMA_CLANG("clang diagnostic pop") -} - -} // namespace models diff --git a/models/qs8-mobilenet-v1.cc b/models/qs8-mobilenet-v1.cc deleted file mode 100644 index 61ee916549a..00000000000 --- a/models/qs8-mobilenet-v1.cc +++ /dev/null @@ -1,1564 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include "xnnpack.h" - -#include -#include -#include -#include -#include -#include - -#include "xnnpack/cache.h" -#include "xnnpack/common.h" -#include "xnnpack/models.h" - -namespace models { - -ExecutionPlan QS8MobileNetV1(pthreadpool_t threadpool) { - alignas(16) static std::array v0; - alignas(16) static std::array v1; - alignas(16) static std::array v2; - alignas(16) static std::array v3; - alignas(16) static std::array v4; - alignas(16) static std::array v5; - alignas(16) static std::array v6; - alignas(16) static std::array v7; - alignas(16) static std::array v8; - alignas(16) static std::array v9; - alignas(16) static std::array v10; - alignas(16) static std::array v11; - alignas(16) static std::array v12; - alignas(16) static std::array v13; - alignas(16) static std::array v14; - alignas(16) static std::array v15; - alignas(16) static std::array v16; - alignas(16) static std::array v17; - alignas(16) static std::array v18; - alignas(16) static std::array v19; - alignas(16) static std::array v20; - alignas(16) static std::array v21; - alignas(16) static std::array v22; - alignas(16) static std::array v23; - alignas(16) static std::array v24; - alignas(16) static std::array v25; - alignas(16) static std::array v26; - alignas(16) static std::array v27; - alignas(16) static std::array v28; - alignas(16) static std::array v29; - alignas(16) static std::array w30; - alignas(16) static std::array w31; - alignas(16) static std::array w32; - alignas(16) static std::array w33; - alignas(16) static std::array w34; - alignas(16) static std::array w35; - alignas(16) static std::array w36; - alignas(16) static std::array w37; - alignas(16) static std::array w38; - alignas(16) static std::array w39; - alignas(16) static std::array w40; - alignas(16) static std::array w41; - alignas(16) static std::array w42; - alignas(16) static std::array w43; - alignas(16) static std::array w44; - alignas(16) static std::array w45; - alignas(16) static std::array w46; - alignas(16) static std::array w47; - alignas(16) static std::array w48; - alignas(16) static std::array w49; - alignas(16) static std::array w50; - alignas(16) static std::array w51; - alignas(16) static std::array w52; - alignas(16) static std::array w53; - alignas(16) static std::array w54; - alignas(16) static std::array w55; - alignas(16) static std::array w56; - alignas(16) static std::array w57; - alignas(16) static std::array w58; - alignas(16) static std::array w59; - alignas(16) static std::array w60; - alignas(16) static std::array w61; - alignas(16) static std::array w62; - alignas(16) static std::array w63; - alignas(16) static std::array w64; - alignas(16) static std::array w65; - alignas(16) static std::array w66; - alignas(16) static std::array w67; - alignas(16) static std::array w68; - alignas(16) static std::array w69; - alignas(16) static std::array w70; - alignas(16) static std::array w71; - alignas(16) static std::array w72; - alignas(16) static std::array w73; - alignas(16) static std::array w74; - alignas(16) static std::array w75; - alignas(16) static std::array w76; - alignas(16) static std::array w77; - alignas(16) static std::array w78; - alignas(16) static std::array w79; - alignas(16) static std::array w80; - alignas(16) static std::array w81; - alignas(16) static std::array w82; - alignas(16) static std::array w83; - alignas(16) static std::array w84; - alignas(16) static std::array w85; - - std::random_device random_device; - auto rng = std::mt19937(random_device()); - auto i8rng = std::bind(std::uniform_int_distribution(-127, 127), std::ref(rng)); - auto i32rng = std::bind(std::uniform_int_distribution(-10000, 10000), std::ref(rng)); - std::generate(v0.begin(), v0.end(), std::ref(i8rng)); - std::generate(v1.begin(), v1.end(), std::ref(i8rng)); - std::generate(v2.begin(), v2.end(), std::ref(i8rng)); - std::generate(v3.begin(), v3.end(), std::ref(i8rng)); - std::generate(v4.begin(), v4.end(), std::ref(i8rng)); - std::generate(v5.begin(), v5.end(), std::ref(i8rng)); - std::generate(v6.begin(), v6.end(), std::ref(i8rng)); - std::generate(v7.begin(), v7.end(), std::ref(i8rng)); - std::generate(v8.begin(), v8.end(), std::ref(i8rng)); - std::generate(v9.begin(), v9.end(), std::ref(i8rng)); - std::generate(v10.begin(), v10.end(), std::ref(i8rng)); - std::generate(v11.begin(), v11.end(), std::ref(i8rng)); - std::generate(v12.begin(), v12.end(), std::ref(i8rng)); - std::generate(v13.begin(), v13.end(), std::ref(i8rng)); - std::generate(v14.begin(), v14.end(), std::ref(i8rng)); - std::generate(v15.begin(), v15.end(), std::ref(i8rng)); - std::generate(v16.begin(), v16.end(), std::ref(i8rng)); - std::generate(v17.begin(), v17.end(), std::ref(i8rng)); - std::generate(v18.begin(), v18.end(), std::ref(i8rng)); - std::generate(v19.begin(), v19.end(), std::ref(i8rng)); - std::generate(v20.begin(), v20.end(), std::ref(i8rng)); - std::generate(v21.begin(), v21.end(), std::ref(i8rng)); - std::generate(v22.begin(), v22.end(), std::ref(i8rng)); - std::generate(v23.begin(), v23.end(), std::ref(i8rng)); - std::generate(v24.begin(), v24.end(), std::ref(i8rng)); - std::generate(v25.begin(), v25.end(), std::ref(i8rng)); - std::generate(v26.begin(), v26.end(), std::ref(i8rng)); - std::generate(v27.begin(), v27.end(), std::ref(i8rng)); - std::generate(v28.begin(), v28.end(), std::ref(i8rng)); - std::generate(v29.begin(), v29.end(), std::ref(i8rng)); - std::generate(w30.begin(), w30.end(), std::ref(i8rng)); - std::generate(w31.begin(), w31.end(), std::ref(i32rng)); - std::generate(w32.begin(), w32.end(), std::ref(i8rng)); - std::generate(w33.begin(), w33.end(), std::ref(i32rng)); - std::generate(w34.begin(), w34.end(), std::ref(i8rng)); - std::generate(w35.begin(), w35.end(), std::ref(i32rng)); - std::generate(w36.begin(), w36.end(), std::ref(i8rng)); - std::generate(w37.begin(), w37.end(), std::ref(i32rng)); - std::generate(w38.begin(), w38.end(), std::ref(i8rng)); - std::generate(w39.begin(), w39.end(), std::ref(i32rng)); - std::generate(w40.begin(), w40.end(), std::ref(i8rng)); - std::generate(w41.begin(), w41.end(), std::ref(i32rng)); - std::generate(w42.begin(), w42.end(), std::ref(i8rng)); - std::generate(w43.begin(), w43.end(), std::ref(i32rng)); - std::generate(w44.begin(), w44.end(), std::ref(i8rng)); - std::generate(w45.begin(), w45.end(), std::ref(i32rng)); - std::generate(w46.begin(), w46.end(), std::ref(i8rng)); - std::generate(w47.begin(), w47.end(), std::ref(i32rng)); - std::generate(w48.begin(), w48.end(), std::ref(i8rng)); - std::generate(w49.begin(), w49.end(), std::ref(i32rng)); - std::generate(w50.begin(), w50.end(), std::ref(i8rng)); - std::generate(w51.begin(), w51.end(), std::ref(i32rng)); - std::generate(w52.begin(), w52.end(), std::ref(i8rng)); - std::generate(w53.begin(), w53.end(), std::ref(i32rng)); - std::generate(w54.begin(), w54.end(), std::ref(i8rng)); - std::generate(w55.begin(), w55.end(), std::ref(i32rng)); - std::generate(w56.begin(), w56.end(), std::ref(i8rng)); - std::generate(w57.begin(), w57.end(), std::ref(i32rng)); - std::generate(w58.begin(), w58.end(), std::ref(i8rng)); - std::generate(w59.begin(), w59.end(), std::ref(i32rng)); - std::generate(w60.begin(), w60.end(), std::ref(i8rng)); - std::generate(w61.begin(), w61.end(), std::ref(i32rng)); - std::generate(w62.begin(), w62.end(), std::ref(i8rng)); - std::generate(w63.begin(), w63.end(), std::ref(i32rng)); - std::generate(w64.begin(), w64.end(), std::ref(i8rng)); - std::generate(w65.begin(), w65.end(), std::ref(i32rng)); - std::generate(w66.begin(), w66.end(), std::ref(i8rng)); - std::generate(w67.begin(), w67.end(), std::ref(i32rng)); - std::generate(w68.begin(), w68.end(), std::ref(i8rng)); - std::generate(w69.begin(), w69.end(), std::ref(i32rng)); - std::generate(w70.begin(), w70.end(), std::ref(i8rng)); - std::generate(w71.begin(), w71.end(), std::ref(i32rng)); - std::generate(w72.begin(), w72.end(), std::ref(i8rng)); - std::generate(w73.begin(), w73.end(), std::ref(i32rng)); - std::generate(w74.begin(), w74.end(), std::ref(i8rng)); - std::generate(w75.begin(), w75.end(), std::ref(i32rng)); - std::generate(w76.begin(), w76.end(), std::ref(i8rng)); - std::generate(w77.begin(), w77.end(), std::ref(i32rng)); - std::generate(w78.begin(), w78.end(), std::ref(i8rng)); - std::generate(w79.begin(), w79.end(), std::ref(i32rng)); - std::generate(w80.begin(), w80.end(), std::ref(i8rng)); - std::generate(w81.begin(), w81.end(), std::ref(i32rng)); - std::generate(w82.begin(), w82.end(), std::ref(i8rng)); - std::generate(w83.begin(), w83.end(), std::ref(i32rng)); - std::generate(w84.begin(), w84.end(), std::ref(i8rng)); - std::generate(w85.begin(), w85.end(), std::ref(i32rng)); - - Operators operators; - xnn_status status; - size_t max_workspace_size = 0; - - xnn_operator_t op0 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 0 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 3 /* input channels per group */, - 32 /* output_channels_per_group */, - 3 /* input pixel stride */, - 32 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w30.data(), w31.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op0); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #0" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op0, xnn_delete_operator); - - xnn_operator_t op1 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 32 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 32 /* input pixel stride */, - 32 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w32.data(), w33.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op1); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #1" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op1, xnn_delete_operator); - - xnn_operator_t op2 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 32 /* input channels per group */, - 64 /* output_channels_per_group */, - 32 /* input pixel stride */, - 64 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w34.data(), w35.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op2); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #2" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op2, xnn_delete_operator); - - xnn_operator_t op3 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 0 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 64 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 64 /* input pixel stride */, - 64 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w36.data(), w37.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op3); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #3" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op3, xnn_delete_operator); - - xnn_operator_t op4 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 64 /* input channels per group */, - 128 /* output_channels_per_group */, - 64 /* input pixel stride */, - 128 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w38.data(), w39.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op4); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #4" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op4, xnn_delete_operator); - - xnn_operator_t op5 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 128 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 128 /* input pixel stride */, - 128 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w40.data(), w41.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op5); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #5" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op5, xnn_delete_operator); - - xnn_operator_t op6 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 128 /* input channels per group */, - 128 /* output_channels_per_group */, - 128 /* input pixel stride */, - 128 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w42.data(), w43.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op6); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #6" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op6, xnn_delete_operator); - - xnn_operator_t op7 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 0 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 128 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 128 /* input pixel stride */, - 128 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w44.data(), w45.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op7); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #7" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op7, xnn_delete_operator); - - xnn_operator_t op8 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 128 /* input channels per group */, - 256 /* output_channels_per_group */, - 128 /* input pixel stride */, - 256 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w46.data(), w47.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op8); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #8" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op8, xnn_delete_operator); - - xnn_operator_t op9 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 256 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 256 /* input pixel stride */, - 256 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w48.data(), w49.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op9); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #9" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op9, xnn_delete_operator); - - xnn_operator_t op10 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 256 /* input channels per group */, - 256 /* output_channels_per_group */, - 256 /* input pixel stride */, - 256 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w50.data(), w51.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op10); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #10" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op10, xnn_delete_operator); - - xnn_operator_t op11 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 0 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 256 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 256 /* input pixel stride */, - 256 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w52.data(), w53.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op11); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #11" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op11, xnn_delete_operator); - - xnn_operator_t op12 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 256 /* input channels per group */, - 512 /* output_channels_per_group */, - 256 /* input pixel stride */, - 512 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w54.data(), w55.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op12); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #12" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op12, xnn_delete_operator); - - xnn_operator_t op13 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 512 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w56.data(), w57.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op13); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #13" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op13, xnn_delete_operator); - - xnn_operator_t op14 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 512 /* input channels per group */, - 512 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w58.data(), w59.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op14); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #14" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op14, xnn_delete_operator); - - xnn_operator_t op15 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 512 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w60.data(), w61.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op15); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #15" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op15, xnn_delete_operator); - - xnn_operator_t op16 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 512 /* input channels per group */, - 512 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w62.data(), w63.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op16); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #16" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op16, xnn_delete_operator); - - xnn_operator_t op17 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 512 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w64.data(), w65.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op17); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #17" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op17, xnn_delete_operator); - - xnn_operator_t op18 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 512 /* input channels per group */, - 512 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w66.data(), w67.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op18); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #18" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op18, xnn_delete_operator); - - xnn_operator_t op19 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 512 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w68.data(), w69.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op19); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #19" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op19, xnn_delete_operator); - - xnn_operator_t op20 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 512 /* input channels per group */, - 512 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w70.data(), w71.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op20); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #20" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op20, xnn_delete_operator); - - xnn_operator_t op21 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 512 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w72.data(), w73.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op21); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #21" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op21, xnn_delete_operator); - - xnn_operator_t op22 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 512 /* input channels per group */, - 512 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w74.data(), w75.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op22); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #22" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op22, xnn_delete_operator); - - xnn_operator_t op23 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 0 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 512 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w76.data(), w77.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op23); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #23" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op23, xnn_delete_operator); - - xnn_operator_t op24 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 512 /* input channels per group */, - 1024 /* output_channels_per_group */, - 512 /* input pixel stride */, - 1024 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w78.data(), w79.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op24); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #24" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op24, xnn_delete_operator); - - xnn_operator_t op25 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1024 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 1024 /* input pixel stride */, - 1024 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w80.data(), w81.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op25); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #25" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op25, xnn_delete_operator); - - xnn_operator_t op26 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 1024 /* input channels per group */, - 1024 /* output_channels_per_group */, - 1024 /* input pixel stride */, - 1024 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w82.data(), w83.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op26); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #26" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op26, xnn_delete_operator); - - xnn_operator_t op27 = nullptr; - status = xnn_create_global_average_pooling_nwc_qs8( - -1 /* input zero point */, 0.5f /* input scale */, - -1 /* output zero point */, 0.5f /* output scale */, - -126 /* output min */, 126 /* output max */, - 0 /* flags */, - &op27); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #27" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op27, xnn_delete_operator); - - xnn_operator_t op28 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 1024 /* input channels per group */, - 1001 /* output_channels_per_group */, - 1024 /* input pixel stride */, - 1001 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w84.data(), w85.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op28); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #28" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op28, xnn_delete_operator); - - size_t op0_workspace_size = 0; - size_t op0_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op0, - /*batch_size=*/1, /*input_height=*/224, /*input_width=*/224, - &op0_workspace_size, &op0_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op0_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #0" << std::endl; - return ExecutionPlan(); - } - - size_t op1_workspace_size = 0; - size_t op1_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op1, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op1_workspace_size, &op1_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op1_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #1" << std::endl; - return ExecutionPlan(); - } - - size_t op2_workspace_size = 0; - size_t op2_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op2, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op2_workspace_size, &op2_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op2_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #2" << std::endl; - return ExecutionPlan(); - } - - size_t op3_workspace_size = 0; - size_t op3_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op3, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op3_workspace_size, &op3_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op3_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #3" << std::endl; - return ExecutionPlan(); - } - - size_t op4_workspace_size = 0; - size_t op4_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op4, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op4_workspace_size, &op4_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op4_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #4" << std::endl; - return ExecutionPlan(); - } - - size_t op5_workspace_size = 0; - size_t op5_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op5, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op5_workspace_size, &op5_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op5_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #5" << std::endl; - return ExecutionPlan(); - } - - size_t op6_workspace_size = 0; - size_t op6_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op6, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op6_workspace_size, &op6_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op6_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #6" << std::endl; - return ExecutionPlan(); - } - - size_t op7_workspace_size = 0; - size_t op7_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op7, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op7_workspace_size, &op7_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op7_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #7" << std::endl; - return ExecutionPlan(); - } - - size_t op8_workspace_size = 0; - size_t op8_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op8, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op8_workspace_size, &op8_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op8_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #8" << std::endl; - return ExecutionPlan(); - } - - size_t op9_workspace_size = 0; - size_t op9_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op9, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op9_workspace_size, &op9_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op9_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #9" << std::endl; - return ExecutionPlan(); - } - - size_t op10_workspace_size = 0; - size_t op10_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op10, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op10_workspace_size, &op10_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op10_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #10" << std::endl; - return ExecutionPlan(); - } - - size_t op11_workspace_size = 0; - size_t op11_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op11, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op11_workspace_size, &op11_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op11_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #11" << std::endl; - return ExecutionPlan(); - } - - size_t op12_workspace_size = 0; - size_t op12_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op12, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op12_workspace_size, &op12_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op12_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #12" << std::endl; - return ExecutionPlan(); - } - - size_t op13_workspace_size = 0; - size_t op13_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op13, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op13_workspace_size, &op13_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op13_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #13" << std::endl; - return ExecutionPlan(); - } - - size_t op14_workspace_size = 0; - size_t op14_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op14, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op14_workspace_size, &op14_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op14_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #14" << std::endl; - return ExecutionPlan(); - } - - size_t op15_workspace_size = 0; - size_t op15_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op15, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op15_workspace_size, &op15_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op15_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #15" << std::endl; - return ExecutionPlan(); - } - - size_t op16_workspace_size = 0; - size_t op16_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op16, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op16_workspace_size, &op16_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op16_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #16" << std::endl; - return ExecutionPlan(); - } - - size_t op17_workspace_size = 0; - size_t op17_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op17, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op17_workspace_size, &op17_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op17_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #17" << std::endl; - return ExecutionPlan(); - } - - size_t op18_workspace_size = 0; - size_t op18_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op18, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op18_workspace_size, &op18_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op18_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #18" << std::endl; - return ExecutionPlan(); - } - - size_t op19_workspace_size = 0; - size_t op19_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op19, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op19_workspace_size, &op19_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op19_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #19" << std::endl; - return ExecutionPlan(); - } - - size_t op20_workspace_size = 0; - size_t op20_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op20, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op20_workspace_size, &op20_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op20_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #20" << std::endl; - return ExecutionPlan(); - } - - size_t op21_workspace_size = 0; - size_t op21_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op21, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op21_workspace_size, &op21_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op21_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #21" << std::endl; - return ExecutionPlan(); - } - - size_t op22_workspace_size = 0; - size_t op22_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op22, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op22_workspace_size, &op22_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op22_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #22" << std::endl; - return ExecutionPlan(); - } - - size_t op23_workspace_size = 0; - size_t op23_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op23, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op23_workspace_size, &op23_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op23_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #23" << std::endl; - return ExecutionPlan(); - } - - size_t op24_workspace_size = 0; - size_t op24_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op24, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op24_workspace_size, &op24_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op24_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #24" << std::endl; - return ExecutionPlan(); - } - - size_t op25_workspace_size = 0; - size_t op25_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op25, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op25_workspace_size, &op25_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op25_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #25" << std::endl; - return ExecutionPlan(); - } - - size_t op26_workspace_size = 0; - size_t op26_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op26, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op26_workspace_size, &op26_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op26_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #26" << std::endl; - return ExecutionPlan(); - } - - size_t op27_workspace_size = 0; - size_t op27_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_qs8( - op27, - /*batch_size=*/1, 49 /* width */, - 1024 /* channels */, 1024 /* input stride */, 1024 /* output stride */, - &op27_workspace_size, &op27_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op27_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #27" << std::endl; - return ExecutionPlan(); - } - - size_t op28_workspace_size = 0; - size_t op28_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op28, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op28_workspace_size, &op28_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op28_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #28" << std::endl; - return ExecutionPlan(); - } - - Workspace workspace(max_workspace_size); - - status = xnn_setup_convolution2d_nhwc_qs8( - op0, - workspace.data(), /*input=*/v0.data(), /*output=*/v1.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #0" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op1, - workspace.data(), /*input=*/v1.data(), /*output=*/v2.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #1" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op2, - workspace.data(), /*input=*/v2.data(), /*output=*/v3.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #2" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op3, - workspace.data(), /*input=*/v3.data(), /*output=*/v4.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #3" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op4, - workspace.data(), /*input=*/v4.data(), /*output=*/v5.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #4" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op5, - workspace.data(), /*input=*/v5.data(), /*output=*/v6.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #5" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op6, - workspace.data(), /*input=*/v6.data(), /*output=*/v7.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #6" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op7, - workspace.data(), /*input=*/v7.data(), /*output=*/v8.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #7" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op8, - workspace.data(), /*input=*/v8.data(), /*output=*/v9.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #8" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op9, - workspace.data(), /*input=*/v9.data(), /*output=*/v10.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #9" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op10, - workspace.data(), /*input=*/v10.data(), /*output=*/v11.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #10" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op11, - workspace.data(), /*input=*/v11.data(), /*output=*/v12.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #11" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op12, - workspace.data(), /*input=*/v12.data(), /*output=*/v13.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #12" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op13, - workspace.data(), /*input=*/v13.data(), /*output=*/v14.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #13" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op14, - workspace.data(), /*input=*/v14.data(), /*output=*/v15.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #14" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op15, - workspace.data(), /*input=*/v15.data(), /*output=*/v16.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #15" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op16, - workspace.data(), /*input=*/v16.data(), /*output=*/v17.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #16" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op17, - workspace.data(), /*input=*/v17.data(), /*output=*/v18.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #17" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op18, - workspace.data(), /*input=*/v18.data(), /*output=*/v19.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #18" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op19, - workspace.data(), /*input=*/v19.data(), /*output=*/v20.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #19" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op20, - workspace.data(), /*input=*/v20.data(), /*output=*/v21.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #20" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op21, - workspace.data(), /*input=*/v21.data(), /*output=*/v22.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #21" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op22, - workspace.data(), /*input=*/v22.data(), /*output=*/v23.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #22" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op23, - workspace.data(), /*input=*/v23.data(), /*output=*/v24.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #23" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op24, - workspace.data(), /*input=*/v24.data(), /*output=*/v25.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #24" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op25, - workspace.data(), /*input=*/v25.data(), /*output=*/v26.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #25" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op26, - workspace.data(), /*input=*/v26.data(), /*output=*/v27.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #26" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_qs8( - op27, - workspace.data(), - /*input=*/v27.data(), /*output=*/v28.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #27" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op28, - workspace.data(), /*input=*/v28.data(), /*output=*/v29.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #28" << std::endl; - return ExecutionPlan(); - } - - XNN_PRAGMA_CLANG("clang diagnostic push") - XNN_PRAGMA_CLANG("clang diagnostic ignored \"-Wpessimizing-move\"") - return ExecutionPlan{operators, workspace}; - XNN_PRAGMA_CLANG("clang diagnostic pop") -} - -} // namespace models diff --git a/models/qs8-mobilenet-v2.cc b/models/qs8-mobilenet-v2.cc deleted file mode 100644 index abc4d88b6b4..00000000000 --- a/models/qs8-mobilenet-v2.cc +++ /dev/null @@ -1,3248 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include "xnnpack.h" - -#include -#include -#include -#include -#include -#include - -#include "xnnpack/cache.h" -#include "xnnpack/common.h" -#include "xnnpack/models.h" - -namespace models { - -ExecutionPlan QS8MobileNetV2(pthreadpool_t threadpool) { - alignas(16) static std::array v0; - alignas(16) static std::array v1; - alignas(16) static std::array v2; - alignas(16) static std::array v3; - alignas(16) static std::array v4; - alignas(16) static std::array v5; - alignas(16) static std::array v6; - alignas(16) static std::array v7; - alignas(16) static std::array v8; - alignas(16) static std::array v9; - alignas(16) static std::array v10; - alignas(16) static std::array v11; - alignas(16) static std::array v12; - alignas(16) static std::array v13; - alignas(16) static std::array v14; - alignas(16) static std::array v15; - alignas(16) static std::array v16; - alignas(16) static std::array v17; - alignas(16) static std::array v18; - alignas(16) static std::array v19; - alignas(16) static std::array v20; - alignas(16) static std::array v21; - alignas(16) static std::array v22; - alignas(16) static std::array v23; - alignas(16) static std::array v24; - alignas(16) static std::array v25; - alignas(16) static std::array v26; - alignas(16) static std::array v27; - alignas(16) static std::array v28; - alignas(16) static std::array v29; - alignas(16) static std::array v30; - alignas(16) static std::array v31; - alignas(16) static std::array v32; - alignas(16) static std::array v33; - alignas(16) static std::array v34; - alignas(16) static std::array v35; - alignas(16) static std::array v36; - alignas(16) static std::array v37; - alignas(16) static std::array v38; - alignas(16) static std::array v39; - alignas(16) static std::array v40; - alignas(16) static std::array v41; - alignas(16) static std::array v42; - alignas(16) static std::array v43; - alignas(16) static std::array v44; - alignas(16) static std::array v45; - alignas(16) static std::array v46; - alignas(16) static std::array v47; - alignas(16) static std::array v48; - alignas(16) static std::array v49; - alignas(16) static std::array v50; - alignas(16) static std::array v51; - alignas(16) static std::array v52; - alignas(16) static std::array v53; - alignas(16) static std::array v54; - alignas(16) static std::array v55; - alignas(16) static std::array v56; - alignas(16) static std::array v57; - alignas(16) static std::array v58; - alignas(16) static std::array v59; - alignas(16) static std::array v60; - alignas(16) static std::array v61; - alignas(16) static std::array v62; - alignas(16) static std::array v63; - alignas(16) static std::array v64; - alignas(16) static std::array w65; - alignas(16) static std::array w66; - alignas(16) static std::array w67; - alignas(16) static std::array w68; - alignas(16) static std::array w69; - alignas(16) static std::array w70; - alignas(16) static std::array w71; - alignas(16) static std::array w72; - alignas(16) static std::array w73; - alignas(16) static std::array w74; - alignas(16) static std::array w75; - alignas(16) static std::array w76; - alignas(16) static std::array w77; - alignas(16) static std::array w78; - alignas(16) static std::array w79; - alignas(16) static std::array w80; - alignas(16) static std::array w81; - alignas(16) static std::array w82; - alignas(16) static std::array w83; - alignas(16) static std::array w84; - alignas(16) static std::array w85; - alignas(16) static std::array w86; - alignas(16) static std::array w87; - alignas(16) static std::array w88; - alignas(16) static std::array w89; - alignas(16) static std::array w90; - alignas(16) static std::array w91; - alignas(16) static std::array w92; - alignas(16) static std::array w93; - alignas(16) static std::array w94; - alignas(16) static std::array w95; - alignas(16) static std::array w96; - alignas(16) static std::array w97; - alignas(16) static std::array w98; - alignas(16) static std::array w99; - alignas(16) static std::array w100; - alignas(16) static std::array w101; - alignas(16) static std::array w102; - alignas(16) static std::array w103; - alignas(16) static std::array w104; - alignas(16) static std::array w105; - alignas(16) static std::array w106; - alignas(16) static std::array w107; - alignas(16) static std::array w108; - alignas(16) static std::array w109; - alignas(16) static std::array w110; - alignas(16) static std::array w111; - alignas(16) static std::array w112; - alignas(16) static std::array w113; - alignas(16) static std::array w114; - alignas(16) static std::array w115; - alignas(16) static std::array w116; - alignas(16) static std::array w117; - alignas(16) static std::array w118; - alignas(16) static std::array w119; - alignas(16) static std::array w120; - alignas(16) static std::array w121; - alignas(16) static std::array w122; - alignas(16) static std::array w123; - alignas(16) static std::array w124; - alignas(16) static std::array w125; - alignas(16) static std::array w126; - alignas(16) static std::array w127; - alignas(16) static std::array w128; - alignas(16) static std::array w129; - alignas(16) static std::array w130; - alignas(16) static std::array w131; - alignas(16) static std::array w132; - alignas(16) static std::array w133; - alignas(16) static std::array w134; - alignas(16) static std::array w135; - alignas(16) static std::array w136; - alignas(16) static std::array w137; - alignas(16) static std::array w138; - alignas(16) static std::array w139; - alignas(16) static std::array w140; - alignas(16) static std::array w141; - alignas(16) static std::array w142; - alignas(16) static std::array w143; - alignas(16) static std::array w144; - alignas(16) static std::array w145; - alignas(16) static std::array w146; - alignas(16) static std::array w147; - alignas(16) static std::array w148; - alignas(16) static std::array w149; - alignas(16) static std::array w150; - alignas(16) static std::array w151; - alignas(16) static std::array w152; - alignas(16) static std::array w153; - alignas(16) static std::array w154; - alignas(16) static std::array w155; - alignas(16) static std::array w156; - alignas(16) static std::array w157; - alignas(16) static std::array w158; - alignas(16) static std::array w159; - alignas(16) static std::array w160; - alignas(16) static std::array w161; - alignas(16) static std::array w162; - alignas(16) static std::array w163; - alignas(16) static std::array w164; - alignas(16) static std::array w165; - alignas(16) static std::array w166; - alignas(16) static std::array w167; - alignas(16) static std::array w168; - alignas(16) static std::array w169; - alignas(16) static std::array w170; - - std::random_device random_device; - auto rng = std::mt19937(random_device()); - auto i8rng = std::bind(std::uniform_int_distribution(-127, 127), std::ref(rng)); - auto i32rng = std::bind(std::uniform_int_distribution(-10000, 10000), std::ref(rng)); - std::generate(v0.begin(), v0.end(), std::ref(i8rng)); - std::generate(v1.begin(), v1.end(), std::ref(i8rng)); - std::generate(v2.begin(), v2.end(), std::ref(i8rng)); - std::generate(v3.begin(), v3.end(), std::ref(i8rng)); - std::generate(v4.begin(), v4.end(), std::ref(i8rng)); - std::generate(v5.begin(), v5.end(), std::ref(i8rng)); - std::generate(v6.begin(), v6.end(), std::ref(i8rng)); - std::generate(v7.begin(), v7.end(), std::ref(i8rng)); - std::generate(v8.begin(), v8.end(), std::ref(i8rng)); - std::generate(v9.begin(), v9.end(), std::ref(i8rng)); - std::generate(v10.begin(), v10.end(), std::ref(i8rng)); - std::generate(v11.begin(), v11.end(), std::ref(i8rng)); - std::generate(v12.begin(), v12.end(), std::ref(i8rng)); - std::generate(v13.begin(), v13.end(), std::ref(i8rng)); - std::generate(v14.begin(), v14.end(), std::ref(i8rng)); - std::generate(v15.begin(), v15.end(), std::ref(i8rng)); - std::generate(v16.begin(), v16.end(), std::ref(i8rng)); - std::generate(v17.begin(), v17.end(), std::ref(i8rng)); - std::generate(v18.begin(), v18.end(), std::ref(i8rng)); - std::generate(v19.begin(), v19.end(), std::ref(i8rng)); - std::generate(v20.begin(), v20.end(), std::ref(i8rng)); - std::generate(v21.begin(), v21.end(), std::ref(i8rng)); - std::generate(v22.begin(), v22.end(), std::ref(i8rng)); - std::generate(v23.begin(), v23.end(), std::ref(i8rng)); - std::generate(v24.begin(), v24.end(), std::ref(i8rng)); - std::generate(v25.begin(), v25.end(), std::ref(i8rng)); - std::generate(v26.begin(), v26.end(), std::ref(i8rng)); - std::generate(v27.begin(), v27.end(), std::ref(i8rng)); - std::generate(v28.begin(), v28.end(), std::ref(i8rng)); - std::generate(v29.begin(), v29.end(), std::ref(i8rng)); - std::generate(v30.begin(), v30.end(), std::ref(i8rng)); - std::generate(v31.begin(), v31.end(), std::ref(i8rng)); - std::generate(v32.begin(), v32.end(), std::ref(i8rng)); - std::generate(v33.begin(), v33.end(), std::ref(i8rng)); - std::generate(v34.begin(), v34.end(), std::ref(i8rng)); - std::generate(v35.begin(), v35.end(), std::ref(i8rng)); - std::generate(v36.begin(), v36.end(), std::ref(i8rng)); - std::generate(v37.begin(), v37.end(), std::ref(i8rng)); - std::generate(v38.begin(), v38.end(), std::ref(i8rng)); - std::generate(v39.begin(), v39.end(), std::ref(i8rng)); - std::generate(v40.begin(), v40.end(), std::ref(i8rng)); - std::generate(v41.begin(), v41.end(), std::ref(i8rng)); - std::generate(v42.begin(), v42.end(), std::ref(i8rng)); - std::generate(v43.begin(), v43.end(), std::ref(i8rng)); - std::generate(v44.begin(), v44.end(), std::ref(i8rng)); - std::generate(v45.begin(), v45.end(), std::ref(i8rng)); - std::generate(v46.begin(), v46.end(), std::ref(i8rng)); - std::generate(v47.begin(), v47.end(), std::ref(i8rng)); - std::generate(v48.begin(), v48.end(), std::ref(i8rng)); - std::generate(v49.begin(), v49.end(), std::ref(i8rng)); - std::generate(v50.begin(), v50.end(), std::ref(i8rng)); - std::generate(v51.begin(), v51.end(), std::ref(i8rng)); - std::generate(v52.begin(), v52.end(), std::ref(i8rng)); - std::generate(v53.begin(), v53.end(), std::ref(i8rng)); - std::generate(v54.begin(), v54.end(), std::ref(i8rng)); - std::generate(v55.begin(), v55.end(), std::ref(i8rng)); - std::generate(v56.begin(), v56.end(), std::ref(i8rng)); - std::generate(v57.begin(), v57.end(), std::ref(i8rng)); - std::generate(v58.begin(), v58.end(), std::ref(i8rng)); - std::generate(v59.begin(), v59.end(), std::ref(i8rng)); - std::generate(v60.begin(), v60.end(), std::ref(i8rng)); - std::generate(v61.begin(), v61.end(), std::ref(i8rng)); - std::generate(v62.begin(), v62.end(), std::ref(i8rng)); - std::generate(v63.begin(), v63.end(), std::ref(i8rng)); - std::generate(v64.begin(), v64.end(), std::ref(i8rng)); - std::generate(w65.begin(), w65.end(), std::ref(i8rng)); - std::generate(w66.begin(), w66.end(), std::ref(i32rng)); - std::generate(w67.begin(), w67.end(), std::ref(i8rng)); - std::generate(w68.begin(), w68.end(), std::ref(i32rng)); - std::generate(w69.begin(), w69.end(), std::ref(i8rng)); - std::generate(w70.begin(), w70.end(), std::ref(i32rng)); - std::generate(w71.begin(), w71.end(), std::ref(i8rng)); - std::generate(w72.begin(), w72.end(), std::ref(i32rng)); - std::generate(w73.begin(), w73.end(), std::ref(i8rng)); - std::generate(w74.begin(), w74.end(), std::ref(i32rng)); - std::generate(w75.begin(), w75.end(), std::ref(i8rng)); - std::generate(w76.begin(), w76.end(), std::ref(i32rng)); - std::generate(w77.begin(), w77.end(), std::ref(i8rng)); - std::generate(w78.begin(), w78.end(), std::ref(i32rng)); - std::generate(w79.begin(), w79.end(), std::ref(i8rng)); - std::generate(w80.begin(), w80.end(), std::ref(i32rng)); - std::generate(w81.begin(), w81.end(), std::ref(i8rng)); - std::generate(w82.begin(), w82.end(), std::ref(i32rng)); - std::generate(w83.begin(), w83.end(), std::ref(i8rng)); - std::generate(w84.begin(), w84.end(), std::ref(i32rng)); - std::generate(w85.begin(), w85.end(), std::ref(i8rng)); - std::generate(w86.begin(), w86.end(), std::ref(i32rng)); - std::generate(w87.begin(), w87.end(), std::ref(i8rng)); - std::generate(w88.begin(), w88.end(), std::ref(i32rng)); - std::generate(w89.begin(), w89.end(), std::ref(i8rng)); - std::generate(w90.begin(), w90.end(), std::ref(i32rng)); - std::generate(w91.begin(), w91.end(), std::ref(i8rng)); - std::generate(w92.begin(), w92.end(), std::ref(i32rng)); - std::generate(w93.begin(), w93.end(), std::ref(i8rng)); - std::generate(w94.begin(), w94.end(), std::ref(i32rng)); - std::generate(w95.begin(), w95.end(), std::ref(i8rng)); - std::generate(w96.begin(), w96.end(), std::ref(i32rng)); - std::generate(w97.begin(), w97.end(), std::ref(i8rng)); - std::generate(w98.begin(), w98.end(), std::ref(i32rng)); - std::generate(w99.begin(), w99.end(), std::ref(i8rng)); - std::generate(w100.begin(), w100.end(), std::ref(i32rng)); - std::generate(w101.begin(), w101.end(), std::ref(i8rng)); - std::generate(w102.begin(), w102.end(), std::ref(i32rng)); - std::generate(w103.begin(), w103.end(), std::ref(i8rng)); - std::generate(w104.begin(), w104.end(), std::ref(i32rng)); - std::generate(w105.begin(), w105.end(), std::ref(i8rng)); - std::generate(w106.begin(), w106.end(), std::ref(i32rng)); - std::generate(w107.begin(), w107.end(), std::ref(i8rng)); - std::generate(w108.begin(), w108.end(), std::ref(i32rng)); - std::generate(w109.begin(), w109.end(), std::ref(i8rng)); - std::generate(w110.begin(), w110.end(), std::ref(i32rng)); - std::generate(w111.begin(), w111.end(), std::ref(i8rng)); - std::generate(w112.begin(), w112.end(), std::ref(i32rng)); - std::generate(w113.begin(), w113.end(), std::ref(i8rng)); - std::generate(w114.begin(), w114.end(), std::ref(i32rng)); - std::generate(w115.begin(), w115.end(), std::ref(i8rng)); - std::generate(w116.begin(), w116.end(), std::ref(i32rng)); - std::generate(w117.begin(), w117.end(), std::ref(i8rng)); - std::generate(w118.begin(), w118.end(), std::ref(i32rng)); - std::generate(w119.begin(), w119.end(), std::ref(i8rng)); - std::generate(w120.begin(), w120.end(), std::ref(i32rng)); - std::generate(w121.begin(), w121.end(), std::ref(i8rng)); - std::generate(w122.begin(), w122.end(), std::ref(i32rng)); - std::generate(w123.begin(), w123.end(), std::ref(i8rng)); - std::generate(w124.begin(), w124.end(), std::ref(i32rng)); - std::generate(w125.begin(), w125.end(), std::ref(i8rng)); - std::generate(w126.begin(), w126.end(), std::ref(i32rng)); - std::generate(w127.begin(), w127.end(), std::ref(i8rng)); - std::generate(w128.begin(), w128.end(), std::ref(i32rng)); - std::generate(w129.begin(), w129.end(), std::ref(i8rng)); - std::generate(w130.begin(), w130.end(), std::ref(i32rng)); - std::generate(w131.begin(), w131.end(), std::ref(i8rng)); - std::generate(w132.begin(), w132.end(), std::ref(i32rng)); - std::generate(w133.begin(), w133.end(), std::ref(i8rng)); - std::generate(w134.begin(), w134.end(), std::ref(i32rng)); - std::generate(w135.begin(), w135.end(), std::ref(i8rng)); - std::generate(w136.begin(), w136.end(), std::ref(i32rng)); - std::generate(w137.begin(), w137.end(), std::ref(i8rng)); - std::generate(w138.begin(), w138.end(), std::ref(i32rng)); - std::generate(w139.begin(), w139.end(), std::ref(i8rng)); - std::generate(w140.begin(), w140.end(), std::ref(i32rng)); - std::generate(w141.begin(), w141.end(), std::ref(i8rng)); - std::generate(w142.begin(), w142.end(), std::ref(i32rng)); - std::generate(w143.begin(), w143.end(), std::ref(i8rng)); - std::generate(w144.begin(), w144.end(), std::ref(i32rng)); - std::generate(w145.begin(), w145.end(), std::ref(i8rng)); - std::generate(w146.begin(), w146.end(), std::ref(i32rng)); - std::generate(w147.begin(), w147.end(), std::ref(i8rng)); - std::generate(w148.begin(), w148.end(), std::ref(i32rng)); - std::generate(w149.begin(), w149.end(), std::ref(i8rng)); - std::generate(w150.begin(), w150.end(), std::ref(i32rng)); - std::generate(w151.begin(), w151.end(), std::ref(i8rng)); - std::generate(w152.begin(), w152.end(), std::ref(i32rng)); - std::generate(w153.begin(), w153.end(), std::ref(i8rng)); - std::generate(w154.begin(), w154.end(), std::ref(i32rng)); - std::generate(w155.begin(), w155.end(), std::ref(i8rng)); - std::generate(w156.begin(), w156.end(), std::ref(i32rng)); - std::generate(w157.begin(), w157.end(), std::ref(i8rng)); - std::generate(w158.begin(), w158.end(), std::ref(i32rng)); - std::generate(w159.begin(), w159.end(), std::ref(i8rng)); - std::generate(w160.begin(), w160.end(), std::ref(i32rng)); - std::generate(w161.begin(), w161.end(), std::ref(i8rng)); - std::generate(w162.begin(), w162.end(), std::ref(i32rng)); - std::generate(w163.begin(), w163.end(), std::ref(i8rng)); - std::generate(w164.begin(), w164.end(), std::ref(i32rng)); - std::generate(w165.begin(), w165.end(), std::ref(i8rng)); - std::generate(w166.begin(), w166.end(), std::ref(i32rng)); - std::generate(w167.begin(), w167.end(), std::ref(i8rng)); - std::generate(w168.begin(), w168.end(), std::ref(i32rng)); - std::generate(w169.begin(), w169.end(), std::ref(i8rng)); - std::generate(w170.begin(), w170.end(), std::ref(i32rng)); - - Operators operators; - xnn_status status; - size_t max_workspace_size = 0; - - xnn_operator_t op0 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 0 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 3 /* input channels per group */, - 32 /* output_channels_per_group */, - 3 /* input pixel stride */, - 32 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w65.data(), w66.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op0); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #0" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op0, xnn_delete_operator); - - xnn_operator_t op1 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 32 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 32 /* input pixel stride */, - 32 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w67.data(), w68.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op1); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #1" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op1, xnn_delete_operator); - - xnn_operator_t op2 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 32 /* input channels per group */, - 16 /* output_channels_per_group */, - 32 /* input pixel stride */, - 16 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w69.data(), w70.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op2); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #2" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op2, xnn_delete_operator); - - xnn_operator_t op3 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 16 /* input channels per group */, - 96 /* output_channels_per_group */, - 16 /* input pixel stride */, - 96 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w71.data(), w72.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op3); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #3" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op3, xnn_delete_operator); - - xnn_operator_t op4 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 0 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 96 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 96 /* input pixel stride */, - 96 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w73.data(), w74.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op4); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #4" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op4, xnn_delete_operator); - - xnn_operator_t op5 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 96 /* input channels per group */, - 24 /* output_channels_per_group */, - 96 /* input pixel stride */, - 24 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w75.data(), w76.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op5); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #5" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op5, xnn_delete_operator); - - xnn_operator_t op6 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 24 /* input channels per group */, - 144 /* output_channels_per_group */, - 24 /* input pixel stride */, - 144 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w77.data(), w78.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op6); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #6" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op6, xnn_delete_operator); - - xnn_operator_t op7 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 144 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 144 /* input pixel stride */, - 144 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w79.data(), w80.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op7); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #7" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op7, xnn_delete_operator); - - xnn_operator_t op8 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 144 /* input channels per group */, - 24 /* output_channels_per_group */, - 144 /* input pixel stride */, - 24 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w81.data(), w82.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op8); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #8" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op8, xnn_delete_operator); - - xnn_operator_t op9 = nullptr; - status = xnn_create_add_nd_qs8( - -1 /* input1 zero point */, 0.5f /* input1 scale */, - -1 /* input2 zero point */, 0.5f /* input2 scale */, - -1 /* output zero point */, 1.0f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - &op9); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #9" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op9, xnn_delete_operator); - - xnn_operator_t op10 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 24 /* input channels per group */, - 144 /* output_channels_per_group */, - 24 /* input pixel stride */, - 144 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w83.data(), w84.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op10); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #10" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op10, xnn_delete_operator); - - xnn_operator_t op11 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 0 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 144 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 144 /* input pixel stride */, - 144 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w85.data(), w86.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op11); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #11" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op11, xnn_delete_operator); - - xnn_operator_t op12 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 144 /* input channels per group */, - 32 /* output_channels_per_group */, - 144 /* input pixel stride */, - 32 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w87.data(), w88.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op12); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #12" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op12, xnn_delete_operator); - - xnn_operator_t op13 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 32 /* input channels per group */, - 192 /* output_channels_per_group */, - 32 /* input pixel stride */, - 192 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w89.data(), w90.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op13); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #13" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op13, xnn_delete_operator); - - xnn_operator_t op14 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 192 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 192 /* input pixel stride */, - 192 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w91.data(), w92.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op14); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #14" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op14, xnn_delete_operator); - - xnn_operator_t op15 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 192 /* input channels per group */, - 32 /* output_channels_per_group */, - 192 /* input pixel stride */, - 32 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w93.data(), w94.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op15); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #15" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op15, xnn_delete_operator); - - xnn_operator_t op16 = nullptr; - status = xnn_create_add_nd_qs8( - -1 /* input1 zero point */, 0.5f /* input1 scale */, - -1 /* input2 zero point */, 0.5f /* input2 scale */, - -1 /* output zero point */, 1.0f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - &op16); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #16" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op16, xnn_delete_operator); - - xnn_operator_t op17 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 32 /* input channels per group */, - 192 /* output_channels_per_group */, - 32 /* input pixel stride */, - 192 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w95.data(), w96.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op17); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #17" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op17, xnn_delete_operator); - - xnn_operator_t op18 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 192 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 192 /* input pixel stride */, - 192 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w97.data(), w98.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op18); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #18" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op18, xnn_delete_operator); - - xnn_operator_t op19 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 192 /* input channels per group */, - 32 /* output_channels_per_group */, - 192 /* input pixel stride */, - 32 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w99.data(), w100.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op19); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #19" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op19, xnn_delete_operator); - - xnn_operator_t op20 = nullptr; - status = xnn_create_add_nd_qs8( - -1 /* input1 zero point */, 0.5f /* input1 scale */, - -1 /* input2 zero point */, 0.5f /* input2 scale */, - -1 /* output zero point */, 1.0f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - &op20); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #20" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op20, xnn_delete_operator); - - xnn_operator_t op21 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 32 /* input channels per group */, - 192 /* output_channels_per_group */, - 32 /* input pixel stride */, - 192 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w101.data(), w102.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op21); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #21" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op21, xnn_delete_operator); - - xnn_operator_t op22 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 0 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 192 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 192 /* input pixel stride */, - 192 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w103.data(), w104.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op22); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #22" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op22, xnn_delete_operator); - - xnn_operator_t op23 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 192 /* input channels per group */, - 64 /* output_channels_per_group */, - 192 /* input pixel stride */, - 64 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w105.data(), w106.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op23); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #23" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op23, xnn_delete_operator); - - xnn_operator_t op24 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 64 /* input channels per group */, - 384 /* output_channels_per_group */, - 64 /* input pixel stride */, - 384 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w107.data(), w108.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op24); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #24" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op24, xnn_delete_operator); - - xnn_operator_t op25 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 384 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 384 /* input pixel stride */, - 384 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w109.data(), w110.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op25); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #25" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op25, xnn_delete_operator); - - xnn_operator_t op26 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 384 /* input channels per group */, - 64 /* output_channels_per_group */, - 384 /* input pixel stride */, - 64 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w111.data(), w112.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op26); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #26" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op26, xnn_delete_operator); - - xnn_operator_t op27 = nullptr; - status = xnn_create_add_nd_qs8( - -1 /* input1 zero point */, 0.5f /* input1 scale */, - -1 /* input2 zero point */, 0.5f /* input2 scale */, - -1 /* output zero point */, 1.0f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - &op27); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #27" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op27, xnn_delete_operator); - - xnn_operator_t op28 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 64 /* input channels per group */, - 384 /* output_channels_per_group */, - 64 /* input pixel stride */, - 384 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w113.data(), w114.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op28); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #28" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op28, xnn_delete_operator); - - xnn_operator_t op29 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 384 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 384 /* input pixel stride */, - 384 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w115.data(), w116.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op29); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #29" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op29, xnn_delete_operator); - - xnn_operator_t op30 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 384 /* input channels per group */, - 64 /* output_channels_per_group */, - 384 /* input pixel stride */, - 64 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w117.data(), w118.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op30); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #30" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op30, xnn_delete_operator); - - xnn_operator_t op31 = nullptr; - status = xnn_create_add_nd_qs8( - -1 /* input1 zero point */, 0.5f /* input1 scale */, - -1 /* input2 zero point */, 0.5f /* input2 scale */, - -1 /* output zero point */, 1.0f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - &op31); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #31" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op31, xnn_delete_operator); - - xnn_operator_t op32 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 64 /* input channels per group */, - 384 /* output_channels_per_group */, - 64 /* input pixel stride */, - 384 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w119.data(), w120.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op32); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #32" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op32, xnn_delete_operator); - - xnn_operator_t op33 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 384 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 384 /* input pixel stride */, - 384 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w121.data(), w122.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op33); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #33" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op33, xnn_delete_operator); - - xnn_operator_t op34 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 384 /* input channels per group */, - 64 /* output_channels_per_group */, - 384 /* input pixel stride */, - 64 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w123.data(), w124.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op34); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #34" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op34, xnn_delete_operator); - - xnn_operator_t op35 = nullptr; - status = xnn_create_add_nd_qs8( - -1 /* input1 zero point */, 0.5f /* input1 scale */, - -1 /* input2 zero point */, 0.5f /* input2 scale */, - -1 /* output zero point */, 1.0f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - &op35); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #35" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op35, xnn_delete_operator); - - xnn_operator_t op36 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 64 /* input channels per group */, - 384 /* output_channels_per_group */, - 64 /* input pixel stride */, - 384 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w125.data(), w126.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op36); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #36" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op36, xnn_delete_operator); - - xnn_operator_t op37 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 384 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 384 /* input pixel stride */, - 384 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w127.data(), w128.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op37); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #37" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op37, xnn_delete_operator); - - xnn_operator_t op38 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 384 /* input channels per group */, - 96 /* output_channels_per_group */, - 384 /* input pixel stride */, - 96 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w129.data(), w130.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op38); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #38" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op38, xnn_delete_operator); - - xnn_operator_t op39 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 96 /* input channels per group */, - 576 /* output_channels_per_group */, - 96 /* input pixel stride */, - 576 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w131.data(), w132.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op39); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #39" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op39, xnn_delete_operator); - - xnn_operator_t op40 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 576 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 576 /* input pixel stride */, - 576 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w133.data(), w134.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op40); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #40" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op40, xnn_delete_operator); - - xnn_operator_t op41 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 576 /* input channels per group */, - 96 /* output_channels_per_group */, - 576 /* input pixel stride */, - 96 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w135.data(), w136.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op41); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #41" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op41, xnn_delete_operator); - - xnn_operator_t op42 = nullptr; - status = xnn_create_add_nd_qs8( - -1 /* input1 zero point */, 0.5f /* input1 scale */, - -1 /* input2 zero point */, 0.5f /* input2 scale */, - -1 /* output zero point */, 1.0f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - &op42); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #42" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op42, xnn_delete_operator); - - xnn_operator_t op43 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 96 /* input channels per group */, - 576 /* output_channels_per_group */, - 96 /* input pixel stride */, - 576 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w137.data(), w138.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op43); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #43" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op43, xnn_delete_operator); - - xnn_operator_t op44 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 576 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 576 /* input pixel stride */, - 576 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w139.data(), w140.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op44); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #44" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op44, xnn_delete_operator); - - xnn_operator_t op45 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 576 /* input channels per group */, - 96 /* output_channels_per_group */, - 576 /* input pixel stride */, - 96 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w141.data(), w142.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op45); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #45" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op45, xnn_delete_operator); - - xnn_operator_t op46 = nullptr; - status = xnn_create_add_nd_qs8( - -1 /* input1 zero point */, 0.5f /* input1 scale */, - -1 /* input2 zero point */, 0.5f /* input2 scale */, - -1 /* output zero point */, 1.0f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - &op46); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #46" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op46, xnn_delete_operator); - - xnn_operator_t op47 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 96 /* input channels per group */, - 576 /* output_channels_per_group */, - 96 /* input pixel stride */, - 576 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w143.data(), w144.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op47); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #47" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op47, xnn_delete_operator); - - xnn_operator_t op48 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 0 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 576 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 576 /* input pixel stride */, - 576 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w145.data(), w146.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op48); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #48" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op48, xnn_delete_operator); - - xnn_operator_t op49 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 576 /* input channels per group */, - 160 /* output_channels_per_group */, - 576 /* input pixel stride */, - 160 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w147.data(), w148.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op49); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #49" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op49, xnn_delete_operator); - - xnn_operator_t op50 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 160 /* input channels per group */, - 960 /* output_channels_per_group */, - 160 /* input pixel stride */, - 960 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w149.data(), w150.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op50); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #50" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op50, xnn_delete_operator); - - xnn_operator_t op51 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 960 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 960 /* input pixel stride */, - 960 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w151.data(), w152.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op51); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #51" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op51, xnn_delete_operator); - - xnn_operator_t op52 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 960 /* input channels per group */, - 160 /* output_channels_per_group */, - 960 /* input pixel stride */, - 160 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w153.data(), w154.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op52); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #52" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op52, xnn_delete_operator); - - xnn_operator_t op53 = nullptr; - status = xnn_create_add_nd_qs8( - -1 /* input1 zero point */, 0.5f /* input1 scale */, - -1 /* input2 zero point */, 0.5f /* input2 scale */, - -1 /* output zero point */, 1.0f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - &op53); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #53" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op53, xnn_delete_operator); - - xnn_operator_t op54 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 160 /* input channels per group */, - 960 /* output_channels_per_group */, - 160 /* input pixel stride */, - 960 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w155.data(), w156.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op54); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #54" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op54, xnn_delete_operator); - - xnn_operator_t op55 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 960 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 960 /* input pixel stride */, - 960 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w157.data(), w158.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op55); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #55" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op55, xnn_delete_operator); - - xnn_operator_t op56 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 960 /* input channels per group */, - 160 /* output_channels_per_group */, - 960 /* input pixel stride */, - 160 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w159.data(), w160.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op56); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #56" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op56, xnn_delete_operator); - - xnn_operator_t op57 = nullptr; - status = xnn_create_add_nd_qs8( - -1 /* input1 zero point */, 0.5f /* input1 scale */, - -1 /* input2 zero point */, 0.5f /* input2 scale */, - -1 /* output zero point */, 1.0f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - &op57); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #57" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op57, xnn_delete_operator); - - xnn_operator_t op58 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 160 /* input channels per group */, - 960 /* output_channels_per_group */, - 160 /* input pixel stride */, - 960 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w161.data(), w162.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op58); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #58" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op58, xnn_delete_operator); - - xnn_operator_t op59 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 960 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 960 /* input pixel stride */, - 960 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w163.data(), w164.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op59); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #59" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op59, xnn_delete_operator); - - xnn_operator_t op60 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 960 /* input channels per group */, - 320 /* output_channels_per_group */, - 960 /* input pixel stride */, - 320 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w165.data(), w166.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op60); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #60" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op60, xnn_delete_operator); - - xnn_operator_t op61 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 320 /* input channels per group */, - 1280 /* output_channels_per_group */, - 320 /* input pixel stride */, - 1280 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w167.data(), w168.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op61); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #61" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op61, xnn_delete_operator); - - xnn_operator_t op62 = nullptr; - status = xnn_create_global_average_pooling_nwc_qs8( - -1 /* input zero point */, 0.5f /* input scale */, - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - &op62); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #62" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op62, xnn_delete_operator); - - xnn_operator_t op63 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 1280 /* input channels per group */, - 1001 /* output_channels_per_group */, - 1280 /* input pixel stride */, - 1001 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */, - w169.data(), w170.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op63); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #63" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op63, xnn_delete_operator); - - size_t op0_workspace_size = 0; - size_t op0_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op0, - /*batch_size=*/1, /*input_height=*/224, /*input_width=*/224, - &op0_workspace_size, &op0_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op0_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #0" << std::endl; - return ExecutionPlan(); - } - - size_t op1_workspace_size = 0; - size_t op1_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op1, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op1_workspace_size, &op1_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op1_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #1" << std::endl; - return ExecutionPlan(); - } - - size_t op2_workspace_size = 0; - size_t op2_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op2, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op2_workspace_size, &op2_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op2_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #2" << std::endl; - return ExecutionPlan(); - } - - size_t op3_workspace_size = 0; - size_t op3_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op3, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op3_workspace_size, &op3_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op3_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #3" << std::endl; - return ExecutionPlan(); - } - - size_t op4_workspace_size = 0; - size_t op4_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op4, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op4_workspace_size, &op4_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op4_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #4" << std::endl; - return ExecutionPlan(); - } - - size_t op5_workspace_size = 0; - size_t op5_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op5, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op5_workspace_size, &op5_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op5_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #5" << std::endl; - return ExecutionPlan(); - } - - size_t op6_workspace_size = 0; - size_t op6_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op6, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op6_workspace_size, &op6_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op6_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #6" << std::endl; - return ExecutionPlan(); - } - - size_t op7_workspace_size = 0; - size_t op7_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op7, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op7_workspace_size, &op7_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op7_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #7" << std::endl; - return ExecutionPlan(); - } - - size_t op8_workspace_size = 0; - size_t op8_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op8, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op8_workspace_size, &op8_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op8_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #8" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 56, 56, 24 }; - const size_t b_shape[] = { 1, 56, 56, 24 }; - status = xnn_reshape_add_nd_qs8( - op9, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #9" << std::endl; - return ExecutionPlan(); - } - - size_t op10_workspace_size = 0; - size_t op10_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op10, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op10_workspace_size, &op10_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op10_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #10" << std::endl; - return ExecutionPlan(); - } - - size_t op11_workspace_size = 0; - size_t op11_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op11, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op11_workspace_size, &op11_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op11_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #11" << std::endl; - return ExecutionPlan(); - } - - size_t op12_workspace_size = 0; - size_t op12_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op12, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op12_workspace_size, &op12_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op12_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #12" << std::endl; - return ExecutionPlan(); - } - - size_t op13_workspace_size = 0; - size_t op13_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op13, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op13_workspace_size, &op13_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op13_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #13" << std::endl; - return ExecutionPlan(); - } - - size_t op14_workspace_size = 0; - size_t op14_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op14, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op14_workspace_size, &op14_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op14_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #14" << std::endl; - return ExecutionPlan(); - } - - size_t op15_workspace_size = 0; - size_t op15_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op15, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op15_workspace_size, &op15_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op15_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #15" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 28, 28, 32 }; - const size_t b_shape[] = { 1, 28, 28, 32 }; - status = xnn_reshape_add_nd_qs8( - op16, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #16" << std::endl; - return ExecutionPlan(); - } - - size_t op17_workspace_size = 0; - size_t op17_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op17, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op17_workspace_size, &op17_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op17_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #17" << std::endl; - return ExecutionPlan(); - } - - size_t op18_workspace_size = 0; - size_t op18_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op18, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op18_workspace_size, &op18_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op18_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #18" << std::endl; - return ExecutionPlan(); - } - - size_t op19_workspace_size = 0; - size_t op19_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op19, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op19_workspace_size, &op19_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op19_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #19" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 28, 28, 32 }; - const size_t b_shape[] = { 1, 28, 28, 32 }; - status = xnn_reshape_add_nd_qs8( - op20, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #20" << std::endl; - return ExecutionPlan(); - } - - size_t op21_workspace_size = 0; - size_t op21_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op21, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op21_workspace_size, &op21_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op21_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #21" << std::endl; - return ExecutionPlan(); - } - - size_t op22_workspace_size = 0; - size_t op22_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op22, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op22_workspace_size, &op22_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op22_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #22" << std::endl; - return ExecutionPlan(); - } - - size_t op23_workspace_size = 0; - size_t op23_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op23, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op23_workspace_size, &op23_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op23_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #23" << std::endl; - return ExecutionPlan(); - } - - size_t op24_workspace_size = 0; - size_t op24_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op24, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op24_workspace_size, &op24_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op24_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #24" << std::endl; - return ExecutionPlan(); - } - - size_t op25_workspace_size = 0; - size_t op25_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op25, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op25_workspace_size, &op25_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op25_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #25" << std::endl; - return ExecutionPlan(); - } - - size_t op26_workspace_size = 0; - size_t op26_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op26, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op26_workspace_size, &op26_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op26_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #26" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 64 }; - const size_t b_shape[] = { 1, 14, 14, 64 }; - status = xnn_reshape_add_nd_qs8( - op27, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #27" << std::endl; - return ExecutionPlan(); - } - - size_t op28_workspace_size = 0; - size_t op28_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op28, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op28_workspace_size, &op28_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op28_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #28" << std::endl; - return ExecutionPlan(); - } - - size_t op29_workspace_size = 0; - size_t op29_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op29, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op29_workspace_size, &op29_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op29_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #29" << std::endl; - return ExecutionPlan(); - } - - size_t op30_workspace_size = 0; - size_t op30_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op30, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op30_workspace_size, &op30_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op30_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #30" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 64 }; - const size_t b_shape[] = { 1, 14, 14, 64 }; - status = xnn_reshape_add_nd_qs8( - op31, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #31" << std::endl; - return ExecutionPlan(); - } - - size_t op32_workspace_size = 0; - size_t op32_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op32, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op32_workspace_size, &op32_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op32_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #32" << std::endl; - return ExecutionPlan(); - } - - size_t op33_workspace_size = 0; - size_t op33_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op33, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op33_workspace_size, &op33_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op33_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #33" << std::endl; - return ExecutionPlan(); - } - - size_t op34_workspace_size = 0; - size_t op34_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op34, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op34_workspace_size, &op34_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op34_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #34" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 64 }; - const size_t b_shape[] = { 1, 14, 14, 64 }; - status = xnn_reshape_add_nd_qs8( - op35, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #35" << std::endl; - return ExecutionPlan(); - } - - size_t op36_workspace_size = 0; - size_t op36_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op36, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op36_workspace_size, &op36_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op36_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #36" << std::endl; - return ExecutionPlan(); - } - - size_t op37_workspace_size = 0; - size_t op37_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op37, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op37_workspace_size, &op37_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op37_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #37" << std::endl; - return ExecutionPlan(); - } - - size_t op38_workspace_size = 0; - size_t op38_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op38, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op38_workspace_size, &op38_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op38_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #38" << std::endl; - return ExecutionPlan(); - } - - size_t op39_workspace_size = 0; - size_t op39_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op39, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op39_workspace_size, &op39_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op39_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #39" << std::endl; - return ExecutionPlan(); - } - - size_t op40_workspace_size = 0; - size_t op40_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op40, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op40_workspace_size, &op40_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op40_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #40" << std::endl; - return ExecutionPlan(); - } - - size_t op41_workspace_size = 0; - size_t op41_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op41, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op41_workspace_size, &op41_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op41_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #41" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 96 }; - const size_t b_shape[] = { 1, 14, 14, 96 }; - status = xnn_reshape_add_nd_qs8( - op42, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #42" << std::endl; - return ExecutionPlan(); - } - - size_t op43_workspace_size = 0; - size_t op43_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op43, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op43_workspace_size, &op43_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op43_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #43" << std::endl; - return ExecutionPlan(); - } - - size_t op44_workspace_size = 0; - size_t op44_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op44, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op44_workspace_size, &op44_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op44_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #44" << std::endl; - return ExecutionPlan(); - } - - size_t op45_workspace_size = 0; - size_t op45_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op45, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op45_workspace_size, &op45_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op45_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #45" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 96 }; - const size_t b_shape[] = { 1, 14, 14, 96 }; - status = xnn_reshape_add_nd_qs8( - op46, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #46" << std::endl; - return ExecutionPlan(); - } - - size_t op47_workspace_size = 0; - size_t op47_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op47, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op47_workspace_size, &op47_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op47_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #47" << std::endl; - return ExecutionPlan(); - } - - size_t op48_workspace_size = 0; - size_t op48_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op48, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op48_workspace_size, &op48_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op48_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #48" << std::endl; - return ExecutionPlan(); - } - - size_t op49_workspace_size = 0; - size_t op49_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op49, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op49_workspace_size, &op49_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op49_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #49" << std::endl; - return ExecutionPlan(); - } - - size_t op50_workspace_size = 0; - size_t op50_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op50, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op50_workspace_size, &op50_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op50_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #50" << std::endl; - return ExecutionPlan(); - } - - size_t op51_workspace_size = 0; - size_t op51_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op51, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op51_workspace_size, &op51_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op51_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #51" << std::endl; - return ExecutionPlan(); - } - - size_t op52_workspace_size = 0; - size_t op52_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op52, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op52_workspace_size, &op52_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op52_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #52" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 160 }; - const size_t b_shape[] = { 1, 7, 7, 160 }; - status = xnn_reshape_add_nd_qs8( - op53, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #53" << std::endl; - return ExecutionPlan(); - } - - size_t op54_workspace_size = 0; - size_t op54_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op54, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op54_workspace_size, &op54_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op54_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #54" << std::endl; - return ExecutionPlan(); - } - - size_t op55_workspace_size = 0; - size_t op55_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op55, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op55_workspace_size, &op55_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op55_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #55" << std::endl; - return ExecutionPlan(); - } - - size_t op56_workspace_size = 0; - size_t op56_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op56, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op56_workspace_size, &op56_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op56_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #56" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 160 }; - const size_t b_shape[] = { 1, 7, 7, 160 }; - status = xnn_reshape_add_nd_qs8( - op57, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #57" << std::endl; - return ExecutionPlan(); - } - - size_t op58_workspace_size = 0; - size_t op58_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op58, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op58_workspace_size, &op58_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op58_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #58" << std::endl; - return ExecutionPlan(); - } - - size_t op59_workspace_size = 0; - size_t op59_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op59, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op59_workspace_size, &op59_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op59_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #59" << std::endl; - return ExecutionPlan(); - } - - size_t op60_workspace_size = 0; - size_t op60_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op60, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op60_workspace_size, &op60_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op60_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #60" << std::endl; - return ExecutionPlan(); - } - - size_t op61_workspace_size = 0; - size_t op61_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op61, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op61_workspace_size, &op61_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op61_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #61" << std::endl; - return ExecutionPlan(); - } - - size_t op62_workspace_size = 0; - size_t op62_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_qs8( - op62, - /*batch_size=*/1, 49 /* width */, - 1280 /* channels */, 1280 /* input stride */, 1280 /* output stride */, - &op62_workspace_size, &op62_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op62_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #62" << std::endl; - return ExecutionPlan(); - } - - size_t op63_workspace_size = 0; - size_t op63_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - op63, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op63_workspace_size, &op63_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op63_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #63" << std::endl; - return ExecutionPlan(); - } - - Workspace workspace(max_workspace_size); - - status = xnn_setup_convolution2d_nhwc_qs8( - op0, - workspace.data(), /*input=*/v0.data(), /*output=*/v1.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #0" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op1, - workspace.data(), /*input=*/v1.data(), /*output=*/v2.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #1" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op2, - workspace.data(), /*input=*/v2.data(), /*output=*/v3.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #2" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op3, - workspace.data(), /*input=*/v3.data(), /*output=*/v4.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #3" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op4, - workspace.data(), /*input=*/v4.data(), /*output=*/v5.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #4" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op5, - workspace.data(), /*input=*/v5.data(), /*output=*/v6.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #5" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op6, - workspace.data(), /*input=*/v6.data(), /*output=*/v7.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #6" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op7, - workspace.data(), /*input=*/v7.data(), /*output=*/v8.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #7" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op8, - workspace.data(), /*input=*/v8.data(), /*output=*/v9.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #8" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qs8( - op9, - v9.data() /* a */, v6.data() /* b */, /*output=*/v10.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #9" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op10, - workspace.data(), /*input=*/v10.data(), /*output=*/v11.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #10" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op11, - workspace.data(), /*input=*/v11.data(), /*output=*/v12.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #11" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op12, - workspace.data(), /*input=*/v12.data(), /*output=*/v13.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #12" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op13, - workspace.data(), /*input=*/v13.data(), /*output=*/v14.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #13" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op14, - workspace.data(), /*input=*/v14.data(), /*output=*/v15.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #14" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op15, - workspace.data(), /*input=*/v15.data(), /*output=*/v16.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #15" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qs8( - op16, - v16.data() /* a */, v13.data() /* b */, /*output=*/v17.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #16" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op17, - workspace.data(), /*input=*/v17.data(), /*output=*/v18.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #17" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op18, - workspace.data(), /*input=*/v18.data(), /*output=*/v19.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #18" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op19, - workspace.data(), /*input=*/v19.data(), /*output=*/v20.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #19" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qs8( - op20, - v20.data() /* a */, v17.data() /* b */, /*output=*/v21.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #20" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op21, - workspace.data(), /*input=*/v21.data(), /*output=*/v22.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #21" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op22, - workspace.data(), /*input=*/v22.data(), /*output=*/v23.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #22" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op23, - workspace.data(), /*input=*/v23.data(), /*output=*/v24.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #23" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op24, - workspace.data(), /*input=*/v24.data(), /*output=*/v25.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #24" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op25, - workspace.data(), /*input=*/v25.data(), /*output=*/v26.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #25" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op26, - workspace.data(), /*input=*/v26.data(), /*output=*/v27.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #26" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qs8( - op27, - v27.data() /* a */, v24.data() /* b */, /*output=*/v28.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #27" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op28, - workspace.data(), /*input=*/v28.data(), /*output=*/v29.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #28" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op29, - workspace.data(), /*input=*/v29.data(), /*output=*/v30.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #29" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op30, - workspace.data(), /*input=*/v30.data(), /*output=*/v31.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #30" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qs8( - op31, - v31.data() /* a */, v28.data() /* b */, /*output=*/v32.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #31" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op32, - workspace.data(), /*input=*/v32.data(), /*output=*/v33.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #32" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op33, - workspace.data(), /*input=*/v33.data(), /*output=*/v34.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #33" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op34, - workspace.data(), /*input=*/v34.data(), /*output=*/v35.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #34" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qs8( - op35, - v35.data() /* a */, v32.data() /* b */, /*output=*/v36.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #35" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op36, - workspace.data(), /*input=*/v36.data(), /*output=*/v37.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #36" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op37, - workspace.data(), /*input=*/v37.data(), /*output=*/v38.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #37" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op38, - workspace.data(), /*input=*/v38.data(), /*output=*/v39.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #38" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op39, - workspace.data(), /*input=*/v39.data(), /*output=*/v40.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #39" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op40, - workspace.data(), /*input=*/v40.data(), /*output=*/v41.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #40" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op41, - workspace.data(), /*input=*/v41.data(), /*output=*/v42.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #41" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qs8( - op42, - v42.data() /* a */, v39.data() /* b */, /*output=*/v43.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #42" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op43, - workspace.data(), /*input=*/v43.data(), /*output=*/v44.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #43" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op44, - workspace.data(), /*input=*/v44.data(), /*output=*/v45.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #44" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op45, - workspace.data(), /*input=*/v45.data(), /*output=*/v46.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #45" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qs8( - op46, - v46.data() /* a */, v43.data() /* b */, /*output=*/v47.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #46" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op47, - workspace.data(), /*input=*/v47.data(), /*output=*/v48.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #47" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op48, - workspace.data(), /*input=*/v48.data(), /*output=*/v49.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #48" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op49, - workspace.data(), /*input=*/v49.data(), /*output=*/v50.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #49" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op50, - workspace.data(), /*input=*/v50.data(), /*output=*/v51.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #50" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op51, - workspace.data(), /*input=*/v51.data(), /*output=*/v52.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #51" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op52, - workspace.data(), /*input=*/v52.data(), /*output=*/v53.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #52" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qs8( - op53, - v53.data() /* a */, v50.data() /* b */, /*output=*/v54.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #53" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op54, - workspace.data(), /*input=*/v54.data(), /*output=*/v55.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #54" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op55, - workspace.data(), /*input=*/v55.data(), /*output=*/v56.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #55" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op56, - workspace.data(), /*input=*/v56.data(), /*output=*/v57.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #56" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qs8( - op57, - v57.data() /* a */, v54.data() /* b */, /*output=*/v58.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #57" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op58, - workspace.data(), /*input=*/v58.data(), /*output=*/v59.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #58" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op59, - workspace.data(), /*input=*/v59.data(), /*output=*/v60.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #59" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op60, - workspace.data(), /*input=*/v60.data(), /*output=*/v61.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #60" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op61, - workspace.data(), /*input=*/v61.data(), /*output=*/v62.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #61" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_qs8( - op62, - workspace.data(), - /*input=*/v62.data(), /*output=*/v63.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #62" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8( - op63, - workspace.data(), /*input=*/v63.data(), /*output=*/v64.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #63" << std::endl; - return ExecutionPlan(); - } - - XNN_PRAGMA_CLANG("clang diagnostic push") - XNN_PRAGMA_CLANG("clang diagnostic ignored \"-Wpessimizing-move\"") - return ExecutionPlan{operators, workspace}; - XNN_PRAGMA_CLANG("clang diagnostic pop") -} - -} // namespace models diff --git a/models/qs8-qc8w-mobilenet-v1.cc b/models/qs8-qc8w-mobilenet-v1.cc deleted file mode 100644 index 3d1c6f44bb1..00000000000 --- a/models/qs8-qc8w-mobilenet-v1.cc +++ /dev/null @@ -1,1621 +0,0 @@ -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include "xnnpack.h" - -#include -#include -#include -#include -#include -#include - -#include "xnnpack/cache.h" -#include "xnnpack/common.h" -#include "xnnpack/models.h" - -namespace models { - -ExecutionPlan QC8MobileNetV1(pthreadpool_t threadpool) { - alignas(16) static std::array v0; - alignas(16) static std::array v1; - alignas(16) static std::array v2; - alignas(16) static std::array v3; - alignas(16) static std::array v4; - alignas(16) static std::array v5; - alignas(16) static std::array v6; - alignas(16) static std::array v7; - alignas(16) static std::array v8; - alignas(16) static std::array v9; - alignas(16) static std::array v10; - alignas(16) static std::array v11; - alignas(16) static std::array v12; - alignas(16) static std::array v13; - alignas(16) static std::array v14; - alignas(16) static std::array v15; - alignas(16) static std::array v16; - alignas(16) static std::array v17; - alignas(16) static std::array v18; - alignas(16) static std::array v19; - alignas(16) static std::array v20; - alignas(16) static std::array v21; - alignas(16) static std::array v22; - alignas(16) static std::array v23; - alignas(16) static std::array v24; - alignas(16) static std::array v25; - alignas(16) static std::array v26; - alignas(16) static std::array v27; - alignas(16) static std::array v28; - alignas(16) static std::array v29; - alignas(16) static std::array w30; - alignas(16) static std::array s30; - alignas(16) static std::array w31; - alignas(16) static std::array w32; - alignas(16) static std::array s32; - alignas(16) static std::array w33; - alignas(16) static std::array w34; - alignas(16) static std::array s34; - alignas(16) static std::array w35; - alignas(16) static std::array w36; - alignas(16) static std::array s36; - alignas(16) static std::array w37; - alignas(16) static std::array w38; - alignas(16) static std::array s38; - alignas(16) static std::array w39; - alignas(16) static std::array w40; - alignas(16) static std::array s40; - alignas(16) static std::array w41; - alignas(16) static std::array w42; - alignas(16) static std::array s42; - alignas(16) static std::array w43; - alignas(16) static std::array w44; - alignas(16) static std::array s44; - alignas(16) static std::array w45; - alignas(16) static std::array w46; - alignas(16) static std::array s46; - alignas(16) static std::array w47; - alignas(16) static std::array w48; - alignas(16) static std::array s48; - alignas(16) static std::array w49; - alignas(16) static std::array w50; - alignas(16) static std::array s50; - alignas(16) static std::array w51; - alignas(16) static std::array w52; - alignas(16) static std::array s52; - alignas(16) static std::array w53; - alignas(16) static std::array w54; - alignas(16) static std::array s54; - alignas(16) static std::array w55; - alignas(16) static std::array w56; - alignas(16) static std::array s56; - alignas(16) static std::array w57; - alignas(16) static std::array w58; - alignas(16) static std::array s58; - alignas(16) static std::array w59; - alignas(16) static std::array w60; - alignas(16) static std::array s60; - alignas(16) static std::array w61; - alignas(16) static std::array w62; - alignas(16) static std::array s62; - alignas(16) static std::array w63; - alignas(16) static std::array w64; - alignas(16) static std::array s64; - alignas(16) static std::array w65; - alignas(16) static std::array w66; - alignas(16) static std::array s66; - alignas(16) static std::array w67; - alignas(16) static std::array w68; - alignas(16) static std::array s68; - alignas(16) static std::array w69; - alignas(16) static std::array w70; - alignas(16) static std::array s70; - alignas(16) static std::array w71; - alignas(16) static std::array w72; - alignas(16) static std::array s72; - alignas(16) static std::array w73; - alignas(16) static std::array w74; - alignas(16) static std::array s74; - alignas(16) static std::array w75; - alignas(16) static std::array w76; - alignas(16) static std::array s76; - alignas(16) static std::array w77; - alignas(16) static std::array w78; - alignas(16) static std::array s78; - alignas(16) static std::array w79; - alignas(16) static std::array w80; - alignas(16) static std::array s80; - alignas(16) static std::array w81; - alignas(16) static std::array w82; - alignas(16) static std::array s82; - alignas(16) static std::array w83; - alignas(16) static std::array w84; - alignas(16) static std::array s84; - alignas(16) static std::array w85; - - std::random_device random_device; - auto rng = std::mt19937(random_device()); - auto i8rng = std::bind(std::uniform_int_distribution(-127, 127), std::ref(rng)); - auto i32rng = std::bind(std::uniform_int_distribution(-10000, 10000), std::ref(rng)); - auto srng = std::bind(std::uniform_real_distribution(0.5f, 0.75f), std::ref(rng)); - std::generate(v0.begin(), v0.end(), std::ref(i8rng)); - std::generate(v1.begin(), v1.end(), std::ref(i8rng)); - std::generate(v2.begin(), v2.end(), std::ref(i8rng)); - std::generate(v3.begin(), v3.end(), std::ref(i8rng)); - std::generate(v4.begin(), v4.end(), std::ref(i8rng)); - std::generate(v5.begin(), v5.end(), std::ref(i8rng)); - std::generate(v6.begin(), v6.end(), std::ref(i8rng)); - std::generate(v7.begin(), v7.end(), std::ref(i8rng)); - std::generate(v8.begin(), v8.end(), std::ref(i8rng)); - std::generate(v9.begin(), v9.end(), std::ref(i8rng)); - std::generate(v10.begin(), v10.end(), std::ref(i8rng)); - std::generate(v11.begin(), v11.end(), std::ref(i8rng)); - std::generate(v12.begin(), v12.end(), std::ref(i8rng)); - std::generate(v13.begin(), v13.end(), std::ref(i8rng)); - std::generate(v14.begin(), v14.end(), std::ref(i8rng)); - std::generate(v15.begin(), v15.end(), std::ref(i8rng)); - std::generate(v16.begin(), v16.end(), std::ref(i8rng)); - std::generate(v17.begin(), v17.end(), std::ref(i8rng)); - std::generate(v18.begin(), v18.end(), std::ref(i8rng)); - std::generate(v19.begin(), v19.end(), std::ref(i8rng)); - std::generate(v20.begin(), v20.end(), std::ref(i8rng)); - std::generate(v21.begin(), v21.end(), std::ref(i8rng)); - std::generate(v22.begin(), v22.end(), std::ref(i8rng)); - std::generate(v23.begin(), v23.end(), std::ref(i8rng)); - std::generate(v24.begin(), v24.end(), std::ref(i8rng)); - std::generate(v25.begin(), v25.end(), std::ref(i8rng)); - std::generate(v26.begin(), v26.end(), std::ref(i8rng)); - std::generate(v27.begin(), v27.end(), std::ref(i8rng)); - std::generate(v28.begin(), v28.end(), std::ref(i8rng)); - std::generate(v29.begin(), v29.end(), std::ref(i8rng)); - std::generate(w30.begin(), w30.end(), std::ref(i8rng)); - std::generate(s30.begin(), s30.end(), std::ref(srng)); - std::generate(w31.begin(), w31.end(), std::ref(i32rng)); - std::generate(w32.begin(), w32.end(), std::ref(i8rng)); - std::generate(s32.begin(), s32.end(), std::ref(srng)); - std::generate(w33.begin(), w33.end(), std::ref(i32rng)); - std::generate(w34.begin(), w34.end(), std::ref(i8rng)); - std::generate(s34.begin(), s34.end(), std::ref(srng)); - std::generate(w35.begin(), w35.end(), std::ref(i32rng)); - std::generate(w36.begin(), w36.end(), std::ref(i8rng)); - std::generate(s36.begin(), s36.end(), std::ref(srng)); - std::generate(w37.begin(), w37.end(), std::ref(i32rng)); - std::generate(w38.begin(), w38.end(), std::ref(i8rng)); - std::generate(s38.begin(), s38.end(), std::ref(srng)); - std::generate(w39.begin(), w39.end(), std::ref(i32rng)); - std::generate(w40.begin(), w40.end(), std::ref(i8rng)); - std::generate(s40.begin(), s40.end(), std::ref(srng)); - std::generate(w41.begin(), w41.end(), std::ref(i32rng)); - std::generate(w42.begin(), w42.end(), std::ref(i8rng)); - std::generate(s42.begin(), s42.end(), std::ref(srng)); - std::generate(w43.begin(), w43.end(), std::ref(i32rng)); - std::generate(w44.begin(), w44.end(), std::ref(i8rng)); - std::generate(s44.begin(), s44.end(), std::ref(srng)); - std::generate(w45.begin(), w45.end(), std::ref(i32rng)); - std::generate(w46.begin(), w46.end(), std::ref(i8rng)); - std::generate(s46.begin(), s46.end(), std::ref(srng)); - std::generate(w47.begin(), w47.end(), std::ref(i32rng)); - std::generate(w48.begin(), w48.end(), std::ref(i8rng)); - std::generate(s48.begin(), s48.end(), std::ref(srng)); - std::generate(w49.begin(), w49.end(), std::ref(i32rng)); - std::generate(w50.begin(), w50.end(), std::ref(i8rng)); - std::generate(s50.begin(), s50.end(), std::ref(srng)); - std::generate(w51.begin(), w51.end(), std::ref(i32rng)); - std::generate(w52.begin(), w52.end(), std::ref(i8rng)); - std::generate(s52.begin(), s52.end(), std::ref(srng)); - std::generate(w53.begin(), w53.end(), std::ref(i32rng)); - std::generate(w54.begin(), w54.end(), std::ref(i8rng)); - std::generate(s54.begin(), s54.end(), std::ref(srng)); - std::generate(w55.begin(), w55.end(), std::ref(i32rng)); - std::generate(w56.begin(), w56.end(), std::ref(i8rng)); - std::generate(s56.begin(), s56.end(), std::ref(srng)); - std::generate(w57.begin(), w57.end(), std::ref(i32rng)); - std::generate(w58.begin(), w58.end(), std::ref(i8rng)); - std::generate(s58.begin(), s58.end(), std::ref(srng)); - std::generate(w59.begin(), w59.end(), std::ref(i32rng)); - std::generate(w60.begin(), w60.end(), std::ref(i8rng)); - std::generate(s60.begin(), s60.end(), std::ref(srng)); - std::generate(w61.begin(), w61.end(), std::ref(i32rng)); - std::generate(w62.begin(), w62.end(), std::ref(i8rng)); - std::generate(s62.begin(), s62.end(), std::ref(srng)); - std::generate(w63.begin(), w63.end(), std::ref(i32rng)); - std::generate(w64.begin(), w64.end(), std::ref(i8rng)); - std::generate(s64.begin(), s64.end(), std::ref(srng)); - std::generate(w65.begin(), w65.end(), std::ref(i32rng)); - std::generate(w66.begin(), w66.end(), std::ref(i8rng)); - std::generate(s66.begin(), s66.end(), std::ref(srng)); - std::generate(w67.begin(), w67.end(), std::ref(i32rng)); - std::generate(w68.begin(), w68.end(), std::ref(i8rng)); - std::generate(s68.begin(), s68.end(), std::ref(srng)); - std::generate(w69.begin(), w69.end(), std::ref(i32rng)); - std::generate(w70.begin(), w70.end(), std::ref(i8rng)); - std::generate(s70.begin(), s70.end(), std::ref(srng)); - std::generate(w71.begin(), w71.end(), std::ref(i32rng)); - std::generate(w72.begin(), w72.end(), std::ref(i8rng)); - std::generate(s72.begin(), s72.end(), std::ref(srng)); - std::generate(w73.begin(), w73.end(), std::ref(i32rng)); - std::generate(w74.begin(), w74.end(), std::ref(i8rng)); - std::generate(s74.begin(), s74.end(), std::ref(srng)); - std::generate(w75.begin(), w75.end(), std::ref(i32rng)); - std::generate(w76.begin(), w76.end(), std::ref(i8rng)); - std::generate(s76.begin(), s76.end(), std::ref(srng)); - std::generate(w77.begin(), w77.end(), std::ref(i32rng)); - std::generate(w78.begin(), w78.end(), std::ref(i8rng)); - std::generate(s78.begin(), s78.end(), std::ref(srng)); - std::generate(w79.begin(), w79.end(), std::ref(i32rng)); - std::generate(w80.begin(), w80.end(), std::ref(i8rng)); - std::generate(s80.begin(), s80.end(), std::ref(srng)); - std::generate(w81.begin(), w81.end(), std::ref(i32rng)); - std::generate(w82.begin(), w82.end(), std::ref(i8rng)); - std::generate(s82.begin(), s82.end(), std::ref(srng)); - std::generate(w83.begin(), w83.end(), std::ref(i32rng)); - std::generate(w84.begin(), w84.end(), std::ref(i8rng)); - std::generate(s84.begin(), s84.end(), std::ref(srng)); - std::generate(w85.begin(), w85.end(), std::ref(i32rng)); - - Operators operators; - xnn_status status; - size_t max_workspace_size = 0; - - xnn_operator_t op0 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 0 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 3 /* input channels per group */, - 32 /* output_channels_per_group */, - 3 /* input pixel stride */, - 32 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s30.data(), w30.data(), w31.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op0); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #0" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op0, xnn_delete_operator); - - xnn_operator_t op1 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 32 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 32 /* input pixel stride */, - 32 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s32.data(), w32.data(), w33.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op1); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #1" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op1, xnn_delete_operator); - - xnn_operator_t op2 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 32 /* input channels per group */, - 64 /* output_channels_per_group */, - 32 /* input pixel stride */, - 64 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s34.data(), w34.data(), w35.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op2); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #2" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op2, xnn_delete_operator); - - xnn_operator_t op3 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 0 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 64 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 64 /* input pixel stride */, - 64 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s36.data(), w36.data(), w37.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op3); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #3" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op3, xnn_delete_operator); - - xnn_operator_t op4 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 64 /* input channels per group */, - 128 /* output_channels_per_group */, - 64 /* input pixel stride */, - 128 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s38.data(), w38.data(), w39.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op4); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #4" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op4, xnn_delete_operator); - - xnn_operator_t op5 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 128 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 128 /* input pixel stride */, - 128 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s40.data(), w40.data(), w41.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op5); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #5" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op5, xnn_delete_operator); - - xnn_operator_t op6 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 128 /* input channels per group */, - 128 /* output_channels_per_group */, - 128 /* input pixel stride */, - 128 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s42.data(), w42.data(), w43.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op6); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #6" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op6, xnn_delete_operator); - - xnn_operator_t op7 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 0 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 128 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 128 /* input pixel stride */, - 128 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s44.data(), w44.data(), w45.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op7); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #7" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op7, xnn_delete_operator); - - xnn_operator_t op8 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 128 /* input channels per group */, - 256 /* output_channels_per_group */, - 128 /* input pixel stride */, - 256 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s46.data(), w46.data(), w47.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op8); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #8" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op8, xnn_delete_operator); - - xnn_operator_t op9 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 256 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 256 /* input pixel stride */, - 256 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s48.data(), w48.data(), w49.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op9); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #9" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op9, xnn_delete_operator); - - xnn_operator_t op10 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 256 /* input channels per group */, - 256 /* output_channels_per_group */, - 256 /* input pixel stride */, - 256 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s50.data(), w50.data(), w51.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op10); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #10" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op10, xnn_delete_operator); - - xnn_operator_t op11 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 0 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 256 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 256 /* input pixel stride */, - 256 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s52.data(), w52.data(), w53.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op11); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #11" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op11, xnn_delete_operator); - - xnn_operator_t op12 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 256 /* input channels per group */, - 512 /* output_channels_per_group */, - 256 /* input pixel stride */, - 512 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s54.data(), w54.data(), w55.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op12); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #12" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op12, xnn_delete_operator); - - xnn_operator_t op13 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 512 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s56.data(), w56.data(), w57.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op13); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #13" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op13, xnn_delete_operator); - - xnn_operator_t op14 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 512 /* input channels per group */, - 512 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s58.data(), w58.data(), w59.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op14); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #14" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op14, xnn_delete_operator); - - xnn_operator_t op15 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 512 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s60.data(), w60.data(), w61.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op15); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #15" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op15, xnn_delete_operator); - - xnn_operator_t op16 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 512 /* input channels per group */, - 512 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s62.data(), w62.data(), w63.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op16); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #16" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op16, xnn_delete_operator); - - xnn_operator_t op17 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 512 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s64.data(), w64.data(), w65.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op17); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #17" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op17, xnn_delete_operator); - - xnn_operator_t op18 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 512 /* input channels per group */, - 512 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s66.data(), w66.data(), w67.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op18); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #18" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op18, xnn_delete_operator); - - xnn_operator_t op19 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 512 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s68.data(), w68.data(), w69.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op19); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #19" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op19, xnn_delete_operator); - - xnn_operator_t op20 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 512 /* input channels per group */, - 512 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s70.data(), w70.data(), w71.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op20); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #20" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op20, xnn_delete_operator); - - xnn_operator_t op21 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 512 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s72.data(), w72.data(), w73.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op21); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #21" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op21, xnn_delete_operator); - - xnn_operator_t op22 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 512 /* input channels per group */, - 512 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s74.data(), w74.data(), w75.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op22); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #22" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op22, xnn_delete_operator); - - xnn_operator_t op23 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 0 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 512 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 512 /* input pixel stride */, - 512 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s76.data(), w76.data(), w77.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op23); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #23" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op23, xnn_delete_operator); - - xnn_operator_t op24 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 512 /* input channels per group */, - 1024 /* output_channels_per_group */, - 512 /* input pixel stride */, - 1024 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s78.data(), w78.data(), w79.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op24); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #24" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op24, xnn_delete_operator); - - xnn_operator_t op25 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1024 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 1024 /* input pixel stride */, - 1024 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s80.data(), w80.data(), w81.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op25); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #25" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op25, xnn_delete_operator); - - xnn_operator_t op26 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 1024 /* input channels per group */, - 1024 /* output_channels_per_group */, - 1024 /* input pixel stride */, - 1024 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s82.data(), w82.data(), w83.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op26); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #26" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op26, xnn_delete_operator); - - xnn_operator_t op27 = nullptr; - status = xnn_create_global_average_pooling_nwc_qs8( - -1 /* input zero point */, 0.5f /* input scale */, - -1 /* output zero point */, 0.5f /* output scale */, - -126 /* output min */, 126 /* output max */, - 0 /* flags */, - &op27); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #27" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op27, xnn_delete_operator); - - xnn_operator_t op28 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 1024 /* input channels per group */, - 1001 /* output_channels_per_group */, - 1024 /* input pixel stride */, - 1001 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s84.data(), w84.data(), w85.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op28); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #28" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op28, xnn_delete_operator); - - size_t op0_workspace_size = 0; - size_t op0_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op0, - /*batch_size=*/1, /*input_height=*/224, /*input_width=*/224, - &op0_workspace_size, &op0_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op0_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #0" << std::endl; - return ExecutionPlan(); - } - - size_t op1_workspace_size = 0; - size_t op1_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op1, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op1_workspace_size, &op1_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op1_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #1" << std::endl; - return ExecutionPlan(); - } - - size_t op2_workspace_size = 0; - size_t op2_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op2, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op2_workspace_size, &op2_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op2_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #2" << std::endl; - return ExecutionPlan(); - } - - size_t op3_workspace_size = 0; - size_t op3_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op3, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op3_workspace_size, &op3_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op3_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #3" << std::endl; - return ExecutionPlan(); - } - - size_t op4_workspace_size = 0; - size_t op4_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op4, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op4_workspace_size, &op4_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op4_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #4" << std::endl; - return ExecutionPlan(); - } - - size_t op5_workspace_size = 0; - size_t op5_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op5, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op5_workspace_size, &op5_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op5_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #5" << std::endl; - return ExecutionPlan(); - } - - size_t op6_workspace_size = 0; - size_t op6_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op6, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op6_workspace_size, &op6_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op6_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #6" << std::endl; - return ExecutionPlan(); - } - - size_t op7_workspace_size = 0; - size_t op7_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op7, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op7_workspace_size, &op7_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op7_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #7" << std::endl; - return ExecutionPlan(); - } - - size_t op8_workspace_size = 0; - size_t op8_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op8, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op8_workspace_size, &op8_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op8_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #8" << std::endl; - return ExecutionPlan(); - } - - size_t op9_workspace_size = 0; - size_t op9_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op9, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op9_workspace_size, &op9_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op9_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #9" << std::endl; - return ExecutionPlan(); - } - - size_t op10_workspace_size = 0; - size_t op10_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op10, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op10_workspace_size, &op10_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op10_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #10" << std::endl; - return ExecutionPlan(); - } - - size_t op11_workspace_size = 0; - size_t op11_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op11, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op11_workspace_size, &op11_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op11_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #11" << std::endl; - return ExecutionPlan(); - } - - size_t op12_workspace_size = 0; - size_t op12_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op12, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op12_workspace_size, &op12_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op12_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #12" << std::endl; - return ExecutionPlan(); - } - - size_t op13_workspace_size = 0; - size_t op13_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op13, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op13_workspace_size, &op13_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op13_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #13" << std::endl; - return ExecutionPlan(); - } - - size_t op14_workspace_size = 0; - size_t op14_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op14, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op14_workspace_size, &op14_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op14_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #14" << std::endl; - return ExecutionPlan(); - } - - size_t op15_workspace_size = 0; - size_t op15_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op15, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op15_workspace_size, &op15_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op15_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #15" << std::endl; - return ExecutionPlan(); - } - - size_t op16_workspace_size = 0; - size_t op16_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op16, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op16_workspace_size, &op16_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op16_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #16" << std::endl; - return ExecutionPlan(); - } - - size_t op17_workspace_size = 0; - size_t op17_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op17, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op17_workspace_size, &op17_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op17_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #17" << std::endl; - return ExecutionPlan(); - } - - size_t op18_workspace_size = 0; - size_t op18_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op18, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op18_workspace_size, &op18_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op18_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #18" << std::endl; - return ExecutionPlan(); - } - - size_t op19_workspace_size = 0; - size_t op19_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op19, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op19_workspace_size, &op19_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op19_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #19" << std::endl; - return ExecutionPlan(); - } - - size_t op20_workspace_size = 0; - size_t op20_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op20, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op20_workspace_size, &op20_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op20_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #20" << std::endl; - return ExecutionPlan(); - } - - size_t op21_workspace_size = 0; - size_t op21_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op21, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op21_workspace_size, &op21_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op21_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #21" << std::endl; - return ExecutionPlan(); - } - - size_t op22_workspace_size = 0; - size_t op22_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op22, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op22_workspace_size, &op22_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op22_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #22" << std::endl; - return ExecutionPlan(); - } - - size_t op23_workspace_size = 0; - size_t op23_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op23, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op23_workspace_size, &op23_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op23_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #23" << std::endl; - return ExecutionPlan(); - } - - size_t op24_workspace_size = 0; - size_t op24_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op24, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op24_workspace_size, &op24_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op24_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #24" << std::endl; - return ExecutionPlan(); - } - - size_t op25_workspace_size = 0; - size_t op25_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op25, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op25_workspace_size, &op25_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op25_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #25" << std::endl; - return ExecutionPlan(); - } - - size_t op26_workspace_size = 0; - size_t op26_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op26, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op26_workspace_size, &op26_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op26_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #26" << std::endl; - return ExecutionPlan(); - } - - size_t op27_workspace_size = 0; - size_t op27_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_qs8( - op27, - /*batch_size=*/1, 49 /* width */, - 1024 /* channels */, 1024 /* input stride */, 1024 /* output stride */, - &op27_workspace_size, &op27_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op27_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #27" << std::endl; - return ExecutionPlan(); - } - - size_t op28_workspace_size = 0; - size_t op28_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op28, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op28_workspace_size, &op28_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op28_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #28" << std::endl; - return ExecutionPlan(); - } - - Workspace workspace(max_workspace_size); - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op0, - workspace.data(), /*input=*/v0.data(), /*output=*/v1.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #0" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op1, - workspace.data(), /*input=*/v1.data(), /*output=*/v2.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #1" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op2, - workspace.data(), /*input=*/v2.data(), /*output=*/v3.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #2" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op3, - workspace.data(), /*input=*/v3.data(), /*output=*/v4.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #3" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op4, - workspace.data(), /*input=*/v4.data(), /*output=*/v5.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #4" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op5, - workspace.data(), /*input=*/v5.data(), /*output=*/v6.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #5" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op6, - workspace.data(), /*input=*/v6.data(), /*output=*/v7.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #6" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op7, - workspace.data(), /*input=*/v7.data(), /*output=*/v8.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #7" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op8, - workspace.data(), /*input=*/v8.data(), /*output=*/v9.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #8" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op9, - workspace.data(), /*input=*/v9.data(), /*output=*/v10.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #9" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op10, - workspace.data(), /*input=*/v10.data(), /*output=*/v11.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #10" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op11, - workspace.data(), /*input=*/v11.data(), /*output=*/v12.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #11" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op12, - workspace.data(), /*input=*/v12.data(), /*output=*/v13.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #12" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op13, - workspace.data(), /*input=*/v13.data(), /*output=*/v14.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #13" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op14, - workspace.data(), /*input=*/v14.data(), /*output=*/v15.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #14" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op15, - workspace.data(), /*input=*/v15.data(), /*output=*/v16.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #15" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op16, - workspace.data(), /*input=*/v16.data(), /*output=*/v17.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #16" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op17, - workspace.data(), /*input=*/v17.data(), /*output=*/v18.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #17" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op18, - workspace.data(), /*input=*/v18.data(), /*output=*/v19.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #18" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op19, - workspace.data(), /*input=*/v19.data(), /*output=*/v20.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #19" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op20, - workspace.data(), /*input=*/v20.data(), /*output=*/v21.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #20" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op21, - workspace.data(), /*input=*/v21.data(), /*output=*/v22.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #21" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op22, - workspace.data(), /*input=*/v22.data(), /*output=*/v23.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #22" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op23, - workspace.data(), /*input=*/v23.data(), /*output=*/v24.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #23" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op24, - workspace.data(), /*input=*/v24.data(), /*output=*/v25.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #24" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op25, - workspace.data(), /*input=*/v25.data(), /*output=*/v26.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #25" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op26, - workspace.data(), /*input=*/v26.data(), /*output=*/v27.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #26" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_qs8( - op27, - workspace.data(), - /*input=*/v27.data(), /*output=*/v28.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #27" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op28, - workspace.data(), /*input=*/v28.data(), /*output=*/v29.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #28" << std::endl; - return ExecutionPlan(); - } - - XNN_PRAGMA_CLANG("clang diagnostic push") - XNN_PRAGMA_CLANG("clang diagnostic ignored \"-Wpessimizing-move\"") - return ExecutionPlan{operators, workspace}; - XNN_PRAGMA_CLANG("clang diagnostic pop") -} - -} // namespace models diff --git a/models/qs8-qc8w-mobilenet-v2.cc b/models/qs8-qc8w-mobilenet-v2.cc deleted file mode 100644 index 88ce865dcbc..00000000000 --- a/models/qs8-qc8w-mobilenet-v2.cc +++ /dev/null @@ -1,3355 +0,0 @@ -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include "xnnpack.h" - -#include -#include -#include -#include -#include -#include - -#include "xnnpack/cache.h" -#include "xnnpack/common.h" -#include "xnnpack/models.h" - -namespace models { - -ExecutionPlan QC8MobileNetV2(pthreadpool_t threadpool) { - alignas(16) static std::array v0; - alignas(16) static std::array v1; - alignas(16) static std::array v2; - alignas(16) static std::array v3; - alignas(16) static std::array v4; - alignas(16) static std::array v5; - alignas(16) static std::array v6; - alignas(16) static std::array v7; - alignas(16) static std::array v8; - alignas(16) static std::array v9; - alignas(16) static std::array v10; - alignas(16) static std::array v11; - alignas(16) static std::array v12; - alignas(16) static std::array v13; - alignas(16) static std::array v14; - alignas(16) static std::array v15; - alignas(16) static std::array v16; - alignas(16) static std::array v17; - alignas(16) static std::array v18; - alignas(16) static std::array v19; - alignas(16) static std::array v20; - alignas(16) static std::array v21; - alignas(16) static std::array v22; - alignas(16) static std::array v23; - alignas(16) static std::array v24; - alignas(16) static std::array v25; - alignas(16) static std::array v26; - alignas(16) static std::array v27; - alignas(16) static std::array v28; - alignas(16) static std::array v29; - alignas(16) static std::array v30; - alignas(16) static std::array v31; - alignas(16) static std::array v32; - alignas(16) static std::array v33; - alignas(16) static std::array v34; - alignas(16) static std::array v35; - alignas(16) static std::array v36; - alignas(16) static std::array v37; - alignas(16) static std::array v38; - alignas(16) static std::array v39; - alignas(16) static std::array v40; - alignas(16) static std::array v41; - alignas(16) static std::array v42; - alignas(16) static std::array v43; - alignas(16) static std::array v44; - alignas(16) static std::array v45; - alignas(16) static std::array v46; - alignas(16) static std::array v47; - alignas(16) static std::array v48; - alignas(16) static std::array v49; - alignas(16) static std::array v50; - alignas(16) static std::array v51; - alignas(16) static std::array v52; - alignas(16) static std::array v53; - alignas(16) static std::array v54; - alignas(16) static std::array v55; - alignas(16) static std::array v56; - alignas(16) static std::array v57; - alignas(16) static std::array v58; - alignas(16) static std::array v59; - alignas(16) static std::array v60; - alignas(16) static std::array v61; - alignas(16) static std::array v62; - alignas(16) static std::array v63; - alignas(16) static std::array v64; - alignas(16) static std::array w65; - alignas(16) static std::array s65; - alignas(16) static std::array w66; - alignas(16) static std::array w67; - alignas(16) static std::array s67; - alignas(16) static std::array w68; - alignas(16) static std::array w69; - alignas(16) static std::array s69; - alignas(16) static std::array w70; - alignas(16) static std::array w71; - alignas(16) static std::array s71; - alignas(16) static std::array w72; - alignas(16) static std::array w73; - alignas(16) static std::array s73; - alignas(16) static std::array w74; - alignas(16) static std::array w75; - alignas(16) static std::array s75; - alignas(16) static std::array w76; - alignas(16) static std::array w77; - alignas(16) static std::array s77; - alignas(16) static std::array w78; - alignas(16) static std::array w79; - alignas(16) static std::array s79; - alignas(16) static std::array w80; - alignas(16) static std::array w81; - alignas(16) static std::array s81; - alignas(16) static std::array w82; - alignas(16) static std::array w83; - alignas(16) static std::array s83; - alignas(16) static std::array w84; - alignas(16) static std::array w85; - alignas(16) static std::array s85; - alignas(16) static std::array w86; - alignas(16) static std::array w87; - alignas(16) static std::array s87; - alignas(16) static std::array w88; - alignas(16) static std::array w89; - alignas(16) static std::array s89; - alignas(16) static std::array w90; - alignas(16) static std::array w91; - alignas(16) static std::array s91; - alignas(16) static std::array w92; - alignas(16) static std::array w93; - alignas(16) static std::array s93; - alignas(16) static std::array w94; - alignas(16) static std::array w95; - alignas(16) static std::array s95; - alignas(16) static std::array w96; - alignas(16) static std::array w97; - alignas(16) static std::array s97; - alignas(16) static std::array w98; - alignas(16) static std::array w99; - alignas(16) static std::array s99; - alignas(16) static std::array w100; - alignas(16) static std::array w101; - alignas(16) static std::array s101; - alignas(16) static std::array w102; - alignas(16) static std::array w103; - alignas(16) static std::array s103; - alignas(16) static std::array w104; - alignas(16) static std::array w105; - alignas(16) static std::array s105; - alignas(16) static std::array w106; - alignas(16) static std::array w107; - alignas(16) static std::array s107; - alignas(16) static std::array w108; - alignas(16) static std::array w109; - alignas(16) static std::array s109; - alignas(16) static std::array w110; - alignas(16) static std::array w111; - alignas(16) static std::array s111; - alignas(16) static std::array w112; - alignas(16) static std::array w113; - alignas(16) static std::array s113; - alignas(16) static std::array w114; - alignas(16) static std::array w115; - alignas(16) static std::array s115; - alignas(16) static std::array w116; - alignas(16) static std::array w117; - alignas(16) static std::array s117; - alignas(16) static std::array w118; - alignas(16) static std::array w119; - alignas(16) static std::array s119; - alignas(16) static std::array w120; - alignas(16) static std::array w121; - alignas(16) static std::array s121; - alignas(16) static std::array w122; - alignas(16) static std::array w123; - alignas(16) static std::array s123; - alignas(16) static std::array w124; - alignas(16) static std::array w125; - alignas(16) static std::array s125; - alignas(16) static std::array w126; - alignas(16) static std::array w127; - alignas(16) static std::array s127; - alignas(16) static std::array w128; - alignas(16) static std::array w129; - alignas(16) static std::array s129; - alignas(16) static std::array w130; - alignas(16) static std::array w131; - alignas(16) static std::array s131; - alignas(16) static std::array w132; - alignas(16) static std::array w133; - alignas(16) static std::array s133; - alignas(16) static std::array w134; - alignas(16) static std::array w135; - alignas(16) static std::array s135; - alignas(16) static std::array w136; - alignas(16) static std::array w137; - alignas(16) static std::array s137; - alignas(16) static std::array w138; - alignas(16) static std::array w139; - alignas(16) static std::array s139; - alignas(16) static std::array w140; - alignas(16) static std::array w141; - alignas(16) static std::array s141; - alignas(16) static std::array w142; - alignas(16) static std::array w143; - alignas(16) static std::array s143; - alignas(16) static std::array w144; - alignas(16) static std::array w145; - alignas(16) static std::array s145; - alignas(16) static std::array w146; - alignas(16) static std::array w147; - alignas(16) static std::array s147; - alignas(16) static std::array w148; - alignas(16) static std::array w149; - alignas(16) static std::array s149; - alignas(16) static std::array w150; - alignas(16) static std::array w151; - alignas(16) static std::array s151; - alignas(16) static std::array w152; - alignas(16) static std::array w153; - alignas(16) static std::array s153; - alignas(16) static std::array w154; - alignas(16) static std::array w155; - alignas(16) static std::array s155; - alignas(16) static std::array w156; - alignas(16) static std::array w157; - alignas(16) static std::array s157; - alignas(16) static std::array w158; - alignas(16) static std::array w159; - alignas(16) static std::array s159; - alignas(16) static std::array w160; - alignas(16) static std::array w161; - alignas(16) static std::array s161; - alignas(16) static std::array w162; - alignas(16) static std::array w163; - alignas(16) static std::array s163; - alignas(16) static std::array w164; - alignas(16) static std::array w165; - alignas(16) static std::array s165; - alignas(16) static std::array w166; - alignas(16) static std::array w167; - alignas(16) static std::array s167; - alignas(16) static std::array w168; - alignas(16) static std::array w169; - alignas(16) static std::array s169; - alignas(16) static std::array w170; - - std::random_device random_device; - auto rng = std::mt19937(random_device()); - auto i8rng = std::bind(std::uniform_int_distribution(-127, 127), std::ref(rng)); - auto i32rng = std::bind(std::uniform_int_distribution(-10000, 10000), std::ref(rng)); - auto srng = std::bind(std::uniform_real_distribution(0.5f, 0.75f), std::ref(rng)); - std::generate(v0.begin(), v0.end(), std::ref(i8rng)); - std::generate(v1.begin(), v1.end(), std::ref(i8rng)); - std::generate(v2.begin(), v2.end(), std::ref(i8rng)); - std::generate(v3.begin(), v3.end(), std::ref(i8rng)); - std::generate(v4.begin(), v4.end(), std::ref(i8rng)); - std::generate(v5.begin(), v5.end(), std::ref(i8rng)); - std::generate(v6.begin(), v6.end(), std::ref(i8rng)); - std::generate(v7.begin(), v7.end(), std::ref(i8rng)); - std::generate(v8.begin(), v8.end(), std::ref(i8rng)); - std::generate(v9.begin(), v9.end(), std::ref(i8rng)); - std::generate(v10.begin(), v10.end(), std::ref(i8rng)); - std::generate(v11.begin(), v11.end(), std::ref(i8rng)); - std::generate(v12.begin(), v12.end(), std::ref(i8rng)); - std::generate(v13.begin(), v13.end(), std::ref(i8rng)); - std::generate(v14.begin(), v14.end(), std::ref(i8rng)); - std::generate(v15.begin(), v15.end(), std::ref(i8rng)); - std::generate(v16.begin(), v16.end(), std::ref(i8rng)); - std::generate(v17.begin(), v17.end(), std::ref(i8rng)); - std::generate(v18.begin(), v18.end(), std::ref(i8rng)); - std::generate(v19.begin(), v19.end(), std::ref(i8rng)); - std::generate(v20.begin(), v20.end(), std::ref(i8rng)); - std::generate(v21.begin(), v21.end(), std::ref(i8rng)); - std::generate(v22.begin(), v22.end(), std::ref(i8rng)); - std::generate(v23.begin(), v23.end(), std::ref(i8rng)); - std::generate(v24.begin(), v24.end(), std::ref(i8rng)); - std::generate(v25.begin(), v25.end(), std::ref(i8rng)); - std::generate(v26.begin(), v26.end(), std::ref(i8rng)); - std::generate(v27.begin(), v27.end(), std::ref(i8rng)); - std::generate(v28.begin(), v28.end(), std::ref(i8rng)); - std::generate(v29.begin(), v29.end(), std::ref(i8rng)); - std::generate(v30.begin(), v30.end(), std::ref(i8rng)); - std::generate(v31.begin(), v31.end(), std::ref(i8rng)); - std::generate(v32.begin(), v32.end(), std::ref(i8rng)); - std::generate(v33.begin(), v33.end(), std::ref(i8rng)); - std::generate(v34.begin(), v34.end(), std::ref(i8rng)); - std::generate(v35.begin(), v35.end(), std::ref(i8rng)); - std::generate(v36.begin(), v36.end(), std::ref(i8rng)); - std::generate(v37.begin(), v37.end(), std::ref(i8rng)); - std::generate(v38.begin(), v38.end(), std::ref(i8rng)); - std::generate(v39.begin(), v39.end(), std::ref(i8rng)); - std::generate(v40.begin(), v40.end(), std::ref(i8rng)); - std::generate(v41.begin(), v41.end(), std::ref(i8rng)); - std::generate(v42.begin(), v42.end(), std::ref(i8rng)); - std::generate(v43.begin(), v43.end(), std::ref(i8rng)); - std::generate(v44.begin(), v44.end(), std::ref(i8rng)); - std::generate(v45.begin(), v45.end(), std::ref(i8rng)); - std::generate(v46.begin(), v46.end(), std::ref(i8rng)); - std::generate(v47.begin(), v47.end(), std::ref(i8rng)); - std::generate(v48.begin(), v48.end(), std::ref(i8rng)); - std::generate(v49.begin(), v49.end(), std::ref(i8rng)); - std::generate(v50.begin(), v50.end(), std::ref(i8rng)); - std::generate(v51.begin(), v51.end(), std::ref(i8rng)); - std::generate(v52.begin(), v52.end(), std::ref(i8rng)); - std::generate(v53.begin(), v53.end(), std::ref(i8rng)); - std::generate(v54.begin(), v54.end(), std::ref(i8rng)); - std::generate(v55.begin(), v55.end(), std::ref(i8rng)); - std::generate(v56.begin(), v56.end(), std::ref(i8rng)); - std::generate(v57.begin(), v57.end(), std::ref(i8rng)); - std::generate(v58.begin(), v58.end(), std::ref(i8rng)); - std::generate(v59.begin(), v59.end(), std::ref(i8rng)); - std::generate(v60.begin(), v60.end(), std::ref(i8rng)); - std::generate(v61.begin(), v61.end(), std::ref(i8rng)); - std::generate(v62.begin(), v62.end(), std::ref(i8rng)); - std::generate(v63.begin(), v63.end(), std::ref(i8rng)); - std::generate(v64.begin(), v64.end(), std::ref(i8rng)); - std::generate(w65.begin(), w65.end(), std::ref(i8rng)); - std::generate(s65.begin(), s65.end(), std::ref(srng)); - std::generate(w66.begin(), w66.end(), std::ref(i32rng)); - std::generate(w67.begin(), w67.end(), std::ref(i8rng)); - std::generate(s67.begin(), s67.end(), std::ref(srng)); - std::generate(w68.begin(), w68.end(), std::ref(i32rng)); - std::generate(w69.begin(), w69.end(), std::ref(i8rng)); - std::generate(s69.begin(), s69.end(), std::ref(srng)); - std::generate(w70.begin(), w70.end(), std::ref(i32rng)); - std::generate(w71.begin(), w71.end(), std::ref(i8rng)); - std::generate(s71.begin(), s71.end(), std::ref(srng)); - std::generate(w72.begin(), w72.end(), std::ref(i32rng)); - std::generate(w73.begin(), w73.end(), std::ref(i8rng)); - std::generate(s73.begin(), s73.end(), std::ref(srng)); - std::generate(w74.begin(), w74.end(), std::ref(i32rng)); - std::generate(w75.begin(), w75.end(), std::ref(i8rng)); - std::generate(s75.begin(), s75.end(), std::ref(srng)); - std::generate(w76.begin(), w76.end(), std::ref(i32rng)); - std::generate(w77.begin(), w77.end(), std::ref(i8rng)); - std::generate(s77.begin(), s77.end(), std::ref(srng)); - std::generate(w78.begin(), w78.end(), std::ref(i32rng)); - std::generate(w79.begin(), w79.end(), std::ref(i8rng)); - std::generate(s79.begin(), s79.end(), std::ref(srng)); - std::generate(w80.begin(), w80.end(), std::ref(i32rng)); - std::generate(w81.begin(), w81.end(), std::ref(i8rng)); - std::generate(s81.begin(), s81.end(), std::ref(srng)); - std::generate(w82.begin(), w82.end(), std::ref(i32rng)); - std::generate(w83.begin(), w83.end(), std::ref(i8rng)); - std::generate(s83.begin(), s83.end(), std::ref(srng)); - std::generate(w84.begin(), w84.end(), std::ref(i32rng)); - std::generate(w85.begin(), w85.end(), std::ref(i8rng)); - std::generate(s85.begin(), s85.end(), std::ref(srng)); - std::generate(w86.begin(), w86.end(), std::ref(i32rng)); - std::generate(w87.begin(), w87.end(), std::ref(i8rng)); - std::generate(s87.begin(), s87.end(), std::ref(srng)); - std::generate(w88.begin(), w88.end(), std::ref(i32rng)); - std::generate(w89.begin(), w89.end(), std::ref(i8rng)); - std::generate(s89.begin(), s89.end(), std::ref(srng)); - std::generate(w90.begin(), w90.end(), std::ref(i32rng)); - std::generate(w91.begin(), w91.end(), std::ref(i8rng)); - std::generate(s91.begin(), s91.end(), std::ref(srng)); - std::generate(w92.begin(), w92.end(), std::ref(i32rng)); - std::generate(w93.begin(), w93.end(), std::ref(i8rng)); - std::generate(s93.begin(), s93.end(), std::ref(srng)); - std::generate(w94.begin(), w94.end(), std::ref(i32rng)); - std::generate(w95.begin(), w95.end(), std::ref(i8rng)); - std::generate(s95.begin(), s95.end(), std::ref(srng)); - std::generate(w96.begin(), w96.end(), std::ref(i32rng)); - std::generate(w97.begin(), w97.end(), std::ref(i8rng)); - std::generate(s97.begin(), s97.end(), std::ref(srng)); - std::generate(w98.begin(), w98.end(), std::ref(i32rng)); - std::generate(w99.begin(), w99.end(), std::ref(i8rng)); - std::generate(s99.begin(), s99.end(), std::ref(srng)); - std::generate(w100.begin(), w100.end(), std::ref(i32rng)); - std::generate(w101.begin(), w101.end(), std::ref(i8rng)); - std::generate(s101.begin(), s101.end(), std::ref(srng)); - std::generate(w102.begin(), w102.end(), std::ref(i32rng)); - std::generate(w103.begin(), w103.end(), std::ref(i8rng)); - std::generate(s103.begin(), s103.end(), std::ref(srng)); - std::generate(w104.begin(), w104.end(), std::ref(i32rng)); - std::generate(w105.begin(), w105.end(), std::ref(i8rng)); - std::generate(s105.begin(), s105.end(), std::ref(srng)); - std::generate(w106.begin(), w106.end(), std::ref(i32rng)); - std::generate(w107.begin(), w107.end(), std::ref(i8rng)); - std::generate(s107.begin(), s107.end(), std::ref(srng)); - std::generate(w108.begin(), w108.end(), std::ref(i32rng)); - std::generate(w109.begin(), w109.end(), std::ref(i8rng)); - std::generate(s109.begin(), s109.end(), std::ref(srng)); - std::generate(w110.begin(), w110.end(), std::ref(i32rng)); - std::generate(w111.begin(), w111.end(), std::ref(i8rng)); - std::generate(s111.begin(), s111.end(), std::ref(srng)); - std::generate(w112.begin(), w112.end(), std::ref(i32rng)); - std::generate(w113.begin(), w113.end(), std::ref(i8rng)); - std::generate(s113.begin(), s113.end(), std::ref(srng)); - std::generate(w114.begin(), w114.end(), std::ref(i32rng)); - std::generate(w115.begin(), w115.end(), std::ref(i8rng)); - std::generate(s115.begin(), s115.end(), std::ref(srng)); - std::generate(w116.begin(), w116.end(), std::ref(i32rng)); - std::generate(w117.begin(), w117.end(), std::ref(i8rng)); - std::generate(s117.begin(), s117.end(), std::ref(srng)); - std::generate(w118.begin(), w118.end(), std::ref(i32rng)); - std::generate(w119.begin(), w119.end(), std::ref(i8rng)); - std::generate(s119.begin(), s119.end(), std::ref(srng)); - std::generate(w120.begin(), w120.end(), std::ref(i32rng)); - std::generate(w121.begin(), w121.end(), std::ref(i8rng)); - std::generate(s121.begin(), s121.end(), std::ref(srng)); - std::generate(w122.begin(), w122.end(), std::ref(i32rng)); - std::generate(w123.begin(), w123.end(), std::ref(i8rng)); - std::generate(s123.begin(), s123.end(), std::ref(srng)); - std::generate(w124.begin(), w124.end(), std::ref(i32rng)); - std::generate(w125.begin(), w125.end(), std::ref(i8rng)); - std::generate(s125.begin(), s125.end(), std::ref(srng)); - std::generate(w126.begin(), w126.end(), std::ref(i32rng)); - std::generate(w127.begin(), w127.end(), std::ref(i8rng)); - std::generate(s127.begin(), s127.end(), std::ref(srng)); - std::generate(w128.begin(), w128.end(), std::ref(i32rng)); - std::generate(w129.begin(), w129.end(), std::ref(i8rng)); - std::generate(s129.begin(), s129.end(), std::ref(srng)); - std::generate(w130.begin(), w130.end(), std::ref(i32rng)); - std::generate(w131.begin(), w131.end(), std::ref(i8rng)); - std::generate(s131.begin(), s131.end(), std::ref(srng)); - std::generate(w132.begin(), w132.end(), std::ref(i32rng)); - std::generate(w133.begin(), w133.end(), std::ref(i8rng)); - std::generate(s133.begin(), s133.end(), std::ref(srng)); - std::generate(w134.begin(), w134.end(), std::ref(i32rng)); - std::generate(w135.begin(), w135.end(), std::ref(i8rng)); - std::generate(s135.begin(), s135.end(), std::ref(srng)); - std::generate(w136.begin(), w136.end(), std::ref(i32rng)); - std::generate(w137.begin(), w137.end(), std::ref(i8rng)); - std::generate(s137.begin(), s137.end(), std::ref(srng)); - std::generate(w138.begin(), w138.end(), std::ref(i32rng)); - std::generate(w139.begin(), w139.end(), std::ref(i8rng)); - std::generate(s139.begin(), s139.end(), std::ref(srng)); - std::generate(w140.begin(), w140.end(), std::ref(i32rng)); - std::generate(w141.begin(), w141.end(), std::ref(i8rng)); - std::generate(s141.begin(), s141.end(), std::ref(srng)); - std::generate(w142.begin(), w142.end(), std::ref(i32rng)); - std::generate(w143.begin(), w143.end(), std::ref(i8rng)); - std::generate(s143.begin(), s143.end(), std::ref(srng)); - std::generate(w144.begin(), w144.end(), std::ref(i32rng)); - std::generate(w145.begin(), w145.end(), std::ref(i8rng)); - std::generate(s145.begin(), s145.end(), std::ref(srng)); - std::generate(w146.begin(), w146.end(), std::ref(i32rng)); - std::generate(w147.begin(), w147.end(), std::ref(i8rng)); - std::generate(s147.begin(), s147.end(), std::ref(srng)); - std::generate(w148.begin(), w148.end(), std::ref(i32rng)); - std::generate(w149.begin(), w149.end(), std::ref(i8rng)); - std::generate(s149.begin(), s149.end(), std::ref(srng)); - std::generate(w150.begin(), w150.end(), std::ref(i32rng)); - std::generate(w151.begin(), w151.end(), std::ref(i8rng)); - std::generate(s151.begin(), s151.end(), std::ref(srng)); - std::generate(w152.begin(), w152.end(), std::ref(i32rng)); - std::generate(w151.begin(), w151.end(), std::ref(i8rng)); - std::generate(s153.begin(), s153.end(), std::ref(srng)); - std::generate(w154.begin(), w154.end(), std::ref(i32rng)); - std::generate(w155.begin(), w155.end(), std::ref(i8rng)); - std::generate(s155.begin(), s155.end(), std::ref(srng)); - std::generate(w156.begin(), w156.end(), std::ref(i32rng)); - std::generate(w157.begin(), w157.end(), std::ref(i8rng)); - std::generate(s157.begin(), s157.end(), std::ref(srng)); - std::generate(w158.begin(), w158.end(), std::ref(i32rng)); - std::generate(w159.begin(), w159.end(), std::ref(i8rng)); - std::generate(s159.begin(), s159.end(), std::ref(srng)); - std::generate(w160.begin(), w160.end(), std::ref(i32rng)); - std::generate(w161.begin(), w161.end(), std::ref(i8rng)); - std::generate(s161.begin(), s161.end(), std::ref(srng)); - std::generate(w162.begin(), w162.end(), std::ref(i32rng)); - std::generate(w163.begin(), w163.end(), std::ref(i8rng)); - std::generate(s163.begin(), s163.end(), std::ref(srng)); - std::generate(w164.begin(), w164.end(), std::ref(i32rng)); - std::generate(w165.begin(), w165.end(), std::ref(i8rng)); - std::generate(s165.begin(), s165.end(), std::ref(srng)); - std::generate(w166.begin(), w166.end(), std::ref(i32rng)); - std::generate(w167.begin(), w167.end(), std::ref(i8rng)); - std::generate(s167.begin(), s167.end(), std::ref(srng)); - std::generate(w168.begin(), w168.end(), std::ref(i32rng)); - std::generate(w169.begin(), w169.end(), std::ref(i8rng)); - std::generate(s169.begin(), s169.end(), std::ref(srng)); - std::generate(w170.begin(), w170.end(), std::ref(i32rng)); - - Operators operators; - xnn_status status; - size_t max_workspace_size = 0; - - xnn_operator_t op0 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 0 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 3 /* input channels per group */, - 32 /* output_channels_per_group */, - 3 /* input pixel stride */, - 32 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s65.data(), w65.data(), w66.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op0); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #0" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op0, xnn_delete_operator); - - xnn_operator_t op1 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 32 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 32 /* input pixel stride */, - 32 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s67.data(), w67.data(), w68.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op1); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #1" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op1, xnn_delete_operator); - - xnn_operator_t op2 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 32 /* input channels per group */, - 16 /* output_channels_per_group */, - 32 /* input pixel stride */, - 16 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s69.data(), w69.data(), w70.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op2); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #2" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op2, xnn_delete_operator); - - xnn_operator_t op3 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 16 /* input channels per group */, - 96 /* output_channels_per_group */, - 16 /* input pixel stride */, - 96 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s71.data(), w71.data(), w72.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op3); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #3" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op3, xnn_delete_operator); - - xnn_operator_t op4 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 0 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 96 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 96 /* input pixel stride */, - 96 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s73.data(), w73.data(), w74.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op4); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #4" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op4, xnn_delete_operator); - - xnn_operator_t op5 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 96 /* input channels per group */, - 24 /* output_channels_per_group */, - 96 /* input pixel stride */, - 24 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s75.data(), w75.data(), w76.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op5); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #5" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op5, xnn_delete_operator); - - xnn_operator_t op6 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 24 /* input channels per group */, - 144 /* output_channels_per_group */, - 24 /* input pixel stride */, - 144 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s77.data(), w77.data(), w78.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op6); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #6" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op6, xnn_delete_operator); - - xnn_operator_t op7 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 144 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 144 /* input pixel stride */, - 144 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s79.data(), w79.data(), w80.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op7); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #7" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op7, xnn_delete_operator); - - xnn_operator_t op8 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 144 /* input channels per group */, - 24 /* output_channels_per_group */, - 144 /* input pixel stride */, - 24 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s81.data(), w81.data(), w82.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op8); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #8" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op8, xnn_delete_operator); - - xnn_operator_t op9 = nullptr; - status = xnn_create_add_nd_qs8( - -1 /* input1 zero point */, 0.5f /* input1 scale */, - -1 /* input2 zero point */, 0.5f /* input2 scale */, - -1 /* output zero point */, 1.0f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - &op9); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #9" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op9, xnn_delete_operator); - - xnn_operator_t op10 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 24 /* input channels per group */, - 144 /* output_channels_per_group */, - 24 /* input pixel stride */, - 144 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s83.data(), w83.data(), w84.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op10); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #10" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op10, xnn_delete_operator); - - xnn_operator_t op11 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 0 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 144 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 144 /* input pixel stride */, - 144 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s85.data(), w85.data(), w86.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op11); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #11" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op11, xnn_delete_operator); - - xnn_operator_t op12 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 144 /* input channels per group */, - 32 /* output_channels_per_group */, - 144 /* input pixel stride */, - 32 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s87.data(), w87.data(), w88.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op12); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #12" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op12, xnn_delete_operator); - - xnn_operator_t op13 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 32 /* input channels per group */, - 192 /* output_channels_per_group */, - 32 /* input pixel stride */, - 192 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s89.data(), w89.data(), w90.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op13); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #13" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op13, xnn_delete_operator); - - xnn_operator_t op14 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 192 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 192 /* input pixel stride */, - 192 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s91.data(), w91.data(), w92.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op14); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #14" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op14, xnn_delete_operator); - - xnn_operator_t op15 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 192 /* input channels per group */, - 32 /* output_channels_per_group */, - 192 /* input pixel stride */, - 32 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s93.data(), w93.data(), w94.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op15); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #15" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op15, xnn_delete_operator); - - xnn_operator_t op16 = nullptr; - status = xnn_create_add_nd_qs8( - -1 /* input1 zero point */, 0.5f /* input1 scale */, - -1 /* input2 zero point */, 0.5f /* input2 scale */, - -1 /* output zero point */, 1.0f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - &op16); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #16" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op16, xnn_delete_operator); - - xnn_operator_t op17 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 32 /* input channels per group */, - 192 /* output_channels_per_group */, - 32 /* input pixel stride */, - 192 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s95.data(), w95.data(), w96.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op17); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #17" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op17, xnn_delete_operator); - - xnn_operator_t op18 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 192 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 192 /* input pixel stride */, - 192 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s97.data(), w97.data(), w98.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op18); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #18" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op18, xnn_delete_operator); - - xnn_operator_t op19 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 192 /* input channels per group */, - 32 /* output_channels_per_group */, - 192 /* input pixel stride */, - 32 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s99.data(), w99.data(), w100.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op19); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #19" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op19, xnn_delete_operator); - - xnn_operator_t op20 = nullptr; - status = xnn_create_add_nd_qs8( - -1 /* input1 zero point */, 0.5f /* input1 scale */, - -1 /* input2 zero point */, 0.5f /* input2 scale */, - -1 /* output zero point */, 1.0f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - &op20); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #20" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op20, xnn_delete_operator); - - xnn_operator_t op21 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 32 /* input channels per group */, - 192 /* output_channels_per_group */, - 32 /* input pixel stride */, - 192 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s101.data(), w101.data(), w102.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op21); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #21" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op21, xnn_delete_operator); - - xnn_operator_t op22 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 0 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 192 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 192 /* input pixel stride */, - 192 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s103.data(), w103.data(), w104.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op22); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #22" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op22, xnn_delete_operator); - - xnn_operator_t op23 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 192 /* input channels per group */, - 64 /* output_channels_per_group */, - 192 /* input pixel stride */, - 64 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s105.data(), w105.data(), w106.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op23); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #23" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op23, xnn_delete_operator); - - xnn_operator_t op24 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 64 /* input channels per group */, - 384 /* output_channels_per_group */, - 64 /* input pixel stride */, - 384 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s107.data(), w107.data(), w108.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op24); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #24" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op24, xnn_delete_operator); - - xnn_operator_t op25 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 384 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 384 /* input pixel stride */, - 384 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s109.data(), w109.data(), w110.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op25); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #25" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op25, xnn_delete_operator); - - xnn_operator_t op26 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 384 /* input channels per group */, - 64 /* output_channels_per_group */, - 384 /* input pixel stride */, - 64 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s111.data(), w111.data(), w112.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op26); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #26" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op26, xnn_delete_operator); - - xnn_operator_t op27 = nullptr; - status = xnn_create_add_nd_qs8( - -1 /* input1 zero point */, 0.5f /* input1 scale */, - -1 /* input2 zero point */, 0.5f /* input2 scale */, - -1 /* output zero point */, 1.0f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - &op27); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #27" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op27, xnn_delete_operator); - - xnn_operator_t op28 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 64 /* input channels per group */, - 384 /* output_channels_per_group */, - 64 /* input pixel stride */, - 384 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s113.data(), w113.data(), w114.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op28); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #28" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op28, xnn_delete_operator); - - xnn_operator_t op29 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 384 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 384 /* input pixel stride */, - 384 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s115.data(), w115.data(), w116.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op29); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #29" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op29, xnn_delete_operator); - - xnn_operator_t op30 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 384 /* input channels per group */, - 64 /* output_channels_per_group */, - 384 /* input pixel stride */, - 64 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s117.data(), w117.data(), w118.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op30); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #30" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op30, xnn_delete_operator); - - xnn_operator_t op31 = nullptr; - status = xnn_create_add_nd_qs8( - -1 /* input1 zero point */, 0.5f /* input1 scale */, - -1 /* input2 zero point */, 0.5f /* input2 scale */, - -1 /* output zero point */, 1.0f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - &op31); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #31" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op31, xnn_delete_operator); - - xnn_operator_t op32 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 64 /* input channels per group */, - 384 /* output_channels_per_group */, - 64 /* input pixel stride */, - 384 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s119.data(), w119.data(), w120.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op32); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #32" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op32, xnn_delete_operator); - - xnn_operator_t op33 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 384 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 384 /* input pixel stride */, - 384 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s121.data(), w121.data(), w122.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op33); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #33" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op33, xnn_delete_operator); - - xnn_operator_t op34 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 384 /* input channels per group */, - 64 /* output_channels_per_group */, - 384 /* input pixel stride */, - 64 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s123.data(), w123.data(), w124.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op34); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #34" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op34, xnn_delete_operator); - - xnn_operator_t op35 = nullptr; - status = xnn_create_add_nd_qs8( - -1 /* input1 zero point */, 0.5f /* input1 scale */, - -1 /* input2 zero point */, 0.5f /* input2 scale */, - -1 /* output zero point */, 1.0f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - &op35); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #35" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op35, xnn_delete_operator); - - xnn_operator_t op36 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 64 /* input channels per group */, - 384 /* output_channels_per_group */, - 64 /* input pixel stride */, - 384 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s125.data(), w125.data(), w126.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op36); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #36" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op36, xnn_delete_operator); - - xnn_operator_t op37 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 384 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 384 /* input pixel stride */, - 384 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s127.data(), w127.data(), w128.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op37); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #37" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op37, xnn_delete_operator); - - xnn_operator_t op38 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 384 /* input channels per group */, - 96 /* output_channels_per_group */, - 384 /* input pixel stride */, - 96 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s129.data(), w129.data(), w130.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op38); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #38" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op38, xnn_delete_operator); - - xnn_operator_t op39 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 96 /* input channels per group */, - 576 /* output_channels_per_group */, - 96 /* input pixel stride */, - 576 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s131.data(), w131.data(), w132.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op39); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #39" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op39, xnn_delete_operator); - - xnn_operator_t op40 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 576 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 576 /* input pixel stride */, - 576 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s133.data(), w133.data(), w134.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op40); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #40" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op40, xnn_delete_operator); - - xnn_operator_t op41 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 576 /* input channels per group */, - 96 /* output_channels_per_group */, - 576 /* input pixel stride */, - 96 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s135.data(), w135.data(), w136.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op41); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #41" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op41, xnn_delete_operator); - - xnn_operator_t op42 = nullptr; - status = xnn_create_add_nd_qs8( - -1 /* input1 zero point */, 0.5f /* input1 scale */, - -1 /* input2 zero point */, 0.5f /* input2 scale */, - -1 /* output zero point */, 1.0f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - &op42); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #42" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op42, xnn_delete_operator); - - xnn_operator_t op43 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 96 /* input channels per group */, - 576 /* output_channels_per_group */, - 96 /* input pixel stride */, - 576 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s137.data(), w137.data(), w138.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op43); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #43" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op43, xnn_delete_operator); - - xnn_operator_t op44 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 576 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 576 /* input pixel stride */, - 576 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s139.data(), w139.data(), w140.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op44); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #44" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op44, xnn_delete_operator); - - xnn_operator_t op45 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 576 /* input channels per group */, - 96 /* output_channels_per_group */, - 576 /* input pixel stride */, - 96 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s141.data(), w141.data(), w142.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op45); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #45" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op45, xnn_delete_operator); - - xnn_operator_t op46 = nullptr; - status = xnn_create_add_nd_qs8( - -1 /* input1 zero point */, 0.5f /* input1 scale */, - -1 /* input2 zero point */, 0.5f /* input2 scale */, - -1 /* output zero point */, 1.0f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - &op46); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #46" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op46, xnn_delete_operator); - - xnn_operator_t op47 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 96 /* input channels per group */, - 576 /* output_channels_per_group */, - 96 /* input pixel stride */, - 576 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s143.data(), w143.data(), w144.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op47); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #47" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op47, xnn_delete_operator); - - xnn_operator_t op48 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 0 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 2 /* subsampling height */, 2 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 576 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 576 /* input pixel stride */, - 576 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s145.data(), w145.data(), w146.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op48); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #48" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op48, xnn_delete_operator); - - xnn_operator_t op49 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 576 /* input channels per group */, - 160 /* output_channels_per_group */, - 576 /* input pixel stride */, - 160 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s147.data(), w147.data(), w148.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op49); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #49" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op49, xnn_delete_operator); - - xnn_operator_t op50 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 160 /* input channels per group */, - 960 /* output_channels_per_group */, - 160 /* input pixel stride */, - 960 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s149.data(), w149.data(), w150.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op50); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #50" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op50, xnn_delete_operator); - - xnn_operator_t op51 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 960 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 960 /* input pixel stride */, - 960 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s151.data(), w151.data(), w152.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op51); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #51" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op51, xnn_delete_operator); - - xnn_operator_t op52 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 960 /* input channels per group */, - 160 /* output_channels_per_group */, - 960 /* input pixel stride */, - 160 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s153.data(), w153.data(), w154.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op52); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #52" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op52, xnn_delete_operator); - - xnn_operator_t op53 = nullptr; - status = xnn_create_add_nd_qs8( - -1 /* input1 zero point */, 0.5f /* input1 scale */, - -1 /* input2 zero point */, 0.5f /* input2 scale */, - -1 /* output zero point */, 1.0f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - &op53); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #53" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op53, xnn_delete_operator); - - xnn_operator_t op54 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 160 /* input channels per group */, - 960 /* output_channels_per_group */, - 160 /* input pixel stride */, - 960 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s155.data(), w155.data(), w156.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op54); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #54" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op54, xnn_delete_operator); - - xnn_operator_t op55 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 960 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 960 /* input pixel stride */, - 960 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s157.data(), w157.data(), w158.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op55); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #55" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op55, xnn_delete_operator); - - xnn_operator_t op56 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 960 /* input channels per group */, - 160 /* output_channels_per_group */, - 960 /* input pixel stride */, - 160 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s159.data(), w159.data(), w160.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op56); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #56" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op56, xnn_delete_operator); - - xnn_operator_t op57 = nullptr; - status = xnn_create_add_nd_qs8( - -1 /* input1 zero point */, 0.5f /* input1 scale */, - -1 /* input2 zero point */, 0.5f /* input2 scale */, - -1 /* output zero point */, 1.0f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - &op57); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #57" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op57, xnn_delete_operator); - - xnn_operator_t op58 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 160 /* input channels per group */, - 960 /* output_channels_per_group */, - 160 /* input pixel stride */, - 960 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s161.data(), w161.data(), w162.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op58); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #58" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op58, xnn_delete_operator); - - xnn_operator_t op59 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 1 /* top padding */, 1 /* right padding */, - 1 /* bottom padding */, 1 /* left padding */, - 3 /* kernel height */, 3 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 960 /* groups */, - 1 /* input channels per group */, - 1 /* output_channels_per_group */, - 960 /* input pixel stride */, - 960 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s163.data(), w163.data(), w164.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op59); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #59" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op59, xnn_delete_operator); - - xnn_operator_t op60 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 960 /* input channels per group */, - 320 /* output_channels_per_group */, - 960 /* input pixel stride */, - 320 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s165.data(), w165.data(), w166.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op60); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #60" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op60, xnn_delete_operator); - - xnn_operator_t op61 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 320 /* input channels per group */, - 1280 /* output_channels_per_group */, - 320 /* input pixel stride */, - 1280 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s167.data(), w167.data(), w168.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op61); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #61" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op61, xnn_delete_operator); - - xnn_operator_t op62 = nullptr; - status = xnn_create_global_average_pooling_nwc_qs8( - -1 /* input zero point */, 0.5f /* input scale */, - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - &op62); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #62" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op62, xnn_delete_operator); - - xnn_operator_t op63 = nullptr; - status = xnn_create_convolution2d_nhwc_qs8_qc8w( - 0 /* top padding */, 0 /* right padding */, - 0 /* bottom padding */, 0 /* left padding */, - 1 /* kernel height */, 1 /* kernel width */, - 1 /* subsampling height */, 1 /* subsampling width */, - 1 /* dilation_height */, 1 /* dilation_width */, - 1 /* groups */, - 1280 /* input channels per group */, - 1001 /* output_channels_per_group */, - 1280 /* input pixel stride */, - 1001 /* output pixel stride */, - -1 /* input zero point */, 0.5f /* input scale */, - s169.data(), w169.data(), w170.data(), - -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */, - 0 /* flags */, - nullptr, - nullptr, - &op63); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #63" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op63, xnn_delete_operator); - - size_t op0_workspace_size = 0; - size_t op0_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op0, - /*batch_size=*/1, /*input_height=*/224, /*input_width=*/224, - &op0_workspace_size, &op0_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op0_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #0" << std::endl; - return ExecutionPlan(); - } - - size_t op1_workspace_size = 0; - size_t op1_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op1, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op1_workspace_size, &op1_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op1_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #1" << std::endl; - return ExecutionPlan(); - } - - size_t op2_workspace_size = 0; - size_t op2_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op2, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op2_workspace_size, &op2_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op2_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #2" << std::endl; - return ExecutionPlan(); - } - - size_t op3_workspace_size = 0; - size_t op3_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op3, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op3_workspace_size, &op3_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op3_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #3" << std::endl; - return ExecutionPlan(); - } - - size_t op4_workspace_size = 0; - size_t op4_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op4, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op4_workspace_size, &op4_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op4_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #4" << std::endl; - return ExecutionPlan(); - } - - size_t op5_workspace_size = 0; - size_t op5_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op5, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op5_workspace_size, &op5_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op5_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #5" << std::endl; - return ExecutionPlan(); - } - - size_t op6_workspace_size = 0; - size_t op6_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op6, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op6_workspace_size, &op6_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op6_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #6" << std::endl; - return ExecutionPlan(); - } - - size_t op7_workspace_size = 0; - size_t op7_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op7, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op7_workspace_size, &op7_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op7_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #7" << std::endl; - return ExecutionPlan(); - } - - size_t op8_workspace_size = 0; - size_t op8_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op8, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op8_workspace_size, &op8_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op8_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #8" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 56, 56, 24 }; - const size_t b_shape[] = { 1, 56, 56, 24 }; - status = xnn_reshape_add_nd_qs8( - op9, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #9" << std::endl; - return ExecutionPlan(); - } - - size_t op10_workspace_size = 0; - size_t op10_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op10, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op10_workspace_size, &op10_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op10_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #10" << std::endl; - return ExecutionPlan(); - } - - size_t op11_workspace_size = 0; - size_t op11_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op11, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op11_workspace_size, &op11_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op11_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #11" << std::endl; - return ExecutionPlan(); - } - - size_t op12_workspace_size = 0; - size_t op12_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op12, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op12_workspace_size, &op12_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op12_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #12" << std::endl; - return ExecutionPlan(); - } - - size_t op13_workspace_size = 0; - size_t op13_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op13, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op13_workspace_size, &op13_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op13_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #13" << std::endl; - return ExecutionPlan(); - } - - size_t op14_workspace_size = 0; - size_t op14_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op14, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op14_workspace_size, &op14_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op14_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #14" << std::endl; - return ExecutionPlan(); - } - - size_t op15_workspace_size = 0; - size_t op15_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op15, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op15_workspace_size, &op15_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op15_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #15" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 28, 28, 32 }; - const size_t b_shape[] = { 1, 28, 28, 32 }; - status = xnn_reshape_add_nd_qs8( - op16, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #16" << std::endl; - return ExecutionPlan(); - } - - size_t op17_workspace_size = 0; - size_t op17_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op17, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op17_workspace_size, &op17_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op17_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #17" << std::endl; - return ExecutionPlan(); - } - - size_t op18_workspace_size = 0; - size_t op18_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op18, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op18_workspace_size, &op18_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op18_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #18" << std::endl; - return ExecutionPlan(); - } - - size_t op19_workspace_size = 0; - size_t op19_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op19, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op19_workspace_size, &op19_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op19_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #19" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 28, 28, 32 }; - const size_t b_shape[] = { 1, 28, 28, 32 }; - status = xnn_reshape_add_nd_qs8( - op20, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #20" << std::endl; - return ExecutionPlan(); - } - - size_t op21_workspace_size = 0; - size_t op21_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op21, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op21_workspace_size, &op21_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op21_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #21" << std::endl; - return ExecutionPlan(); - } - - size_t op22_workspace_size = 0; - size_t op22_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op22, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op22_workspace_size, &op22_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op22_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #22" << std::endl; - return ExecutionPlan(); - } - - size_t op23_workspace_size = 0; - size_t op23_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op23, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op23_workspace_size, &op23_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op23_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #23" << std::endl; - return ExecutionPlan(); - } - - size_t op24_workspace_size = 0; - size_t op24_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op24, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op24_workspace_size, &op24_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op24_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #24" << std::endl; - return ExecutionPlan(); - } - - size_t op25_workspace_size = 0; - size_t op25_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op25, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op25_workspace_size, &op25_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op25_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #25" << std::endl; - return ExecutionPlan(); - } - - size_t op26_workspace_size = 0; - size_t op26_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op26, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op26_workspace_size, &op26_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op26_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #26" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 64 }; - const size_t b_shape[] = { 1, 14, 14, 64 }; - status = xnn_reshape_add_nd_qs8( - op27, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #27" << std::endl; - return ExecutionPlan(); - } - - size_t op28_workspace_size = 0; - size_t op28_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op28, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op28_workspace_size, &op28_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op28_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #28" << std::endl; - return ExecutionPlan(); - } - - size_t op29_workspace_size = 0; - size_t op29_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op29, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op29_workspace_size, &op29_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op29_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #29" << std::endl; - return ExecutionPlan(); - } - - size_t op30_workspace_size = 0; - size_t op30_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op30, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op30_workspace_size, &op30_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op30_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #30" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 64 }; - const size_t b_shape[] = { 1, 14, 14, 64 }; - status = xnn_reshape_add_nd_qs8( - op31, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #31" << std::endl; - return ExecutionPlan(); - } - - size_t op32_workspace_size = 0; - size_t op32_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op32, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op32_workspace_size, &op32_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op32_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #32" << std::endl; - return ExecutionPlan(); - } - - size_t op33_workspace_size = 0; - size_t op33_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op33, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op33_workspace_size, &op33_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op33_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #33" << std::endl; - return ExecutionPlan(); - } - - size_t op34_workspace_size = 0; - size_t op34_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op34, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op34_workspace_size, &op34_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op34_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #34" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 64 }; - const size_t b_shape[] = { 1, 14, 14, 64 }; - status = xnn_reshape_add_nd_qs8( - op35, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #35" << std::endl; - return ExecutionPlan(); - } - - size_t op36_workspace_size = 0; - size_t op36_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op36, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op36_workspace_size, &op36_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op36_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #36" << std::endl; - return ExecutionPlan(); - } - - size_t op37_workspace_size = 0; - size_t op37_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op37, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op37_workspace_size, &op37_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op37_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #37" << std::endl; - return ExecutionPlan(); - } - - size_t op38_workspace_size = 0; - size_t op38_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op38, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op38_workspace_size, &op38_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op38_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #38" << std::endl; - return ExecutionPlan(); - } - - size_t op39_workspace_size = 0; - size_t op39_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op39, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op39_workspace_size, &op39_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op39_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #39" << std::endl; - return ExecutionPlan(); - } - - size_t op40_workspace_size = 0; - size_t op40_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op40, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op40_workspace_size, &op40_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op40_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #40" << std::endl; - return ExecutionPlan(); - } - - size_t op41_workspace_size = 0; - size_t op41_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op41, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op41_workspace_size, &op41_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op41_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #41" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 96 }; - const size_t b_shape[] = { 1, 14, 14, 96 }; - status = xnn_reshape_add_nd_qs8( - op42, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #42" << std::endl; - return ExecutionPlan(); - } - - size_t op43_workspace_size = 0; - size_t op43_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op43, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op43_workspace_size, &op43_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op43_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #43" << std::endl; - return ExecutionPlan(); - } - - size_t op44_workspace_size = 0; - size_t op44_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op44, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op44_workspace_size, &op44_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op44_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #44" << std::endl; - return ExecutionPlan(); - } - - size_t op45_workspace_size = 0; - size_t op45_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op45, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op45_workspace_size, &op45_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op45_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #45" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 96 }; - const size_t b_shape[] = { 1, 14, 14, 96 }; - status = xnn_reshape_add_nd_qs8( - op46, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #46" << std::endl; - return ExecutionPlan(); - } - - size_t op47_workspace_size = 0; - size_t op47_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op47, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op47_workspace_size, &op47_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op47_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #47" << std::endl; - return ExecutionPlan(); - } - - size_t op48_workspace_size = 0; - size_t op48_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op48, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op48_workspace_size, &op48_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op48_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #48" << std::endl; - return ExecutionPlan(); - } - - size_t op49_workspace_size = 0; - size_t op49_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op49, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op49_workspace_size, &op49_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op49_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #49" << std::endl; - return ExecutionPlan(); - } - - size_t op50_workspace_size = 0; - size_t op50_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op50, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op50_workspace_size, &op50_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op50_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #50" << std::endl; - return ExecutionPlan(); - } - - size_t op51_workspace_size = 0; - size_t op51_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op51, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op51_workspace_size, &op51_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op51_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #51" << std::endl; - return ExecutionPlan(); - } - - size_t op52_workspace_size = 0; - size_t op52_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op52, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op52_workspace_size, &op52_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op52_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #52" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 160 }; - const size_t b_shape[] = { 1, 7, 7, 160 }; - status = xnn_reshape_add_nd_qs8( - op53, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #53" << std::endl; - return ExecutionPlan(); - } - - size_t op54_workspace_size = 0; - size_t op54_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op54, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op54_workspace_size, &op54_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op54_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #54" << std::endl; - return ExecutionPlan(); - } - - size_t op55_workspace_size = 0; - size_t op55_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op55, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op55_workspace_size, &op55_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op55_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #55" << std::endl; - return ExecutionPlan(); - } - - size_t op56_workspace_size = 0; - size_t op56_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op56, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op56_workspace_size, &op56_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op56_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #56" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 160 }; - const size_t b_shape[] = { 1, 7, 7, 160 }; - status = xnn_reshape_add_nd_qs8( - op57, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #57" << std::endl; - return ExecutionPlan(); - } - - size_t op58_workspace_size = 0; - size_t op58_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op58, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op58_workspace_size, &op58_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op58_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #58" << std::endl; - return ExecutionPlan(); - } - - size_t op59_workspace_size = 0; - size_t op59_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op59, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op59_workspace_size, &op59_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op59_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #59" << std::endl; - return ExecutionPlan(); - } - - size_t op60_workspace_size = 0; - size_t op60_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op60, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op60_workspace_size, &op60_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op60_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #60" << std::endl; - return ExecutionPlan(); - } - - size_t op61_workspace_size = 0; - size_t op61_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op61, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op61_workspace_size, &op61_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op61_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #61" << std::endl; - return ExecutionPlan(); - } - - size_t op62_workspace_size = 0; - size_t op62_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_qs8( - op62, - /*batch_size=*/1, 49 /* width */, - 1280 /* channels */, 1280 /* input stride */, 1280 /* output stride */, - &op62_workspace_size, &op62_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op62_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #62" << std::endl; - return ExecutionPlan(); - } - - size_t op63_workspace_size = 0; - size_t op63_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8_qc8w( - op63, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op63_workspace_size, &op63_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op63_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #63" << std::endl; - return ExecutionPlan(); - } - - Workspace workspace(max_workspace_size); - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op0, - workspace.data(), /*input=*/v0.data(), /*output=*/v1.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #0" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op1, - workspace.data(), /*input=*/v1.data(), /*output=*/v2.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #1" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op2, - workspace.data(), /*input=*/v2.data(), /*output=*/v3.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #2" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op3, - workspace.data(), /*input=*/v3.data(), /*output=*/v4.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #3" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op4, - workspace.data(), /*input=*/v4.data(), /*output=*/v5.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #4" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op5, - workspace.data(), /*input=*/v5.data(), /*output=*/v6.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #5" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op6, - workspace.data(), /*input=*/v6.data(), /*output=*/v7.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #6" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op7, - workspace.data(), /*input=*/v7.data(), /*output=*/v8.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #7" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op8, - workspace.data(), /*input=*/v8.data(), /*output=*/v9.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #8" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qs8( - op9, - v9.data() /* a */, v6.data() /* b */, /*output=*/v10.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #9" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op10, - workspace.data(), /*input=*/v10.data(), /*output=*/v11.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #10" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op11, - workspace.data(), /*input=*/v11.data(), /*output=*/v12.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #11" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op12, - workspace.data(), /*input=*/v12.data(), /*output=*/v13.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #12" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op13, - workspace.data(), /*input=*/v13.data(), /*output=*/v14.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #13" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op14, - workspace.data(), /*input=*/v14.data(), /*output=*/v15.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #14" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op15, - workspace.data(), /*input=*/v15.data(), /*output=*/v16.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #15" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qs8( - op16, - v16.data() /* a */, v13.data() /* b */, /*output=*/v17.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #16" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op17, - workspace.data(), /*input=*/v17.data(), /*output=*/v18.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #17" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op18, - workspace.data(), /*input=*/v18.data(), /*output=*/v19.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #18" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op19, - workspace.data(), /*input=*/v19.data(), /*output=*/v20.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #19" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qs8( - op20, - v20.data() /* a */, v17.data() /* b */, /*output=*/v21.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #20" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op21, - workspace.data(), /*input=*/v21.data(), /*output=*/v22.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #21" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op22, - workspace.data(), /*input=*/v22.data(), /*output=*/v23.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #22" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op23, - workspace.data(), /*input=*/v23.data(), /*output=*/v24.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #23" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op24, - workspace.data(), /*input=*/v24.data(), /*output=*/v25.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #24" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op25, - workspace.data(), /*input=*/v25.data(), /*output=*/v26.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #25" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op26, - workspace.data(), /*input=*/v26.data(), /*output=*/v27.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #26" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qs8( - op27, - v27.data() /* a */, v24.data() /* b */, /*output=*/v28.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #27" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op28, - workspace.data(), /*input=*/v28.data(), /*output=*/v29.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #28" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op29, - workspace.data(), /*input=*/v29.data(), /*output=*/v30.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #29" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op30, - workspace.data(), /*input=*/v30.data(), /*output=*/v31.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #30" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qs8( - op31, - v31.data() /* a */, v28.data() /* b */, /*output=*/v32.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #31" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op32, - workspace.data(), /*input=*/v32.data(), /*output=*/v33.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #32" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op33, - workspace.data(), /*input=*/v33.data(), /*output=*/v34.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #33" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op34, - workspace.data(), /*input=*/v34.data(), /*output=*/v35.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #34" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qs8( - op35, - v35.data() /* a */, v32.data() /* b */, /*output=*/v36.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #35" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op36, - workspace.data(), /*input=*/v36.data(), /*output=*/v37.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #36" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op37, - workspace.data(), /*input=*/v37.data(), /*output=*/v38.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #37" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op38, - workspace.data(), /*input=*/v38.data(), /*output=*/v39.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #38" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op39, - workspace.data(), /*input=*/v39.data(), /*output=*/v40.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #39" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op40, - workspace.data(), /*input=*/v40.data(), /*output=*/v41.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #40" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op41, - workspace.data(), /*input=*/v41.data(), /*output=*/v42.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #41" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qs8( - op42, - v42.data() /* a */, v39.data() /* b */, /*output=*/v43.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #42" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op43, - workspace.data(), /*input=*/v43.data(), /*output=*/v44.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #43" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op44, - workspace.data(), /*input=*/v44.data(), /*output=*/v45.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #44" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op45, - workspace.data(), /*input=*/v45.data(), /*output=*/v46.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #45" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qs8( - op46, - v46.data() /* a */, v43.data() /* b */, /*output=*/v47.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #46" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op47, - workspace.data(), /*input=*/v47.data(), /*output=*/v48.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #47" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op48, - workspace.data(), /*input=*/v48.data(), /*output=*/v49.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #48" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op49, - workspace.data(), /*input=*/v49.data(), /*output=*/v50.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #49" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op50, - workspace.data(), /*input=*/v50.data(), /*output=*/v51.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #50" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op51, - workspace.data(), /*input=*/v51.data(), /*output=*/v52.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #51" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op52, - workspace.data(), /*input=*/v52.data(), /*output=*/v53.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #52" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qs8( - op53, - v53.data() /* a */, v50.data() /* b */, /*output=*/v54.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #53" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op54, - workspace.data(), /*input=*/v54.data(), /*output=*/v55.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #54" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op55, - workspace.data(), /*input=*/v55.data(), /*output=*/v56.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #55" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op56, - workspace.data(), /*input=*/v56.data(), /*output=*/v57.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #56" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qs8( - op57, - v57.data() /* a */, v54.data() /* b */, /*output=*/v58.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #57" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op58, - workspace.data(), /*input=*/v58.data(), /*output=*/v59.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #58" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op59, - workspace.data(), /*input=*/v59.data(), /*output=*/v60.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #59" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op60, - workspace.data(), /*input=*/v60.data(), /*output=*/v61.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #60" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op61, - workspace.data(), /*input=*/v61.data(), /*output=*/v62.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #61" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_qs8( - op62, - workspace.data(), - /*input=*/v62.data(), /*output=*/v63.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #62" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qs8_qc8w( - op63, - workspace.data(), /*input=*/v63.data(), /*output=*/v64.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #63" << std::endl; - return ExecutionPlan(); - } - - XNN_PRAGMA_CLANG("clang diagnostic push") - XNN_PRAGMA_CLANG("clang diagnostic ignored \"-Wpessimizing-move\"") - return ExecutionPlan{operators, workspace}; - XNN_PRAGMA_CLANG("clang diagnostic pop") -} - -} // namespace models diff --git a/models/qu8-mobilenet-v1.cc b/models/qu8-mobilenet-v1.cc deleted file mode 100644 index f382313588a..00000000000 --- a/models/qu8-mobilenet-v1.cc +++ /dev/null @@ -1,1774 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! - -#include "xnnpack.h" - -#include -#include -#include -#include -#include -#include - -#include "xnnpack/cache.h" -#include "xnnpack/common.h" -#include "xnnpack/models.h" - -namespace models { - -ExecutionPlan QU8MobileNetV1(pthreadpool_t threadpool) { - alignas(16) static std::array v0; - alignas(16) static std::array v1; - alignas(16) static std::array v2; - alignas(16) static std::array v3; - alignas(16) static std::array v4; - alignas(16) static std::array v5; - alignas(16) static std::array v6; - alignas(16) static std::array v7; - alignas(16) static std::array v8; - alignas(16) static std::array v9; - alignas(16) static std::array v10; - alignas(16) static std::array v11; - alignas(16) static std::array v12; - alignas(16) static std::array v13; - alignas(16) static std::array v14; - alignas(16) static std::array v15; - alignas(16) static std::array v16; - alignas(16) static std::array v17; - alignas(16) static std::array v18; - alignas(16) static std::array v19; - alignas(16) static std::array v20; - alignas(16) static std::array v21; - alignas(16) static std::array v22; - alignas(16) static std::array v23; - alignas(16) static std::array v24; - alignas(16) static std::array v25; - alignas(16) static std::array v26; - alignas(16) static std::array v27; - alignas(16) static std::array v28; - alignas(16) static std::array v29; - alignas(16) static std::array v30; - alignas(16) static std::array v31; - alignas(16) static std::array w32; - alignas(16) static std::array w33; - alignas(16) static std::array w34; - alignas(16) static std::array w35; - alignas(16) static std::array w36; - alignas(16) static std::array w37; - alignas(16) static std::array w38; - alignas(16) static std::array w39; - alignas(16) static std::array w40; - alignas(16) static std::array w41; - alignas(16) static std::array w42; - alignas(16) static std::array w43; - alignas(16) static std::array w44; - alignas(16) static std::array w45; - alignas(16) static std::array w46; - alignas(16) static std::array w47; - alignas(16) static std::array w48; - alignas(16) static std::array w49; - alignas(16) static std::array w50; - alignas(16) static std::array w51; - alignas(16) static std::array w52; - alignas(16) static std::array w53; - alignas(16) static std::array w54; - alignas(16) static std::array w55; - alignas(16) static std::array w56; - alignas(16) static std::array w57; - alignas(16) static std::array w58; - alignas(16) static std::array w59; - alignas(16) static std::array w60; - alignas(16) static std::array w61; - alignas(16) static std::array w62; - alignas(16) static std::array w63; - alignas(16) static std::array w64; - alignas(16) static std::array w65; - alignas(16) static std::array w66; - alignas(16) static std::array w67; - alignas(16) static std::array w68; - alignas(16) static std::array w69; - alignas(16) static std::array w70; - alignas(16) static std::array w71; - alignas(16) static std::array w72; - alignas(16) static std::array w73; - alignas(16) static std::array w74; - alignas(16) static std::array w75; - alignas(16) static std::array w76; - alignas(16) static std::array w77; - alignas(16) static std::array w78; - alignas(16) static std::array w79; - alignas(16) static std::array w80; - alignas(16) static std::array w81; - alignas(16) static std::array w82; - alignas(16) static std::array w83; - alignas(16) static std::array w84; - alignas(16) static std::array w85; - alignas(16) static std::array w86; - alignas(16) static std::array w87; - - std::random_device random_device; - auto rng = std::mt19937(random_device()); - auto qu8rng = std::bind(std::uniform_int_distribution(std::numeric_limits::min(), std::numeric_limits::max()), std::ref(rng)); - auto qs32rng = std::bind(std::uniform_int_distribution(-10000, 10000), std::ref(rng)); - std::generate(v0.begin(), v0.end(), std::ref(qu8rng)); - std::generate(v1.begin(), v1.end(), std::ref(qu8rng)); - std::generate(v2.begin(), v2.end(), std::ref(qu8rng)); - std::generate(v3.begin(), v3.end(), std::ref(qu8rng)); - std::generate(v4.begin(), v4.end(), std::ref(qu8rng)); - std::generate(v5.begin(), v5.end(), std::ref(qu8rng)); - std::generate(v6.begin(), v6.end(), std::ref(qu8rng)); - std::generate(v7.begin(), v7.end(), std::ref(qu8rng)); - std::generate(v8.begin(), v8.end(), std::ref(qu8rng)); - std::generate(v9.begin(), v9.end(), std::ref(qu8rng)); - std::generate(v10.begin(), v10.end(), std::ref(qu8rng)); - std::generate(v11.begin(), v11.end(), std::ref(qu8rng)); - std::generate(v12.begin(), v12.end(), std::ref(qu8rng)); - std::generate(v13.begin(), v13.end(), std::ref(qu8rng)); - std::generate(v14.begin(), v14.end(), std::ref(qu8rng)); - std::generate(v15.begin(), v15.end(), std::ref(qu8rng)); - std::generate(v16.begin(), v16.end(), std::ref(qu8rng)); - std::generate(v17.begin(), v17.end(), std::ref(qu8rng)); - std::generate(v18.begin(), v18.end(), std::ref(qu8rng)); - std::generate(v19.begin(), v19.end(), std::ref(qu8rng)); - std::generate(v20.begin(), v20.end(), std::ref(qu8rng)); - std::generate(v21.begin(), v21.end(), std::ref(qu8rng)); - std::generate(v22.begin(), v22.end(), std::ref(qu8rng)); - std::generate(v23.begin(), v23.end(), std::ref(qu8rng)); - std::generate(v24.begin(), v24.end(), std::ref(qu8rng)); - std::generate(v25.begin(), v25.end(), std::ref(qu8rng)); - std::generate(v26.begin(), v26.end(), std::ref(qu8rng)); - std::generate(v27.begin(), v27.end(), std::ref(qu8rng)); - std::generate(v28.begin(), v28.end(), std::ref(qu8rng)); - std::generate(v29.begin(), v29.end(), std::ref(qu8rng)); - std::generate(v30.begin(), v30.end(), std::ref(qu8rng)); - std::generate(v31.begin(), v31.end(), std::ref(qu8rng)); - std::generate(w32.begin(), w32.end(), std::ref(qu8rng)); - std::generate(w33.begin(), w33.end(), std::ref(qs32rng)); - std::generate(w34.begin(), w34.end(), std::ref(qu8rng)); - std::generate(w35.begin(), w35.end(), std::ref(qs32rng)); - std::generate(w36.begin(), w36.end(), std::ref(qu8rng)); - std::generate(w37.begin(), w37.end(), std::ref(qs32rng)); - std::generate(w38.begin(), w38.end(), std::ref(qu8rng)); - std::generate(w39.begin(), w39.end(), std::ref(qs32rng)); - std::generate(w40.begin(), w40.end(), std::ref(qu8rng)); - std::generate(w41.begin(), w41.end(), std::ref(qs32rng)); - std::generate(w42.begin(), w42.end(), std::ref(qu8rng)); - std::generate(w43.begin(), w43.end(), std::ref(qs32rng)); - std::generate(w44.begin(), w44.end(), std::ref(qu8rng)); - std::generate(w45.begin(), w45.end(), std::ref(qs32rng)); - std::generate(w46.begin(), w46.end(), std::ref(qu8rng)); - std::generate(w47.begin(), w47.end(), std::ref(qs32rng)); - std::generate(w48.begin(), w48.end(), std::ref(qu8rng)); - std::generate(w49.begin(), w49.end(), std::ref(qs32rng)); - std::generate(w50.begin(), w50.end(), std::ref(qu8rng)); - std::generate(w51.begin(), w51.end(), std::ref(qs32rng)); - std::generate(w52.begin(), w52.end(), std::ref(qu8rng)); - std::generate(w53.begin(), w53.end(), std::ref(qs32rng)); - std::generate(w54.begin(), w54.end(), std::ref(qu8rng)); - std::generate(w55.begin(), w55.end(), std::ref(qs32rng)); - std::generate(w56.begin(), w56.end(), std::ref(qu8rng)); - std::generate(w57.begin(), w57.end(), std::ref(qs32rng)); - std::generate(w58.begin(), w58.end(), std::ref(qu8rng)); - std::generate(w59.begin(), w59.end(), std::ref(qs32rng)); - std::generate(w60.begin(), w60.end(), std::ref(qu8rng)); - std::generate(w61.begin(), w61.end(), std::ref(qs32rng)); - std::generate(w62.begin(), w62.end(), std::ref(qu8rng)); - std::generate(w63.begin(), w63.end(), std::ref(qs32rng)); - std::generate(w64.begin(), w64.end(), std::ref(qu8rng)); - std::generate(w65.begin(), w65.end(), std::ref(qs32rng)); - std::generate(w66.begin(), w66.end(), std::ref(qu8rng)); - std::generate(w67.begin(), w67.end(), std::ref(qs32rng)); - std::generate(w68.begin(), w68.end(), std::ref(qu8rng)); - std::generate(w69.begin(), w69.end(), std::ref(qs32rng)); - std::generate(w70.begin(), w70.end(), std::ref(qu8rng)); - std::generate(w71.begin(), w71.end(), std::ref(qs32rng)); - std::generate(w72.begin(), w72.end(), std::ref(qu8rng)); - std::generate(w73.begin(), w73.end(), std::ref(qs32rng)); - std::generate(w74.begin(), w74.end(), std::ref(qu8rng)); - std::generate(w75.begin(), w75.end(), std::ref(qs32rng)); - std::generate(w76.begin(), w76.end(), std::ref(qu8rng)); - std::generate(w77.begin(), w77.end(), std::ref(qs32rng)); - std::generate(w78.begin(), w78.end(), std::ref(qu8rng)); - std::generate(w79.begin(), w79.end(), std::ref(qs32rng)); - std::generate(w80.begin(), w80.end(), std::ref(qu8rng)); - std::generate(w81.begin(), w81.end(), std::ref(qs32rng)); - std::generate(w82.begin(), w82.end(), std::ref(qu8rng)); - std::generate(w83.begin(), w83.end(), std::ref(qs32rng)); - std::generate(w84.begin(), w84.end(), std::ref(qu8rng)); - std::generate(w85.begin(), w85.end(), std::ref(qs32rng)); - std::generate(w86.begin(), w86.end(), std::ref(qu8rng)); - std::generate(w87.begin(), w87.end(), std::ref(qs32rng)); - - Operators operators; - xnn_status status; - xnn_code_cache* code_cache_ptr = nullptr; - size_t max_workspace_size = 0; - - xnn_operator_t op0 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/0, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/3, - /*group_output_channels=*/32, - /*input_channel_stride=*/3, - /*output_channel_stride=*/32, - /*input_zero_point=*/(uint8_t) 128, - /*input_scale=*/0.0078125, - /*kernel_zero_point=*/(uint8_t) 151, - /*kernel_scale=*/0.02182667888700962, - /*kernel=*/w32.data(), /*bias=*/w33.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op0); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #0" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op0, xnn_delete_operator); - - xnn_operator_t op1 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/32, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/32, - /*output_channel_stride=*/32, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 110, - /*kernel_scale=*/0.29219913482666016, - /*kernel=*/w34.data(), /*bias=*/w35.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op1); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #1" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op1, xnn_delete_operator); - - xnn_operator_t op2 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/32, - /*group_output_channels=*/64, - /*input_channel_stride=*/32, - /*output_channel_stride=*/64, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 121, - /*kernel_scale=*/0.030420949682593346, - /*kernel=*/w36.data(), /*bias=*/w37.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op2); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #2" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op2, xnn_delete_operator); - - xnn_operator_t op3 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/0, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/64, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/64, - /*output_channel_stride=*/64, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 130, - /*kernel_scale=*/0.40277284383773804, - /*kernel=*/w38.data(), /*bias=*/w39.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op3); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #3" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op3, xnn_delete_operator); - - xnn_operator_t op4 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/64, - /*group_output_channels=*/128, - /*input_channel_stride=*/64, - /*output_channel_stride=*/128, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 104, - /*kernel_scale=*/0.015148180536925793, - /*kernel=*/w40.data(), /*bias=*/w41.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op4); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #4" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op4, xnn_delete_operator); - - xnn_operator_t op5 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/128, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/128, - /*output_channel_stride=*/128, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 160, - /*kernel_scale=*/0.06053730100393295, - /*kernel=*/w42.data(), /*bias=*/w43.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op5); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #5" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op5, xnn_delete_operator); - - xnn_operator_t op6 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/128, - /*group_output_channels=*/128, - /*input_channel_stride=*/128, - /*output_channel_stride=*/128, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 94, - /*kernel_scale=*/0.013755458407104015, - /*kernel=*/w44.data(), /*bias=*/w45.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op6); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #6" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op6, xnn_delete_operator); - - xnn_operator_t op7 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/0, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/128, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/128, - /*output_channel_stride=*/128, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 123, - /*kernel_scale=*/0.01675807684659958, - /*kernel=*/w46.data(), /*bias=*/w47.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op7); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #7" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op7, xnn_delete_operator); - - xnn_operator_t op8 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/128, - /*group_output_channels=*/256, - /*input_channel_stride=*/128, - /*output_channel_stride=*/256, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 151, - /*kernel_scale=*/0.007601846940815449, - /*kernel=*/w48.data(), /*bias=*/w49.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op8); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #8" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op8, xnn_delete_operator); - - xnn_operator_t op9 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/256, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/256, - /*output_channel_stride=*/256, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 129, - /*kernel_scale=*/0.04105526953935623, - /*kernel=*/w50.data(), /*bias=*/w51.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op9); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #9" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op9, xnn_delete_operator); - - xnn_operator_t op10 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/256, - /*group_output_channels=*/256, - /*input_channel_stride=*/256, - /*output_channel_stride=*/256, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 122, - /*kernel_scale=*/0.006431614048779011, - /*kernel=*/w52.data(), /*bias=*/w53.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op10); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #10" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op10, xnn_delete_operator); - - xnn_operator_t op11 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/0, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/256, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/256, - /*output_channel_stride=*/256, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 122, - /*kernel_scale=*/0.013460792601108551, - /*kernel=*/w54.data(), /*bias=*/w55.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op11); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #11" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op11, xnn_delete_operator); - - xnn_operator_t op12 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/256, - /*group_output_channels=*/512, - /*input_channel_stride=*/256, - /*output_channel_stride=*/512, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 109, - /*kernel_scale=*/0.00917122047394514, - /*kernel=*/w56.data(), /*bias=*/w57.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op12); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #12" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op12, xnn_delete_operator); - - xnn_operator_t op13 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/512, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/512, - /*output_channel_stride=*/512, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 132, - /*kernel_scale=*/0.036934755742549896, - /*kernel=*/w58.data(), /*bias=*/w59.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op13); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #13" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op13, xnn_delete_operator); - - xnn_operator_t op14 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/512, - /*group_output_channels=*/512, - /*input_channel_stride=*/512, - /*output_channel_stride=*/512, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 140, - /*kernel_scale=*/0.005300046876072884, - /*kernel=*/w60.data(), /*bias=*/w61.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op14); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #14" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op14, xnn_delete_operator); - - xnn_operator_t op15 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/512, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/512, - /*output_channel_stride=*/512, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 94, - /*kernel_scale=*/0.042609862983226776, - /*kernel=*/w62.data(), /*bias=*/w63.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op15); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #15" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op15, xnn_delete_operator); - - xnn_operator_t op16 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/512, - /*group_output_channels=*/512, - /*input_channel_stride=*/512, - /*output_channel_stride=*/512, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 127, - /*kernel_scale=*/0.0049632852897048, - /*kernel=*/w64.data(), /*bias=*/w65.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op16); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #16" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op16, xnn_delete_operator); - - xnn_operator_t op17 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/512, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/512, - /*output_channel_stride=*/512, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 127, - /*kernel_scale=*/0.028358859941363335, - /*kernel=*/w66.data(), /*bias=*/w67.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op17); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #17" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op17, xnn_delete_operator); - - xnn_operator_t op18 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/512, - /*group_output_channels=*/512, - /*input_channel_stride=*/512, - /*output_channel_stride=*/512, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 89, - /*kernel_scale=*/0.007770895957946777, - /*kernel=*/w68.data(), /*bias=*/w69.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op18); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #18" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op18, xnn_delete_operator); - - xnn_operator_t op19 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/512, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/512, - /*output_channel_stride=*/512, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 134, - /*kernel_scale=*/0.024329448118805885, - /*kernel=*/w70.data(), /*bias=*/w71.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op19); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #19" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op19, xnn_delete_operator); - - xnn_operator_t op20 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/512, - /*group_output_channels=*/512, - /*input_channel_stride=*/512, - /*output_channel_stride=*/512, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 99, - /*kernel_scale=*/0.009658650495111942, - /*kernel=*/w72.data(), /*bias=*/w73.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op20); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #20" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op20, xnn_delete_operator); - - xnn_operator_t op21 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/512, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/512, - /*output_channel_stride=*/512, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 106, - /*kernel_scale=*/0.019366811960935593, - /*kernel=*/w74.data(), /*bias=*/w75.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op21); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #21" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op21, xnn_delete_operator); - - xnn_operator_t op22 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/512, - /*group_output_channels=*/512, - /*input_channel_stride=*/512, - /*output_channel_stride=*/512, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 153, - /*kernel_scale=*/0.005446993745863438, - /*kernel=*/w76.data(), /*bias=*/w77.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op22); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #22" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op22, xnn_delete_operator); - - xnn_operator_t op23 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/0, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/512, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/512, - /*output_channel_stride=*/512, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 126, - /*kernel_scale=*/0.007835594937205315, - /*kernel=*/w78.data(), /*bias=*/w79.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op23); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #23" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op23, xnn_delete_operator); - - xnn_operator_t op24 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/512, - /*group_output_channels=*/1024, - /*input_channel_stride=*/512, - /*output_channel_stride=*/1024, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 130, - /*kernel_scale=*/0.00817922968417406, - /*kernel=*/w80.data(), /*bias=*/w81.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op24); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #24" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op24, xnn_delete_operator); - - xnn_operator_t op25 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1024, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/1024, - /*output_channel_stride=*/1024, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 211, - /*kernel_scale=*/0.12616927921772003, - /*kernel=*/w82.data(), /*bias=*/w83.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op25); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #25" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op25, xnn_delete_operator); - - xnn_operator_t op26 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/1024, - /*group_output_channels=*/1024, - /*input_channel_stride=*/1024, - /*output_channel_stride=*/1024, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 95, - /*kernel_scale=*/0.018048152327537537, - /*kernel=*/w84.data(), /*bias=*/w85.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op26); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #26" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op26, xnn_delete_operator); - - xnn_operator_t op27 = nullptr; - status = xnn_create_global_average_pooling_nwc_qu8( - 0 /* input zero point */, 0.023528477177023888 /* input scale */, - 0 /* output zero point */, 0.023528477177023888 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op27); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #27" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op27, xnn_delete_operator); - - xnn_operator_t op28 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/1024, - /*group_output_channels=*/1001, - /*input_channel_stride=*/1024, - /*output_channel_stride=*/1001, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 74, - /*kernel_scale=*/0.004986600950360298, - /*kernel=*/w86.data(), /*bias=*/w87.data(), - /*output_zero_point=*/(uint8_t) 66, - /*output_scale=*/0.16609922051429749, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op28); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #28" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op28, xnn_delete_operator); - - xnn_operator_t op29 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op29); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #29" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op29, xnn_delete_operator); - - xnn_operator_t op30 = nullptr; - status = xnn_create_softmax_nc_qu8( - /*input_scale=*/0.16609922051429749, - /*output_zero_point=*/0, - /*output_scale=*/0.00390625, - /*flags=*/0, - &op30); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #30" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op30, xnn_delete_operator); - - size_t op0_workspace_size = 0; - size_t op0_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op0, - /*batch_size=*/1, /*input_height=*/224, /*input_width=*/224, - &op0_workspace_size, &op0_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op0_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #0" << std::endl; - return ExecutionPlan(); - } - - size_t op1_workspace_size = 0; - size_t op1_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op1, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op1_workspace_size, &op1_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op1_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #1" << std::endl; - return ExecutionPlan(); - } - - size_t op2_workspace_size = 0; - size_t op2_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op2, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op2_workspace_size, &op2_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op2_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #2" << std::endl; - return ExecutionPlan(); - } - - size_t op3_workspace_size = 0; - size_t op3_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op3, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op3_workspace_size, &op3_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op3_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #3" << std::endl; - return ExecutionPlan(); - } - - size_t op4_workspace_size = 0; - size_t op4_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op4, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op4_workspace_size, &op4_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op4_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #4" << std::endl; - return ExecutionPlan(); - } - - size_t op5_workspace_size = 0; - size_t op5_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op5, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op5_workspace_size, &op5_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op5_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #5" << std::endl; - return ExecutionPlan(); - } - - size_t op6_workspace_size = 0; - size_t op6_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op6, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op6_workspace_size, &op6_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op6_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #6" << std::endl; - return ExecutionPlan(); - } - - size_t op7_workspace_size = 0; - size_t op7_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op7, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op7_workspace_size, &op7_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op7_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #7" << std::endl; - return ExecutionPlan(); - } - - size_t op8_workspace_size = 0; - size_t op8_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op8, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op8_workspace_size, &op8_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op8_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #8" << std::endl; - return ExecutionPlan(); - } - - size_t op9_workspace_size = 0; - size_t op9_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op9, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op9_workspace_size, &op9_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op9_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #9" << std::endl; - return ExecutionPlan(); - } - - size_t op10_workspace_size = 0; - size_t op10_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op10, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op10_workspace_size, &op10_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op10_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #10" << std::endl; - return ExecutionPlan(); - } - - size_t op11_workspace_size = 0; - size_t op11_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op11, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op11_workspace_size, &op11_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op11_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #11" << std::endl; - return ExecutionPlan(); - } - - size_t op12_workspace_size = 0; - size_t op12_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op12, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op12_workspace_size, &op12_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op12_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #12" << std::endl; - return ExecutionPlan(); - } - - size_t op13_workspace_size = 0; - size_t op13_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op13, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op13_workspace_size, &op13_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op13_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #13" << std::endl; - return ExecutionPlan(); - } - - size_t op14_workspace_size = 0; - size_t op14_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op14, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op14_workspace_size, &op14_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op14_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #14" << std::endl; - return ExecutionPlan(); - } - - size_t op15_workspace_size = 0; - size_t op15_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op15, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op15_workspace_size, &op15_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op15_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #15" << std::endl; - return ExecutionPlan(); - } - - size_t op16_workspace_size = 0; - size_t op16_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op16, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op16_workspace_size, &op16_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op16_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #16" << std::endl; - return ExecutionPlan(); - } - - size_t op17_workspace_size = 0; - size_t op17_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op17, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op17_workspace_size, &op17_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op17_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #17" << std::endl; - return ExecutionPlan(); - } - - size_t op18_workspace_size = 0; - size_t op18_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op18, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op18_workspace_size, &op18_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op18_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #18" << std::endl; - return ExecutionPlan(); - } - - size_t op19_workspace_size = 0; - size_t op19_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op19, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op19_workspace_size, &op19_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op19_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #19" << std::endl; - return ExecutionPlan(); - } - - size_t op20_workspace_size = 0; - size_t op20_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op20, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op20_workspace_size, &op20_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op20_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #20" << std::endl; - return ExecutionPlan(); - } - - size_t op21_workspace_size = 0; - size_t op21_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op21, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op21_workspace_size, &op21_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op21_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #21" << std::endl; - return ExecutionPlan(); - } - - size_t op22_workspace_size = 0; - size_t op22_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op22, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op22_workspace_size, &op22_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op22_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #22" << std::endl; - return ExecutionPlan(); - } - - size_t op23_workspace_size = 0; - size_t op23_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op23, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op23_workspace_size, &op23_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op23_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #23" << std::endl; - return ExecutionPlan(); - } - - size_t op24_workspace_size = 0; - size_t op24_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op24, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op24_workspace_size, &op24_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op24_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #24" << std::endl; - return ExecutionPlan(); - } - - size_t op25_workspace_size = 0; - size_t op25_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op25, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op25_workspace_size, &op25_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op25_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #25" << std::endl; - return ExecutionPlan(); - } - - size_t op26_workspace_size = 0; - size_t op26_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op26, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op26_workspace_size, &op26_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op26_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #26" << std::endl; - return ExecutionPlan(); - } - - size_t op27_workspace_size = 0; - size_t op27_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_qu8( - op27, - /*batch_size=*/1, 49 /* width */, - 1024 /* channels */, 1024 /* input stride */, 1024 /* output stride */, - &op27_workspace_size, &op27_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op27_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #27" << std::endl; - return ExecutionPlan(); - } - - size_t op28_workspace_size = 0; - size_t op28_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op28, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op28_workspace_size, &op28_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op28_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #28" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op29, - /*batch_size=*/1001, - 1 /* channels */, - 1 /* input stride */, - 1 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #29" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_softmax_nc_qu8( - op30, - /*channels=*/1001, - /*input_stride=*/1001, - /*output_stride=*/1001, - /*batch_size=*/1, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #30" << std::endl; - return ExecutionPlan(); - } - - Workspace workspace(max_workspace_size); - - status = xnn_setup_convolution2d_nhwc_qu8( - op0, - workspace.data(), /*input=*/v0.data(), /*output=*/v1.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #0" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op1, - workspace.data(), /*input=*/v1.data(), /*output=*/v2.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #1" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op2, - workspace.data(), /*input=*/v2.data(), /*output=*/v3.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #2" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op3, - workspace.data(), /*input=*/v3.data(), /*output=*/v4.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #3" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op4, - workspace.data(), /*input=*/v4.data(), /*output=*/v5.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #4" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op5, - workspace.data(), /*input=*/v5.data(), /*output=*/v6.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #5" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op6, - workspace.data(), /*input=*/v6.data(), /*output=*/v7.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #6" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op7, - workspace.data(), /*input=*/v7.data(), /*output=*/v8.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #7" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op8, - workspace.data(), /*input=*/v8.data(), /*output=*/v9.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #8" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op9, - workspace.data(), /*input=*/v9.data(), /*output=*/v10.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #9" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op10, - workspace.data(), /*input=*/v10.data(), /*output=*/v11.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #10" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op11, - workspace.data(), /*input=*/v11.data(), /*output=*/v12.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #11" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op12, - workspace.data(), /*input=*/v12.data(), /*output=*/v13.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #12" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op13, - workspace.data(), /*input=*/v13.data(), /*output=*/v14.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #13" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op14, - workspace.data(), /*input=*/v14.data(), /*output=*/v15.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #14" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op15, - workspace.data(), /*input=*/v15.data(), /*output=*/v16.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #15" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op16, - workspace.data(), /*input=*/v16.data(), /*output=*/v17.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #16" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op17, - workspace.data(), /*input=*/v17.data(), /*output=*/v18.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #17" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op18, - workspace.data(), /*input=*/v18.data(), /*output=*/v19.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #18" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op19, - workspace.data(), /*input=*/v19.data(), /*output=*/v20.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #19" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op20, - workspace.data(), /*input=*/v20.data(), /*output=*/v21.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #20" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op21, - workspace.data(), /*input=*/v21.data(), /*output=*/v22.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #21" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op22, - workspace.data(), /*input=*/v22.data(), /*output=*/v23.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #22" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op23, - workspace.data(), /*input=*/v23.data(), /*output=*/v24.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #23" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op24, - workspace.data(), /*input=*/v24.data(), /*output=*/v25.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #24" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op25, - workspace.data(), /*input=*/v25.data(), /*output=*/v26.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #25" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op26, - workspace.data(), /*input=*/v26.data(), /*output=*/v27.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #26" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_qu8( - op27, - workspace.data(), - /*input=*/v27.data(), /*output=*/v28.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #27" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op28, - workspace.data(), /*input=*/v28.data(), /*output=*/v29.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #28" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op29, - /*input=*/v29.data(), /*output=*/v30.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #29" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_softmax_nc_qu8( - op30, - /*input=*/v30.data(), /*output=*/v31.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #30" << std::endl; - return ExecutionPlan(); - } - - XNN_PRAGMA_CLANG("clang diagnostic push") - XNN_PRAGMA_CLANG("clang diagnostic ignored \"-Wpessimizing-move\"") - return ExecutionPlan{operators, workspace}; - XNN_PRAGMA_CLANG("clang diagnostic pop") -} - -} // namespace models diff --git a/models/qu8-mobilenet-v2.cc b/models/qu8-mobilenet-v2.cc deleted file mode 100644 index 37cef520685..00000000000 --- a/models/qu8-mobilenet-v2.cc +++ /dev/null @@ -1,3559 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! - -#include "xnnpack.h" - -#include -#include -#include -#include -#include -#include - -#include "xnnpack/cache.h" -#include "xnnpack/common.h" -#include "xnnpack/models.h" - -namespace models { - -ExecutionPlan QU8MobileNetV2(pthreadpool_t threadpool) { - alignas(16) static std::array v0; - alignas(16) static std::array v1; - alignas(16) static std::array v2; - alignas(16) static std::array v3; - alignas(16) static std::array v4; - alignas(16) static std::array v5; - alignas(16) static std::array v6; - alignas(16) static std::array v7; - alignas(16) static std::array v8; - alignas(16) static std::array v9; - alignas(16) static std::array v10; - alignas(16) static std::array v11; - alignas(16) static std::array v12; - alignas(16) static std::array v13; - alignas(16) static std::array v14; - alignas(16) static std::array v15; - alignas(16) static std::array v16; - alignas(16) static std::array v17; - alignas(16) static std::array v18; - alignas(16) static std::array v19; - alignas(16) static std::array v20; - alignas(16) static std::array v21; - alignas(16) static std::array v22; - alignas(16) static std::array v23; - alignas(16) static std::array v24; - alignas(16) static std::array v25; - alignas(16) static std::array v26; - alignas(16) static std::array v27; - alignas(16) static std::array v28; - alignas(16) static std::array v29; - alignas(16) static std::array v30; - alignas(16) static std::array v31; - alignas(16) static std::array v32; - alignas(16) static std::array v33; - alignas(16) static std::array v34; - alignas(16) static std::array v35; - alignas(16) static std::array v36; - alignas(16) static std::array v37; - alignas(16) static std::array v38; - alignas(16) static std::array v39; - alignas(16) static std::array v40; - alignas(16) static std::array v41; - alignas(16) static std::array v42; - alignas(16) static std::array v43; - alignas(16) static std::array v44; - alignas(16) static std::array v45; - alignas(16) static std::array v46; - alignas(16) static std::array v47; - alignas(16) static std::array v48; - alignas(16) static std::array v49; - alignas(16) static std::array v50; - alignas(16) static std::array v51; - alignas(16) static std::array v52; - alignas(16) static std::array v53; - alignas(16) static std::array v54; - alignas(16) static std::array v55; - alignas(16) static std::array v56; - alignas(16) static std::array v57; - alignas(16) static std::array v58; - alignas(16) static std::array v59; - alignas(16) static std::array v60; - alignas(16) static std::array v61; - alignas(16) static std::array v62; - alignas(16) static std::array v63; - alignas(16) static std::array v64; - alignas(16) static std::array v65; - alignas(16) static std::array w66; - alignas(16) static std::array w67; - alignas(16) static std::array w68; - alignas(16) static std::array w69; - alignas(16) static std::array w70; - alignas(16) static std::array w71; - alignas(16) static std::array w72; - alignas(16) static std::array w73; - alignas(16) static std::array w74; - alignas(16) static std::array w75; - alignas(16) static std::array w76; - alignas(16) static std::array w77; - alignas(16) static std::array w78; - alignas(16) static std::array w79; - alignas(16) static std::array w80; - alignas(16) static std::array w81; - alignas(16) static std::array w82; - alignas(16) static std::array w83; - alignas(16) static std::array w84; - alignas(16) static std::array w85; - alignas(16) static std::array w86; - alignas(16) static std::array w87; - alignas(16) static std::array w88; - alignas(16) static std::array w89; - alignas(16) static std::array w90; - alignas(16) static std::array w91; - alignas(16) static std::array w92; - alignas(16) static std::array w93; - alignas(16) static std::array w94; - alignas(16) static std::array w95; - alignas(16) static std::array w96; - alignas(16) static std::array w97; - alignas(16) static std::array w98; - alignas(16) static std::array w99; - alignas(16) static std::array w100; - alignas(16) static std::array w101; - alignas(16) static std::array w102; - alignas(16) static std::array w103; - alignas(16) static std::array w104; - alignas(16) static std::array w105; - alignas(16) static std::array w106; - alignas(16) static std::array w107; - alignas(16) static std::array w108; - alignas(16) static std::array w109; - alignas(16) static std::array w110; - alignas(16) static std::array w111; - alignas(16) static std::array w112; - alignas(16) static std::array w113; - alignas(16) static std::array w114; - alignas(16) static std::array w115; - alignas(16) static std::array w116; - alignas(16) static std::array w117; - alignas(16) static std::array w118; - alignas(16) static std::array w119; - alignas(16) static std::array w120; - alignas(16) static std::array w121; - alignas(16) static std::array w122; - alignas(16) static std::array w123; - alignas(16) static std::array w124; - alignas(16) static std::array w125; - alignas(16) static std::array w126; - alignas(16) static std::array w127; - alignas(16) static std::array w128; - alignas(16) static std::array w129; - alignas(16) static std::array w130; - alignas(16) static std::array w131; - alignas(16) static std::array w132; - alignas(16) static std::array w133; - alignas(16) static std::array w134; - alignas(16) static std::array w135; - alignas(16) static std::array w136; - alignas(16) static std::array w137; - alignas(16) static std::array w138; - alignas(16) static std::array w139; - alignas(16) static std::array w140; - alignas(16) static std::array w141; - alignas(16) static std::array w142; - alignas(16) static std::array w143; - alignas(16) static std::array w144; - alignas(16) static std::array w145; - alignas(16) static std::array w146; - alignas(16) static std::array w147; - alignas(16) static std::array w148; - alignas(16) static std::array w149; - alignas(16) static std::array w150; - alignas(16) static std::array w151; - alignas(16) static std::array w152; - alignas(16) static std::array w153; - alignas(16) static std::array w154; - alignas(16) static std::array w155; - alignas(16) static std::array w156; - alignas(16) static std::array w157; - alignas(16) static std::array w158; - alignas(16) static std::array w159; - alignas(16) static std::array w160; - alignas(16) static std::array w161; - alignas(16) static std::array w162; - alignas(16) static std::array w163; - alignas(16) static std::array w164; - alignas(16) static std::array w165; - alignas(16) static std::array w166; - alignas(16) static std::array w167; - alignas(16) static std::array w168; - alignas(16) static std::array w169; - alignas(16) static std::array w170; - alignas(16) static std::array w171; - - std::random_device random_device; - auto rng = std::mt19937(random_device()); - auto qu8rng = std::bind(std::uniform_int_distribution(std::numeric_limits::min(), std::numeric_limits::max()), std::ref(rng)); - auto qs32rng = std::bind(std::uniform_int_distribution(-10000, 10000), std::ref(rng)); - std::generate(v0.begin(), v0.end(), std::ref(qu8rng)); - std::generate(v1.begin(), v1.end(), std::ref(qu8rng)); - std::generate(v2.begin(), v2.end(), std::ref(qu8rng)); - std::generate(v3.begin(), v3.end(), std::ref(qu8rng)); - std::generate(v4.begin(), v4.end(), std::ref(qu8rng)); - std::generate(v5.begin(), v5.end(), std::ref(qu8rng)); - std::generate(v6.begin(), v6.end(), std::ref(qu8rng)); - std::generate(v7.begin(), v7.end(), std::ref(qu8rng)); - std::generate(v8.begin(), v8.end(), std::ref(qu8rng)); - std::generate(v9.begin(), v9.end(), std::ref(qu8rng)); - std::generate(v10.begin(), v10.end(), std::ref(qu8rng)); - std::generate(v11.begin(), v11.end(), std::ref(qu8rng)); - std::generate(v12.begin(), v12.end(), std::ref(qu8rng)); - std::generate(v13.begin(), v13.end(), std::ref(qu8rng)); - std::generate(v14.begin(), v14.end(), std::ref(qu8rng)); - std::generate(v15.begin(), v15.end(), std::ref(qu8rng)); - std::generate(v16.begin(), v16.end(), std::ref(qu8rng)); - std::generate(v17.begin(), v17.end(), std::ref(qu8rng)); - std::generate(v18.begin(), v18.end(), std::ref(qu8rng)); - std::generate(v19.begin(), v19.end(), std::ref(qu8rng)); - std::generate(v20.begin(), v20.end(), std::ref(qu8rng)); - std::generate(v21.begin(), v21.end(), std::ref(qu8rng)); - std::generate(v22.begin(), v22.end(), std::ref(qu8rng)); - std::generate(v23.begin(), v23.end(), std::ref(qu8rng)); - std::generate(v24.begin(), v24.end(), std::ref(qu8rng)); - std::generate(v25.begin(), v25.end(), std::ref(qu8rng)); - std::generate(v26.begin(), v26.end(), std::ref(qu8rng)); - std::generate(v27.begin(), v27.end(), std::ref(qu8rng)); - std::generate(v28.begin(), v28.end(), std::ref(qu8rng)); - std::generate(v29.begin(), v29.end(), std::ref(qu8rng)); - std::generate(v30.begin(), v30.end(), std::ref(qu8rng)); - std::generate(v31.begin(), v31.end(), std::ref(qu8rng)); - std::generate(v32.begin(), v32.end(), std::ref(qu8rng)); - std::generate(v33.begin(), v33.end(), std::ref(qu8rng)); - std::generate(v34.begin(), v34.end(), std::ref(qu8rng)); - std::generate(v35.begin(), v35.end(), std::ref(qu8rng)); - std::generate(v36.begin(), v36.end(), std::ref(qu8rng)); - std::generate(v37.begin(), v37.end(), std::ref(qu8rng)); - std::generate(v38.begin(), v38.end(), std::ref(qu8rng)); - std::generate(v39.begin(), v39.end(), std::ref(qu8rng)); - std::generate(v40.begin(), v40.end(), std::ref(qu8rng)); - std::generate(v41.begin(), v41.end(), std::ref(qu8rng)); - std::generate(v42.begin(), v42.end(), std::ref(qu8rng)); - std::generate(v43.begin(), v43.end(), std::ref(qu8rng)); - std::generate(v44.begin(), v44.end(), std::ref(qu8rng)); - std::generate(v45.begin(), v45.end(), std::ref(qu8rng)); - std::generate(v46.begin(), v46.end(), std::ref(qu8rng)); - std::generate(v47.begin(), v47.end(), std::ref(qu8rng)); - std::generate(v48.begin(), v48.end(), std::ref(qu8rng)); - std::generate(v49.begin(), v49.end(), std::ref(qu8rng)); - std::generate(v50.begin(), v50.end(), std::ref(qu8rng)); - std::generate(v51.begin(), v51.end(), std::ref(qu8rng)); - std::generate(v52.begin(), v52.end(), std::ref(qu8rng)); - std::generate(v53.begin(), v53.end(), std::ref(qu8rng)); - std::generate(v54.begin(), v54.end(), std::ref(qu8rng)); - std::generate(v55.begin(), v55.end(), std::ref(qu8rng)); - std::generate(v56.begin(), v56.end(), std::ref(qu8rng)); - std::generate(v57.begin(), v57.end(), std::ref(qu8rng)); - std::generate(v58.begin(), v58.end(), std::ref(qu8rng)); - std::generate(v59.begin(), v59.end(), std::ref(qu8rng)); - std::generate(v60.begin(), v60.end(), std::ref(qu8rng)); - std::generate(v61.begin(), v61.end(), std::ref(qu8rng)); - std::generate(v62.begin(), v62.end(), std::ref(qu8rng)); - std::generate(v63.begin(), v63.end(), std::ref(qu8rng)); - std::generate(v64.begin(), v64.end(), std::ref(qu8rng)); - std::generate(v65.begin(), v65.end(), std::ref(qu8rng)); - std::generate(w66.begin(), w66.end(), std::ref(qu8rng)); - std::generate(w67.begin(), w67.end(), std::ref(qs32rng)); - std::generate(w68.begin(), w68.end(), std::ref(qu8rng)); - std::generate(w69.begin(), w69.end(), std::ref(qs32rng)); - std::generate(w70.begin(), w70.end(), std::ref(qu8rng)); - std::generate(w71.begin(), w71.end(), std::ref(qs32rng)); - std::generate(w72.begin(), w72.end(), std::ref(qu8rng)); - std::generate(w73.begin(), w73.end(), std::ref(qs32rng)); - std::generate(w74.begin(), w74.end(), std::ref(qu8rng)); - std::generate(w75.begin(), w75.end(), std::ref(qs32rng)); - std::generate(w76.begin(), w76.end(), std::ref(qu8rng)); - std::generate(w77.begin(), w77.end(), std::ref(qs32rng)); - std::generate(w78.begin(), w78.end(), std::ref(qu8rng)); - std::generate(w79.begin(), w79.end(), std::ref(qs32rng)); - std::generate(w80.begin(), w80.end(), std::ref(qu8rng)); - std::generate(w81.begin(), w81.end(), std::ref(qs32rng)); - std::generate(w82.begin(), w82.end(), std::ref(qu8rng)); - std::generate(w83.begin(), w83.end(), std::ref(qs32rng)); - std::generate(w84.begin(), w84.end(), std::ref(qu8rng)); - std::generate(w85.begin(), w85.end(), std::ref(qs32rng)); - std::generate(w86.begin(), w86.end(), std::ref(qu8rng)); - std::generate(w87.begin(), w87.end(), std::ref(qs32rng)); - std::generate(w88.begin(), w88.end(), std::ref(qu8rng)); - std::generate(w89.begin(), w89.end(), std::ref(qs32rng)); - std::generate(w90.begin(), w90.end(), std::ref(qu8rng)); - std::generate(w91.begin(), w91.end(), std::ref(qs32rng)); - std::generate(w92.begin(), w92.end(), std::ref(qu8rng)); - std::generate(w93.begin(), w93.end(), std::ref(qs32rng)); - std::generate(w94.begin(), w94.end(), std::ref(qu8rng)); - std::generate(w95.begin(), w95.end(), std::ref(qs32rng)); - std::generate(w96.begin(), w96.end(), std::ref(qu8rng)); - std::generate(w97.begin(), w97.end(), std::ref(qs32rng)); - std::generate(w98.begin(), w98.end(), std::ref(qu8rng)); - std::generate(w99.begin(), w99.end(), std::ref(qs32rng)); - std::generate(w100.begin(), w100.end(), std::ref(qu8rng)); - std::generate(w101.begin(), w101.end(), std::ref(qs32rng)); - std::generate(w102.begin(), w102.end(), std::ref(qu8rng)); - std::generate(w103.begin(), w103.end(), std::ref(qs32rng)); - std::generate(w104.begin(), w104.end(), std::ref(qu8rng)); - std::generate(w105.begin(), w105.end(), std::ref(qs32rng)); - std::generate(w106.begin(), w106.end(), std::ref(qu8rng)); - std::generate(w107.begin(), w107.end(), std::ref(qs32rng)); - std::generate(w108.begin(), w108.end(), std::ref(qu8rng)); - std::generate(w109.begin(), w109.end(), std::ref(qs32rng)); - std::generate(w110.begin(), w110.end(), std::ref(qu8rng)); - std::generate(w111.begin(), w111.end(), std::ref(qs32rng)); - std::generate(w112.begin(), w112.end(), std::ref(qu8rng)); - std::generate(w113.begin(), w113.end(), std::ref(qs32rng)); - std::generate(w114.begin(), w114.end(), std::ref(qu8rng)); - std::generate(w115.begin(), w115.end(), std::ref(qs32rng)); - std::generate(w116.begin(), w116.end(), std::ref(qu8rng)); - std::generate(w117.begin(), w117.end(), std::ref(qs32rng)); - std::generate(w118.begin(), w118.end(), std::ref(qu8rng)); - std::generate(w119.begin(), w119.end(), std::ref(qs32rng)); - std::generate(w120.begin(), w120.end(), std::ref(qu8rng)); - std::generate(w121.begin(), w121.end(), std::ref(qs32rng)); - std::generate(w122.begin(), w122.end(), std::ref(qu8rng)); - std::generate(w123.begin(), w123.end(), std::ref(qs32rng)); - std::generate(w124.begin(), w124.end(), std::ref(qu8rng)); - std::generate(w125.begin(), w125.end(), std::ref(qs32rng)); - std::generate(w126.begin(), w126.end(), std::ref(qu8rng)); - std::generate(w127.begin(), w127.end(), std::ref(qs32rng)); - std::generate(w128.begin(), w128.end(), std::ref(qu8rng)); - std::generate(w129.begin(), w129.end(), std::ref(qs32rng)); - std::generate(w130.begin(), w130.end(), std::ref(qu8rng)); - std::generate(w131.begin(), w131.end(), std::ref(qs32rng)); - std::generate(w132.begin(), w132.end(), std::ref(qu8rng)); - std::generate(w133.begin(), w133.end(), std::ref(qs32rng)); - std::generate(w134.begin(), w134.end(), std::ref(qu8rng)); - std::generate(w135.begin(), w135.end(), std::ref(qs32rng)); - std::generate(w136.begin(), w136.end(), std::ref(qu8rng)); - std::generate(w137.begin(), w137.end(), std::ref(qs32rng)); - std::generate(w138.begin(), w138.end(), std::ref(qu8rng)); - std::generate(w139.begin(), w139.end(), std::ref(qs32rng)); - std::generate(w140.begin(), w140.end(), std::ref(qu8rng)); - std::generate(w141.begin(), w141.end(), std::ref(qs32rng)); - std::generate(w142.begin(), w142.end(), std::ref(qu8rng)); - std::generate(w143.begin(), w143.end(), std::ref(qs32rng)); - std::generate(w144.begin(), w144.end(), std::ref(qu8rng)); - std::generate(w145.begin(), w145.end(), std::ref(qs32rng)); - std::generate(w146.begin(), w146.end(), std::ref(qu8rng)); - std::generate(w147.begin(), w147.end(), std::ref(qs32rng)); - std::generate(w148.begin(), w148.end(), std::ref(qu8rng)); - std::generate(w149.begin(), w149.end(), std::ref(qs32rng)); - std::generate(w150.begin(), w150.end(), std::ref(qu8rng)); - std::generate(w151.begin(), w151.end(), std::ref(qs32rng)); - std::generate(w152.begin(), w152.end(), std::ref(qu8rng)); - std::generate(w153.begin(), w153.end(), std::ref(qs32rng)); - std::generate(w154.begin(), w154.end(), std::ref(qu8rng)); - std::generate(w155.begin(), w155.end(), std::ref(qs32rng)); - std::generate(w156.begin(), w156.end(), std::ref(qu8rng)); - std::generate(w157.begin(), w157.end(), std::ref(qs32rng)); - std::generate(w158.begin(), w158.end(), std::ref(qu8rng)); - std::generate(w159.begin(), w159.end(), std::ref(qs32rng)); - std::generate(w160.begin(), w160.end(), std::ref(qu8rng)); - std::generate(w161.begin(), w161.end(), std::ref(qs32rng)); - std::generate(w162.begin(), w162.end(), std::ref(qu8rng)); - std::generate(w163.begin(), w163.end(), std::ref(qs32rng)); - std::generate(w164.begin(), w164.end(), std::ref(qu8rng)); - std::generate(w165.begin(), w165.end(), std::ref(qs32rng)); - std::generate(w166.begin(), w166.end(), std::ref(qu8rng)); - std::generate(w167.begin(), w167.end(), std::ref(qs32rng)); - std::generate(w168.begin(), w168.end(), std::ref(qu8rng)); - std::generate(w169.begin(), w169.end(), std::ref(qs32rng)); - std::generate(w170.begin(), w170.end(), std::ref(qu8rng)); - std::generate(w171.begin(), w171.end(), std::ref(qs32rng)); - - Operators operators; - xnn_status status; - xnn_code_cache* code_cache_ptr = nullptr; - size_t max_workspace_size = 0; - - xnn_operator_t op0 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/0, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/3, - /*group_output_channels=*/32, - /*input_channel_stride=*/3, - /*output_channel_stride=*/32, - /*input_zero_point=*/(uint8_t) 128, - /*input_scale=*/0.0078125, - /*kernel_zero_point=*/(uint8_t) 122, - /*kernel_scale=*/0.03396892547607422, - /*kernel=*/w66.data(), /*bias=*/w67.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op0); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #0" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op0, xnn_delete_operator); - - xnn_operator_t op1 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/32, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/32, - /*output_channel_stride=*/32, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 165, - /*kernel_scale=*/0.3436955213546753, - /*kernel=*/w68.data(), /*bias=*/w69.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op1); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #1" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op1, xnn_delete_operator); - - xnn_operator_t op2 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/32, - /*group_output_channels=*/16, - /*input_channel_stride=*/32, - /*output_channel_stride=*/16, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 140, - /*kernel_scale=*/0.03737175464630127, - /*kernel=*/w70.data(), /*bias=*/w71.data(), - /*output_zero_point=*/(uint8_t) 129, - /*output_scale=*/0.35441333055496216, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op2); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #2" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op2, xnn_delete_operator); - - xnn_operator_t op3 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/16, - /*group_output_channels=*/96, - /*input_channel_stride=*/16, - /*output_channel_stride=*/96, - /*input_zero_point=*/(uint8_t) 129, - /*input_scale=*/0.35441333055496216, - /*kernel_zero_point=*/(uint8_t) 127, - /*kernel_scale=*/0.009758507832884789, - /*kernel=*/w72.data(), /*bias=*/w73.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op3); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #3" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op3, xnn_delete_operator); - - xnn_operator_t op4 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/0, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/96, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/96, - /*output_channel_stride=*/96, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 109, - /*kernel_scale=*/0.020969120785593987, - /*kernel=*/w74.data(), /*bias=*/w75.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op4); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #4" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op4, xnn_delete_operator); - - xnn_operator_t op5 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/96, - /*group_output_channels=*/24, - /*input_channel_stride=*/96, - /*output_channel_stride=*/24, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 156, - /*kernel_scale=*/0.0225360207259655, - /*kernel=*/w76.data(), /*bias=*/w77.data(), - /*output_zero_point=*/(uint8_t) 119, - /*output_scale=*/0.2758343517780304, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op5); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #5" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op5, xnn_delete_operator); - - xnn_operator_t op6 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/24, - /*group_output_channels=*/144, - /*input_channel_stride=*/24, - /*output_channel_stride=*/144, - /*input_zero_point=*/(uint8_t) 119, - /*input_scale=*/0.2758343517780304, - /*kernel_zero_point=*/(uint8_t) 144, - /*kernel_scale=*/0.0036556976847350597, - /*kernel=*/w78.data(), /*bias=*/w79.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op6); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #6" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op6, xnn_delete_operator); - - xnn_operator_t op7 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/144, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/144, - /*output_channel_stride=*/144, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 52, - /*kernel_scale=*/0.16981913149356842, - /*kernel=*/w80.data(), /*bias=*/w81.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op7); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #7" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op7, xnn_delete_operator); - - xnn_operator_t op8 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/144, - /*group_output_channels=*/24, - /*input_channel_stride=*/144, - /*output_channel_stride=*/24, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 122, - /*kernel_scale=*/0.02740888111293316, - /*kernel=*/w82.data(), /*bias=*/w83.data(), - /*output_zero_point=*/(uint8_t) 136, - /*output_scale=*/0.4014929533004761, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op8); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #8" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op8, xnn_delete_operator); - - xnn_operator_t op9 = nullptr; - status = xnn_create_add_nd_qu8( - 136 /* input1 zero point */, 0.4014929533004761 /* input1 scale */, - 119 /* input2 zero point */, 0.2758343517780304 /* input2 scale */, - 133 /* output zero point */, 0.43216896057128906 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op9); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #9" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op9, xnn_delete_operator); - - xnn_operator_t op10 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/24, - /*group_output_channels=*/144, - /*input_channel_stride=*/24, - /*output_channel_stride=*/144, - /*input_zero_point=*/(uint8_t) 133, - /*input_scale=*/0.43216896057128906, - /*kernel_zero_point=*/(uint8_t) 104, - /*kernel_scale=*/0.0029988749884068966, - /*kernel=*/w84.data(), /*bias=*/w85.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op10); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #10" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op10, xnn_delete_operator); - - xnn_operator_t op11 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/0, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/144, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/144, - /*output_channel_stride=*/144, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 143, - /*kernel_scale=*/0.017202870920300484, - /*kernel=*/w86.data(), /*bias=*/w87.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op11); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #11" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op11, xnn_delete_operator); - - xnn_operator_t op12 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/144, - /*group_output_channels=*/32, - /*input_channel_stride=*/144, - /*output_channel_stride=*/32, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 111, - /*kernel_scale=*/0.016844693571329117, - /*kernel=*/w88.data(), /*bias=*/w89.data(), - /*output_zero_point=*/(uint8_t) 127, - /*output_scale=*/0.21836242079734802, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op12); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #12" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op12, xnn_delete_operator); - - xnn_operator_t op13 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/32, - /*group_output_channels=*/192, - /*input_channel_stride=*/32, - /*output_channel_stride=*/192, - /*input_zero_point=*/(uint8_t) 127, - /*input_scale=*/0.21836242079734802, - /*kernel_zero_point=*/(uint8_t) 128, - /*kernel_scale=*/0.0019244228024035692, - /*kernel=*/w90.data(), /*bias=*/w91.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op13); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #13" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op13, xnn_delete_operator); - - xnn_operator_t op14 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/192, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/192, - /*output_channel_stride=*/192, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 118, - /*kernel_scale=*/0.06525065749883652, - /*kernel=*/w92.data(), /*bias=*/w93.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op14); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #14" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op14, xnn_delete_operator); - - xnn_operator_t op15 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/192, - /*group_output_channels=*/32, - /*input_channel_stride=*/192, - /*output_channel_stride=*/32, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 146, - /*kernel_scale=*/0.019062912091612816, - /*kernel=*/w94.data(), /*bias=*/w95.data(), - /*output_zero_point=*/(uint8_t) 121, - /*output_scale=*/0.2279418408870697, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op15); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #15" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op15, xnn_delete_operator); - - xnn_operator_t op16 = nullptr; - status = xnn_create_add_nd_qu8( - 121 /* input1 zero point */, 0.2279418408870697 /* input1 scale */, - 127 /* input2 zero point */, 0.21836242079734802 /* input2 scale */, - 130 /* output zero point */, 0.25968998670578003 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op16); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #16" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op16, xnn_delete_operator); - - xnn_operator_t op17 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/32, - /*group_output_channels=*/192, - /*input_channel_stride=*/32, - /*output_channel_stride=*/192, - /*input_zero_point=*/(uint8_t) 130, - /*input_scale=*/0.25968998670578003, - /*kernel_zero_point=*/(uint8_t) 135, - /*kernel_scale=*/0.0013649158645421267, - /*kernel=*/w96.data(), /*bias=*/w97.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op17); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #17" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op17, xnn_delete_operator); - - xnn_operator_t op18 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/192, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/192, - /*output_channel_stride=*/192, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 95, - /*kernel_scale=*/0.07909784466028214, - /*kernel=*/w98.data(), /*bias=*/w99.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op18); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #18" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op18, xnn_delete_operator); - - xnn_operator_t op19 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/192, - /*group_output_channels=*/32, - /*input_channel_stride=*/192, - /*output_channel_stride=*/32, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 128, - /*kernel_scale=*/0.018293123692274094, - /*kernel=*/w100.data(), /*bias=*/w101.data(), - /*output_zero_point=*/(uint8_t) 124, - /*output_scale=*/0.25774890184402466, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op19); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #19" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op19, xnn_delete_operator); - - xnn_operator_t op20 = nullptr; - status = xnn_create_add_nd_qu8( - 124 /* input1 zero point */, 0.25774890184402466 /* input1 scale */, - 130 /* input2 zero point */, 0.25968998670578003 /* input2 scale */, - 124 /* output zero point */, 0.331714928150177 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op20); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #20" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op20, xnn_delete_operator); - - xnn_operator_t op21 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/32, - /*group_output_channels=*/192, - /*input_channel_stride=*/32, - /*output_channel_stride=*/192, - /*input_zero_point=*/(uint8_t) 124, - /*input_scale=*/0.331714928150177, - /*kernel_zero_point=*/(uint8_t) 127, - /*kernel_scale=*/0.0019170437008142471, - /*kernel=*/w102.data(), /*bias=*/w103.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op21); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #21" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op21, xnn_delete_operator); - - xnn_operator_t op22 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/0, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/192, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/192, - /*output_channel_stride=*/192, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 127, - /*kernel_scale=*/0.010087885893881321, - /*kernel=*/w104.data(), /*bias=*/w105.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op22); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #22" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op22, xnn_delete_operator); - - xnn_operator_t op23 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/192, - /*group_output_channels=*/64, - /*input_channel_stride=*/192, - /*output_channel_stride=*/64, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 147, - /*kernel_scale=*/0.014601286500692368, - /*kernel=*/w106.data(), /*bias=*/w107.data(), - /*output_zero_point=*/(uint8_t) 126, - /*output_scale=*/0.18540528416633606, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op23); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #23" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op23, xnn_delete_operator); - - xnn_operator_t op24 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/64, - /*group_output_channels=*/384, - /*input_channel_stride=*/64, - /*output_channel_stride=*/384, - /*input_zero_point=*/(uint8_t) 126, - /*input_scale=*/0.18540528416633606, - /*kernel_zero_point=*/(uint8_t) 125, - /*kernel_scale=*/0.0015538912266492844, - /*kernel=*/w108.data(), /*bias=*/w109.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op24); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #24" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op24, xnn_delete_operator); - - xnn_operator_t op25 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/384, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/384, - /*output_channel_stride=*/384, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 110, - /*kernel_scale=*/0.06092711538076401, - /*kernel=*/w110.data(), /*bias=*/w111.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op25); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #25" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op25, xnn_delete_operator); - - xnn_operator_t op26 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/384, - /*group_output_channels=*/64, - /*input_channel_stride=*/384, - /*output_channel_stride=*/64, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 124, - /*kernel_scale=*/0.016782939434051514, - /*kernel=*/w112.data(), /*bias=*/w113.data(), - /*output_zero_point=*/(uint8_t) 109, - /*output_scale=*/0.17263489961624146, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op26); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #26" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op26, xnn_delete_operator); - - xnn_operator_t op27 = nullptr; - status = xnn_create_add_nd_qu8( - 109 /* input1 zero point */, 0.17263489961624146 /* input1 scale */, - 126 /* input2 zero point */, 0.18540528416633606 /* input2 scale */, - 122 /* output zero point */, 0.18911026418209076 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op27); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #27" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op27, xnn_delete_operator); - - xnn_operator_t op28 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/64, - /*group_output_channels=*/384, - /*input_channel_stride=*/64, - /*output_channel_stride=*/384, - /*input_zero_point=*/(uint8_t) 122, - /*input_scale=*/0.18911026418209076, - /*kernel_zero_point=*/(uint8_t) 134, - /*kernel_scale=*/0.0014702979242429137, - /*kernel=*/w114.data(), /*bias=*/w115.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op28); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #28" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op28, xnn_delete_operator); - - xnn_operator_t op29 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/384, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/384, - /*output_channel_stride=*/384, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 133, - /*kernel_scale=*/0.052407849580049515, - /*kernel=*/w116.data(), /*bias=*/w117.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op29); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #29" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op29, xnn_delete_operator); - - xnn_operator_t op30 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/384, - /*group_output_channels=*/64, - /*input_channel_stride=*/384, - /*output_channel_stride=*/64, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 125, - /*kernel_scale=*/0.012898261658847332, - /*kernel=*/w118.data(), /*bias=*/w119.data(), - /*output_zero_point=*/(uint8_t) 123, - /*output_scale=*/0.14715521037578583, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op30); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #30" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op30, xnn_delete_operator); - - xnn_operator_t op31 = nullptr; - status = xnn_create_add_nd_qu8( - 123 /* input1 zero point */, 0.14715521037578583 /* input1 scale */, - 122 /* input2 zero point */, 0.18911026418209076 /* input2 scale */, - 124 /* output zero point */, 0.1996811032295227 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op31); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #31" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op31, xnn_delete_operator); - - xnn_operator_t op32 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/64, - /*group_output_channels=*/384, - /*input_channel_stride=*/64, - /*output_channel_stride=*/384, - /*input_zero_point=*/(uint8_t) 124, - /*input_scale=*/0.1996811032295227, - /*kernel_zero_point=*/(uint8_t) 127, - /*kernel_scale=*/0.0013733493397012353, - /*kernel=*/w120.data(), /*bias=*/w121.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op32); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #32" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op32, xnn_delete_operator); - - xnn_operator_t op33 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/384, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/384, - /*output_channel_stride=*/384, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 155, - /*kernel_scale=*/0.04077887907624245, - /*kernel=*/w122.data(), /*bias=*/w123.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op33); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #33" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op33, xnn_delete_operator); - - xnn_operator_t op34 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/384, - /*group_output_channels=*/64, - /*input_channel_stride=*/384, - /*output_channel_stride=*/64, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 144, - /*kernel_scale=*/0.019561484456062317, - /*kernel=*/w124.data(), /*bias=*/w125.data(), - /*output_zero_point=*/(uint8_t) 122, - /*output_scale=*/0.15627601742744446, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op34); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #34" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op34, xnn_delete_operator); - - xnn_operator_t op35 = nullptr; - status = xnn_create_add_nd_qu8( - 122 /* input1 zero point */, 0.15627601742744446 /* input1 scale */, - 124 /* input2 zero point */, 0.1996811032295227 /* input2 scale */, - 120 /* output zero point */, 0.22027325630187988 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op35); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #35" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op35, xnn_delete_operator); - - xnn_operator_t op36 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/64, - /*group_output_channels=*/384, - /*input_channel_stride=*/64, - /*output_channel_stride=*/384, - /*input_zero_point=*/(uint8_t) 120, - /*input_scale=*/0.22027325630187988, - /*kernel_zero_point=*/(uint8_t) 131, - /*kernel_scale=*/0.0016282502328976989, - /*kernel=*/w126.data(), /*bias=*/w127.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op36); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #36" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op36, xnn_delete_operator); - - xnn_operator_t op37 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/384, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/384, - /*output_channel_stride=*/384, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 143, - /*kernel_scale=*/0.031107846647500992, - /*kernel=*/w128.data(), /*bias=*/w129.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op37); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #37" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op37, xnn_delete_operator); - - xnn_operator_t op38 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/384, - /*group_output_channels=*/96, - /*input_channel_stride=*/384, - /*output_channel_stride=*/96, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 129, - /*kernel_scale=*/0.007436311338096857, - /*kernel=*/w130.data(), /*bias=*/w131.data(), - /*output_zero_point=*/(uint8_t) 129, - /*output_scale=*/0.17061053216457367, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op38); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #38" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op38, xnn_delete_operator); - - xnn_operator_t op39 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/96, - /*group_output_channels=*/576, - /*input_channel_stride=*/96, - /*output_channel_stride=*/576, - /*input_zero_point=*/(uint8_t) 129, - /*input_scale=*/0.17061053216457367, - /*kernel_zero_point=*/(uint8_t) 134, - /*kernel_scale=*/0.0016309921629726887, - /*kernel=*/w132.data(), /*bias=*/w133.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op39); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #39" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op39, xnn_delete_operator); - - xnn_operator_t op40 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/576, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/576, - /*output_channel_stride=*/576, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 66, - /*kernel_scale=*/0.07080810517072678, - /*kernel=*/w134.data(), /*bias=*/w135.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op40); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #40" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op40, xnn_delete_operator); - - xnn_operator_t op41 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/576, - /*group_output_channels=*/96, - /*input_channel_stride=*/576, - /*output_channel_stride=*/96, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 136, - /*kernel_scale=*/0.00838223285973072, - /*kernel=*/w136.data(), /*bias=*/w137.data(), - /*output_zero_point=*/(uint8_t) 127, - /*output_scale=*/0.12332822382450104, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op41); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #41" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op41, xnn_delete_operator); - - xnn_operator_t op42 = nullptr; - status = xnn_create_add_nd_qu8( - 127 /* input1 zero point */, 0.12332822382450104 /* input1 scale */, - 129 /* input2 zero point */, 0.17061053216457367 /* input2 scale */, - 127 /* output zero point */, 0.17615799605846405 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op42); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #42" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op42, xnn_delete_operator); - - xnn_operator_t op43 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/96, - /*group_output_channels=*/576, - /*input_channel_stride=*/96, - /*output_channel_stride=*/576, - /*input_zero_point=*/(uint8_t) 127, - /*input_scale=*/0.17615799605846405, - /*kernel_zero_point=*/(uint8_t) 138, - /*kernel_scale=*/0.0018258779309689999, - /*kernel=*/w138.data(), /*bias=*/w139.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op43); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #43" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op43, xnn_delete_operator); - - xnn_operator_t op44 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/576, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/576, - /*output_channel_stride=*/576, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 159, - /*kernel_scale=*/0.07448793947696686, - /*kernel=*/w140.data(), /*bias=*/w141.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op44); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #44" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op44, xnn_delete_operator); - - xnn_operator_t op45 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/576, - /*group_output_channels=*/96, - /*input_channel_stride=*/576, - /*output_channel_stride=*/96, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 154, - /*kernel_scale=*/0.023982593789696693, - /*kernel=*/w142.data(), /*bias=*/w143.data(), - /*output_zero_point=*/(uint8_t) 127, - /*output_scale=*/0.18619607388973236, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op45); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #45" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op45, xnn_delete_operator); - - xnn_operator_t op46 = nullptr; - status = xnn_create_add_nd_qu8( - 127 /* input1 zero point */, 0.18619607388973236 /* input1 scale */, - 127 /* input2 zero point */, 0.17615799605846405 /* input2 scale */, - 126 /* output zero point */, 0.23340091109275818 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op46); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #46" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op46, xnn_delete_operator); - - xnn_operator_t op47 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/96, - /*group_output_channels=*/576, - /*input_channel_stride=*/96, - /*output_channel_stride=*/576, - /*input_zero_point=*/(uint8_t) 126, - /*input_scale=*/0.23340091109275818, - /*kernel_zero_point=*/(uint8_t) 123, - /*kernel_scale=*/0.0013828007504343987, - /*kernel=*/w144.data(), /*bias=*/w145.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op47); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #47" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op47, xnn_delete_operator); - - xnn_operator_t op48 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/0, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/576, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/576, - /*output_channel_stride=*/576, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 92, - /*kernel_scale=*/0.01525793131440878, - /*kernel=*/w146.data(), /*bias=*/w147.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op48); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #48" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op48, xnn_delete_operator); - - xnn_operator_t op49 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/576, - /*group_output_channels=*/160, - /*input_channel_stride=*/576, - /*output_channel_stride=*/160, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 140, - /*kernel_scale=*/0.009447949007153511, - /*kernel=*/w148.data(), /*bias=*/w149.data(), - /*output_zero_point=*/(uint8_t) 132, - /*output_scale=*/0.13237787783145905, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op49); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #49" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op49, xnn_delete_operator); - - xnn_operator_t op50 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/160, - /*group_output_channels=*/960, - /*input_channel_stride=*/160, - /*output_channel_stride=*/960, - /*input_zero_point=*/(uint8_t) 132, - /*input_scale=*/0.13237787783145905, - /*kernel_zero_point=*/(uint8_t) 135, - /*kernel_scale=*/0.0020222084131091833, - /*kernel=*/w150.data(), /*bias=*/w151.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op50); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #50" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op50, xnn_delete_operator); - - xnn_operator_t op51 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/960, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/960, - /*output_channel_stride=*/960, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 147, - /*kernel_scale=*/0.04166752099990845, - /*kernel=*/w152.data(), /*bias=*/w153.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op51); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #51" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op51, xnn_delete_operator); - - xnn_operator_t op52 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/960, - /*group_output_channels=*/160, - /*input_channel_stride=*/960, - /*output_channel_stride=*/160, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 139, - /*kernel_scale=*/0.00789870135486126, - /*kernel=*/w154.data(), /*bias=*/w155.data(), - /*output_zero_point=*/(uint8_t) 129, - /*output_scale=*/0.10045691579580307, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op52); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #52" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op52, xnn_delete_operator); - - xnn_operator_t op53 = nullptr; - status = xnn_create_add_nd_qu8( - 129 /* input1 zero point */, 0.10045691579580307 /* input1 scale */, - 132 /* input2 zero point */, 0.13237787783145905 /* input2 scale */, - 134 /* output zero point */, 0.15070965886116028 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op53); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #53" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op53, xnn_delete_operator); - - xnn_operator_t op54 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/160, - /*group_output_channels=*/960, - /*input_channel_stride=*/160, - /*output_channel_stride=*/960, - /*input_zero_point=*/(uint8_t) 134, - /*input_scale=*/0.15070965886116028, - /*kernel_zero_point=*/(uint8_t) 127, - /*kernel_scale=*/0.0015944414772093296, - /*kernel=*/w156.data(), /*bias=*/w157.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op54); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #54" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op54, xnn_delete_operator); - - xnn_operator_t op55 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/960, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/960, - /*output_channel_stride=*/960, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 102, - /*kernel_scale=*/0.04281935095787048, - /*kernel=*/w158.data(), /*bias=*/w159.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op55); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #55" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op55, xnn_delete_operator); - - xnn_operator_t op56 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/960, - /*group_output_channels=*/160, - /*input_channel_stride=*/960, - /*output_channel_stride=*/160, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 131, - /*kernel_scale=*/0.03697410225868225, - /*kernel=*/w160.data(), /*bias=*/w161.data(), - /*output_zero_point=*/(uint8_t) 133, - /*output_scale=*/0.1696060746908188, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op56); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #56" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op56, xnn_delete_operator); - - xnn_operator_t op57 = nullptr; - status = xnn_create_add_nd_qu8( - 133 /* input1 zero point */, 0.1696060746908188 /* input1 scale */, - 134 /* input2 zero point */, 0.15070965886116028 /* input2 scale */, - 131 /* output zero point */, 0.21005140244960785 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op57); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #57" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op57, xnn_delete_operator); - - xnn_operator_t op58 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/160, - /*group_output_channels=*/960, - /*input_channel_stride=*/160, - /*output_channel_stride=*/960, - /*input_zero_point=*/(uint8_t) 131, - /*input_scale=*/0.21005140244960785, - /*kernel_zero_point=*/(uint8_t) 135, - /*kernel_scale=*/0.002046825597062707, - /*kernel=*/w162.data(), /*bias=*/w163.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op58); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #58" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op58, xnn_delete_operator); - - xnn_operator_t op59 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/960, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/960, - /*output_channel_stride=*/960, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 201, - /*kernel_scale=*/0.16456253826618195, - /*kernel=*/w164.data(), /*bias=*/w165.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op59); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #59" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op59, xnn_delete_operator); - - xnn_operator_t op60 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/960, - /*group_output_channels=*/320, - /*input_channel_stride=*/960, - /*output_channel_stride=*/320, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 111, - /*kernel_scale=*/0.008009289391338825, - /*kernel=*/w166.data(), /*bias=*/w167.data(), - /*output_zero_point=*/(uint8_t) 130, - /*output_scale=*/0.11694499105215073, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op60); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #60" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op60, xnn_delete_operator); - - xnn_operator_t op61 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/320, - /*group_output_channels=*/1280, - /*input_channel_stride=*/320, - /*output_channel_stride=*/1280, - /*input_zero_point=*/(uint8_t) 130, - /*input_scale=*/0.11694499105215073, - /*kernel_zero_point=*/(uint8_t) 125, - /*kernel_scale=*/0.005167067516595125, - /*kernel=*/w168.data(), /*bias=*/w169.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023528477177023888, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op61); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #61" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op61, xnn_delete_operator); - - xnn_operator_t op62 = nullptr; - status = xnn_create_global_average_pooling_nwc_qu8( - 0 /* input zero point */, 0.023528477177023888 /* input scale */, - 0 /* output zero point */, 0.023528477177023888 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op62); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #62" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op62, xnn_delete_operator); - - xnn_operator_t op63 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/1280, - /*group_output_channels=*/1001, - /*input_channel_stride=*/1280, - /*output_channel_stride=*/1001, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023528477177023888, - /*kernel_zero_point=*/(uint8_t) 113, - /*kernel_scale=*/0.0016910821432247758, - /*kernel=*/w170.data(), /*bias=*/w171.data(), - /*output_zero_point=*/(uint8_t) 58, - /*output_scale=*/0.09889253973960876, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op63); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #63" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op63, xnn_delete_operator); - - xnn_operator_t op64 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op64); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #64" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op64, xnn_delete_operator); - - size_t op0_workspace_size = 0; - size_t op0_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op0, - /*batch_size=*/1, /*input_height=*/224, /*input_width=*/224, - &op0_workspace_size, &op0_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op0_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #0" << std::endl; - return ExecutionPlan(); - } - - size_t op1_workspace_size = 0; - size_t op1_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op1, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op1_workspace_size, &op1_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op1_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #1" << std::endl; - return ExecutionPlan(); - } - - size_t op2_workspace_size = 0; - size_t op2_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op2, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op2_workspace_size, &op2_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op2_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #2" << std::endl; - return ExecutionPlan(); - } - - size_t op3_workspace_size = 0; - size_t op3_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op3, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op3_workspace_size, &op3_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op3_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #3" << std::endl; - return ExecutionPlan(); - } - - size_t op4_workspace_size = 0; - size_t op4_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op4, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op4_workspace_size, &op4_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op4_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #4" << std::endl; - return ExecutionPlan(); - } - - size_t op5_workspace_size = 0; - size_t op5_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op5, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op5_workspace_size, &op5_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op5_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #5" << std::endl; - return ExecutionPlan(); - } - - size_t op6_workspace_size = 0; - size_t op6_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op6, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op6_workspace_size, &op6_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op6_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #6" << std::endl; - return ExecutionPlan(); - } - - size_t op7_workspace_size = 0; - size_t op7_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op7, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op7_workspace_size, &op7_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op7_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #7" << std::endl; - return ExecutionPlan(); - } - - size_t op8_workspace_size = 0; - size_t op8_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op8, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op8_workspace_size, &op8_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op8_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #8" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 56, 56, 24 }; - const size_t b_shape[] = { 1, 56, 56, 24 }; - status = xnn_reshape_add_nd_qu8( - op9, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #9" << std::endl; - return ExecutionPlan(); - } - - size_t op10_workspace_size = 0; - size_t op10_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op10, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op10_workspace_size, &op10_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op10_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #10" << std::endl; - return ExecutionPlan(); - } - - size_t op11_workspace_size = 0; - size_t op11_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op11, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op11_workspace_size, &op11_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op11_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #11" << std::endl; - return ExecutionPlan(); - } - - size_t op12_workspace_size = 0; - size_t op12_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op12, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op12_workspace_size, &op12_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op12_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #12" << std::endl; - return ExecutionPlan(); - } - - size_t op13_workspace_size = 0; - size_t op13_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op13, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op13_workspace_size, &op13_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op13_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #13" << std::endl; - return ExecutionPlan(); - } - - size_t op14_workspace_size = 0; - size_t op14_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op14, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op14_workspace_size, &op14_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op14_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #14" << std::endl; - return ExecutionPlan(); - } - - size_t op15_workspace_size = 0; - size_t op15_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op15, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op15_workspace_size, &op15_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op15_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #15" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 28, 28, 32 }; - const size_t b_shape[] = { 1, 28, 28, 32 }; - status = xnn_reshape_add_nd_qu8( - op16, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #16" << std::endl; - return ExecutionPlan(); - } - - size_t op17_workspace_size = 0; - size_t op17_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op17, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op17_workspace_size, &op17_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op17_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #17" << std::endl; - return ExecutionPlan(); - } - - size_t op18_workspace_size = 0; - size_t op18_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op18, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op18_workspace_size, &op18_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op18_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #18" << std::endl; - return ExecutionPlan(); - } - - size_t op19_workspace_size = 0; - size_t op19_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op19, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op19_workspace_size, &op19_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op19_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #19" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 28, 28, 32 }; - const size_t b_shape[] = { 1, 28, 28, 32 }; - status = xnn_reshape_add_nd_qu8( - op20, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #20" << std::endl; - return ExecutionPlan(); - } - - size_t op21_workspace_size = 0; - size_t op21_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op21, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op21_workspace_size, &op21_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op21_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #21" << std::endl; - return ExecutionPlan(); - } - - size_t op22_workspace_size = 0; - size_t op22_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op22, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op22_workspace_size, &op22_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op22_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #22" << std::endl; - return ExecutionPlan(); - } - - size_t op23_workspace_size = 0; - size_t op23_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op23, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op23_workspace_size, &op23_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op23_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #23" << std::endl; - return ExecutionPlan(); - } - - size_t op24_workspace_size = 0; - size_t op24_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op24, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op24_workspace_size, &op24_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op24_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #24" << std::endl; - return ExecutionPlan(); - } - - size_t op25_workspace_size = 0; - size_t op25_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op25, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op25_workspace_size, &op25_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op25_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #25" << std::endl; - return ExecutionPlan(); - } - - size_t op26_workspace_size = 0; - size_t op26_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op26, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op26_workspace_size, &op26_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op26_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #26" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 64 }; - const size_t b_shape[] = { 1, 14, 14, 64 }; - status = xnn_reshape_add_nd_qu8( - op27, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #27" << std::endl; - return ExecutionPlan(); - } - - size_t op28_workspace_size = 0; - size_t op28_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op28, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op28_workspace_size, &op28_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op28_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #28" << std::endl; - return ExecutionPlan(); - } - - size_t op29_workspace_size = 0; - size_t op29_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op29, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op29_workspace_size, &op29_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op29_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #29" << std::endl; - return ExecutionPlan(); - } - - size_t op30_workspace_size = 0; - size_t op30_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op30, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op30_workspace_size, &op30_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op30_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #30" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 64 }; - const size_t b_shape[] = { 1, 14, 14, 64 }; - status = xnn_reshape_add_nd_qu8( - op31, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #31" << std::endl; - return ExecutionPlan(); - } - - size_t op32_workspace_size = 0; - size_t op32_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op32, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op32_workspace_size, &op32_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op32_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #32" << std::endl; - return ExecutionPlan(); - } - - size_t op33_workspace_size = 0; - size_t op33_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op33, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op33_workspace_size, &op33_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op33_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #33" << std::endl; - return ExecutionPlan(); - } - - size_t op34_workspace_size = 0; - size_t op34_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op34, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op34_workspace_size, &op34_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op34_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #34" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 64 }; - const size_t b_shape[] = { 1, 14, 14, 64 }; - status = xnn_reshape_add_nd_qu8( - op35, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #35" << std::endl; - return ExecutionPlan(); - } - - size_t op36_workspace_size = 0; - size_t op36_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op36, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op36_workspace_size, &op36_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op36_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #36" << std::endl; - return ExecutionPlan(); - } - - size_t op37_workspace_size = 0; - size_t op37_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op37, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op37_workspace_size, &op37_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op37_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #37" << std::endl; - return ExecutionPlan(); - } - - size_t op38_workspace_size = 0; - size_t op38_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op38, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op38_workspace_size, &op38_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op38_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #38" << std::endl; - return ExecutionPlan(); - } - - size_t op39_workspace_size = 0; - size_t op39_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op39, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op39_workspace_size, &op39_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op39_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #39" << std::endl; - return ExecutionPlan(); - } - - size_t op40_workspace_size = 0; - size_t op40_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op40, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op40_workspace_size, &op40_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op40_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #40" << std::endl; - return ExecutionPlan(); - } - - size_t op41_workspace_size = 0; - size_t op41_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op41, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op41_workspace_size, &op41_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op41_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #41" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 96 }; - const size_t b_shape[] = { 1, 14, 14, 96 }; - status = xnn_reshape_add_nd_qu8( - op42, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #42" << std::endl; - return ExecutionPlan(); - } - - size_t op43_workspace_size = 0; - size_t op43_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op43, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op43_workspace_size, &op43_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op43_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #43" << std::endl; - return ExecutionPlan(); - } - - size_t op44_workspace_size = 0; - size_t op44_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op44, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op44_workspace_size, &op44_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op44_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #44" << std::endl; - return ExecutionPlan(); - } - - size_t op45_workspace_size = 0; - size_t op45_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op45, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op45_workspace_size, &op45_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op45_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #45" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 96 }; - const size_t b_shape[] = { 1, 14, 14, 96 }; - status = xnn_reshape_add_nd_qu8( - op46, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #46" << std::endl; - return ExecutionPlan(); - } - - size_t op47_workspace_size = 0; - size_t op47_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op47, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op47_workspace_size, &op47_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op47_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #47" << std::endl; - return ExecutionPlan(); - } - - size_t op48_workspace_size = 0; - size_t op48_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op48, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op48_workspace_size, &op48_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op48_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #48" << std::endl; - return ExecutionPlan(); - } - - size_t op49_workspace_size = 0; - size_t op49_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op49, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op49_workspace_size, &op49_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op49_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #49" << std::endl; - return ExecutionPlan(); - } - - size_t op50_workspace_size = 0; - size_t op50_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op50, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op50_workspace_size, &op50_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op50_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #50" << std::endl; - return ExecutionPlan(); - } - - size_t op51_workspace_size = 0; - size_t op51_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op51, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op51_workspace_size, &op51_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op51_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #51" << std::endl; - return ExecutionPlan(); - } - - size_t op52_workspace_size = 0; - size_t op52_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op52, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op52_workspace_size, &op52_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op52_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #52" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 160 }; - const size_t b_shape[] = { 1, 7, 7, 160 }; - status = xnn_reshape_add_nd_qu8( - op53, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #53" << std::endl; - return ExecutionPlan(); - } - - size_t op54_workspace_size = 0; - size_t op54_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op54, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op54_workspace_size, &op54_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op54_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #54" << std::endl; - return ExecutionPlan(); - } - - size_t op55_workspace_size = 0; - size_t op55_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op55, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op55_workspace_size, &op55_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op55_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #55" << std::endl; - return ExecutionPlan(); - } - - size_t op56_workspace_size = 0; - size_t op56_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op56, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op56_workspace_size, &op56_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op56_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #56" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 160 }; - const size_t b_shape[] = { 1, 7, 7, 160 }; - status = xnn_reshape_add_nd_qu8( - op57, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #57" << std::endl; - return ExecutionPlan(); - } - - size_t op58_workspace_size = 0; - size_t op58_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op58, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op58_workspace_size, &op58_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op58_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #58" << std::endl; - return ExecutionPlan(); - } - - size_t op59_workspace_size = 0; - size_t op59_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op59, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op59_workspace_size, &op59_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op59_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #59" << std::endl; - return ExecutionPlan(); - } - - size_t op60_workspace_size = 0; - size_t op60_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op60, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op60_workspace_size, &op60_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op60_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #60" << std::endl; - return ExecutionPlan(); - } - - size_t op61_workspace_size = 0; - size_t op61_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op61, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op61_workspace_size, &op61_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op61_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #61" << std::endl; - return ExecutionPlan(); - } - - size_t op62_workspace_size = 0; - size_t op62_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_qu8( - op62, - /*batch_size=*/1, 49 /* width */, - 1280 /* channels */, 1280 /* input stride */, 1280 /* output stride */, - &op62_workspace_size, &op62_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op62_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #62" << std::endl; - return ExecutionPlan(); - } - - size_t op63_workspace_size = 0; - size_t op63_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op63, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op63_workspace_size, &op63_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op63_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #63" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op64, - /*batch_size=*/1001, - 1 /* channels */, - 1 /* input stride */, - 1 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #64" << std::endl; - return ExecutionPlan(); - } - - Workspace workspace(max_workspace_size); - - status = xnn_setup_convolution2d_nhwc_qu8( - op0, - workspace.data(), /*input=*/v0.data(), /*output=*/v1.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #0" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op1, - workspace.data(), /*input=*/v1.data(), /*output=*/v2.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #1" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op2, - workspace.data(), /*input=*/v2.data(), /*output=*/v3.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #2" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op3, - workspace.data(), /*input=*/v3.data(), /*output=*/v4.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #3" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op4, - workspace.data(), /*input=*/v4.data(), /*output=*/v5.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #4" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op5, - workspace.data(), /*input=*/v5.data(), /*output=*/v6.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #5" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op6, - workspace.data(), /*input=*/v6.data(), /*output=*/v7.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #6" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op7, - workspace.data(), /*input=*/v7.data(), /*output=*/v8.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #7" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op8, - workspace.data(), /*input=*/v8.data(), /*output=*/v9.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #8" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op9, - v9.data() /* a */, v6.data() /* b */, /*output=*/v10.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #9" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op10, - workspace.data(), /*input=*/v10.data(), /*output=*/v11.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #10" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op11, - workspace.data(), /*input=*/v11.data(), /*output=*/v12.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #11" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op12, - workspace.data(), /*input=*/v12.data(), /*output=*/v13.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #12" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op13, - workspace.data(), /*input=*/v13.data(), /*output=*/v14.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #13" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op14, - workspace.data(), /*input=*/v14.data(), /*output=*/v15.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #14" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op15, - workspace.data(), /*input=*/v15.data(), /*output=*/v16.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #15" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op16, - v16.data() /* a */, v13.data() /* b */, /*output=*/v17.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #16" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op17, - workspace.data(), /*input=*/v17.data(), /*output=*/v18.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #17" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op18, - workspace.data(), /*input=*/v18.data(), /*output=*/v19.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #18" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op19, - workspace.data(), /*input=*/v19.data(), /*output=*/v20.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #19" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op20, - v20.data() /* a */, v17.data() /* b */, /*output=*/v21.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #20" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op21, - workspace.data(), /*input=*/v21.data(), /*output=*/v22.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #21" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op22, - workspace.data(), /*input=*/v22.data(), /*output=*/v23.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #22" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op23, - workspace.data(), /*input=*/v23.data(), /*output=*/v24.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #23" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op24, - workspace.data(), /*input=*/v24.data(), /*output=*/v25.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #24" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op25, - workspace.data(), /*input=*/v25.data(), /*output=*/v26.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #25" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op26, - workspace.data(), /*input=*/v26.data(), /*output=*/v27.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #26" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op27, - v27.data() /* a */, v24.data() /* b */, /*output=*/v28.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #27" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op28, - workspace.data(), /*input=*/v28.data(), /*output=*/v29.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #28" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op29, - workspace.data(), /*input=*/v29.data(), /*output=*/v30.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #29" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op30, - workspace.data(), /*input=*/v30.data(), /*output=*/v31.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #30" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op31, - v31.data() /* a */, v28.data() /* b */, /*output=*/v32.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #31" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op32, - workspace.data(), /*input=*/v32.data(), /*output=*/v33.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #32" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op33, - workspace.data(), /*input=*/v33.data(), /*output=*/v34.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #33" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op34, - workspace.data(), /*input=*/v34.data(), /*output=*/v35.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #34" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op35, - v35.data() /* a */, v32.data() /* b */, /*output=*/v36.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #35" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op36, - workspace.data(), /*input=*/v36.data(), /*output=*/v37.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #36" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op37, - workspace.data(), /*input=*/v37.data(), /*output=*/v38.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #37" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op38, - workspace.data(), /*input=*/v38.data(), /*output=*/v39.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #38" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op39, - workspace.data(), /*input=*/v39.data(), /*output=*/v40.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #39" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op40, - workspace.data(), /*input=*/v40.data(), /*output=*/v41.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #40" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op41, - workspace.data(), /*input=*/v41.data(), /*output=*/v42.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #41" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op42, - v42.data() /* a */, v39.data() /* b */, /*output=*/v43.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #42" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op43, - workspace.data(), /*input=*/v43.data(), /*output=*/v44.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #43" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op44, - workspace.data(), /*input=*/v44.data(), /*output=*/v45.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #44" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op45, - workspace.data(), /*input=*/v45.data(), /*output=*/v46.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #45" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op46, - v46.data() /* a */, v43.data() /* b */, /*output=*/v47.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #46" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op47, - workspace.data(), /*input=*/v47.data(), /*output=*/v48.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #47" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op48, - workspace.data(), /*input=*/v48.data(), /*output=*/v49.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #48" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op49, - workspace.data(), /*input=*/v49.data(), /*output=*/v50.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #49" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op50, - workspace.data(), /*input=*/v50.data(), /*output=*/v51.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #50" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op51, - workspace.data(), /*input=*/v51.data(), /*output=*/v52.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #51" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op52, - workspace.data(), /*input=*/v52.data(), /*output=*/v53.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #52" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op53, - v53.data() /* a */, v50.data() /* b */, /*output=*/v54.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #53" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op54, - workspace.data(), /*input=*/v54.data(), /*output=*/v55.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #54" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op55, - workspace.data(), /*input=*/v55.data(), /*output=*/v56.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #55" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op56, - workspace.data(), /*input=*/v56.data(), /*output=*/v57.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #56" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op57, - v57.data() /* a */, v54.data() /* b */, /*output=*/v58.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #57" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op58, - workspace.data(), /*input=*/v58.data(), /*output=*/v59.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #58" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op59, - workspace.data(), /*input=*/v59.data(), /*output=*/v60.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #59" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op60, - workspace.data(), /*input=*/v60.data(), /*output=*/v61.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #60" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op61, - workspace.data(), /*input=*/v61.data(), /*output=*/v62.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #61" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_qu8( - op62, - workspace.data(), - /*input=*/v62.data(), /*output=*/v63.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #62" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op63, - workspace.data(), /*input=*/v63.data(), /*output=*/v64.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #63" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op64, - /*input=*/v64.data(), /*output=*/v65.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #64" << std::endl; - return ExecutionPlan(); - } - - XNN_PRAGMA_CLANG("clang diagnostic push") - XNN_PRAGMA_CLANG("clang diagnostic ignored \"-Wpessimizing-move\"") - return ExecutionPlan{operators, workspace}; - XNN_PRAGMA_CLANG("clang diagnostic pop") -} - -} // namespace models diff --git a/models/qu8-mobilenet-v3-large.cc b/models/qu8-mobilenet-v3-large.cc deleted file mode 100644 index 466a6e9e0dd..00000000000 --- a/models/qu8-mobilenet-v3-large.cc +++ /dev/null @@ -1,6166 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! - -#include "xnnpack.h" - -#include -#include -#include -#include -#include -#include - -#include "xnnpack/cache.h" -#include "xnnpack/common.h" -#include "xnnpack/models.h" - -namespace models { - -ExecutionPlan QU8MobileNetV3Large(pthreadpool_t threadpool) { - alignas(16) static std::array v0; - alignas(16) static std::array v1; - alignas(16) static std::array v2; - alignas(16) static std::array v3; - alignas(16) static std::array v4; - alignas(16) static std::array v5; - alignas(16) static std::array v6; - alignas(16) static std::array v7; - alignas(16) static std::array v8; - alignas(16) static std::array v9; - alignas(16) static std::array v10; - alignas(16) static std::array v11; - alignas(16) static std::array v12; - alignas(16) static std::array v13; - alignas(16) static std::array v14; - alignas(16) static std::array v15; - alignas(16) static std::array v16; - alignas(16) static std::array v17; - alignas(16) static std::array v18; - alignas(16) static std::array v19; - alignas(16) static std::array v20; - alignas(16) static std::array v21; - alignas(16) static std::array v22; - alignas(16) static std::array v23; - alignas(16) static std::array v24; - alignas(16) static std::array v25; - alignas(16) static std::array v26; - alignas(16) static std::array v27; - alignas(16) static std::array v28; - alignas(16) static std::array v29; - alignas(16) static std::array v30; - alignas(16) static std::array v31; - alignas(16) static std::array v32; - alignas(16) static std::array v33; - alignas(16) static std::array v34; - alignas(16) static std::array v35; - alignas(16) static std::array v36; - alignas(16) static std::array v37; - alignas(16) static std::array v38; - alignas(16) static std::array v39; - alignas(16) static std::array v40; - alignas(16) static std::array v41; - alignas(16) static std::array v42; - alignas(16) static std::array v43; - alignas(16) static std::array v44; - alignas(16) static std::array v45; - alignas(16) static std::array v46; - alignas(16) static std::array v47; - alignas(16) static std::array v48; - alignas(16) static std::array v49; - alignas(16) static std::array v50; - alignas(16) static std::array v51; - alignas(16) static std::array v52; - alignas(16) static std::array v53; - alignas(16) static std::array v54; - alignas(16) static std::array v55; - alignas(16) static std::array v56; - alignas(16) static std::array v57; - alignas(16) static std::array v58; - alignas(16) static std::array v59; - alignas(16) static std::array v60; - alignas(16) static std::array v61; - alignas(16) static std::array v62; - alignas(16) static std::array v63; - alignas(16) static std::array v64; - alignas(16) static std::array v65; - alignas(16) static std::array v66; - alignas(16) static std::array v67; - alignas(16) static std::array v68; - alignas(16) static std::array v69; - alignas(16) static std::array v70; - alignas(16) static std::array v71; - alignas(16) static std::array v72; - alignas(16) static std::array v73; - alignas(16) static std::array v74; - alignas(16) static std::array v75; - alignas(16) static std::array v76; - alignas(16) static std::array v77; - alignas(16) static std::array v78; - alignas(16) static std::array v79; - alignas(16) static std::array v80; - alignas(16) static std::array v81; - alignas(16) static std::array v82; - alignas(16) static std::array v83; - alignas(16) static std::array v84; - alignas(16) static std::array v85; - alignas(16) static std::array v86; - alignas(16) static std::array v87; - alignas(16) static std::array v88; - alignas(16) static std::array v89; - alignas(16) static std::array v90; - alignas(16) static std::array v91; - alignas(16) static std::array v92; - alignas(16) static std::array v93; - alignas(16) static std::array v94; - alignas(16) static std::array v95; - alignas(16) static std::array v96; - alignas(16) static std::array v97; - alignas(16) static std::array v98; - alignas(16) static std::array v99; - alignas(16) static std::array v100; - alignas(16) static std::array v101; - alignas(16) static std::array v102; - alignas(16) static std::array v103; - alignas(16) static std::array v104; - alignas(16) static std::array v105; - alignas(16) static std::array v106; - alignas(16) static std::array v107; - alignas(16) static std::array v108; - alignas(16) static std::array v109; - alignas(16) static std::array v110; - alignas(16) static std::array v111; - alignas(16) static std::array v112; - alignas(16) static std::array v113; - alignas(16) static std::array v114; - alignas(16) static std::array v115; - alignas(16) static std::array v116; - alignas(16) static std::array v117; - alignas(16) static std::array v118; - alignas(16) static std::array v119; - alignas(16) static std::array v120; - alignas(16) static std::array v121; - alignas(16) static std::array v122; - alignas(16) static std::array v123; - alignas(16) static std::array v124; - alignas(16) static std::array v125; - alignas(16) static std::array v126; - alignas(16) static std::array v127; - alignas(16) static std::array v128; - alignas(16) static std::array v129; - alignas(16) static std::array v130; - alignas(16) static std::array v131; - alignas(16) static std::array w132; - alignas(16) static std::array w133; - alignas(16) static std::array w134; - alignas(16) static std::array w135; - alignas(16) static std::array w136; - alignas(16) static std::array w137; - alignas(16) static std::array w138; - alignas(16) static std::array w139; - alignas(16) static std::array w140; - alignas(16) static std::array w141; - alignas(16) static std::array w142; - alignas(16) static std::array w143; - alignas(16) static std::array w144; - alignas(16) static std::array w145; - alignas(16) static std::array w146; - alignas(16) static std::array w147; - alignas(16) static std::array w148; - alignas(16) static std::array w149; - alignas(16) static std::array w150; - alignas(16) static std::array w151; - alignas(16) static std::array w152; - alignas(16) static std::array w153; - alignas(16) static std::array w154; - alignas(16) static std::array w155; - alignas(16) static std::array w156; - alignas(16) static std::array w157; - alignas(16) static std::array w158; - alignas(16) static std::array w159; - alignas(16) static std::array w160; - alignas(16) static std::array w161; - alignas(16) static std::array w162; - alignas(16) static std::array w163; - alignas(16) static std::array w164; - alignas(16) static std::array w165; - alignas(16) static std::array w166; - alignas(16) static std::array w167; - alignas(16) static std::array w168; - alignas(16) static std::array w169; - alignas(16) static std::array w170; - alignas(16) static std::array w171; - alignas(16) static std::array w172; - alignas(16) static std::array w173; - alignas(16) static std::array w174; - alignas(16) static std::array w175; - alignas(16) static std::array w176; - alignas(16) static std::array w177; - alignas(16) static std::array w178; - alignas(16) static std::array w179; - alignas(16) static std::array w180; - alignas(16) static std::array w181; - alignas(16) static std::array w182; - alignas(16) static std::array w183; - alignas(16) static std::array w184; - alignas(16) static std::array w185; - alignas(16) static std::array w186; - alignas(16) static std::array w187; - alignas(16) static std::array w188; - alignas(16) static std::array w189; - alignas(16) static std::array w190; - alignas(16) static std::array w191; - alignas(16) static std::array w192; - alignas(16) static std::array w193; - alignas(16) static std::array w194; - alignas(16) static std::array w195; - alignas(16) static std::array w196; - alignas(16) static std::array w197; - alignas(16) static std::array w198; - alignas(16) static std::array w199; - alignas(16) static std::array w200; - alignas(16) static std::array w201; - alignas(16) static std::array w202; - alignas(16) static std::array w203; - alignas(16) static std::array w204; - alignas(16) static std::array w205; - alignas(16) static std::array w206; - alignas(16) static std::array w207; - alignas(16) static std::array w208; - alignas(16) static std::array w209; - alignas(16) static std::array w210; - alignas(16) static std::array w211; - alignas(16) static std::array w212; - alignas(16) static std::array w213; - alignas(16) static std::array w214; - alignas(16) static std::array w215; - alignas(16) static std::array w216; - alignas(16) static std::array w217; - alignas(16) static std::array w218; - alignas(16) static std::array w219; - alignas(16) static std::array w220; - alignas(16) static std::array w221; - alignas(16) static std::array w222; - alignas(16) static std::array w223; - alignas(16) static std::array w224; - alignas(16) static std::array w225; - alignas(16) static std::array w226; - alignas(16) static std::array w227; - alignas(16) static std::array w228; - alignas(16) static std::array w229; - alignas(16) static std::array w230; - alignas(16) static std::array w231; - alignas(16) static std::array w232; - alignas(16) static std::array w233; - alignas(16) static std::array w234; - alignas(16) static std::array w235; - alignas(16) static std::array w236; - alignas(16) static std::array w237; - alignas(16) static std::array w238; - alignas(16) static std::array w239; - alignas(16) static std::array w240; - alignas(16) static std::array w241; - alignas(16) static std::array w242; - alignas(16) static std::array w243; - alignas(16) static std::array w244; - alignas(16) static std::array w245; - alignas(16) static std::array w246; - alignas(16) static std::array w247; - alignas(16) static std::array w248; - alignas(16) static std::array w249; - alignas(16) static std::array w250; - alignas(16) static std::array w251; - alignas(16) static std::array w252; - alignas(16) static std::array w253; - alignas(16) static std::array w254; - alignas(16) static std::array w255; - alignas(16) static std::array w256; - alignas(16) static std::array w257; - alignas(16) static std::array w258; - alignas(16) static std::array w259; - alignas(16) static std::array w260; - alignas(16) static std::array w261; - alignas(16) static std::array w262; - alignas(16) static std::array w263; - alignas(16) static std::array w264; - alignas(16) static std::array w265; - alignas(16) static std::array w266; - alignas(16) static std::array w267; - alignas(16) static std::array w268; - alignas(16) static std::array w269; - alignas(16) static std::array w270; - alignas(16) static std::array w271; - alignas(16) static std::array w272; - alignas(16) static std::array w273; - alignas(16) static std::array w274; - alignas(16) static std::array w275; - - std::random_device random_device; - auto rng = std::mt19937(random_device()); - auto qu8rng = std::bind(std::uniform_int_distribution(std::numeric_limits::min(), std::numeric_limits::max()), std::ref(rng)); - auto qs32rng = std::bind(std::uniform_int_distribution(-10000, 10000), std::ref(rng)); - std::generate(v0.begin(), v0.end(), std::ref(qu8rng)); - std::generate(v1.begin(), v1.end(), std::ref(qu8rng)); - std::generate(v2.begin(), v2.end(), std::ref(qu8rng)); - std::generate(v3.begin(), v3.end(), std::ref(qu8rng)); - std::generate(v4.begin(), v4.end(), std::ref(qu8rng)); - std::generate(v5.begin(), v5.end(), std::ref(qu8rng)); - std::generate(v6.begin(), v6.end(), std::ref(qu8rng)); - std::generate(v7.begin(), v7.end(), std::ref(qu8rng)); - std::generate(v8.begin(), v8.end(), std::ref(qu8rng)); - std::generate(v9.begin(), v9.end(), std::ref(qu8rng)); - std::generate(v10.begin(), v10.end(), std::ref(qu8rng)); - std::generate(v11.begin(), v11.end(), std::ref(qu8rng)); - std::generate(v12.begin(), v12.end(), std::ref(qu8rng)); - std::generate(v13.begin(), v13.end(), std::ref(qu8rng)); - std::generate(v14.begin(), v14.end(), std::ref(qu8rng)); - std::generate(v15.begin(), v15.end(), std::ref(qu8rng)); - std::generate(v16.begin(), v16.end(), std::ref(qu8rng)); - std::generate(v17.begin(), v17.end(), std::ref(qu8rng)); - std::generate(v18.begin(), v18.end(), std::ref(qu8rng)); - std::generate(v19.begin(), v19.end(), std::ref(qu8rng)); - std::generate(v20.begin(), v20.end(), std::ref(qu8rng)); - std::generate(v21.begin(), v21.end(), std::ref(qu8rng)); - std::generate(v22.begin(), v22.end(), std::ref(qu8rng)); - std::generate(v23.begin(), v23.end(), std::ref(qu8rng)); - std::generate(v24.begin(), v24.end(), std::ref(qu8rng)); - std::generate(v25.begin(), v25.end(), std::ref(qu8rng)); - std::generate(v26.begin(), v26.end(), std::ref(qu8rng)); - std::generate(v27.begin(), v27.end(), std::ref(qu8rng)); - std::generate(v28.begin(), v28.end(), std::ref(qu8rng)); - std::generate(v29.begin(), v29.end(), std::ref(qu8rng)); - std::generate(v30.begin(), v30.end(), std::ref(qu8rng)); - std::generate(v31.begin(), v31.end(), std::ref(qu8rng)); - std::generate(v32.begin(), v32.end(), std::ref(qu8rng)); - std::generate(v33.begin(), v33.end(), std::ref(qu8rng)); - std::generate(v34.begin(), v34.end(), std::ref(qu8rng)); - std::generate(v35.begin(), v35.end(), std::ref(qu8rng)); - std::generate(v36.begin(), v36.end(), std::ref(qu8rng)); - std::generate(v37.begin(), v37.end(), std::ref(qu8rng)); - std::generate(v38.begin(), v38.end(), std::ref(qu8rng)); - std::generate(v39.begin(), v39.end(), std::ref(qu8rng)); - std::generate(v40.begin(), v40.end(), std::ref(qu8rng)); - std::generate(v41.begin(), v41.end(), std::ref(qu8rng)); - std::generate(v42.begin(), v42.end(), std::ref(qu8rng)); - std::generate(v43.begin(), v43.end(), std::ref(qu8rng)); - std::generate(v44.begin(), v44.end(), std::ref(qu8rng)); - std::generate(v45.begin(), v45.end(), std::ref(qu8rng)); - std::generate(v46.begin(), v46.end(), std::ref(qu8rng)); - std::generate(v47.begin(), v47.end(), std::ref(qu8rng)); - std::generate(v48.begin(), v48.end(), std::ref(qu8rng)); - std::generate(v49.begin(), v49.end(), std::ref(qu8rng)); - std::generate(v50.begin(), v50.end(), std::ref(qu8rng)); - std::generate(v51.begin(), v51.end(), std::ref(qu8rng)); - std::generate(v52.begin(), v52.end(), std::ref(qu8rng)); - std::generate(v53.begin(), v53.end(), std::ref(qu8rng)); - std::generate(v54.begin(), v54.end(), std::ref(qu8rng)); - std::generate(v55.begin(), v55.end(), std::ref(qu8rng)); - std::generate(v56.begin(), v56.end(), std::ref(qu8rng)); - std::generate(v57.begin(), v57.end(), std::ref(qu8rng)); - std::generate(v58.begin(), v58.end(), std::ref(qu8rng)); - std::generate(v59.begin(), v59.end(), std::ref(qu8rng)); - std::generate(v60.begin(), v60.end(), std::ref(qu8rng)); - std::generate(v61.begin(), v61.end(), std::ref(qu8rng)); - std::generate(v62.begin(), v62.end(), std::ref(qu8rng)); - std::generate(v63.begin(), v63.end(), std::ref(qu8rng)); - std::generate(v64.begin(), v64.end(), std::ref(qu8rng)); - std::generate(v65.begin(), v65.end(), std::ref(qu8rng)); - std::generate(v66.begin(), v66.end(), std::ref(qu8rng)); - std::generate(v67.begin(), v67.end(), std::ref(qu8rng)); - std::generate(v68.begin(), v68.end(), std::ref(qu8rng)); - std::generate(v69.begin(), v69.end(), std::ref(qu8rng)); - std::generate(v70.begin(), v70.end(), std::ref(qu8rng)); - std::generate(v71.begin(), v71.end(), std::ref(qu8rng)); - std::generate(v72.begin(), v72.end(), std::ref(qu8rng)); - std::generate(v73.begin(), v73.end(), std::ref(qu8rng)); - std::generate(v74.begin(), v74.end(), std::ref(qu8rng)); - std::generate(v75.begin(), v75.end(), std::ref(qu8rng)); - std::generate(v76.begin(), v76.end(), std::ref(qu8rng)); - std::generate(v77.begin(), v77.end(), std::ref(qu8rng)); - std::generate(v78.begin(), v78.end(), std::ref(qu8rng)); - std::generate(v79.begin(), v79.end(), std::ref(qu8rng)); - std::generate(v80.begin(), v80.end(), std::ref(qu8rng)); - std::generate(v81.begin(), v81.end(), std::ref(qu8rng)); - std::generate(v82.begin(), v82.end(), std::ref(qu8rng)); - std::generate(v83.begin(), v83.end(), std::ref(qu8rng)); - std::generate(v84.begin(), v84.end(), std::ref(qu8rng)); - std::generate(v85.begin(), v85.end(), std::ref(qu8rng)); - std::generate(v86.begin(), v86.end(), std::ref(qu8rng)); - std::generate(v87.begin(), v87.end(), std::ref(qu8rng)); - std::generate(v88.begin(), v88.end(), std::ref(qu8rng)); - std::generate(v89.begin(), v89.end(), std::ref(qu8rng)); - std::generate(v90.begin(), v90.end(), std::ref(qu8rng)); - std::generate(v91.begin(), v91.end(), std::ref(qu8rng)); - std::generate(v92.begin(), v92.end(), std::ref(qu8rng)); - std::generate(v93.begin(), v93.end(), std::ref(qu8rng)); - std::generate(v94.begin(), v94.end(), std::ref(qu8rng)); - std::generate(v95.begin(), v95.end(), std::ref(qu8rng)); - std::generate(v96.begin(), v96.end(), std::ref(qu8rng)); - std::generate(v97.begin(), v97.end(), std::ref(qu8rng)); - std::generate(v98.begin(), v98.end(), std::ref(qu8rng)); - std::generate(v99.begin(), v99.end(), std::ref(qu8rng)); - std::generate(v100.begin(), v100.end(), std::ref(qu8rng)); - std::generate(v101.begin(), v101.end(), std::ref(qu8rng)); - std::generate(v102.begin(), v102.end(), std::ref(qu8rng)); - std::generate(v103.begin(), v103.end(), std::ref(qu8rng)); - std::generate(v104.begin(), v104.end(), std::ref(qu8rng)); - std::generate(v105.begin(), v105.end(), std::ref(qu8rng)); - std::generate(v106.begin(), v106.end(), std::ref(qu8rng)); - std::generate(v107.begin(), v107.end(), std::ref(qu8rng)); - std::generate(v108.begin(), v108.end(), std::ref(qu8rng)); - std::generate(v109.begin(), v109.end(), std::ref(qu8rng)); - std::generate(v110.begin(), v110.end(), std::ref(qu8rng)); - std::generate(v111.begin(), v111.end(), std::ref(qu8rng)); - std::generate(v112.begin(), v112.end(), std::ref(qu8rng)); - std::generate(v113.begin(), v113.end(), std::ref(qu8rng)); - std::generate(v114.begin(), v114.end(), std::ref(qu8rng)); - std::generate(v115.begin(), v115.end(), std::ref(qu8rng)); - std::generate(v116.begin(), v116.end(), std::ref(qu8rng)); - std::generate(v117.begin(), v117.end(), std::ref(qu8rng)); - std::generate(v118.begin(), v118.end(), std::ref(qu8rng)); - std::generate(v119.begin(), v119.end(), std::ref(qu8rng)); - std::generate(v120.begin(), v120.end(), std::ref(qu8rng)); - std::generate(v121.begin(), v121.end(), std::ref(qu8rng)); - std::generate(v122.begin(), v122.end(), std::ref(qu8rng)); - std::generate(v123.begin(), v123.end(), std::ref(qu8rng)); - std::generate(v124.begin(), v124.end(), std::ref(qu8rng)); - std::generate(v125.begin(), v125.end(), std::ref(qu8rng)); - std::generate(v126.begin(), v126.end(), std::ref(qu8rng)); - std::generate(v127.begin(), v127.end(), std::ref(qu8rng)); - std::generate(v128.begin(), v128.end(), std::ref(qu8rng)); - std::generate(v129.begin(), v129.end(), std::ref(qu8rng)); - std::generate(v130.begin(), v130.end(), std::ref(qu8rng)); - std::generate(v131.begin(), v131.end(), std::ref(qu8rng)); - std::generate(w132.begin(), w132.end(), std::ref(qu8rng)); - std::generate(w133.begin(), w133.end(), std::ref(qs32rng)); - std::generate(w134.begin(), w134.end(), std::ref(qu8rng)); - std::generate(w135.begin(), w135.end(), std::ref(qs32rng)); - std::generate(w136.begin(), w136.end(), std::ref(qu8rng)); - std::generate(w137.begin(), w137.end(), std::ref(qs32rng)); - std::generate(w138.begin(), w138.end(), std::ref(qu8rng)); - std::generate(w139.begin(), w139.end(), std::ref(qs32rng)); - std::generate(w140.begin(), w140.end(), std::ref(qu8rng)); - std::generate(w141.begin(), w141.end(), std::ref(qs32rng)); - std::generate(w142.begin(), w142.end(), std::ref(qu8rng)); - std::generate(w143.begin(), w143.end(), std::ref(qs32rng)); - std::generate(w144.begin(), w144.end(), std::ref(qu8rng)); - std::generate(w145.begin(), w145.end(), std::ref(qs32rng)); - std::generate(w146.begin(), w146.end(), std::ref(qu8rng)); - std::generate(w147.begin(), w147.end(), std::ref(qs32rng)); - std::generate(w148.begin(), w148.end(), std::ref(qu8rng)); - std::generate(w149.begin(), w149.end(), std::ref(qs32rng)); - std::generate(w150.begin(), w150.end(), std::ref(qu8rng)); - std::generate(w151.begin(), w151.end(), std::ref(qs32rng)); - std::generate(w152.begin(), w152.end(), std::ref(qu8rng)); - std::generate(w153.begin(), w153.end(), std::ref(qs32rng)); - std::generate(w154.begin(), w154.end(), std::ref(qu8rng)); - std::generate(w155.begin(), w155.end(), std::ref(qs32rng)); - std::generate(w156.begin(), w156.end(), std::ref(qu8rng)); - std::generate(w157.begin(), w157.end(), std::ref(qs32rng)); - std::generate(w158.begin(), w158.end(), std::ref(qu8rng)); - std::generate(w159.begin(), w159.end(), std::ref(qu8rng)); - std::generate(w160.begin(), w160.end(), std::ref(qu8rng)); - std::generate(w161.begin(), w161.end(), std::ref(qs32rng)); - std::generate(w162.begin(), w162.end(), std::ref(qu8rng)); - std::generate(w163.begin(), w163.end(), std::ref(qs32rng)); - std::generate(w164.begin(), w164.end(), std::ref(qu8rng)); - std::generate(w165.begin(), w165.end(), std::ref(qs32rng)); - std::generate(w166.begin(), w166.end(), std::ref(qu8rng)); - std::generate(w167.begin(), w167.end(), std::ref(qs32rng)); - std::generate(w168.begin(), w168.end(), std::ref(qu8rng)); - std::generate(w169.begin(), w169.end(), std::ref(qs32rng)); - std::generate(w170.begin(), w170.end(), std::ref(qu8rng)); - std::generate(w171.begin(), w171.end(), std::ref(qu8rng)); - std::generate(w172.begin(), w172.end(), std::ref(qu8rng)); - std::generate(w173.begin(), w173.end(), std::ref(qs32rng)); - std::generate(w174.begin(), w174.end(), std::ref(qu8rng)); - std::generate(w175.begin(), w175.end(), std::ref(qs32rng)); - std::generate(w176.begin(), w176.end(), std::ref(qu8rng)); - std::generate(w177.begin(), w177.end(), std::ref(qs32rng)); - std::generate(w178.begin(), w178.end(), std::ref(qu8rng)); - std::generate(w179.begin(), w179.end(), std::ref(qs32rng)); - std::generate(w180.begin(), w180.end(), std::ref(qu8rng)); - std::generate(w181.begin(), w181.end(), std::ref(qs32rng)); - std::generate(w182.begin(), w182.end(), std::ref(qu8rng)); - std::generate(w183.begin(), w183.end(), std::ref(qu8rng)); - std::generate(w184.begin(), w184.end(), std::ref(qu8rng)); - std::generate(w185.begin(), w185.end(), std::ref(qs32rng)); - std::generate(w186.begin(), w186.end(), std::ref(qu8rng)); - std::generate(w187.begin(), w187.end(), std::ref(qs32rng)); - std::generate(w188.begin(), w188.end(), std::ref(qu8rng)); - std::generate(w189.begin(), w189.end(), std::ref(qs32rng)); - std::generate(w190.begin(), w190.end(), std::ref(qu8rng)); - std::generate(w191.begin(), w191.end(), std::ref(qs32rng)); - std::generate(w192.begin(), w192.end(), std::ref(qu8rng)); - std::generate(w193.begin(), w193.end(), std::ref(qs32rng)); - std::generate(w194.begin(), w194.end(), std::ref(qu8rng)); - std::generate(w195.begin(), w195.end(), std::ref(qs32rng)); - std::generate(w196.begin(), w196.end(), std::ref(qu8rng)); - std::generate(w197.begin(), w197.end(), std::ref(qs32rng)); - std::generate(w198.begin(), w198.end(), std::ref(qu8rng)); - std::generate(w199.begin(), w199.end(), std::ref(qs32rng)); - std::generate(w200.begin(), w200.end(), std::ref(qu8rng)); - std::generate(w201.begin(), w201.end(), std::ref(qs32rng)); - std::generate(w202.begin(), w202.end(), std::ref(qu8rng)); - std::generate(w203.begin(), w203.end(), std::ref(qs32rng)); - std::generate(w204.begin(), w204.end(), std::ref(qu8rng)); - std::generate(w205.begin(), w205.end(), std::ref(qs32rng)); - std::generate(w206.begin(), w206.end(), std::ref(qu8rng)); - std::generate(w207.begin(), w207.end(), std::ref(qs32rng)); - std::generate(w208.begin(), w208.end(), std::ref(qu8rng)); - std::generate(w209.begin(), w209.end(), std::ref(qs32rng)); - std::generate(w210.begin(), w210.end(), std::ref(qu8rng)); - std::generate(w211.begin(), w211.end(), std::ref(qs32rng)); - std::generate(w212.begin(), w212.end(), std::ref(qu8rng)); - std::generate(w213.begin(), w213.end(), std::ref(qs32rng)); - std::generate(w214.begin(), w214.end(), std::ref(qu8rng)); - std::generate(w215.begin(), w215.end(), std::ref(qs32rng)); - std::generate(w216.begin(), w216.end(), std::ref(qu8rng)); - std::generate(w217.begin(), w217.end(), std::ref(qs32rng)); - std::generate(w218.begin(), w218.end(), std::ref(qu8rng)); - std::generate(w219.begin(), w219.end(), std::ref(qu8rng)); - std::generate(w220.begin(), w220.end(), std::ref(qu8rng)); - std::generate(w221.begin(), w221.end(), std::ref(qs32rng)); - std::generate(w222.begin(), w222.end(), std::ref(qu8rng)); - std::generate(w223.begin(), w223.end(), std::ref(qs32rng)); - std::generate(w224.begin(), w224.end(), std::ref(qu8rng)); - std::generate(w225.begin(), w225.end(), std::ref(qs32rng)); - std::generate(w226.begin(), w226.end(), std::ref(qu8rng)); - std::generate(w227.begin(), w227.end(), std::ref(qs32rng)); - std::generate(w228.begin(), w228.end(), std::ref(qu8rng)); - std::generate(w229.begin(), w229.end(), std::ref(qs32rng)); - std::generate(w230.begin(), w230.end(), std::ref(qu8rng)); - std::generate(w231.begin(), w231.end(), std::ref(qu8rng)); - std::generate(w232.begin(), w232.end(), std::ref(qu8rng)); - std::generate(w233.begin(), w233.end(), std::ref(qs32rng)); - std::generate(w234.begin(), w234.end(), std::ref(qu8rng)); - std::generate(w235.begin(), w235.end(), std::ref(qs32rng)); - std::generate(w236.begin(), w236.end(), std::ref(qu8rng)); - std::generate(w237.begin(), w237.end(), std::ref(qs32rng)); - std::generate(w238.begin(), w238.end(), std::ref(qu8rng)); - std::generate(w239.begin(), w239.end(), std::ref(qs32rng)); - std::generate(w240.begin(), w240.end(), std::ref(qu8rng)); - std::generate(w241.begin(), w241.end(), std::ref(qs32rng)); - std::generate(w242.begin(), w242.end(), std::ref(qu8rng)); - std::generate(w243.begin(), w243.end(), std::ref(qu8rng)); - std::generate(w244.begin(), w244.end(), std::ref(qu8rng)); - std::generate(w245.begin(), w245.end(), std::ref(qs32rng)); - std::generate(w246.begin(), w246.end(), std::ref(qu8rng)); - std::generate(w247.begin(), w247.end(), std::ref(qs32rng)); - std::generate(w248.begin(), w248.end(), std::ref(qu8rng)); - std::generate(w249.begin(), w249.end(), std::ref(qs32rng)); - std::generate(w250.begin(), w250.end(), std::ref(qu8rng)); - std::generate(w251.begin(), w251.end(), std::ref(qs32rng)); - std::generate(w252.begin(), w252.end(), std::ref(qu8rng)); - std::generate(w253.begin(), w253.end(), std::ref(qs32rng)); - std::generate(w254.begin(), w254.end(), std::ref(qu8rng)); - std::generate(w255.begin(), w255.end(), std::ref(qu8rng)); - std::generate(w256.begin(), w256.end(), std::ref(qu8rng)); - std::generate(w257.begin(), w257.end(), std::ref(qs32rng)); - std::generate(w258.begin(), w258.end(), std::ref(qu8rng)); - std::generate(w259.begin(), w259.end(), std::ref(qs32rng)); - std::generate(w260.begin(), w260.end(), std::ref(qu8rng)); - std::generate(w261.begin(), w261.end(), std::ref(qs32rng)); - std::generate(w262.begin(), w262.end(), std::ref(qu8rng)); - std::generate(w263.begin(), w263.end(), std::ref(qs32rng)); - std::generate(w264.begin(), w264.end(), std::ref(qu8rng)); - std::generate(w265.begin(), w265.end(), std::ref(qs32rng)); - std::generate(w266.begin(), w266.end(), std::ref(qu8rng)); - std::generate(w267.begin(), w267.end(), std::ref(qu8rng)); - std::generate(w268.begin(), w268.end(), std::ref(qu8rng)); - std::generate(w269.begin(), w269.end(), std::ref(qs32rng)); - std::generate(w270.begin(), w270.end(), std::ref(qu8rng)); - std::generate(w271.begin(), w271.end(), std::ref(qs32rng)); - std::generate(w272.begin(), w272.end(), std::ref(qu8rng)); - std::generate(w273.begin(), w273.end(), std::ref(qs32rng)); - std::generate(w274.begin(), w274.end(), std::ref(qu8rng)); - std::generate(w275.begin(), w275.end(), std::ref(qs32rng)); - - Operators operators; - xnn_status status; - xnn_code_cache* code_cache_ptr = nullptr; - size_t max_workspace_size = 0; - - xnn_operator_t op0 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/0, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/3, - /*group_output_channels=*/16, - /*input_channel_stride=*/3, - /*output_channel_stride=*/16, - /*input_zero_point=*/(uint8_t) 128, - /*input_scale=*/0.007874015718698502, - /*kernel_zero_point=*/(uint8_t) 115, - /*kernel_scale=*/0.033770088106393814, - /*kernel=*/w132.data(), /*bias=*/w133.data(), - /*output_zero_point=*/(uint8_t) 118, - /*output_scale=*/0.27370360493659973, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op0); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #0" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op0, xnn_delete_operator); - - xnn_operator_t op1 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op1); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #1" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op1, xnn_delete_operator); - - xnn_operator_t op2 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/16, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/16, - /*output_channel_stride=*/16, - /*input_zero_point=*/(uint8_t) 3, - /*input_scale=*/0.14285412430763245, - /*kernel_zero_point=*/(uint8_t) 180, - /*kernel_scale=*/1.2274280786514282, - /*kernel=*/w134.data(), /*bias=*/w135.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.48894554376602173, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op2); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #2" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op2, xnn_delete_operator); - - xnn_operator_t op3 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/16, - /*group_output_channels=*/16, - /*input_channel_stride=*/16, - /*output_channel_stride=*/16, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.48894554376602173, - /*kernel_zero_point=*/(uint8_t) 117, - /*kernel_scale=*/0.003143413458019495, - /*kernel=*/w136.data(), /*bias=*/w137.data(), - /*output_zero_point=*/(uint8_t) 118, - /*output_scale=*/0.31826987862586975, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op3); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #3" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op3, xnn_delete_operator); - - xnn_operator_t op4 = nullptr; - status = xnn_create_add_nd_qu8( - 118 /* input1 zero point */, 0.31826987862586975 /* input1 scale */, - 3 /* input2 zero point */, 0.14285412430763245 /* input2 scale */, - 110 /* output zero point */, 0.32081592082977295 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op4); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #4" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op4, xnn_delete_operator); - - xnn_operator_t op5 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/16, - /*group_output_channels=*/64, - /*input_channel_stride=*/16, - /*output_channel_stride=*/64, - /*input_zero_point=*/(uint8_t) 110, - /*input_scale=*/0.32081592082977295, - /*kernel_zero_point=*/(uint8_t) 132, - /*kernel_scale=*/0.03118671104311943, - /*kernel=*/w138.data(), /*bias=*/w139.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.6120059490203857, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op5); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #5" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op5, xnn_delete_operator); - - xnn_operator_t op6 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/0, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/64, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/64, - /*output_channel_stride=*/64, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.6120059490203857, - /*kernel_zero_point=*/(uint8_t) 121, - /*kernel_scale=*/0.026778604835271835, - /*kernel=*/w140.data(), /*bias=*/w141.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.41203051805496216, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op6); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #6" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op6, xnn_delete_operator); - - xnn_operator_t op7 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/64, - /*group_output_channels=*/24, - /*input_channel_stride=*/64, - /*output_channel_stride=*/24, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.41203051805496216, - /*kernel_zero_point=*/(uint8_t) 133, - /*kernel_scale=*/0.01140664890408516, - /*kernel=*/w142.data(), /*bias=*/w143.data(), - /*output_zero_point=*/(uint8_t) 98, - /*output_scale=*/0.7137808203697205, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op7); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #7" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op7, xnn_delete_operator); - - xnn_operator_t op8 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/24, - /*group_output_channels=*/72, - /*input_channel_stride=*/24, - /*output_channel_stride=*/72, - /*input_zero_point=*/(uint8_t) 98, - /*input_scale=*/0.7137808203697205, - /*kernel_zero_point=*/(uint8_t) 134, - /*kernel_scale=*/0.0028549986891448498, - /*kernel=*/w144.data(), /*bias=*/w145.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.17176608741283417, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op8); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #8" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op8, xnn_delete_operator); - - xnn_operator_t op9 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/72, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/72, - /*output_channel_stride=*/72, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.17176608741283417, - /*kernel_zero_point=*/(uint8_t) 84, - /*kernel_scale=*/0.05291542783379555, - /*kernel=*/w146.data(), /*bias=*/w147.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.1972651332616806, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op9); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #9" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op9, xnn_delete_operator); - - xnn_operator_t op10 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/72, - /*group_output_channels=*/24, - /*input_channel_stride=*/72, - /*output_channel_stride=*/24, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.1972651332616806, - /*kernel_zero_point=*/(uint8_t) 134, - /*kernel_scale=*/0.015823975205421448, - /*kernel=*/w148.data(), /*bias=*/w149.data(), - /*output_zero_point=*/(uint8_t) 128, - /*output_scale=*/0.7786163091659546, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op10); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #10" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op10, xnn_delete_operator); - - xnn_operator_t op11 = nullptr; - status = xnn_create_add_nd_qu8( - 128 /* input1 zero point */, 0.7786163091659546 /* input1 scale */, - 98 /* input2 zero point */, 0.7137808203697205 /* input2 scale */, - 133 /* output zero point */, 0.8882708549499512 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op11); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #11" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op11, xnn_delete_operator); - - xnn_operator_t op12 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/24, - /*group_output_channels=*/72, - /*input_channel_stride=*/24, - /*output_channel_stride=*/72, - /*input_zero_point=*/(uint8_t) 133, - /*input_scale=*/0.8882708549499512, - /*kernel_zero_point=*/(uint8_t) 110, - /*kernel_scale=*/0.0035069144796580076, - /*kernel=*/w150.data(), /*bias=*/w151.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.16383449733257294, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op12); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #12" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op12, xnn_delete_operator); - - xnn_operator_t op13 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/1, /*input_padding_right=*/2, - /*input_padding_bottom=*/2, /*input_padding_left=*/1, - /*kernel_height=*/5, /*kernel_width=*/5, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/72, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/72, - /*output_channel_stride=*/72, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.16383449733257294, - /*kernel_zero_point=*/(uint8_t) 172, - /*kernel_scale=*/0.0138913718983531, - /*kernel=*/w152.data(), /*bias=*/w153.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.17515037953853607, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op13); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #13" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op13, xnn_delete_operator); - - xnn_operator_t op14 = nullptr; - status = xnn_create_global_average_pooling_nwc_qu8( - 0 /* input zero point */, 0.17515037953853607 /* input scale */, - 0 /* output zero point */, 0.17515037953853607 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op14); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #14" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op14, xnn_delete_operator); - - xnn_operator_t op15 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/72, - /*group_output_channels=*/24, - /*input_channel_stride=*/72, - /*output_channel_stride=*/24, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.17515037953853607, - /*kernel_zero_point=*/(uint8_t) 1, - /*kernel_scale=*/0.0003568351676221937, - /*kernel=*/w154.data(), /*bias=*/w155.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.06370840221643448, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op15); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #15" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op15, xnn_delete_operator); - - xnn_operator_t op16 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/24, - /*group_output_channels=*/72, - /*input_channel_stride=*/24, - /*output_channel_stride=*/72, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.06370840221643448, - /*kernel_zero_point=*/(uint8_t) 1, - /*kernel_scale=*/0.0005565338069573045, - /*kernel=*/w156.data(), /*bias=*/w157.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.031534332782030106, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op16); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #16" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op16, xnn_delete_operator); - - xnn_operator_t op17 = nullptr; - status = xnn_create_add_nd_qu8( - 0 /* input1 zero point */, 0.031534332782030106 /* input1 scale */, - 0 /* input2 zero point */, 0.0117647061124444 /* input2 scale */, - 0 /* output zero point */, 0.023528477177023888 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op17); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #17" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op17, xnn_delete_operator); - - xnn_operator_t op18 = nullptr; - status = xnn_create_multiply_nd_qu8( - 0 /* input1 zero point */, 0.023528477177023888 /* input1 scale */, - 0 /* input2 zero point */, 0.0006536078290082514 /* input2 scale */, - 0 /* output zero point */, 0.003921507392078638 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op18); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #18" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op18, xnn_delete_operator); - - xnn_operator_t op19 = nullptr; - status = xnn_create_multiply_nd_qu8( - 0 /* input1 zero point */, 0.17515037953853607 /* input1 scale */, - 0 /* input2 zero point */, 0.003921507392078638 /* input2 scale */, - 0 /* output zero point */, 0.16822829842567444 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op19); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #19" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op19, xnn_delete_operator); - - xnn_operator_t op20 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/72, - /*group_output_channels=*/40, - /*input_channel_stride=*/72, - /*output_channel_stride=*/40, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.16822829842567444, - /*kernel_zero_point=*/(uint8_t) 137, - /*kernel_scale=*/0.012648322619497776, - /*kernel=*/w160.data(), /*bias=*/w161.data(), - /*output_zero_point=*/(uint8_t) 138, - /*output_scale=*/0.46539929509162903, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op20); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #20" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op20, xnn_delete_operator); - - xnn_operator_t op21 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/40, - /*group_output_channels=*/120, - /*input_channel_stride=*/40, - /*output_channel_stride=*/120, - /*input_zero_point=*/(uint8_t) 138, - /*input_scale=*/0.46539929509162903, - /*kernel_zero_point=*/(uint8_t) 115, - /*kernel_scale=*/0.0016734388191252947, - /*kernel=*/w162.data(), /*bias=*/w163.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.07927421480417252, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op21); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #21" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op21, xnn_delete_operator); - - xnn_operator_t op22 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/2, /*input_padding_right=*/2, - /*input_padding_bottom=*/2, /*input_padding_left=*/2, - /*kernel_height=*/5, /*kernel_width=*/5, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/120, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/120, - /*output_channel_stride=*/120, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.07927421480417252, - /*kernel_zero_point=*/(uint8_t) 112, - /*kernel_scale=*/0.04146534577012062, - /*kernel=*/w164.data(), /*bias=*/w165.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.29763659834861755, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op22); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #22" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op22, xnn_delete_operator); - - xnn_operator_t op23 = nullptr; - status = xnn_create_global_average_pooling_nwc_qu8( - 0 /* input zero point */, 0.29763659834861755 /* input scale */, - 0 /* output zero point */, 0.29763659834861755 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op23); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #23" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op23, xnn_delete_operator); - - xnn_operator_t op24 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/120, - /*group_output_channels=*/32, - /*input_channel_stride=*/120, - /*output_channel_stride=*/32, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.29763659834861755, - /*kernel_zero_point=*/(uint8_t) 154, - /*kernel_scale=*/0.010948714800179005, - /*kernel=*/w166.data(), /*bias=*/w167.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.025719892233610153, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op24); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #24" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op24, xnn_delete_operator); - - xnn_operator_t op25 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/32, - /*group_output_channels=*/120, - /*input_channel_stride=*/32, - /*output_channel_stride=*/120, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.025719892233610153, - /*kernel_zero_point=*/(uint8_t) 112, - /*kernel_scale=*/0.008170774206519127, - /*kernel=*/w168.data(), /*bias=*/w169.data(), - /*output_zero_point=*/(uint8_t) 146, - /*output_scale=*/0.04194885492324829, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op25); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #25" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op25, xnn_delete_operator); - - xnn_operator_t op26 = nullptr; - status = xnn_create_add_nd_qu8( - 146 /* input1 zero point */, 0.04194885492324829 /* input1 scale */, - 0 /* input2 zero point */, 0.0117647061124444 /* input2 scale */, - 0 /* output zero point */, 0.023528477177023888 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op26); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #26" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op26, xnn_delete_operator); - - xnn_operator_t op27 = nullptr; - status = xnn_create_multiply_nd_qu8( - 0 /* input1 zero point */, 0.023528477177023888 /* input1 scale */, - 0 /* input2 zero point */, 0.0006536078290082514 /* input2 scale */, - 0 /* output zero point */, 0.003921507392078638 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op27); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #27" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op27, xnn_delete_operator); - - xnn_operator_t op28 = nullptr; - status = xnn_create_multiply_nd_qu8( - 0 /* input1 zero point */, 0.29763659834861755 /* input1 scale */, - 0 /* input2 zero point */, 0.003921507392078638 /* input2 scale */, - 0 /* output zero point */, 0.0508246049284935 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op28); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #28" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op28, xnn_delete_operator); - - xnn_operator_t op29 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/120, - /*group_output_channels=*/40, - /*input_channel_stride=*/120, - /*output_channel_stride=*/40, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.0508246049284935, - /*kernel_zero_point=*/(uint8_t) 129, - /*kernel_scale=*/0.0810198038816452, - /*kernel=*/w172.data(), /*bias=*/w173.data(), - /*output_zero_point=*/(uint8_t) 123, - /*output_scale=*/0.38306790590286255, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op29); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #29" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op29, xnn_delete_operator); - - xnn_operator_t op30 = nullptr; - status = xnn_create_add_nd_qu8( - 123 /* input1 zero point */, 0.38306790590286255 /* input1 scale */, - 138 /* input2 zero point */, 0.46539929509162903 /* input2 scale */, - 120 /* output zero point */, 0.5553078055381775 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op30); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #30" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op30, xnn_delete_operator); - - xnn_operator_t op31 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/40, - /*group_output_channels=*/120, - /*input_channel_stride=*/40, - /*output_channel_stride=*/120, - /*input_zero_point=*/(uint8_t) 120, - /*input_scale=*/0.5553078055381775, - /*kernel_zero_point=*/(uint8_t) 112, - /*kernel_scale=*/0.0015208977274596691, - /*kernel=*/w174.data(), /*bias=*/w175.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.07210717350244522, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op31); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #31" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op31, xnn_delete_operator); - - xnn_operator_t op32 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/2, /*input_padding_right=*/2, - /*input_padding_bottom=*/2, /*input_padding_left=*/2, - /*kernel_height=*/5, /*kernel_width=*/5, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/120, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/120, - /*output_channel_stride=*/120, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.07210717350244522, - /*kernel_zero_point=*/(uint8_t) 100, - /*kernel_scale=*/0.08610404282808304, - /*kernel=*/w176.data(), /*bias=*/w177.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.47602489590644836, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op32); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #32" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op32, xnn_delete_operator); - - xnn_operator_t op33 = nullptr; - status = xnn_create_global_average_pooling_nwc_qu8( - 0 /* input zero point */, 0.47602489590644836 /* input scale */, - 0 /* output zero point */, 0.47602489590644836 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op33); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #33" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op33, xnn_delete_operator); - - xnn_operator_t op34 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/120, - /*group_output_channels=*/32, - /*input_channel_stride=*/120, - /*output_channel_stride=*/32, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.47602489590644836, - /*kernel_zero_point=*/(uint8_t) 148, - /*kernel_scale=*/0.011198465712368488, - /*kernel=*/w178.data(), /*bias=*/w179.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.018647782504558563, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op34); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #34" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op34, xnn_delete_operator); - - xnn_operator_t op35 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/32, - /*group_output_channels=*/120, - /*input_channel_stride=*/32, - /*output_channel_stride=*/120, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.018647782504558563, - /*kernel_zero_point=*/(uint8_t) 123, - /*kernel_scale=*/0.007070987951010466, - /*kernel=*/w180.data(), /*bias=*/w181.data(), - /*output_zero_point=*/(uint8_t) 161, - /*output_scale=*/0.037593815475702286, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op35); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #35" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op35, xnn_delete_operator); - - xnn_operator_t op36 = nullptr; - status = xnn_create_add_nd_qu8( - 161 /* input1 zero point */, 0.037593815475702286 /* input1 scale */, - 0 /* input2 zero point */, 0.0117647061124444 /* input2 scale */, - 0 /* output zero point */, 0.023528477177023888 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op36); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #36" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op36, xnn_delete_operator); - - xnn_operator_t op37 = nullptr; - status = xnn_create_multiply_nd_qu8( - 0 /* input1 zero point */, 0.023528477177023888 /* input1 scale */, - 0 /* input2 zero point */, 0.0006536078290082514 /* input2 scale */, - 0 /* output zero point */, 0.003921507392078638 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op37); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #37" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op37, xnn_delete_operator); - - xnn_operator_t op38 = nullptr; - status = xnn_create_multiply_nd_qu8( - 0 /* input1 zero point */, 0.47602489590644836 /* input1 scale */, - 0 /* input2 zero point */, 0.003921507392078638 /* input2 scale */, - 0 /* output zero point */, 0.04567532241344452 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op38); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #38" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op38, xnn_delete_operator); - - xnn_operator_t op39 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/120, - /*group_output_channels=*/40, - /*input_channel_stride=*/120, - /*output_channel_stride=*/40, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.04567532241344452, - /*kernel_zero_point=*/(uint8_t) 137, - /*kernel_scale=*/0.0395188145339489, - /*kernel=*/w184.data(), /*bias=*/w185.data(), - /*output_zero_point=*/(uint8_t) 113, - /*output_scale=*/0.315862774848938, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op39); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #39" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op39, xnn_delete_operator); - - xnn_operator_t op40 = nullptr; - status = xnn_create_add_nd_qu8( - 113 /* input1 zero point */, 0.315862774848938 /* input1 scale */, - 120 /* input2 zero point */, 0.5553078055381775 /* input2 scale */, - 118 /* output zero point */, 0.642050564289093 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op40); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #40" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op40, xnn_delete_operator); - - xnn_operator_t op41 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/40, - /*group_output_channels=*/240, - /*input_channel_stride=*/40, - /*output_channel_stride=*/240, - /*input_zero_point=*/(uint8_t) 118, - /*input_scale=*/0.642050564289093, - /*kernel_zero_point=*/(uint8_t) 145, - /*kernel_scale=*/0.0012737783836200833, - /*kernel=*/w186.data(), /*bias=*/w187.data(), - /*output_zero_point=*/(uint8_t) 105, - /*output_scale=*/0.13308307528495789, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op41); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #41" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op41, xnn_delete_operator); - - xnn_operator_t op42 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op42); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #42" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op42, xnn_delete_operator); - - xnn_operator_t op43 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/0, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/240, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/240, - /*output_channel_stride=*/240, - /*input_zero_point=*/(uint8_t) 5, - /*input_scale=*/0.07557293772697449, - /*kernel_zero_point=*/(uint8_t) 131, - /*kernel_scale=*/0.05084514245390892, - /*kernel=*/w188.data(), /*bias=*/w189.data(), - /*output_zero_point=*/(uint8_t) 144, - /*output_scale=*/0.17950370907783508, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op43); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #43" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op43, xnn_delete_operator); - - xnn_operator_t op44 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op44); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #44" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op44, xnn_delete_operator); - - xnn_operator_t op45 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/240, - /*group_output_channels=*/80, - /*input_channel_stride=*/240, - /*output_channel_stride=*/80, - /*input_zero_point=*/(uint8_t) 5, - /*input_scale=*/0.07313619554042816, - /*kernel_zero_point=*/(uint8_t) 131, - /*kernel_scale=*/0.01282998826354742, - /*kernel=*/w190.data(), /*bias=*/w191.data(), - /*output_zero_point=*/(uint8_t) 130, - /*output_scale=*/0.3045005798339844, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op45); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #45" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op45, xnn_delete_operator); - - xnn_operator_t op46 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/80, - /*group_output_channels=*/200, - /*input_channel_stride=*/80, - /*output_channel_stride=*/200, - /*input_zero_point=*/(uint8_t) 130, - /*input_scale=*/0.3045005798339844, - /*kernel_zero_point=*/(uint8_t) 132, - /*kernel_scale=*/0.001329420367255807, - /*kernel=*/w192.data(), /*bias=*/w193.data(), - /*output_zero_point=*/(uint8_t) 93, - /*output_scale=*/0.09378162026405334, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op46); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #46" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op46, xnn_delete_operator); - - xnn_operator_t op47 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op47); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #47" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op47, xnn_delete_operator); - - xnn_operator_t op48 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/200, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/200, - /*output_channel_stride=*/200, - /*input_zero_point=*/(uint8_t) 7, - /*input_scale=*/0.05536928027868271, - /*kernel_zero_point=*/(uint8_t) 146, - /*kernel_scale=*/0.08685924112796783, - /*kernel=*/w194.data(), /*bias=*/w195.data(), - /*output_zero_point=*/(uint8_t) 137, - /*output_scale=*/0.06324371695518494, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op48); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #48" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op48, xnn_delete_operator); - - xnn_operator_t op49 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op49); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #49" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op49, xnn_delete_operator); - - xnn_operator_t op50 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/200, - /*group_output_channels=*/80, - /*input_channel_stride=*/200, - /*output_channel_stride=*/80, - /*input_zero_point=*/(uint8_t) 15, - /*input_scale=*/0.02439218945801258, - /*kernel_zero_point=*/(uint8_t) 110, - /*kernel_scale=*/0.05305293947458267, - /*kernel=*/w196.data(), /*bias=*/w197.data(), - /*output_zero_point=*/(uint8_t) 147, - /*output_scale=*/0.3730231821537018, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op50); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #50" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op50, xnn_delete_operator); - - xnn_operator_t op51 = nullptr; - status = xnn_create_add_nd_qu8( - 147 /* input1 zero point */, 0.3730231821537018 /* input1 scale */, - 130 /* input2 zero point */, 0.3045005798339844 /* input2 scale */, - 143 /* output zero point */, 0.3441302180290222 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op51); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #51" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op51, xnn_delete_operator); - - xnn_operator_t op52 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/80, - /*group_output_channels=*/184, - /*input_channel_stride=*/80, - /*output_channel_stride=*/184, - /*input_zero_point=*/(uint8_t) 143, - /*input_scale=*/0.3441302180290222, - /*kernel_zero_point=*/(uint8_t) 114, - /*kernel_scale=*/0.0012470033252611756, - /*kernel=*/w198.data(), /*bias=*/w199.data(), - /*output_zero_point=*/(uint8_t) 112, - /*output_scale=*/0.10203129798173904, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op52); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #52" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op52, xnn_delete_operator); - - xnn_operator_t op53 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op53); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #53" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op53, xnn_delete_operator); - - xnn_operator_t op54 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/184, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/184, - /*output_channel_stride=*/184, - /*input_zero_point=*/(uint8_t) 7, - /*input_scale=*/0.05502431467175484, - /*kernel_zero_point=*/(uint8_t) 150, - /*kernel_scale=*/0.08035282045602798, - /*kernel=*/w200.data(), /*bias=*/w201.data(), - /*output_zero_point=*/(uint8_t) 143, - /*output_scale=*/0.08216419816017151, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op54); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #54" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op54, xnn_delete_operator); - - xnn_operator_t op55 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op55); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #55" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op55, xnn_delete_operator); - - xnn_operator_t op56 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/184, - /*group_output_channels=*/80, - /*input_channel_stride=*/184, - /*output_channel_stride=*/80, - /*input_zero_point=*/(uint8_t) 11, - /*input_scale=*/0.03298536315560341, - /*kernel_zero_point=*/(uint8_t) 120, - /*kernel_scale=*/0.032908618450164795, - /*kernel=*/w202.data(), /*bias=*/w203.data(), - /*output_zero_point=*/(uint8_t) 118, - /*output_scale=*/0.2424711138010025, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op56); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #56" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op56, xnn_delete_operator); - - xnn_operator_t op57 = nullptr; - status = xnn_create_add_nd_qu8( - 118 /* input1 zero point */, 0.2424711138010025 /* input1 scale */, - 143 /* input2 zero point */, 0.3441302180290222 /* input2 scale */, - 129 /* output zero point */, 0.3845607340335846 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op57); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #57" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op57, xnn_delete_operator); - - xnn_operator_t op58 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/80, - /*group_output_channels=*/184, - /*input_channel_stride=*/80, - /*output_channel_stride=*/184, - /*input_zero_point=*/(uint8_t) 129, - /*input_scale=*/0.3845607340335846, - /*kernel_zero_point=*/(uint8_t) 124, - /*kernel_scale=*/0.0009509262163192034, - /*kernel=*/w204.data(), /*bias=*/w205.data(), - /*output_zero_point=*/(uint8_t) 127, - /*output_scale=*/0.08432933688163757, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op58); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #58" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op58, xnn_delete_operator); - - xnn_operator_t op59 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op59); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #59" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op59, xnn_delete_operator); - - xnn_operator_t op60 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/184, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/184, - /*output_channel_stride=*/184, - /*input_zero_point=*/(uint8_t) 9, - /*input_scale=*/0.04154029116034508, - /*kernel_zero_point=*/(uint8_t) 103, - /*kernel_scale=*/0.16053271293640137, - /*kernel=*/w206.data(), /*bias=*/w207.data(), - /*output_zero_point=*/(uint8_t) 143, - /*output_scale=*/0.12306679785251617, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op60); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #60" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op60, xnn_delete_operator); - - xnn_operator_t op61 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op61); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #61" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op61, xnn_delete_operator); - - xnn_operator_t op62 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/184, - /*group_output_channels=*/80, - /*input_channel_stride=*/184, - /*output_channel_stride=*/80, - /*input_zero_point=*/(uint8_t) 7, - /*input_scale=*/0.050543028861284256, - /*kernel_zero_point=*/(uint8_t) 115, - /*kernel_scale=*/0.022296851500868797, - /*kernel=*/w208.data(), /*bias=*/w209.data(), - /*output_zero_point=*/(uint8_t) 128, - /*output_scale=*/0.243500754237175, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op62); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #62" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op62, xnn_delete_operator); - - xnn_operator_t op63 = nullptr; - status = xnn_create_add_nd_qu8( - 128 /* input1 zero point */, 0.243500754237175 /* input1 scale */, - 129 /* input2 zero point */, 0.3845607340335846 /* input2 scale */, - 121 /* output zero point */, 0.3455895185470581 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op63); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #63" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op63, xnn_delete_operator); - - xnn_operator_t op64 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/80, - /*group_output_channels=*/480, - /*input_channel_stride=*/80, - /*output_channel_stride=*/480, - /*input_zero_point=*/(uint8_t) 121, - /*input_scale=*/0.3455895185470581, - /*kernel_zero_point=*/(uint8_t) 83, - /*kernel_scale=*/0.002185759600251913, - /*kernel=*/w210.data(), /*bias=*/w211.data(), - /*output_zero_point=*/(uint8_t) 124, - /*output_scale=*/0.14602775871753693, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op64); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #64" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op64, xnn_delete_operator); - - xnn_operator_t op65 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op65); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #65" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op65, xnn_delete_operator); - - xnn_operator_t op66 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/480, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/480, - /*output_channel_stride=*/480, - /*input_zero_point=*/(uint8_t) 5, - /*input_scale=*/0.07129283249378204, - /*kernel_zero_point=*/(uint8_t) 118, - /*kernel_scale=*/0.17997917532920837, - /*kernel=*/w212.data(), /*bias=*/w213.data(), - /*output_zero_point=*/(uint8_t) 107, - /*output_scale=*/0.21639184653759003, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op66); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #66" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op66, xnn_delete_operator); - - xnn_operator_t op67 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op67); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #67" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op67, xnn_delete_operator); - - xnn_operator_t op68 = nullptr; - status = xnn_create_global_average_pooling_nwc_qu8( - 3 /* input zero point */, 0.11743443459272385 /* input scale */, - 3 /* output zero point */, 0.11743443459272385 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op68); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #68" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op68, xnn_delete_operator); - - xnn_operator_t op69 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/480, - /*group_output_channels=*/120, - /*input_channel_stride=*/480, - /*output_channel_stride=*/120, - /*input_zero_point=*/(uint8_t) 3, - /*input_scale=*/0.11743443459272385, - /*kernel_zero_point=*/(uint8_t) 151, - /*kernel_scale=*/0.00905598234385252, - /*kernel=*/w214.data(), /*bias=*/w215.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.016878068447113037, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op69); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #69" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op69, xnn_delete_operator); - - xnn_operator_t op70 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/120, - /*group_output_channels=*/480, - /*input_channel_stride=*/120, - /*output_channel_stride=*/480, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.016878068447113037, - /*kernel_zero_point=*/(uint8_t) 118, - /*kernel_scale=*/0.00806900393217802, - /*kernel=*/w216.data(), /*bias=*/w217.data(), - /*output_zero_point=*/(uint8_t) 154, - /*output_scale=*/0.04319746419787407, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op70); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #70" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op70, xnn_delete_operator); - - xnn_operator_t op71 = nullptr; - status = xnn_create_add_nd_qu8( - 154 /* input1 zero point */, 0.04319746419787407 /* input1 scale */, - 0 /* input2 zero point */, 0.0117647061124444 /* input2 scale */, - 0 /* output zero point */, 0.023528477177023888 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op71); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #71" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op71, xnn_delete_operator); - - xnn_operator_t op72 = nullptr; - status = xnn_create_multiply_nd_qu8( - 0 /* input1 zero point */, 0.023528477177023888 /* input1 scale */, - 0 /* input2 zero point */, 0.0006536078290082514 /* input2 scale */, - 0 /* output zero point */, 0.003921507392078638 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op72); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #72" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op72, xnn_delete_operator); - - xnn_operator_t op73 = nullptr; - status = xnn_create_multiply_nd_qu8( - 3 /* input1 zero point */, 0.11743443459272385 /* input1 scale */, - 0 /* input2 zero point */, 0.003921507392078638 /* input2 scale */, - 11 /* output zero point */, 0.032692719250917435 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op73); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #73" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op73, xnn_delete_operator); - - xnn_operator_t op74 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/480, - /*group_output_channels=*/112, - /*input_channel_stride=*/480, - /*output_channel_stride=*/112, - /*input_zero_point=*/(uint8_t) 11, - /*input_scale=*/0.032692719250917435, - /*kernel_zero_point=*/(uint8_t) 120, - /*kernel_scale=*/0.02655486948788166, - /*kernel=*/w220.data(), /*bias=*/w221.data(), - /*output_zero_point=*/(uint8_t) 126, - /*output_scale=*/0.28324681520462036, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op74); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #74" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op74, xnn_delete_operator); - - xnn_operator_t op75 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/112, - /*group_output_channels=*/672, - /*input_channel_stride=*/112, - /*output_channel_stride=*/672, - /*input_zero_point=*/(uint8_t) 126, - /*input_scale=*/0.28324681520462036, - /*kernel_zero_point=*/(uint8_t) 122, - /*kernel_scale=*/0.0020197967533022165, - /*kernel=*/w222.data(), /*bias=*/w223.data(), - /*output_zero_point=*/(uint8_t) 104, - /*output_scale=*/0.1581750512123108, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op75); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #75" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op75, xnn_delete_operator); - - xnn_operator_t op76 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op76); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #76" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op76, xnn_delete_operator); - - xnn_operator_t op77 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/672, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/672, - /*output_channel_stride=*/672, - /*input_zero_point=*/(uint8_t) 4, - /*input_scale=*/0.086741141974926, - /*kernel_zero_point=*/(uint8_t) 131, - /*kernel_scale=*/0.10931526124477386, - /*kernel=*/w224.data(), /*bias=*/w225.data(), - /*output_zero_point=*/(uint8_t) 134, - /*output_scale=*/0.32434141635894775, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op77); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #77" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op77, xnn_delete_operator); - - xnn_operator_t op78 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op78); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #78" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op78, xnn_delete_operator); - - xnn_operator_t op79 = nullptr; - status = xnn_create_global_average_pooling_nwc_qu8( - 3 /* input zero point */, 0.13992221653461456 /* input scale */, - 3 /* output zero point */, 0.13992221653461456 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op79); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #79" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op79, xnn_delete_operator); - - xnn_operator_t op80 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/672, - /*group_output_channels=*/168, - /*input_channel_stride=*/672, - /*output_channel_stride=*/168, - /*input_zero_point=*/(uint8_t) 3, - /*input_scale=*/0.13992221653461456, - /*kernel_zero_point=*/(uint8_t) 122, - /*kernel_scale=*/0.007790832780301571, - /*kernel=*/w226.data(), /*bias=*/w227.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.011208290234208107, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op80); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #80" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op80, xnn_delete_operator); - - xnn_operator_t op81 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/168, - /*group_output_channels=*/672, - /*input_channel_stride=*/168, - /*output_channel_stride=*/672, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.011208290234208107, - /*kernel_zero_point=*/(uint8_t) 102, - /*kernel_scale=*/0.005877777934074402, - /*kernel=*/w228.data(), /*bias=*/w229.data(), - /*output_zero_point=*/(uint8_t) 160, - /*output_scale=*/0.029027709737420082, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op81); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #81" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op81, xnn_delete_operator); - - xnn_operator_t op82 = nullptr; - status = xnn_create_add_nd_qu8( - 160 /* input1 zero point */, 0.029027709737420082 /* input1 scale */, - 0 /* input2 zero point */, 0.0117647061124444 /* input2 scale */, - 0 /* output zero point */, 0.022148869931697845 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op82); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #82" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op82, xnn_delete_operator); - - xnn_operator_t op83 = nullptr; - status = xnn_create_multiply_nd_qu8( - 0 /* input1 zero point */, 0.022148869931697845 /* input1 scale */, - 0 /* input2 zero point */, 0.0006536078290082514 /* input2 scale */, - 0 /* output zero point */, 0.0036574413534253836 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op83); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #83" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op83, xnn_delete_operator); - - xnn_operator_t op84 = nullptr; - status = xnn_create_multiply_nd_qu8( - 3 /* input1 zero point */, 0.13992221653461456 /* input1 scale */, - 0 /* input2 zero point */, 0.0036574413534253836 /* input2 scale */, - 14 /* output zero point */, 0.02700776234269142 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op84); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #84" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op84, xnn_delete_operator); - - xnn_operator_t op85 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/672, - /*group_output_channels=*/112, - /*input_channel_stride=*/672, - /*output_channel_stride=*/112, - /*input_zero_point=*/(uint8_t) 14, - /*input_scale=*/0.02700776234269142, - /*kernel_zero_point=*/(uint8_t) 127, - /*kernel_scale=*/0.07714813202619553, - /*kernel=*/w232.data(), /*bias=*/w233.data(), - /*output_zero_point=*/(uint8_t) 139, - /*output_scale=*/0.2981528043746948, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op85); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #85" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op85, xnn_delete_operator); - - xnn_operator_t op86 = nullptr; - status = xnn_create_add_nd_qu8( - 139 /* input1 zero point */, 0.2981528043746948 /* input1 scale */, - 126 /* input2 zero point */, 0.28324681520462036 /* input2 scale */, - 133 /* output zero point */, 0.32168200612068176 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op86); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #86" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op86, xnn_delete_operator); - - xnn_operator_t op87 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/112, - /*group_output_channels=*/672, - /*input_channel_stride=*/112, - /*output_channel_stride=*/672, - /*input_zero_point=*/(uint8_t) 133, - /*input_scale=*/0.32168200612068176, - /*kernel_zero_point=*/(uint8_t) 119, - /*kernel_scale=*/0.0033816234208643436, - /*kernel=*/w234.data(), /*bias=*/w235.data(), - /*output_zero_point=*/(uint8_t) 124, - /*output_scale=*/0.1896306872367859, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op87); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #87" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op87, xnn_delete_operator); - - xnn_operator_t op88 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op88); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #88" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op88, xnn_delete_operator); - - xnn_operator_t op89 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/1, /*input_padding_right=*/2, - /*input_padding_bottom=*/2, /*input_padding_left=*/1, - /*kernel_height=*/5, /*kernel_width=*/5, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/672, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/672, - /*output_channel_stride=*/672, - /*input_zero_point=*/(uint8_t) 4, - /*input_scale=*/0.0902840867638588, - /*kernel_zero_point=*/(uint8_t) 132, - /*kernel_scale=*/0.036103639751672745, - /*kernel=*/w236.data(), /*bias=*/w237.data(), - /*output_zero_point=*/(uint8_t) 99, - /*output_scale=*/0.22421795129776, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op89); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #89" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op89, xnn_delete_operator); - - xnn_operator_t op90 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op90); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #90" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op90, xnn_delete_operator); - - xnn_operator_t op91 = nullptr; - status = xnn_create_global_average_pooling_nwc_qu8( - 3 /* input zero point */, 0.12238067388534546 /* input scale */, - 3 /* output zero point */, 0.12238067388534546 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op91); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #91" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op91, xnn_delete_operator); - - xnn_operator_t op92 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/672, - /*group_output_channels=*/168, - /*input_channel_stride=*/672, - /*output_channel_stride=*/168, - /*input_zero_point=*/(uint8_t) 3, - /*input_scale=*/0.12238067388534546, - /*kernel_zero_point=*/(uint8_t) 124, - /*kernel_scale=*/0.00663334084674716, - /*kernel=*/w238.data(), /*bias=*/w239.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.023197801783680916, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op92); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #92" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op92, xnn_delete_operator); - - xnn_operator_t op93 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/168, - /*group_output_channels=*/672, - /*input_channel_stride=*/168, - /*output_channel_stride=*/672, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.023197801783680916, - /*kernel_zero_point=*/(uint8_t) 88, - /*kernel_scale=*/0.009690321050584316, - /*kernel=*/w240.data(), /*bias=*/w241.data(), - /*output_zero_point=*/(uint8_t) 120, - /*output_scale=*/0.04488217830657959, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op93); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #93" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op93, xnn_delete_operator); - - xnn_operator_t op94 = nullptr; - status = xnn_create_add_nd_qu8( - 120 /* input1 zero point */, 0.04488217830657959 /* input1 scale */, - 0 /* input2 zero point */, 0.0117647061124444 /* input2 scale */, - 0 /* output zero point */, 0.023528477177023888 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op94); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #94" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op94, xnn_delete_operator); - - xnn_operator_t op95 = nullptr; - status = xnn_create_multiply_nd_qu8( - 0 /* input1 zero point */, 0.023528477177023888 /* input1 scale */, - 0 /* input2 zero point */, 0.0006536078290082514 /* input2 scale */, - 0 /* output zero point */, 0.003921507392078638 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op95); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #95" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op95, xnn_delete_operator); - - xnn_operator_t op96 = nullptr; - status = xnn_create_multiply_nd_qu8( - 3 /* input1 zero point */, 0.12238067388534546 /* input1 scale */, - 0 /* input2 zero point */, 0.003921507392078638 /* input2 scale */, - 8 /* output zero point */, 0.048699233680963516 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op96); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #96" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op96, xnn_delete_operator); - - xnn_operator_t op97 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/672, - /*group_output_channels=*/160, - /*input_channel_stride=*/672, - /*output_channel_stride=*/160, - /*input_zero_point=*/(uint8_t) 8, - /*input_scale=*/0.048699233680963516, - /*kernel_zero_point=*/(uint8_t) 131, - /*kernel_scale=*/0.016730330884456635, - /*kernel=*/w244.data(), /*bias=*/w245.data(), - /*output_zero_point=*/(uint8_t) 125, - /*output_scale=*/0.28946512937545776, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op97); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #97" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op97, xnn_delete_operator); - - xnn_operator_t op98 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/160, - /*group_output_channels=*/960, - /*input_channel_stride=*/160, - /*output_channel_stride=*/960, - /*input_zero_point=*/(uint8_t) 125, - /*input_scale=*/0.28946512937545776, - /*kernel_zero_point=*/(uint8_t) 110, - /*kernel_scale=*/0.001960444264113903, - /*kernel=*/w246.data(), /*bias=*/w247.data(), - /*output_zero_point=*/(uint8_t) 107, - /*output_scale=*/0.14798924326896667, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op98); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #98" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op98, xnn_delete_operator); - - xnn_operator_t op99 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op99); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #99" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op99, xnn_delete_operator); - - xnn_operator_t op100 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/2, /*input_padding_right=*/2, - /*input_padding_bottom=*/2, /*input_padding_left=*/2, - /*kernel_height=*/5, /*kernel_width=*/5, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/960, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/960, - /*output_channel_stride=*/960, - /*input_zero_point=*/(uint8_t) 5, - /*input_scale=*/0.08019781112670898, - /*kernel_zero_point=*/(uint8_t) 126, - /*kernel_scale=*/0.16313187777996063, - /*kernel=*/w248.data(), /*bias=*/w249.data(), - /*output_zero_point=*/(uint8_t) 113, - /*output_scale=*/0.35713136196136475, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op100); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #100" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op100, xnn_delete_operator); - - xnn_operator_t op101 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op101); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #101" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op101, xnn_delete_operator); - - xnn_operator_t op102 = nullptr; - status = xnn_create_global_average_pooling_nwc_qu8( - 2 /* input zero point */, 0.16568607091903687 /* input scale */, - 2 /* output zero point */, 0.16568607091903687 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op102); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #102" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op102, xnn_delete_operator); - - xnn_operator_t op103 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/960, - /*group_output_channels=*/240, - /*input_channel_stride=*/960, - /*output_channel_stride=*/240, - /*input_zero_point=*/(uint8_t) 2, - /*input_scale=*/0.16568607091903687, - /*kernel_zero_point=*/(uint8_t) 119, - /*kernel_scale=*/0.00574096105992794, - /*kernel=*/w250.data(), /*bias=*/w251.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.02531706728041172, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op103); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #103" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op103, xnn_delete_operator); - - xnn_operator_t op104 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/240, - /*group_output_channels=*/960, - /*input_channel_stride=*/240, - /*output_channel_stride=*/960, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.02531706728041172, - /*kernel_zero_point=*/(uint8_t) 76, - /*kernel_scale=*/0.00785693246871233, - /*kernel=*/w252.data(), /*bias=*/w253.data(), - /*output_zero_point=*/(uint8_t) 132, - /*output_scale=*/0.05263541638851166, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op104); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #104" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op104, xnn_delete_operator); - - xnn_operator_t op105 = nullptr; - status = xnn_create_add_nd_qu8( - 132 /* input1 zero point */, 0.05263541638851166 /* input1 scale */, - 0 /* input2 zero point */, 0.0117647061124444 /* input2 scale */, - 0 /* output zero point */, 0.023528477177023888 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op105); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #105" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op105, xnn_delete_operator); - - xnn_operator_t op106 = nullptr; - status = xnn_create_multiply_nd_qu8( - 0 /* input1 zero point */, 0.023528477177023888 /* input1 scale */, - 0 /* input2 zero point */, 0.0006536078290082514 /* input2 scale */, - 0 /* output zero point */, 0.003921374212950468 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op106); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #106" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op106, xnn_delete_operator); - - xnn_operator_t op107 = nullptr; - status = xnn_create_multiply_nd_qu8( - 2 /* input1 zero point */, 0.16568607091903687 /* input1 scale */, - 0 /* input2 zero point */, 0.003921374212950468 /* input2 scale */, - 6 /* output zero point */, 0.05613052472472191 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op107); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #107" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op107, xnn_delete_operator); - - xnn_operator_t op108 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/960, - /*group_output_channels=*/160, - /*input_channel_stride=*/960, - /*output_channel_stride=*/160, - /*input_zero_point=*/(uint8_t) 6, - /*input_scale=*/0.05613052472472191, - /*kernel_zero_point=*/(uint8_t) 131, - /*kernel_scale=*/0.030698217451572418, - /*kernel=*/w256.data(), /*bias=*/w257.data(), - /*output_zero_point=*/(uint8_t) 124, - /*output_scale=*/0.25378188490867615, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op108); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #108" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op108, xnn_delete_operator); - - xnn_operator_t op109 = nullptr; - status = xnn_create_add_nd_qu8( - 124 /* input1 zero point */, 0.25378188490867615 /* input1 scale */, - 125 /* input2 zero point */, 0.28946512937545776 /* input2 scale */, - 125 /* output zero point */, 0.3463849723339081 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op109); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #109" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op109, xnn_delete_operator); - - xnn_operator_t op110 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/160, - /*group_output_channels=*/960, - /*input_channel_stride=*/160, - /*output_channel_stride=*/960, - /*input_zero_point=*/(uint8_t) 125, - /*input_scale=*/0.3463849723339081, - /*kernel_zero_point=*/(uint8_t) 110, - /*kernel_scale=*/0.0016829369124025106, - /*kernel=*/w258.data(), /*bias=*/w259.data(), - /*output_zero_point=*/(uint8_t) 142, - /*output_scale=*/0.176979660987854, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op110); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #110" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op110, xnn_delete_operator); - - xnn_operator_t op111 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op111); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #111" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op111, xnn_delete_operator); - - xnn_operator_t op112 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/2, /*input_padding_right=*/2, - /*input_padding_bottom=*/2, /*input_padding_left=*/2, - /*kernel_height=*/5, /*kernel_width=*/5, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/960, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/960, - /*output_channel_stride=*/960, - /*input_zero_point=*/(uint8_t) 5, - /*input_scale=*/0.07351326942443848, - /*kernel_zero_point=*/(uint8_t) 131, - /*kernel_scale=*/0.22339750826358795, - /*kernel=*/w260.data(), /*bias=*/w261.data(), - /*output_zero_point=*/(uint8_t) 80, - /*output_scale=*/0.6548007130622864, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op112); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #112" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op112, xnn_delete_operator); - - xnn_operator_t op113 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op113); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #113" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op113, xnn_delete_operator); - - xnn_operator_t op114 = nullptr; - status = xnn_create_global_average_pooling_nwc_qu8( - 1 /* input zero point */, 0.37250789999961853 /* input scale */, - 1 /* output zero point */, 0.37250789999961853 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op114); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #114" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op114, xnn_delete_operator); - - xnn_operator_t op115 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/960, - /*group_output_channels=*/240, - /*input_channel_stride=*/960, - /*output_channel_stride=*/240, - /*input_zero_point=*/(uint8_t) 1, - /*input_scale=*/0.37250789999961853, - /*kernel_zero_point=*/(uint8_t) 113, - /*kernel_scale=*/0.008814899250864983, - /*kernel=*/w262.data(), /*bias=*/w263.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.05916303023695946, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op115); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #115" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op115, xnn_delete_operator); - - xnn_operator_t op116 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/240, - /*group_output_channels=*/960, - /*input_channel_stride=*/240, - /*output_channel_stride=*/960, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.05916303023695946, - /*kernel_zero_point=*/(uint8_t) 90, - /*kernel_scale=*/0.00719048036262393, - /*kernel=*/w264.data(), /*bias=*/w265.data(), - /*output_zero_point=*/(uint8_t) 126, - /*output_scale=*/0.11174535006284714, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op116); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #116" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op116, xnn_delete_operator); - - xnn_operator_t op117 = nullptr; - status = xnn_create_add_nd_qu8( - 126 /* input1 zero point */, 0.11174535006284714 /* input1 scale */, - 0 /* input2 zero point */, 0.0117647061124444 /* input2 scale */, - 0 /* output zero point */, 0.023528477177023888 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op117); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #117" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op117, xnn_delete_operator); - - xnn_operator_t op118 = nullptr; - status = xnn_create_multiply_nd_qu8( - 0 /* input1 zero point */, 0.023528477177023888 /* input1 scale */, - 0 /* input2 zero point */, 0.0006536078290082514 /* input2 scale */, - 0 /* output zero point */, 0.003921506926417351 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op118); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #118" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op118, xnn_delete_operator); - - xnn_operator_t op119 = nullptr; - status = xnn_create_multiply_nd_qu8( - 1 /* input1 zero point */, 0.37250789999961853 /* input1 scale */, - 0 /* input2 zero point */, 0.003921506926417351 /* input2 scale */, - 2 /* output zero point */, 0.21566562354564667 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op119); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #119" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op119, xnn_delete_operator); - - xnn_operator_t op120 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/960, - /*group_output_channels=*/160, - /*input_channel_stride=*/960, - /*output_channel_stride=*/160, - /*input_zero_point=*/(uint8_t) 2, - /*input_scale=*/0.21566562354564667, - /*kernel_zero_point=*/(uint8_t) 141, - /*kernel_scale=*/0.03563878685235977, - /*kernel=*/w268.data(), /*bias=*/w269.data(), - /*output_zero_point=*/(uint8_t) 128, - /*output_scale=*/0.5318803191184998, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op120); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #120" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op120, xnn_delete_operator); - - xnn_operator_t op121 = nullptr; - status = xnn_create_add_nd_qu8( - 128 /* input1 zero point */, 0.5318803191184998 /* input1 scale */, - 125 /* input2 zero point */, 0.3463849723339081 /* input2 scale */, - 127 /* output zero point */, 0.6238188147544861 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op121); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #121" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op121, xnn_delete_operator); - - xnn_operator_t op122 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/160, - /*group_output_channels=*/960, - /*input_channel_stride=*/160, - /*output_channel_stride=*/960, - /*input_zero_point=*/(uint8_t) 127, - /*input_scale=*/0.6238188147544861, - /*kernel_zero_point=*/(uint8_t) 126, - /*kernel_scale=*/0.002437639981508255, - /*kernel=*/w270.data(), /*bias=*/w271.data(), - /*output_zero_point=*/(uint8_t) 129, - /*output_scale=*/0.6554628014564514, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op122); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #122" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op122, xnn_delete_operator); - - xnn_operator_t op123 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op123); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #123" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op123, xnn_delete_operator); - - xnn_operator_t op124 = nullptr; - status = xnn_create_global_average_pooling_nwc_qu8( - 1 /* input zero point */, 0.2900834381580353 /* input scale */, - 1 /* output zero point */, 0.2900834381580353 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op124); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #124" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op124, xnn_delete_operator); - - xnn_operator_t op125 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/960, - /*group_output_channels=*/1280, - /*input_channel_stride=*/960, - /*output_channel_stride=*/1280, - /*input_zero_point=*/(uint8_t) 1, - /*input_scale=*/0.2900834381580353, - /*kernel_zero_point=*/(uint8_t) 128, - /*kernel_scale=*/0.00735006108880043, - /*kernel=*/w272.data(), /*bias=*/w273.data(), - /*output_zero_point=*/(uint8_t) 175, - /*output_scale=*/0.053266387432813644, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op125); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #125" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op125, xnn_delete_operator); - - xnn_operator_t op126 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op126); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #126" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op126, xnn_delete_operator); - - xnn_operator_t op127 = nullptr; - status = xnn_create_global_average_pooling_nwc_qu8( - 22 /* input zero point */, 0.017189515754580498 /* input scale */, - 22 /* output zero point */, 0.017189515754580498 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op127); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #127" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op127, xnn_delete_operator); - - xnn_operator_t op128 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/1280, - /*group_output_channels=*/1001, - /*input_channel_stride=*/1280, - /*output_channel_stride=*/1001, - /*input_zero_point=*/(uint8_t) 22, - /*input_scale=*/0.017189515754580498, - /*kernel_zero_point=*/(uint8_t) 94, - /*kernel_scale=*/0.003363430965691805, - /*kernel=*/w274.data(), /*bias=*/w275.data(), - /*output_zero_point=*/(uint8_t) 77, - /*output_scale=*/0.0765276700258255, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op128); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #128" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op128, xnn_delete_operator); - - xnn_operator_t op129 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op129); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #129" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op129, xnn_delete_operator); - - xnn_operator_t op130 = nullptr; - status = xnn_create_softmax_nc_qu8( - /*input_scale=*/0.0765276700258255, - /*output_zero_point=*/0, - /*output_scale=*/0.00390625, - /*flags=*/0, - &op130); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #130" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op130, xnn_delete_operator); - - size_t op0_workspace_size = 0; - size_t op0_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op0, - /*batch_size=*/1, /*input_height=*/224, /*input_width=*/224, - &op0_workspace_size, &op0_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op0_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #0" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op1, - /*batch_size=*/12544, - 16 /* channels */, - 16 /* input stride */, - 16 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #1" << std::endl; - return ExecutionPlan(); - } - - size_t op2_workspace_size = 0; - size_t op2_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op2, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op2_workspace_size, &op2_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op2_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #2" << std::endl; - return ExecutionPlan(); - } - - size_t op3_workspace_size = 0; - size_t op3_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op3, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op3_workspace_size, &op3_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op3_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #3" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 112, 112, 16 }; - const size_t b_shape[] = { 1, 112, 112, 16 }; - status = xnn_reshape_add_nd_qu8( - op4, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #4" << std::endl; - return ExecutionPlan(); - } - - size_t op5_workspace_size = 0; - size_t op5_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op5, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op5_workspace_size, &op5_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op5_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #5" << std::endl; - return ExecutionPlan(); - } - - size_t op6_workspace_size = 0; - size_t op6_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op6, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op6_workspace_size, &op6_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op6_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #6" << std::endl; - return ExecutionPlan(); - } - - size_t op7_workspace_size = 0; - size_t op7_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op7, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op7_workspace_size, &op7_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op7_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #7" << std::endl; - return ExecutionPlan(); - } - - size_t op8_workspace_size = 0; - size_t op8_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op8, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op8_workspace_size, &op8_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op8_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #8" << std::endl; - return ExecutionPlan(); - } - - size_t op9_workspace_size = 0; - size_t op9_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op9, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op9_workspace_size, &op9_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op9_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #9" << std::endl; - return ExecutionPlan(); - } - - size_t op10_workspace_size = 0; - size_t op10_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op10, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op10_workspace_size, &op10_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op10_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #10" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 56, 56, 24 }; - const size_t b_shape[] = { 1, 56, 56, 24 }; - status = xnn_reshape_add_nd_qu8( - op11, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #11" << std::endl; - return ExecutionPlan(); - } - - size_t op12_workspace_size = 0; - size_t op12_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op12, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op12_workspace_size, &op12_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op12_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #12" << std::endl; - return ExecutionPlan(); - } - - size_t op13_workspace_size = 0; - size_t op13_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op13, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op13_workspace_size, &op13_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op13_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #13" << std::endl; - return ExecutionPlan(); - } - - size_t op14_workspace_size = 0; - size_t op14_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_qu8( - op14, - /*batch_size=*/1, 784 /* width */, - 72 /* channels */, 72 /* input stride */, 72 /* output stride */, - &op14_workspace_size, &op14_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op14_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #14" << std::endl; - return ExecutionPlan(); - } - - size_t op15_workspace_size = 0; - size_t op15_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op15, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op15_workspace_size, &op15_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op15_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #15" << std::endl; - return ExecutionPlan(); - } - - size_t op16_workspace_size = 0; - size_t op16_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op16, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op16_workspace_size, &op16_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op16_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #16" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 1, 1, 72 }; - const size_t b_shape[] = { 1 }; - status = xnn_reshape_add_nd_qu8( - op17, - 4, a_shape, 1, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #17" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 1, 1, 72 }; - const size_t b_shape[] = { 1 }; - status = xnn_reshape_multiply_nd_qu8( - op18, - 4, a_shape, 1, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #18" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 28, 28, 72 }; - const size_t b_shape[] = { 1, 1, 1, 72 }; - status = xnn_reshape_multiply_nd_qu8( - op19, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #19" << std::endl; - return ExecutionPlan(); - } - - size_t op20_workspace_size = 0; - size_t op20_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op20, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op20_workspace_size, &op20_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op20_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #20" << std::endl; - return ExecutionPlan(); - } - - size_t op21_workspace_size = 0; - size_t op21_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op21, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op21_workspace_size, &op21_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op21_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #21" << std::endl; - return ExecutionPlan(); - } - - size_t op22_workspace_size = 0; - size_t op22_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op22, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op22_workspace_size, &op22_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op22_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #22" << std::endl; - return ExecutionPlan(); - } - - size_t op23_workspace_size = 0; - size_t op23_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_qu8( - op23, - /*batch_size=*/1, 784 /* width */, - 120 /* channels */, 120 /* input stride */, 120 /* output stride */, - &op23_workspace_size, &op23_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op23_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #23" << std::endl; - return ExecutionPlan(); - } - - size_t op24_workspace_size = 0; - size_t op24_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op24, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op24_workspace_size, &op24_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op24_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #24" << std::endl; - return ExecutionPlan(); - } - - size_t op25_workspace_size = 0; - size_t op25_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op25, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op25_workspace_size, &op25_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op25_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #25" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 1, 1, 120 }; - const size_t b_shape[] = { 1 }; - status = xnn_reshape_add_nd_qu8( - op26, - 4, a_shape, 1, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #26" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 1, 1, 120 }; - const size_t b_shape[] = { 1 }; - status = xnn_reshape_multiply_nd_qu8( - op27, - 4, a_shape, 1, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #27" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 28, 28, 120 }; - const size_t b_shape[] = { 1, 1, 1, 120 }; - status = xnn_reshape_multiply_nd_qu8( - op28, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #28" << std::endl; - return ExecutionPlan(); - } - - size_t op29_workspace_size = 0; - size_t op29_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op29, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op29_workspace_size, &op29_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op29_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #29" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 28, 28, 40 }; - const size_t b_shape[] = { 1, 28, 28, 40 }; - status = xnn_reshape_add_nd_qu8( - op30, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #30" << std::endl; - return ExecutionPlan(); - } - - size_t op31_workspace_size = 0; - size_t op31_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op31, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op31_workspace_size, &op31_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op31_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #31" << std::endl; - return ExecutionPlan(); - } - - size_t op32_workspace_size = 0; - size_t op32_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op32, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op32_workspace_size, &op32_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op32_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #32" << std::endl; - return ExecutionPlan(); - } - - size_t op33_workspace_size = 0; - size_t op33_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_qu8( - op33, - /*batch_size=*/1, 784 /* width */, - 120 /* channels */, 120 /* input stride */, 120 /* output stride */, - &op33_workspace_size, &op33_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op33_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #33" << std::endl; - return ExecutionPlan(); - } - - size_t op34_workspace_size = 0; - size_t op34_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op34, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op34_workspace_size, &op34_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op34_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #34" << std::endl; - return ExecutionPlan(); - } - - size_t op35_workspace_size = 0; - size_t op35_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op35, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op35_workspace_size, &op35_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op35_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #35" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 1, 1, 120 }; - const size_t b_shape[] = { 1 }; - status = xnn_reshape_add_nd_qu8( - op36, - 4, a_shape, 1, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #36" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 1, 1, 120 }; - const size_t b_shape[] = { 1 }; - status = xnn_reshape_multiply_nd_qu8( - op37, - 4, a_shape, 1, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #37" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 28, 28, 120 }; - const size_t b_shape[] = { 1, 1, 1, 120 }; - status = xnn_reshape_multiply_nd_qu8( - op38, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #38" << std::endl; - return ExecutionPlan(); - } - - size_t op39_workspace_size = 0; - size_t op39_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op39, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op39_workspace_size, &op39_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op39_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #39" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 28, 28, 40 }; - const size_t b_shape[] = { 1, 28, 28, 40 }; - status = xnn_reshape_add_nd_qu8( - op40, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #40" << std::endl; - return ExecutionPlan(); - } - - size_t op41_workspace_size = 0; - size_t op41_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op41, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op41_workspace_size, &op41_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op41_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #41" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op42, - /*batch_size=*/784, - 240 /* channels */, - 240 /* input stride */, - 240 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #42" << std::endl; - return ExecutionPlan(); - } - - size_t op43_workspace_size = 0; - size_t op43_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op43, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op43_workspace_size, &op43_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op43_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #43" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op44, - /*batch_size=*/196, - 240 /* channels */, - 240 /* input stride */, - 240 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #44" << std::endl; - return ExecutionPlan(); - } - - size_t op45_workspace_size = 0; - size_t op45_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op45, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op45_workspace_size, &op45_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op45_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #45" << std::endl; - return ExecutionPlan(); - } - - size_t op46_workspace_size = 0; - size_t op46_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op46, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op46_workspace_size, &op46_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op46_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #46" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op47, - /*batch_size=*/196, - 200 /* channels */, - 200 /* input stride */, - 200 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #47" << std::endl; - return ExecutionPlan(); - } - - size_t op48_workspace_size = 0; - size_t op48_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op48, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op48_workspace_size, &op48_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op48_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #48" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op49, - /*batch_size=*/196, - 200 /* channels */, - 200 /* input stride */, - 200 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #49" << std::endl; - return ExecutionPlan(); - } - - size_t op50_workspace_size = 0; - size_t op50_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op50, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op50_workspace_size, &op50_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op50_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #50" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 80 }; - const size_t b_shape[] = { 1, 14, 14, 80 }; - status = xnn_reshape_add_nd_qu8( - op51, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #51" << std::endl; - return ExecutionPlan(); - } - - size_t op52_workspace_size = 0; - size_t op52_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op52, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op52_workspace_size, &op52_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op52_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #52" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op53, - /*batch_size=*/196, - 184 /* channels */, - 184 /* input stride */, - 184 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #53" << std::endl; - return ExecutionPlan(); - } - - size_t op54_workspace_size = 0; - size_t op54_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op54, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op54_workspace_size, &op54_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op54_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #54" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op55, - /*batch_size=*/196, - 184 /* channels */, - 184 /* input stride */, - 184 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #55" << std::endl; - return ExecutionPlan(); - } - - size_t op56_workspace_size = 0; - size_t op56_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op56, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op56_workspace_size, &op56_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op56_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #56" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 80 }; - const size_t b_shape[] = { 1, 14, 14, 80 }; - status = xnn_reshape_add_nd_qu8( - op57, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #57" << std::endl; - return ExecutionPlan(); - } - - size_t op58_workspace_size = 0; - size_t op58_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op58, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op58_workspace_size, &op58_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op58_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #58" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op59, - /*batch_size=*/196, - 184 /* channels */, - 184 /* input stride */, - 184 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #59" << std::endl; - return ExecutionPlan(); - } - - size_t op60_workspace_size = 0; - size_t op60_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op60, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op60_workspace_size, &op60_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op60_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #60" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op61, - /*batch_size=*/196, - 184 /* channels */, - 184 /* input stride */, - 184 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #61" << std::endl; - return ExecutionPlan(); - } - - size_t op62_workspace_size = 0; - size_t op62_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op62, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op62_workspace_size, &op62_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op62_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #62" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 80 }; - const size_t b_shape[] = { 1, 14, 14, 80 }; - status = xnn_reshape_add_nd_qu8( - op63, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #63" << std::endl; - return ExecutionPlan(); - } - - size_t op64_workspace_size = 0; - size_t op64_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op64, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op64_workspace_size, &op64_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op64_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #64" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op65, - /*batch_size=*/196, - 480 /* channels */, - 480 /* input stride */, - 480 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #65" << std::endl; - return ExecutionPlan(); - } - - size_t op66_workspace_size = 0; - size_t op66_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op66, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op66_workspace_size, &op66_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op66_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #66" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op67, - /*batch_size=*/196, - 480 /* channels */, - 480 /* input stride */, - 480 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #67" << std::endl; - return ExecutionPlan(); - } - - size_t op68_workspace_size = 0; - size_t op68_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_qu8( - op68, - /*batch_size=*/1, 196 /* width */, - 480 /* channels */, 480 /* input stride */, 480 /* output stride */, - &op68_workspace_size, &op68_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op68_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #68" << std::endl; - return ExecutionPlan(); - } - - size_t op69_workspace_size = 0; - size_t op69_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op69, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op69_workspace_size, &op69_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op69_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #69" << std::endl; - return ExecutionPlan(); - } - - size_t op70_workspace_size = 0; - size_t op70_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op70, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op70_workspace_size, &op70_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op70_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #70" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 1, 1, 480 }; - const size_t b_shape[] = { 1 }; - status = xnn_reshape_add_nd_qu8( - op71, - 4, a_shape, 1, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #71" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 1, 1, 480 }; - const size_t b_shape[] = { 1 }; - status = xnn_reshape_multiply_nd_qu8( - op72, - 4, a_shape, 1, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #72" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 480 }; - const size_t b_shape[] = { 1, 1, 1, 480 }; - status = xnn_reshape_multiply_nd_qu8( - op73, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #73" << std::endl; - return ExecutionPlan(); - } - - size_t op74_workspace_size = 0; - size_t op74_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op74, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op74_workspace_size, &op74_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op74_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #74" << std::endl; - return ExecutionPlan(); - } - - size_t op75_workspace_size = 0; - size_t op75_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op75, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op75_workspace_size, &op75_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op75_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #75" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op76, - /*batch_size=*/196, - 672 /* channels */, - 672 /* input stride */, - 672 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #76" << std::endl; - return ExecutionPlan(); - } - - size_t op77_workspace_size = 0; - size_t op77_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op77, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op77_workspace_size, &op77_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op77_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #77" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op78, - /*batch_size=*/196, - 672 /* channels */, - 672 /* input stride */, - 672 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #78" << std::endl; - return ExecutionPlan(); - } - - size_t op79_workspace_size = 0; - size_t op79_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_qu8( - op79, - /*batch_size=*/1, 196 /* width */, - 672 /* channels */, 672 /* input stride */, 672 /* output stride */, - &op79_workspace_size, &op79_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op79_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #79" << std::endl; - return ExecutionPlan(); - } - - size_t op80_workspace_size = 0; - size_t op80_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op80, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op80_workspace_size, &op80_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op80_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #80" << std::endl; - return ExecutionPlan(); - } - - size_t op81_workspace_size = 0; - size_t op81_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op81, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op81_workspace_size, &op81_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op81_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #81" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 1, 1, 672 }; - const size_t b_shape[] = { 1 }; - status = xnn_reshape_add_nd_qu8( - op82, - 4, a_shape, 1, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #82" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 1, 1, 672 }; - const size_t b_shape[] = { 1 }; - status = xnn_reshape_multiply_nd_qu8( - op83, - 4, a_shape, 1, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #83" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 672 }; - const size_t b_shape[] = { 1, 1, 1, 672 }; - status = xnn_reshape_multiply_nd_qu8( - op84, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #84" << std::endl; - return ExecutionPlan(); - } - - size_t op85_workspace_size = 0; - size_t op85_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op85, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op85_workspace_size, &op85_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op85_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #85" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 112 }; - const size_t b_shape[] = { 1, 14, 14, 112 }; - status = xnn_reshape_add_nd_qu8( - op86, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #86" << std::endl; - return ExecutionPlan(); - } - - size_t op87_workspace_size = 0; - size_t op87_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op87, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op87_workspace_size, &op87_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op87_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #87" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op88, - /*batch_size=*/196, - 672 /* channels */, - 672 /* input stride */, - 672 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #88" << std::endl; - return ExecutionPlan(); - } - - size_t op89_workspace_size = 0; - size_t op89_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op89, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op89_workspace_size, &op89_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op89_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #89" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op90, - /*batch_size=*/49, - 672 /* channels */, - 672 /* input stride */, - 672 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #90" << std::endl; - return ExecutionPlan(); - } - - size_t op91_workspace_size = 0; - size_t op91_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_qu8( - op91, - /*batch_size=*/1, 49 /* width */, - 672 /* channels */, 672 /* input stride */, 672 /* output stride */, - &op91_workspace_size, &op91_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op91_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #91" << std::endl; - return ExecutionPlan(); - } - - size_t op92_workspace_size = 0; - size_t op92_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op92, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op92_workspace_size, &op92_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op92_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #92" << std::endl; - return ExecutionPlan(); - } - - size_t op93_workspace_size = 0; - size_t op93_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op93, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op93_workspace_size, &op93_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op93_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #93" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 1, 1, 672 }; - const size_t b_shape[] = { 1 }; - status = xnn_reshape_add_nd_qu8( - op94, - 4, a_shape, 1, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #94" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 1, 1, 672 }; - const size_t b_shape[] = { 1 }; - status = xnn_reshape_multiply_nd_qu8( - op95, - 4, a_shape, 1, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #95" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 672 }; - const size_t b_shape[] = { 1, 1, 1, 672 }; - status = xnn_reshape_multiply_nd_qu8( - op96, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #96" << std::endl; - return ExecutionPlan(); - } - - size_t op97_workspace_size = 0; - size_t op97_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op97, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op97_workspace_size, &op97_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op97_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #97" << std::endl; - return ExecutionPlan(); - } - - size_t op98_workspace_size = 0; - size_t op98_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op98, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op98_workspace_size, &op98_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op98_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #98" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op99, - /*batch_size=*/49, - 960 /* channels */, - 960 /* input stride */, - 960 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #99" << std::endl; - return ExecutionPlan(); - } - - size_t op100_workspace_size = 0; - size_t op100_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op100, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op100_workspace_size, &op100_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op100_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #100" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op101, - /*batch_size=*/49, - 960 /* channels */, - 960 /* input stride */, - 960 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #101" << std::endl; - return ExecutionPlan(); - } - - size_t op102_workspace_size = 0; - size_t op102_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_qu8( - op102, - /*batch_size=*/1, 49 /* width */, - 960 /* channels */, 960 /* input stride */, 960 /* output stride */, - &op102_workspace_size, &op102_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op102_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #102" << std::endl; - return ExecutionPlan(); - } - - size_t op103_workspace_size = 0; - size_t op103_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op103, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op103_workspace_size, &op103_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op103_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #103" << std::endl; - return ExecutionPlan(); - } - - size_t op104_workspace_size = 0; - size_t op104_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op104, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op104_workspace_size, &op104_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op104_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #104" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 1, 1, 960 }; - const size_t b_shape[] = { 1 }; - status = xnn_reshape_add_nd_qu8( - op105, - 4, a_shape, 1, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #105" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 1, 1, 960 }; - const size_t b_shape[] = { 1 }; - status = xnn_reshape_multiply_nd_qu8( - op106, - 4, a_shape, 1, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #106" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 960 }; - const size_t b_shape[] = { 1, 1, 1, 960 }; - status = xnn_reshape_multiply_nd_qu8( - op107, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #107" << std::endl; - return ExecutionPlan(); - } - - size_t op108_workspace_size = 0; - size_t op108_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op108, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op108_workspace_size, &op108_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op108_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #108" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 160 }; - const size_t b_shape[] = { 1, 7, 7, 160 }; - status = xnn_reshape_add_nd_qu8( - op109, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #109" << std::endl; - return ExecutionPlan(); - } - - size_t op110_workspace_size = 0; - size_t op110_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op110, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op110_workspace_size, &op110_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op110_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #110" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op111, - /*batch_size=*/49, - 960 /* channels */, - 960 /* input stride */, - 960 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #111" << std::endl; - return ExecutionPlan(); - } - - size_t op112_workspace_size = 0; - size_t op112_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op112, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op112_workspace_size, &op112_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op112_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #112" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op113, - /*batch_size=*/49, - 960 /* channels */, - 960 /* input stride */, - 960 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #113" << std::endl; - return ExecutionPlan(); - } - - size_t op114_workspace_size = 0; - size_t op114_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_qu8( - op114, - /*batch_size=*/1, 49 /* width */, - 960 /* channels */, 960 /* input stride */, 960 /* output stride */, - &op114_workspace_size, &op114_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op114_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #114" << std::endl; - return ExecutionPlan(); - } - - size_t op115_workspace_size = 0; - size_t op115_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op115, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op115_workspace_size, &op115_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op115_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #115" << std::endl; - return ExecutionPlan(); - } - - size_t op116_workspace_size = 0; - size_t op116_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op116, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op116_workspace_size, &op116_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op116_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #116" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 1, 1, 960 }; - const size_t b_shape[] = { 1 }; - status = xnn_reshape_add_nd_qu8( - op117, - 4, a_shape, 1, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #117" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 1, 1, 960 }; - const size_t b_shape[] = { 1 }; - status = xnn_reshape_multiply_nd_qu8( - op118, - 4, a_shape, 1, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #118" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 960 }; - const size_t b_shape[] = { 1, 1, 1, 960 }; - status = xnn_reshape_multiply_nd_qu8( - op119, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #119" << std::endl; - return ExecutionPlan(); - } - - size_t op120_workspace_size = 0; - size_t op120_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op120, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op120_workspace_size, &op120_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op120_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #120" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 160 }; - const size_t b_shape[] = { 1, 7, 7, 160 }; - status = xnn_reshape_add_nd_qu8( - op121, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #121" << std::endl; - return ExecutionPlan(); - } - - size_t op122_workspace_size = 0; - size_t op122_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op122, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op122_workspace_size, &op122_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op122_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #122" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op123, - /*batch_size=*/49, - 960 /* channels */, - 960 /* input stride */, - 960 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #123" << std::endl; - return ExecutionPlan(); - } - - size_t op124_workspace_size = 0; - size_t op124_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_qu8( - op124, - /*batch_size=*/1, 49 /* width */, - 960 /* channels */, 960 /* input stride */, 960 /* output stride */, - &op124_workspace_size, &op124_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op124_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #124" << std::endl; - return ExecutionPlan(); - } - - size_t op125_workspace_size = 0; - size_t op125_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op125, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op125_workspace_size, &op125_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op125_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #125" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op126, - /*batch_size=*/1, - 1280 /* channels */, - 1280 /* input stride */, - 1280 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #126" << std::endl; - return ExecutionPlan(); - } - - size_t op127_workspace_size = 0; - size_t op127_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_qu8( - op127, - /*batch_size=*/1, 1 /* width */, - 1280 /* channels */, 1280 /* input stride */, 1280 /* output stride */, - &op127_workspace_size, &op127_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op127_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #127" << std::endl; - return ExecutionPlan(); - } - - size_t op128_workspace_size = 0; - size_t op128_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op128, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op128_workspace_size, &op128_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op128_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #128" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op129, - /*batch_size=*/1001, - 1 /* channels */, - 1 /* input stride */, - 1 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #129" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_softmax_nc_qu8( - op130, - /*channels=*/1001, - /*input_stride=*/1001, - /*output_stride=*/1001, - /*batch_size=*/1, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #130" << std::endl; - return ExecutionPlan(); - } - - Workspace workspace(max_workspace_size); - - status = xnn_setup_convolution2d_nhwc_qu8( - op0, - workspace.data(), /*input=*/v0.data(), /*output=*/v1.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #0" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op1, - /*input=*/v1.data(), /*output=*/v2.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #1" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op2, - workspace.data(), /*input=*/v2.data(), /*output=*/v3.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #2" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op3, - workspace.data(), /*input=*/v3.data(), /*output=*/v4.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #3" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op4, - v4.data() /* a */, v2.data() /* b */, /*output=*/v5.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #4" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op5, - workspace.data(), /*input=*/v5.data(), /*output=*/v6.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #5" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op6, - workspace.data(), /*input=*/v6.data(), /*output=*/v7.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #6" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op7, - workspace.data(), /*input=*/v7.data(), /*output=*/v8.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #7" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op8, - workspace.data(), /*input=*/v8.data(), /*output=*/v9.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #8" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op9, - workspace.data(), /*input=*/v9.data(), /*output=*/v10.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #9" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op10, - workspace.data(), /*input=*/v10.data(), /*output=*/v11.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #10" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op11, - v11.data() /* a */, v8.data() /* b */, /*output=*/v12.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #11" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op12, - workspace.data(), /*input=*/v12.data(), /*output=*/v13.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #12" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op13, - workspace.data(), /*input=*/v13.data(), /*output=*/v14.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #13" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_qu8( - op14, - workspace.data(), - /*input=*/v14.data(), /*output=*/v15.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #14" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op15, - workspace.data(), /*input=*/v15.data(), /*output=*/v16.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #15" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op16, - workspace.data(), /*input=*/v16.data(), /*output=*/v17.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #16" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op17, - v17.data() /* a */, w158.data() /* b */, /*output=*/v18.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #17" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_qu8( - op18, - v18.data() /* a */, w159.data() /* b */, /*output=*/v19.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #18" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_qu8( - op19, - v14.data() /* a */, v19.data() /* b */, /*output=*/v20.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #19" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op20, - workspace.data(), /*input=*/v20.data(), /*output=*/v21.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #20" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op21, - workspace.data(), /*input=*/v21.data(), /*output=*/v22.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #21" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op22, - workspace.data(), /*input=*/v22.data(), /*output=*/v23.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #22" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_qu8( - op23, - workspace.data(), - /*input=*/v23.data(), /*output=*/v24.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #23" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op24, - workspace.data(), /*input=*/v24.data(), /*output=*/v25.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #24" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op25, - workspace.data(), /*input=*/v25.data(), /*output=*/v26.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #25" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op26, - v26.data() /* a */, w170.data() /* b */, /*output=*/v27.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #26" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_qu8( - op27, - v27.data() /* a */, w171.data() /* b */, /*output=*/v28.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #27" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_qu8( - op28, - v23.data() /* a */, v28.data() /* b */, /*output=*/v29.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #28" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op29, - workspace.data(), /*input=*/v29.data(), /*output=*/v30.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #29" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op30, - v30.data() /* a */, v21.data() /* b */, /*output=*/v31.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #30" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op31, - workspace.data(), /*input=*/v31.data(), /*output=*/v32.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #31" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op32, - workspace.data(), /*input=*/v32.data(), /*output=*/v33.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #32" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_qu8( - op33, - workspace.data(), - /*input=*/v33.data(), /*output=*/v34.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #33" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op34, - workspace.data(), /*input=*/v34.data(), /*output=*/v35.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #34" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op35, - workspace.data(), /*input=*/v35.data(), /*output=*/v36.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #35" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op36, - v36.data() /* a */, w182.data() /* b */, /*output=*/v37.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #36" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_qu8( - op37, - v37.data() /* a */, w183.data() /* b */, /*output=*/v38.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #37" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_qu8( - op38, - v33.data() /* a */, v38.data() /* b */, /*output=*/v39.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #38" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op39, - workspace.data(), /*input=*/v39.data(), /*output=*/v40.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #39" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op40, - v40.data() /* a */, v31.data() /* b */, /*output=*/v41.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #40" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op41, - workspace.data(), /*input=*/v41.data(), /*output=*/v42.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #41" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op42, - /*input=*/v42.data(), /*output=*/v43.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #42" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op43, - workspace.data(), /*input=*/v43.data(), /*output=*/v44.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #43" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op44, - /*input=*/v44.data(), /*output=*/v45.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #44" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op45, - workspace.data(), /*input=*/v45.data(), /*output=*/v46.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #45" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op46, - workspace.data(), /*input=*/v46.data(), /*output=*/v47.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #46" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op47, - /*input=*/v47.data(), /*output=*/v48.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #47" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op48, - workspace.data(), /*input=*/v48.data(), /*output=*/v49.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #48" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op49, - /*input=*/v49.data(), /*output=*/v50.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #49" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op50, - workspace.data(), /*input=*/v50.data(), /*output=*/v51.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #50" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op51, - v51.data() /* a */, v46.data() /* b */, /*output=*/v52.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #51" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op52, - workspace.data(), /*input=*/v52.data(), /*output=*/v53.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #52" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op53, - /*input=*/v53.data(), /*output=*/v54.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #53" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op54, - workspace.data(), /*input=*/v54.data(), /*output=*/v55.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #54" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op55, - /*input=*/v55.data(), /*output=*/v56.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #55" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op56, - workspace.data(), /*input=*/v56.data(), /*output=*/v57.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #56" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op57, - v57.data() /* a */, v52.data() /* b */, /*output=*/v58.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #57" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op58, - workspace.data(), /*input=*/v58.data(), /*output=*/v59.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #58" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op59, - /*input=*/v59.data(), /*output=*/v60.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #59" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op60, - workspace.data(), /*input=*/v60.data(), /*output=*/v61.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #60" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op61, - /*input=*/v61.data(), /*output=*/v62.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #61" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op62, - workspace.data(), /*input=*/v62.data(), /*output=*/v63.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #62" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op63, - v63.data() /* a */, v58.data() /* b */, /*output=*/v64.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #63" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op64, - workspace.data(), /*input=*/v64.data(), /*output=*/v65.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #64" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op65, - /*input=*/v65.data(), /*output=*/v66.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #65" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op66, - workspace.data(), /*input=*/v66.data(), /*output=*/v67.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #66" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op67, - /*input=*/v67.data(), /*output=*/v68.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #67" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_qu8( - op68, - workspace.data(), - /*input=*/v68.data(), /*output=*/v69.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #68" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op69, - workspace.data(), /*input=*/v69.data(), /*output=*/v70.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #69" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op70, - workspace.data(), /*input=*/v70.data(), /*output=*/v71.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #70" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op71, - v71.data() /* a */, w218.data() /* b */, /*output=*/v72.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #71" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_qu8( - op72, - v72.data() /* a */, w219.data() /* b */, /*output=*/v73.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #72" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_qu8( - op73, - v68.data() /* a */, v73.data() /* b */, /*output=*/v74.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #73" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op74, - workspace.data(), /*input=*/v74.data(), /*output=*/v75.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #74" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op75, - workspace.data(), /*input=*/v75.data(), /*output=*/v76.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #75" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op76, - /*input=*/v76.data(), /*output=*/v77.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #76" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op77, - workspace.data(), /*input=*/v77.data(), /*output=*/v78.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #77" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op78, - /*input=*/v78.data(), /*output=*/v79.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #78" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_qu8( - op79, - workspace.data(), - /*input=*/v79.data(), /*output=*/v80.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #79" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op80, - workspace.data(), /*input=*/v80.data(), /*output=*/v81.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #80" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op81, - workspace.data(), /*input=*/v81.data(), /*output=*/v82.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #81" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op82, - v82.data() /* a */, w230.data() /* b */, /*output=*/v83.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #82" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_qu8( - op83, - v83.data() /* a */, w231.data() /* b */, /*output=*/v84.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #83" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_qu8( - op84, - v79.data() /* a */, v84.data() /* b */, /*output=*/v85.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #84" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op85, - workspace.data(), /*input=*/v85.data(), /*output=*/v86.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #85" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op86, - v86.data() /* a */, v75.data() /* b */, /*output=*/v87.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #86" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op87, - workspace.data(), /*input=*/v87.data(), /*output=*/v88.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #87" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op88, - /*input=*/v88.data(), /*output=*/v89.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #88" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op89, - workspace.data(), /*input=*/v89.data(), /*output=*/v90.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #89" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op90, - /*input=*/v90.data(), /*output=*/v91.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #90" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_qu8( - op91, - workspace.data(), - /*input=*/v91.data(), /*output=*/v92.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #91" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op92, - workspace.data(), /*input=*/v92.data(), /*output=*/v93.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #92" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op93, - workspace.data(), /*input=*/v93.data(), /*output=*/v94.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #93" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op94, - v94.data() /* a */, w242.data() /* b */, /*output=*/v95.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #94" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_qu8( - op95, - v95.data() /* a */, w243.data() /* b */, /*output=*/v96.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #95" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_qu8( - op96, - v91.data() /* a */, v96.data() /* b */, /*output=*/v97.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #96" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op97, - workspace.data(), /*input=*/v97.data(), /*output=*/v98.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #97" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op98, - workspace.data(), /*input=*/v98.data(), /*output=*/v99.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #98" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op99, - /*input=*/v99.data(), /*output=*/v100.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #99" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op100, - workspace.data(), /*input=*/v100.data(), /*output=*/v101.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #100" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op101, - /*input=*/v101.data(), /*output=*/v102.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #101" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_qu8( - op102, - workspace.data(), - /*input=*/v102.data(), /*output=*/v103.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #102" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op103, - workspace.data(), /*input=*/v103.data(), /*output=*/v104.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #103" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op104, - workspace.data(), /*input=*/v104.data(), /*output=*/v105.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #104" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op105, - v105.data() /* a */, w254.data() /* b */, /*output=*/v106.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #105" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_qu8( - op106, - v106.data() /* a */, w255.data() /* b */, /*output=*/v107.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #106" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_qu8( - op107, - v102.data() /* a */, v107.data() /* b */, /*output=*/v108.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #107" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op108, - workspace.data(), /*input=*/v108.data(), /*output=*/v109.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #108" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op109, - v109.data() /* a */, v98.data() /* b */, /*output=*/v110.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #109" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op110, - workspace.data(), /*input=*/v110.data(), /*output=*/v111.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #110" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op111, - /*input=*/v111.data(), /*output=*/v112.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #111" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op112, - workspace.data(), /*input=*/v112.data(), /*output=*/v113.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #112" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op113, - /*input=*/v113.data(), /*output=*/v114.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #113" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_qu8( - op114, - workspace.data(), - /*input=*/v114.data(), /*output=*/v115.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #114" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op115, - workspace.data(), /*input=*/v115.data(), /*output=*/v116.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #115" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op116, - workspace.data(), /*input=*/v116.data(), /*output=*/v117.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #116" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op117, - v117.data() /* a */, w266.data() /* b */, /*output=*/v118.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #117" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_qu8( - op118, - v118.data() /* a */, w267.data() /* b */, /*output=*/v119.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #118" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_qu8( - op119, - v114.data() /* a */, v119.data() /* b */, /*output=*/v120.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #119" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op120, - workspace.data(), /*input=*/v120.data(), /*output=*/v121.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #120" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op121, - v121.data() /* a */, v110.data() /* b */, /*output=*/v122.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #121" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op122, - workspace.data(), /*input=*/v122.data(), /*output=*/v123.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #122" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op123, - /*input=*/v123.data(), /*output=*/v124.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #123" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_qu8( - op124, - workspace.data(), - /*input=*/v124.data(), /*output=*/v125.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #124" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op125, - workspace.data(), /*input=*/v125.data(), /*output=*/v126.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #125" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op126, - /*input=*/v126.data(), /*output=*/v127.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #126" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_qu8( - op127, - workspace.data(), - /*input=*/v127.data(), /*output=*/v128.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #127" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op128, - workspace.data(), /*input=*/v128.data(), /*output=*/v129.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #128" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op129, - /*input=*/v129.data(), /*output=*/v130.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #129" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_softmax_nc_qu8( - op130, - /*input=*/v130.data(), /*output=*/v131.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #130" << std::endl; - return ExecutionPlan(); - } - - XNN_PRAGMA_CLANG("clang diagnostic push") - XNN_PRAGMA_CLANG("clang diagnostic ignored \"-Wpessimizing-move\"") - return ExecutionPlan{operators, workspace}; - XNN_PRAGMA_CLANG("clang diagnostic pop") -} - -} // namespace models diff --git a/models/qu8-mobilenet-v3-small.cc b/models/qu8-mobilenet-v3-small.cc deleted file mode 100644 index 4b9d8aef270..00000000000 --- a/models/qu8-mobilenet-v3-small.cc +++ /dev/null @@ -1,5527 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! - -#include "xnnpack.h" - -#include -#include -#include -#include -#include -#include - -#include "xnnpack/cache.h" -#include "xnnpack/common.h" -#include "xnnpack/models.h" - -namespace models { - -ExecutionPlan QU8MobileNetV3Small(pthreadpool_t threadpool) { - alignas(16) static std::array v0; - alignas(16) static std::array v1; - alignas(16) static std::array v2; - alignas(16) static std::array v3; - alignas(16) static std::array v4; - alignas(16) static std::array v5; - alignas(16) static std::array v6; - alignas(16) static std::array v7; - alignas(16) static std::array v8; - alignas(16) static std::array v9; - alignas(16) static std::array v10; - alignas(16) static std::array v11; - alignas(16) static std::array v12; - alignas(16) static std::array v13; - alignas(16) static std::array v14; - alignas(16) static std::array v15; - alignas(16) static std::array v16; - alignas(16) static std::array v17; - alignas(16) static std::array v18; - alignas(16) static std::array v19; - alignas(16) static std::array v20; - alignas(16) static std::array v21; - alignas(16) static std::array v22; - alignas(16) static std::array v23; - alignas(16) static std::array v24; - alignas(16) static std::array v25; - alignas(16) static std::array v26; - alignas(16) static std::array v27; - alignas(16) static std::array v28; - alignas(16) static std::array v29; - alignas(16) static std::array v30; - alignas(16) static std::array v31; - alignas(16) static std::array v32; - alignas(16) static std::array v33; - alignas(16) static std::array v34; - alignas(16) static std::array v35; - alignas(16) static std::array v36; - alignas(16) static std::array v37; - alignas(16) static std::array v38; - alignas(16) static std::array v39; - alignas(16) static std::array v40; - alignas(16) static std::array v41; - alignas(16) static std::array v42; - alignas(16) static std::array v43; - alignas(16) static std::array v44; - alignas(16) static std::array v45; - alignas(16) static std::array v46; - alignas(16) static std::array v47; - alignas(16) static std::array v48; - alignas(16) static std::array v49; - alignas(16) static std::array v50; - alignas(16) static std::array v51; - alignas(16) static std::array v52; - alignas(16) static std::array v53; - alignas(16) static std::array v54; - alignas(16) static std::array v55; - alignas(16) static std::array v56; - alignas(16) static std::array v57; - alignas(16) static std::array v58; - alignas(16) static std::array v59; - alignas(16) static std::array v60; - alignas(16) static std::array v61; - alignas(16) static std::array v62; - alignas(16) static std::array v63; - alignas(16) static std::array v64; - alignas(16) static std::array v65; - alignas(16) static std::array v66; - alignas(16) static std::array v67; - alignas(16) static std::array v68; - alignas(16) static std::array v69; - alignas(16) static std::array v70; - alignas(16) static std::array v71; - alignas(16) static std::array v72; - alignas(16) static std::array v73; - alignas(16) static std::array v74; - alignas(16) static std::array v75; - alignas(16) static std::array v76; - alignas(16) static std::array v77; - alignas(16) static std::array v78; - alignas(16) static std::array v79; - alignas(16) static std::array v80; - alignas(16) static std::array v81; - alignas(16) static std::array v82; - alignas(16) static std::array v83; - alignas(16) static std::array v84; - alignas(16) static std::array v85; - alignas(16) static std::array v86; - alignas(16) static std::array v87; - alignas(16) static std::array v88; - alignas(16) static std::array v89; - alignas(16) static std::array v90; - alignas(16) static std::array v91; - alignas(16) static std::array v92; - alignas(16) static std::array v93; - alignas(16) static std::array v94; - alignas(16) static std::array v95; - alignas(16) static std::array v96; - alignas(16) static std::array v97; - alignas(16) static std::array v98; - alignas(16) static std::array v99; - alignas(16) static std::array v100; - alignas(16) static std::array v101; - alignas(16) static std::array v102; - alignas(16) static std::array v103; - alignas(16) static std::array v104; - alignas(16) static std::array v105; - alignas(16) static std::array v106; - alignas(16) static std::array v107; - alignas(16) static std::array v108; - alignas(16) static std::array v109; - alignas(16) static std::array v110; - alignas(16) static std::array v111; - alignas(16) static std::array v112; - alignas(16) static std::array v113; - alignas(16) static std::array v114; - alignas(16) static std::array v115; - alignas(16) static std::array v116; - alignas(16) static std::array v117; - alignas(16) static std::array v118; - alignas(16) static std::array v119; - alignas(16) static std::array w120; - alignas(16) static std::array w121; - alignas(16) static std::array w122; - alignas(16) static std::array w123; - alignas(16) static std::array w124; - alignas(16) static std::array w125; - alignas(16) static std::array w126; - alignas(16) static std::array w127; - alignas(16) static std::array w128; - alignas(16) static std::array w129; - alignas(16) static std::array w130; - alignas(16) static std::array w131; - alignas(16) static std::array w132; - alignas(16) static std::array w133; - alignas(16) static std::array w134; - alignas(16) static std::array w135; - alignas(16) static std::array w136; - alignas(16) static std::array w137; - alignas(16) static std::array w138; - alignas(16) static std::array w139; - alignas(16) static std::array w140; - alignas(16) static std::array w141; - alignas(16) static std::array w142; - alignas(16) static std::array w143; - alignas(16) static std::array w144; - alignas(16) static std::array w145; - alignas(16) static std::array w146; - alignas(16) static std::array w147; - alignas(16) static std::array w148; - alignas(16) static std::array w149; - alignas(16) static std::array w150; - alignas(16) static std::array w151; - alignas(16) static std::array w152; - alignas(16) static std::array w153; - alignas(16) static std::array w154; - alignas(16) static std::array w155; - alignas(16) static std::array w156; - alignas(16) static std::array w157; - alignas(16) static std::array w158; - alignas(16) static std::array w159; - alignas(16) static std::array w160; - alignas(16) static std::array w161; - alignas(16) static std::array w162; - alignas(16) static std::array w163; - alignas(16) static std::array w164; - alignas(16) static std::array w165; - alignas(16) static std::array w166; - alignas(16) static std::array w167; - alignas(16) static std::array w168; - alignas(16) static std::array w169; - alignas(16) static std::array w170; - alignas(16) static std::array w171; - alignas(16) static std::array w172; - alignas(16) static std::array w173; - alignas(16) static std::array w174; - alignas(16) static std::array w175; - alignas(16) static std::array w176; - alignas(16) static std::array w177; - alignas(16) static std::array w178; - alignas(16) static std::array w179; - alignas(16) static std::array w180; - alignas(16) static std::array w181; - alignas(16) static std::array w182; - alignas(16) static std::array w183; - alignas(16) static std::array w184; - alignas(16) static std::array w185; - alignas(16) static std::array w186; - alignas(16) static std::array w187; - alignas(16) static std::array w188; - alignas(16) static std::array w189; - alignas(16) static std::array w190; - alignas(16) static std::array w191; - alignas(16) static std::array w192; - alignas(16) static std::array w193; - alignas(16) static std::array w194; - alignas(16) static std::array w195; - alignas(16) static std::array w196; - alignas(16) static std::array w197; - alignas(16) static std::array w198; - alignas(16) static std::array w199; - alignas(16) static std::array w200; - alignas(16) static std::array w201; - alignas(16) static std::array w202; - alignas(16) static std::array w203; - alignas(16) static std::array w204; - alignas(16) static std::array w205; - alignas(16) static std::array w206; - alignas(16) static std::array w207; - alignas(16) static std::array w208; - alignas(16) static std::array w209; - alignas(16) static std::array w210; - alignas(16) static std::array w211; - alignas(16) static std::array w212; - alignas(16) static std::array w213; - alignas(16) static std::array w214; - alignas(16) static std::array w215; - alignas(16) static std::array w216; - alignas(16) static std::array w217; - alignas(16) static std::array w218; - alignas(16) static std::array w219; - alignas(16) static std::array w220; - alignas(16) static std::array w221; - alignas(16) static std::array w222; - alignas(16) static std::array w223; - alignas(16) static std::array w224; - alignas(16) static std::array w225; - alignas(16) static std::array w226; - alignas(16) static std::array w227; - alignas(16) static std::array w228; - alignas(16) static std::array w229; - alignas(16) static std::array w230; - alignas(16) static std::array w231; - alignas(16) static std::array w232; - alignas(16) static std::array w233; - alignas(16) static std::array w234; - alignas(16) static std::array w235; - alignas(16) static std::array w236; - alignas(16) static std::array w237; - alignas(16) static std::array w238; - alignas(16) static std::array w239; - alignas(16) static std::array w240; - alignas(16) static std::array w241; - alignas(16) static std::array w242; - alignas(16) static std::array w243; - alignas(16) static std::array w244; - alignas(16) static std::array w245; - - std::random_device random_device; - auto rng = std::mt19937(random_device()); - auto qu8rng = std::bind(std::uniform_int_distribution(std::numeric_limits::min(), std::numeric_limits::max()), std::ref(rng)); - auto qs32rng = std::bind(std::uniform_int_distribution(-10000, 10000), std::ref(rng)); - std::generate(v0.begin(), v0.end(), std::ref(qu8rng)); - std::generate(v1.begin(), v1.end(), std::ref(qu8rng)); - std::generate(v2.begin(), v2.end(), std::ref(qu8rng)); - std::generate(v3.begin(), v3.end(), std::ref(qu8rng)); - std::generate(v4.begin(), v4.end(), std::ref(qu8rng)); - std::generate(v5.begin(), v5.end(), std::ref(qu8rng)); - std::generate(v6.begin(), v6.end(), std::ref(qu8rng)); - std::generate(v7.begin(), v7.end(), std::ref(qu8rng)); - std::generate(v8.begin(), v8.end(), std::ref(qu8rng)); - std::generate(v9.begin(), v9.end(), std::ref(qu8rng)); - std::generate(v10.begin(), v10.end(), std::ref(qu8rng)); - std::generate(v11.begin(), v11.end(), std::ref(qu8rng)); - std::generate(v12.begin(), v12.end(), std::ref(qu8rng)); - std::generate(v13.begin(), v13.end(), std::ref(qu8rng)); - std::generate(v14.begin(), v14.end(), std::ref(qu8rng)); - std::generate(v15.begin(), v15.end(), std::ref(qu8rng)); - std::generate(v16.begin(), v16.end(), std::ref(qu8rng)); - std::generate(v17.begin(), v17.end(), std::ref(qu8rng)); - std::generate(v18.begin(), v18.end(), std::ref(qu8rng)); - std::generate(v19.begin(), v19.end(), std::ref(qu8rng)); - std::generate(v20.begin(), v20.end(), std::ref(qu8rng)); - std::generate(v21.begin(), v21.end(), std::ref(qu8rng)); - std::generate(v22.begin(), v22.end(), std::ref(qu8rng)); - std::generate(v23.begin(), v23.end(), std::ref(qu8rng)); - std::generate(v24.begin(), v24.end(), std::ref(qu8rng)); - std::generate(v25.begin(), v25.end(), std::ref(qu8rng)); - std::generate(v26.begin(), v26.end(), std::ref(qu8rng)); - std::generate(v27.begin(), v27.end(), std::ref(qu8rng)); - std::generate(v28.begin(), v28.end(), std::ref(qu8rng)); - std::generate(v29.begin(), v29.end(), std::ref(qu8rng)); - std::generate(v30.begin(), v30.end(), std::ref(qu8rng)); - std::generate(v31.begin(), v31.end(), std::ref(qu8rng)); - std::generate(v32.begin(), v32.end(), std::ref(qu8rng)); - std::generate(v33.begin(), v33.end(), std::ref(qu8rng)); - std::generate(v34.begin(), v34.end(), std::ref(qu8rng)); - std::generate(v35.begin(), v35.end(), std::ref(qu8rng)); - std::generate(v36.begin(), v36.end(), std::ref(qu8rng)); - std::generate(v37.begin(), v37.end(), std::ref(qu8rng)); - std::generate(v38.begin(), v38.end(), std::ref(qu8rng)); - std::generate(v39.begin(), v39.end(), std::ref(qu8rng)); - std::generate(v40.begin(), v40.end(), std::ref(qu8rng)); - std::generate(v41.begin(), v41.end(), std::ref(qu8rng)); - std::generate(v42.begin(), v42.end(), std::ref(qu8rng)); - std::generate(v43.begin(), v43.end(), std::ref(qu8rng)); - std::generate(v44.begin(), v44.end(), std::ref(qu8rng)); - std::generate(v45.begin(), v45.end(), std::ref(qu8rng)); - std::generate(v46.begin(), v46.end(), std::ref(qu8rng)); - std::generate(v47.begin(), v47.end(), std::ref(qu8rng)); - std::generate(v48.begin(), v48.end(), std::ref(qu8rng)); - std::generate(v49.begin(), v49.end(), std::ref(qu8rng)); - std::generate(v50.begin(), v50.end(), std::ref(qu8rng)); - std::generate(v51.begin(), v51.end(), std::ref(qu8rng)); - std::generate(v52.begin(), v52.end(), std::ref(qu8rng)); - std::generate(v53.begin(), v53.end(), std::ref(qu8rng)); - std::generate(v54.begin(), v54.end(), std::ref(qu8rng)); - std::generate(v55.begin(), v55.end(), std::ref(qu8rng)); - std::generate(v56.begin(), v56.end(), std::ref(qu8rng)); - std::generate(v57.begin(), v57.end(), std::ref(qu8rng)); - std::generate(v58.begin(), v58.end(), std::ref(qu8rng)); - std::generate(v59.begin(), v59.end(), std::ref(qu8rng)); - std::generate(v60.begin(), v60.end(), std::ref(qu8rng)); - std::generate(v61.begin(), v61.end(), std::ref(qu8rng)); - std::generate(v62.begin(), v62.end(), std::ref(qu8rng)); - std::generate(v63.begin(), v63.end(), std::ref(qu8rng)); - std::generate(v64.begin(), v64.end(), std::ref(qu8rng)); - std::generate(v65.begin(), v65.end(), std::ref(qu8rng)); - std::generate(v66.begin(), v66.end(), std::ref(qu8rng)); - std::generate(v67.begin(), v67.end(), std::ref(qu8rng)); - std::generate(v68.begin(), v68.end(), std::ref(qu8rng)); - std::generate(v69.begin(), v69.end(), std::ref(qu8rng)); - std::generate(v70.begin(), v70.end(), std::ref(qu8rng)); - std::generate(v71.begin(), v71.end(), std::ref(qu8rng)); - std::generate(v72.begin(), v72.end(), std::ref(qu8rng)); - std::generate(v73.begin(), v73.end(), std::ref(qu8rng)); - std::generate(v74.begin(), v74.end(), std::ref(qu8rng)); - std::generate(v75.begin(), v75.end(), std::ref(qu8rng)); - std::generate(v76.begin(), v76.end(), std::ref(qu8rng)); - std::generate(v77.begin(), v77.end(), std::ref(qu8rng)); - std::generate(v78.begin(), v78.end(), std::ref(qu8rng)); - std::generate(v79.begin(), v79.end(), std::ref(qu8rng)); - std::generate(v80.begin(), v80.end(), std::ref(qu8rng)); - std::generate(v81.begin(), v81.end(), std::ref(qu8rng)); - std::generate(v82.begin(), v82.end(), std::ref(qu8rng)); - std::generate(v83.begin(), v83.end(), std::ref(qu8rng)); - std::generate(v84.begin(), v84.end(), std::ref(qu8rng)); - std::generate(v85.begin(), v85.end(), std::ref(qu8rng)); - std::generate(v86.begin(), v86.end(), std::ref(qu8rng)); - std::generate(v87.begin(), v87.end(), std::ref(qu8rng)); - std::generate(v88.begin(), v88.end(), std::ref(qu8rng)); - std::generate(v89.begin(), v89.end(), std::ref(qu8rng)); - std::generate(v90.begin(), v90.end(), std::ref(qu8rng)); - std::generate(v91.begin(), v91.end(), std::ref(qu8rng)); - std::generate(v92.begin(), v92.end(), std::ref(qu8rng)); - std::generate(v93.begin(), v93.end(), std::ref(qu8rng)); - std::generate(v94.begin(), v94.end(), std::ref(qu8rng)); - std::generate(v95.begin(), v95.end(), std::ref(qu8rng)); - std::generate(v96.begin(), v96.end(), std::ref(qu8rng)); - std::generate(v97.begin(), v97.end(), std::ref(qu8rng)); - std::generate(v98.begin(), v98.end(), std::ref(qu8rng)); - std::generate(v99.begin(), v99.end(), std::ref(qu8rng)); - std::generate(v100.begin(), v100.end(), std::ref(qu8rng)); - std::generate(v101.begin(), v101.end(), std::ref(qu8rng)); - std::generate(v102.begin(), v102.end(), std::ref(qu8rng)); - std::generate(v103.begin(), v103.end(), std::ref(qu8rng)); - std::generate(v104.begin(), v104.end(), std::ref(qu8rng)); - std::generate(v105.begin(), v105.end(), std::ref(qu8rng)); - std::generate(v106.begin(), v106.end(), std::ref(qu8rng)); - std::generate(v107.begin(), v107.end(), std::ref(qu8rng)); - std::generate(v108.begin(), v108.end(), std::ref(qu8rng)); - std::generate(v109.begin(), v109.end(), std::ref(qu8rng)); - std::generate(v110.begin(), v110.end(), std::ref(qu8rng)); - std::generate(v111.begin(), v111.end(), std::ref(qu8rng)); - std::generate(v112.begin(), v112.end(), std::ref(qu8rng)); - std::generate(v113.begin(), v113.end(), std::ref(qu8rng)); - std::generate(v114.begin(), v114.end(), std::ref(qu8rng)); - std::generate(v115.begin(), v115.end(), std::ref(qu8rng)); - std::generate(v116.begin(), v116.end(), std::ref(qu8rng)); - std::generate(v117.begin(), v117.end(), std::ref(qu8rng)); - std::generate(v118.begin(), v118.end(), std::ref(qu8rng)); - std::generate(v119.begin(), v119.end(), std::ref(qu8rng)); - std::generate(w120.begin(), w120.end(), std::ref(qu8rng)); - std::generate(w121.begin(), w121.end(), std::ref(qs32rng)); - std::generate(w122.begin(), w122.end(), std::ref(qu8rng)); - std::generate(w123.begin(), w123.end(), std::ref(qs32rng)); - std::generate(w124.begin(), w124.end(), std::ref(qu8rng)); - std::generate(w125.begin(), w125.end(), std::ref(qs32rng)); - std::generate(w126.begin(), w126.end(), std::ref(qu8rng)); - std::generate(w127.begin(), w127.end(), std::ref(qs32rng)); - std::generate(w128.begin(), w128.end(), std::ref(qu8rng)); - std::generate(w129.begin(), w129.end(), std::ref(qu8rng)); - std::generate(w130.begin(), w130.end(), std::ref(qu8rng)); - std::generate(w131.begin(), w131.end(), std::ref(qs32rng)); - std::generate(w132.begin(), w132.end(), std::ref(qu8rng)); - std::generate(w133.begin(), w133.end(), std::ref(qs32rng)); - std::generate(w134.begin(), w134.end(), std::ref(qu8rng)); - std::generate(w135.begin(), w135.end(), std::ref(qs32rng)); - std::generate(w136.begin(), w136.end(), std::ref(qu8rng)); - std::generate(w137.begin(), w137.end(), std::ref(qs32rng)); - std::generate(w138.begin(), w138.end(), std::ref(qu8rng)); - std::generate(w139.begin(), w139.end(), std::ref(qs32rng)); - std::generate(w140.begin(), w140.end(), std::ref(qu8rng)); - std::generate(w141.begin(), w141.end(), std::ref(qs32rng)); - std::generate(w142.begin(), w142.end(), std::ref(qu8rng)); - std::generate(w143.begin(), w143.end(), std::ref(qs32rng)); - std::generate(w144.begin(), w144.end(), std::ref(qu8rng)); - std::generate(w145.begin(), w145.end(), std::ref(qs32rng)); - std::generate(w146.begin(), w146.end(), std::ref(qu8rng)); - std::generate(w147.begin(), w147.end(), std::ref(qs32rng)); - std::generate(w148.begin(), w148.end(), std::ref(qu8rng)); - std::generate(w149.begin(), w149.end(), std::ref(qs32rng)); - std::generate(w150.begin(), w150.end(), std::ref(qu8rng)); - std::generate(w151.begin(), w151.end(), std::ref(qs32rng)); - std::generate(w152.begin(), w152.end(), std::ref(qu8rng)); - std::generate(w153.begin(), w153.end(), std::ref(qu8rng)); - std::generate(w154.begin(), w154.end(), std::ref(qu8rng)); - std::generate(w155.begin(), w155.end(), std::ref(qs32rng)); - std::generate(w156.begin(), w156.end(), std::ref(qu8rng)); - std::generate(w157.begin(), w157.end(), std::ref(qs32rng)); - std::generate(w158.begin(), w158.end(), std::ref(qu8rng)); - std::generate(w159.begin(), w159.end(), std::ref(qs32rng)); - std::generate(w160.begin(), w160.end(), std::ref(qu8rng)); - std::generate(w161.begin(), w161.end(), std::ref(qs32rng)); - std::generate(w162.begin(), w162.end(), std::ref(qu8rng)); - std::generate(w163.begin(), w163.end(), std::ref(qs32rng)); - std::generate(w164.begin(), w164.end(), std::ref(qu8rng)); - std::generate(w165.begin(), w165.end(), std::ref(qu8rng)); - std::generate(w166.begin(), w166.end(), std::ref(qu8rng)); - std::generate(w167.begin(), w167.end(), std::ref(qs32rng)); - std::generate(w168.begin(), w168.end(), std::ref(qu8rng)); - std::generate(w169.begin(), w169.end(), std::ref(qs32rng)); - std::generate(w170.begin(), w170.end(), std::ref(qu8rng)); - std::generate(w171.begin(), w171.end(), std::ref(qs32rng)); - std::generate(w172.begin(), w172.end(), std::ref(qu8rng)); - std::generate(w173.begin(), w173.end(), std::ref(qs32rng)); - std::generate(w174.begin(), w174.end(), std::ref(qu8rng)); - std::generate(w175.begin(), w175.end(), std::ref(qs32rng)); - std::generate(w176.begin(), w176.end(), std::ref(qu8rng)); - std::generate(w177.begin(), w177.end(), std::ref(qu8rng)); - std::generate(w178.begin(), w178.end(), std::ref(qu8rng)); - std::generate(w179.begin(), w179.end(), std::ref(qs32rng)); - std::generate(w180.begin(), w180.end(), std::ref(qu8rng)); - std::generate(w181.begin(), w181.end(), std::ref(qs32rng)); - std::generate(w182.begin(), w182.end(), std::ref(qu8rng)); - std::generate(w183.begin(), w183.end(), std::ref(qs32rng)); - std::generate(w184.begin(), w184.end(), std::ref(qu8rng)); - std::generate(w185.begin(), w185.end(), std::ref(qs32rng)); - std::generate(w186.begin(), w186.end(), std::ref(qu8rng)); - std::generate(w187.begin(), w187.end(), std::ref(qs32rng)); - std::generate(w188.begin(), w188.end(), std::ref(qu8rng)); - std::generate(w189.begin(), w189.end(), std::ref(qu8rng)); - std::generate(w190.begin(), w190.end(), std::ref(qu8rng)); - std::generate(w191.begin(), w191.end(), std::ref(qs32rng)); - std::generate(w192.begin(), w192.end(), std::ref(qu8rng)); - std::generate(w193.begin(), w193.end(), std::ref(qs32rng)); - std::generate(w194.begin(), w194.end(), std::ref(qu8rng)); - std::generate(w195.begin(), w195.end(), std::ref(qs32rng)); - std::generate(w196.begin(), w196.end(), std::ref(qu8rng)); - std::generate(w197.begin(), w197.end(), std::ref(qs32rng)); - std::generate(w198.begin(), w198.end(), std::ref(qu8rng)); - std::generate(w199.begin(), w199.end(), std::ref(qs32rng)); - std::generate(w200.begin(), w200.end(), std::ref(qu8rng)); - std::generate(w201.begin(), w201.end(), std::ref(qu8rng)); - std::generate(w202.begin(), w202.end(), std::ref(qu8rng)); - std::generate(w203.begin(), w203.end(), std::ref(qs32rng)); - std::generate(w204.begin(), w204.end(), std::ref(qu8rng)); - std::generate(w205.begin(), w205.end(), std::ref(qs32rng)); - std::generate(w206.begin(), w206.end(), std::ref(qu8rng)); - std::generate(w207.begin(), w207.end(), std::ref(qs32rng)); - std::generate(w208.begin(), w208.end(), std::ref(qu8rng)); - std::generate(w209.begin(), w209.end(), std::ref(qs32rng)); - std::generate(w210.begin(), w210.end(), std::ref(qu8rng)); - std::generate(w211.begin(), w211.end(), std::ref(qs32rng)); - std::generate(w212.begin(), w212.end(), std::ref(qu8rng)); - std::generate(w213.begin(), w213.end(), std::ref(qu8rng)); - std::generate(w214.begin(), w214.end(), std::ref(qu8rng)); - std::generate(w215.begin(), w215.end(), std::ref(qs32rng)); - std::generate(w216.begin(), w216.end(), std::ref(qu8rng)); - std::generate(w217.begin(), w217.end(), std::ref(qs32rng)); - std::generate(w218.begin(), w218.end(), std::ref(qu8rng)); - std::generate(w219.begin(), w219.end(), std::ref(qs32rng)); - std::generate(w220.begin(), w220.end(), std::ref(qu8rng)); - std::generate(w221.begin(), w221.end(), std::ref(qs32rng)); - std::generate(w222.begin(), w222.end(), std::ref(qu8rng)); - std::generate(w223.begin(), w223.end(), std::ref(qs32rng)); - std::generate(w224.begin(), w224.end(), std::ref(qu8rng)); - std::generate(w225.begin(), w225.end(), std::ref(qu8rng)); - std::generate(w226.begin(), w226.end(), std::ref(qu8rng)); - std::generate(w227.begin(), w227.end(), std::ref(qs32rng)); - std::generate(w228.begin(), w228.end(), std::ref(qu8rng)); - std::generate(w229.begin(), w229.end(), std::ref(qs32rng)); - std::generate(w230.begin(), w230.end(), std::ref(qu8rng)); - std::generate(w231.begin(), w231.end(), std::ref(qs32rng)); - std::generate(w232.begin(), w232.end(), std::ref(qu8rng)); - std::generate(w233.begin(), w233.end(), std::ref(qs32rng)); - std::generate(w234.begin(), w234.end(), std::ref(qu8rng)); - std::generate(w235.begin(), w235.end(), std::ref(qs32rng)); - std::generate(w236.begin(), w236.end(), std::ref(qu8rng)); - std::generate(w237.begin(), w237.end(), std::ref(qu8rng)); - std::generate(w238.begin(), w238.end(), std::ref(qu8rng)); - std::generate(w239.begin(), w239.end(), std::ref(qs32rng)); - std::generate(w240.begin(), w240.end(), std::ref(qu8rng)); - std::generate(w241.begin(), w241.end(), std::ref(qs32rng)); - std::generate(w242.begin(), w242.end(), std::ref(qu8rng)); - std::generate(w243.begin(), w243.end(), std::ref(qs32rng)); - std::generate(w244.begin(), w244.end(), std::ref(qu8rng)); - std::generate(w245.begin(), w245.end(), std::ref(qs32rng)); - - Operators operators; - xnn_status status; - xnn_code_cache* code_cache_ptr = nullptr; - size_t max_workspace_size = 0; - - xnn_operator_t op0 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/0, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/3, - /*group_output_channels=*/16, - /*input_channel_stride=*/3, - /*output_channel_stride=*/16, - /*input_zero_point=*/(uint8_t) 128, - /*input_scale=*/0.007874015718698502, - /*kernel_zero_point=*/(uint8_t) 108, - /*kernel_scale=*/0.03232726827263832, - /*kernel=*/w120.data(), /*bias=*/w121.data(), - /*output_zero_point=*/(uint8_t) 99, - /*output_scale=*/0.2920726239681244, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op0); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #0" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op0, xnn_delete_operator); - - xnn_operator_t op1 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op1); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #1" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op1, xnn_delete_operator); - - xnn_operator_t op2 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/0, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/16, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/16, - /*output_channel_stride=*/16, - /*input_zero_point=*/(uint8_t) 2, - /*input_scale=*/0.173323854804039, - /*kernel_zero_point=*/(uint8_t) 127, - /*kernel_scale=*/0.0927056297659874, - /*kernel=*/w122.data(), /*bias=*/w123.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.9925559759140015, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op2); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #2" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op2, xnn_delete_operator); - - xnn_operator_t op3 = nullptr; - status = xnn_create_global_average_pooling_nwc_qu8( - 0 /* input zero point */, 0.9925559759140015 /* input scale */, - 0 /* output zero point */, 0.9925559759140015 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op3); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #3" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op3, xnn_delete_operator); - - xnn_operator_t op4 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/16, - /*group_output_channels=*/8, - /*input_channel_stride=*/16, - /*output_channel_stride=*/8, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.9925559759140015, - /*kernel_zero_point=*/(uint8_t) 57, - /*kernel_scale=*/0.00045214465353637934, - /*kernel=*/w124.data(), /*bias=*/w125.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.03975478187203407, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op4); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #4" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op4, xnn_delete_operator); - - xnn_operator_t op5 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/8, - /*group_output_channels=*/16, - /*input_channel_stride=*/8, - /*output_channel_stride=*/16, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.03975478187203407, - /*kernel_zero_point=*/(uint8_t) 1, - /*kernel_scale=*/0.0007166960276663303, - /*kernel=*/w126.data(), /*bias=*/w127.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.014429666101932526, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op5); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #5" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op5, xnn_delete_operator); - - xnn_operator_t op6 = nullptr; - status = xnn_create_add_nd_qu8( - 0 /* input1 zero point */, 0.014429666101932526 /* input1 scale */, - 0 /* input2 zero point */, 0.0117647061124444 /* input2 scale */, - 0 /* output zero point */, 0.023528477177023888 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op6); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #6" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op6, xnn_delete_operator); - - xnn_operator_t op7 = nullptr; - status = xnn_create_multiply_nd_qu8( - 0 /* input1 zero point */, 0.023528477177023888 /* input1 scale */, - 0 /* input2 zero point */, 0.0006536078290082514 /* input2 scale */, - 0 /* output zero point */, 0.003921509720385075 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op7); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #7" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op7, xnn_delete_operator); - - xnn_operator_t op8 = nullptr; - status = xnn_create_multiply_nd_qu8( - 0 /* input1 zero point */, 0.9925559759140015 /* input1 scale */, - 0 /* input2 zero point */, 0.003921509720385075 /* input2 scale */, - 0 /* output zero point */, 0.9246004223823547 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op8); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #8" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op8, xnn_delete_operator); - - xnn_operator_t op9 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/16, - /*group_output_channels=*/16, - /*input_channel_stride=*/16, - /*output_channel_stride=*/16, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.9246004223823547, - /*kernel_zero_point=*/(uint8_t) 146, - /*kernel_scale=*/0.017008759081363678, - /*kernel=*/w130.data(), /*bias=*/w131.data(), - /*output_zero_point=*/(uint8_t) 130, - /*output_scale=*/2.010422706604004, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op9); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #9" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op9, xnn_delete_operator); - - xnn_operator_t op10 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/16, - /*group_output_channels=*/72, - /*input_channel_stride=*/16, - /*output_channel_stride=*/72, - /*input_zero_point=*/(uint8_t) 130, - /*input_scale=*/2.010422706604004, - /*kernel_zero_point=*/(uint8_t) 123, - /*kernel_scale=*/0.005887787323445082, - /*kernel=*/w132.data(), /*bias=*/w133.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.9493569135665894, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op10); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #10" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op10, xnn_delete_operator); - - xnn_operator_t op11 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/0, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/72, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/72, - /*output_channel_stride=*/72, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.9493569135665894, - /*kernel_zero_point=*/(uint8_t) 113, - /*kernel_scale=*/0.033502571284770966, - /*kernel=*/w134.data(), /*bias=*/w135.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.6341432929039001, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op11); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #11" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op11, xnn_delete_operator); - - xnn_operator_t op12 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/72, - /*group_output_channels=*/24, - /*input_channel_stride=*/72, - /*output_channel_stride=*/24, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.6341432929039001, - /*kernel_zero_point=*/(uint8_t) 153, - /*kernel_scale=*/0.017683790996670723, - /*kernel=*/w136.data(), /*bias=*/w137.data(), - /*output_zero_point=*/(uint8_t) 119, - /*output_scale=*/1.0579205751419067, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op12); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #12" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op12, xnn_delete_operator); - - xnn_operator_t op13 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/24, - /*group_output_channels=*/88, - /*input_channel_stride=*/24, - /*output_channel_stride=*/88, - /*input_zero_point=*/(uint8_t) 119, - /*input_scale=*/1.0579205751419067, - /*kernel_zero_point=*/(uint8_t) 99, - /*kernel_scale=*/0.005299868993461132, - /*kernel=*/w138.data(), /*bias=*/w139.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.20985400676727295, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op13); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #13" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op13, xnn_delete_operator); - - xnn_operator_t op14 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/1, /*input_padding_right=*/1, - /*input_padding_bottom=*/1, /*input_padding_left=*/1, - /*kernel_height=*/3, /*kernel_width=*/3, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/88, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/88, - /*output_channel_stride=*/88, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.20985400676727295, - /*kernel_zero_point=*/(uint8_t) 144, - /*kernel_scale=*/0.05344513803720474, - /*kernel=*/w140.data(), /*bias=*/w141.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.22873805463314056, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op14); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #14" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op14, xnn_delete_operator); - - xnn_operator_t op15 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/88, - /*group_output_channels=*/24, - /*input_channel_stride=*/88, - /*output_channel_stride=*/24, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.22873805463314056, - /*kernel_zero_point=*/(uint8_t) 139, - /*kernel_scale=*/0.015702862292528152, - /*kernel=*/w142.data(), /*bias=*/w143.data(), - /*output_zero_point=*/(uint8_t) 124, - /*output_scale=*/0.8896244764328003, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op15); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #15" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op15, xnn_delete_operator); - - xnn_operator_t op16 = nullptr; - status = xnn_create_add_nd_qu8( - 124 /* input1 zero point */, 0.8896244764328003 /* input1 scale */, - 119 /* input2 zero point */, 1.0579205751419067 /* input2 scale */, - 123 /* output zero point */, 1.0426580905914307 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op16); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #16" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op16, xnn_delete_operator); - - xnn_operator_t op17 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/24, - /*group_output_channels=*/96, - /*input_channel_stride=*/24, - /*output_channel_stride=*/96, - /*input_zero_point=*/(uint8_t) 123, - /*input_scale=*/1.0426580905914307, - /*kernel_zero_point=*/(uint8_t) 154, - /*kernel_scale=*/0.002672378672286868, - /*kernel=*/w144.data(), /*bias=*/w145.data(), - /*output_zero_point=*/(uint8_t) 110, - /*output_scale=*/0.3380434811115265, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op17); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #17" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op17, xnn_delete_operator); - - xnn_operator_t op18 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op18); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #18" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op18, xnn_delete_operator); - - xnn_operator_t op19 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/1, /*input_padding_right=*/2, - /*input_padding_bottom=*/2, /*input_padding_left=*/1, - /*kernel_height=*/5, /*kernel_width=*/5, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/96, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/96, - /*output_channel_stride=*/96, - /*input_zero_point=*/(uint8_t) 2, - /*input_scale=*/0.18497292697429657, - /*kernel_zero_point=*/(uint8_t) 142, - /*kernel_scale=*/0.031311504542827606, - /*kernel=*/w146.data(), /*bias=*/w147.data(), - /*output_zero_point=*/(uint8_t) 134, - /*output_scale=*/0.24109338223934174, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op19); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #19" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op19, xnn_delete_operator); - - xnn_operator_t op20 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op20); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #20" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op20, xnn_delete_operator); - - xnn_operator_t op21 = nullptr; - status = xnn_create_global_average_pooling_nwc_qu8( - 3 /* input zero point */, 0.1065792664885521 /* input scale */, - 3 /* output zero point */, 0.1065792664885521 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op21); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #21" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op21, xnn_delete_operator); - - xnn_operator_t op22 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/96, - /*group_output_channels=*/24, - /*input_channel_stride=*/96, - /*output_channel_stride=*/24, - /*input_zero_point=*/(uint8_t) 3, - /*input_scale=*/0.1065792664885521, - /*kernel_zero_point=*/(uint8_t) 98, - /*kernel_scale=*/0.005171800963580608, - /*kernel=*/w148.data(), /*bias=*/w149.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.051762163639068604, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op22); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #22" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op22, xnn_delete_operator); - - xnn_operator_t op23 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/24, - /*group_output_channels=*/96, - /*input_channel_stride=*/24, - /*output_channel_stride=*/96, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.051762163639068604, - /*kernel_zero_point=*/(uint8_t) 106, - /*kernel_scale=*/0.005030923057347536, - /*kernel=*/w150.data(), /*bias=*/w151.data(), - /*output_zero_point=*/(uint8_t) 98, - /*output_scale=*/0.03421778604388237, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op23); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #23" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op23, xnn_delete_operator); - - xnn_operator_t op24 = nullptr; - status = xnn_create_add_nd_qu8( - 98 /* input1 zero point */, 0.03421778604388237 /* input1 scale */, - 0 /* input2 zero point */, 0.0117647061124444 /* input2 scale */, - 0 /* output zero point */, 0.023528477177023888 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op24); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #24" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op24, xnn_delete_operator); - - xnn_operator_t op25 = nullptr; - status = xnn_create_multiply_nd_qu8( - 0 /* input1 zero point */, 0.023528477177023888 /* input1 scale */, - 0 /* input2 zero point */, 0.0006536078290082514 /* input2 scale */, - 0 /* output zero point */, 0.003921509254723787 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op25); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #25" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op25, xnn_delete_operator); - - xnn_operator_t op26 = nullptr; - status = xnn_create_multiply_nd_qu8( - 3 /* input1 zero point */, 0.1065792664885521 /* input1 scale */, - 0 /* input2 zero point */, 0.003921509254723787 /* input2 scale */, - 4 /* output zero point */, 0.07695811986923218 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op26); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #26" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op26, xnn_delete_operator); - - xnn_operator_t op27 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/96, - /*group_output_channels=*/40, - /*input_channel_stride=*/96, - /*output_channel_stride=*/40, - /*input_zero_point=*/(uint8_t) 4, - /*input_scale=*/0.07695811986923218, - /*kernel_zero_point=*/(uint8_t) 128, - /*kernel_scale=*/0.03726894408464432, - /*kernel=*/w154.data(), /*bias=*/w155.data(), - /*output_zero_point=*/(uint8_t) 127, - /*output_scale=*/0.3759814500808716, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op27); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #27" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op27, xnn_delete_operator); - - xnn_operator_t op28 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/40, - /*group_output_channels=*/240, - /*input_channel_stride=*/40, - /*output_channel_stride=*/240, - /*input_zero_point=*/(uint8_t) 127, - /*input_scale=*/0.3759814500808716, - /*kernel_zero_point=*/(uint8_t) 159, - /*kernel_scale=*/0.003184415865689516, - /*kernel=*/w156.data(), /*bias=*/w157.data(), - /*output_zero_point=*/(uint8_t) 128, - /*output_scale=*/0.17979219555854797, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op28); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #28" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op28, xnn_delete_operator); - - xnn_operator_t op29 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op29); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #29" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op29, xnn_delete_operator); - - xnn_operator_t op30 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/2, /*input_padding_right=*/2, - /*input_padding_bottom=*/2, /*input_padding_left=*/2, - /*kernel_height=*/5, /*kernel_width=*/5, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/240, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/240, - /*output_channel_stride=*/240, - /*input_zero_point=*/(uint8_t) 4, - /*input_scale=*/0.08579955250024796, - /*kernel_zero_point=*/(uint8_t) 143, - /*kernel_scale=*/0.1883949190378189, - /*kernel=*/w158.data(), /*bias=*/w159.data(), - /*output_zero_point=*/(uint8_t) 130, - /*output_scale=*/0.49307096004486084, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op30); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #30" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op30, xnn_delete_operator); - - xnn_operator_t op31 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op31); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #31" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op31, xnn_delete_operator); - - xnn_operator_t op32 = nullptr; - status = xnn_create_global_average_pooling_nwc_qu8( - 2 /* input zero point */, 0.220099538564682 /* input scale */, - 2 /* output zero point */, 0.220099538564682 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op32); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #32" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op32, xnn_delete_operator); - - xnn_operator_t op33 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/240, - /*group_output_channels=*/64, - /*input_channel_stride=*/240, - /*output_channel_stride=*/64, - /*input_zero_point=*/(uint8_t) 2, - /*input_scale=*/0.220099538564682, - /*kernel_zero_point=*/(uint8_t) 149, - /*kernel_scale=*/0.009354852139949799, - /*kernel=*/w160.data(), /*bias=*/w161.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.016910869628190994, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op33); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #33" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op33, xnn_delete_operator); - - xnn_operator_t op34 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/64, - /*group_output_channels=*/240, - /*input_channel_stride=*/64, - /*output_channel_stride=*/240, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.016910869628190994, - /*kernel_zero_point=*/(uint8_t) 108, - /*kernel_scale=*/0.006087664980441332, - /*kernel=*/w162.data(), /*bias=*/w163.data(), - /*output_zero_point=*/(uint8_t) 144, - /*output_scale=*/0.03480793163180351, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op34); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #34" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op34, xnn_delete_operator); - - xnn_operator_t op35 = nullptr; - status = xnn_create_add_nd_qu8( - 144 /* input1 zero point */, 0.03480793163180351 /* input1 scale */, - 0 /* input2 zero point */, 0.0117647061124444 /* input2 scale */, - 0 /* output zero point */, 0.023528477177023888 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op35); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #35" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op35, xnn_delete_operator); - - xnn_operator_t op36 = nullptr; - status = xnn_create_multiply_nd_qu8( - 0 /* input1 zero point */, 0.023528477177023888 /* input1 scale */, - 0 /* input2 zero point */, 0.0006536078290082514 /* input2 scale */, - 0 /* output zero point */, 0.003921509254723787 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op36); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #36" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op36, xnn_delete_operator); - - xnn_operator_t op37 = nullptr; - status = xnn_create_multiply_nd_qu8( - 2 /* input1 zero point */, 0.220099538564682 /* input1 scale */, - 0 /* input2 zero point */, 0.003921509254723787 /* input2 scale */, - 20 /* output zero point */, 0.02150452695786953 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op37); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #37" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op37, xnn_delete_operator); - - xnn_operator_t op38 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/240, - /*group_output_channels=*/40, - /*input_channel_stride=*/240, - /*output_channel_stride=*/40, - /*input_zero_point=*/(uint8_t) 20, - /*input_scale=*/0.02150452695786953, - /*kernel_zero_point=*/(uint8_t) 115, - /*kernel_scale=*/0.1018327996134758, - /*kernel=*/w166.data(), /*bias=*/w167.data(), - /*output_zero_point=*/(uint8_t) 137, - /*output_scale=*/0.4652852416038513, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op38); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #38" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op38, xnn_delete_operator); - - xnn_operator_t op39 = nullptr; - status = xnn_create_add_nd_qu8( - 137 /* input1 zero point */, 0.4652852416038513 /* input1 scale */, - 127 /* input2 zero point */, 0.3759814500808716 /* input2 scale */, - 132 /* output zero point */, 0.44771137833595276 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op39); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #39" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op39, xnn_delete_operator); - - xnn_operator_t op40 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/40, - /*group_output_channels=*/240, - /*input_channel_stride=*/40, - /*output_channel_stride=*/240, - /*input_zero_point=*/(uint8_t) 132, - /*input_scale=*/0.44771137833595276, - /*kernel_zero_point=*/(uint8_t) 129, - /*kernel_scale=*/0.0009919562144204974, - /*kernel=*/w168.data(), /*bias=*/w169.data(), - /*output_zero_point=*/(uint8_t) 118, - /*output_scale=*/0.12498034536838531, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op40); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #40" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op40, xnn_delete_operator); - - xnn_operator_t op41 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op41); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #41" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op41, xnn_delete_operator); - - xnn_operator_t op42 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/2, /*input_padding_right=*/2, - /*input_padding_bottom=*/2, /*input_padding_left=*/2, - /*kernel_height=*/5, /*kernel_width=*/5, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/240, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/240, - /*output_channel_stride=*/240, - /*input_zero_point=*/(uint8_t) 6, - /*input_scale=*/0.06491293758153915, - /*kernel_zero_point=*/(uint8_t) 101, - /*kernel_scale=*/0.13295969367027283, - /*kernel=*/w170.data(), /*bias=*/w171.data(), - /*output_zero_point=*/(uint8_t) 150, - /*output_scale=*/0.29956355690956116, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op42); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #42" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op42, xnn_delete_operator); - - xnn_operator_t op43 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op43); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #43" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op43, xnn_delete_operator); - - xnn_operator_t op44 = nullptr; - status = xnn_create_global_average_pooling_nwc_qu8( - 3 /* input zero point */, 0.11336661875247955 /* input scale */, - 3 /* output zero point */, 0.11336661875247955 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op44); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #44" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op44, xnn_delete_operator); - - xnn_operator_t op45 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/240, - /*group_output_channels=*/64, - /*input_channel_stride=*/240, - /*output_channel_stride=*/64, - /*input_zero_point=*/(uint8_t) 3, - /*input_scale=*/0.11336661875247955, - /*kernel_zero_point=*/(uint8_t) 163, - /*kernel_scale=*/0.007440278306603432, - /*kernel=*/w172.data(), /*bias=*/w173.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.008480816148221493, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op45); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #45" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op45, xnn_delete_operator); - - xnn_operator_t op46 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/64, - /*group_output_channels=*/240, - /*input_channel_stride=*/64, - /*output_channel_stride=*/240, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.008480816148221493, - /*kernel_zero_point=*/(uint8_t) 110, - /*kernel_scale=*/0.006039419211447239, - /*kernel=*/w174.data(), /*bias=*/w175.data(), - /*output_zero_point=*/(uint8_t) 135, - /*output_scale=*/0.027621593326330185, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op46); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #46" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op46, xnn_delete_operator); - - xnn_operator_t op47 = nullptr; - status = xnn_create_add_nd_qu8( - 135 /* input1 zero point */, 0.027621593326330185 /* input1 scale */, - 0 /* input2 zero point */, 0.0117647061124444 /* input2 scale */, - 0 /* output zero point */, 0.023528477177023888 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op47); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #47" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op47, xnn_delete_operator); - - xnn_operator_t op48 = nullptr; - status = xnn_create_multiply_nd_qu8( - 0 /* input1 zero point */, 0.023528477177023888 /* input1 scale */, - 0 /* input2 zero point */, 0.0006536078290082514 /* input2 scale */, - 0 /* output zero point */, 0.003921374212950468 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op48); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #48" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op48, xnn_delete_operator); - - xnn_operator_t op49 = nullptr; - status = xnn_create_multiply_nd_qu8( - 3 /* input1 zero point */, 0.11336661875247955 /* input1 scale */, - 0 /* input2 zero point */, 0.003921374212950468 /* input2 scale */, - 21 /* output zero point */, 0.0160247553139925 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op49); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #49" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op49, xnn_delete_operator); - - xnn_operator_t op50 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/240, - /*group_output_channels=*/40, - /*input_channel_stride=*/240, - /*output_channel_stride=*/40, - /*input_zero_point=*/(uint8_t) 21, - /*input_scale=*/0.0160247553139925, - /*kernel_zero_point=*/(uint8_t) 131, - /*kernel_scale=*/0.22305507957935333, - /*kernel=*/w178.data(), /*bias=*/w179.data(), - /*output_zero_point=*/(uint8_t) 139, - /*output_scale=*/0.544162929058075, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op50); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #50" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op50, xnn_delete_operator); - - xnn_operator_t op51 = nullptr; - status = xnn_create_add_nd_qu8( - 139 /* input1 zero point */, 0.544162929058075 /* input1 scale */, - 132 /* input2 zero point */, 0.44771137833595276 /* input2 scale */, - 137 /* output zero point */, 0.6061347723007202 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op51); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #51" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op51, xnn_delete_operator); - - xnn_operator_t op52 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/40, - /*group_output_channels=*/120, - /*input_channel_stride=*/40, - /*output_channel_stride=*/120, - /*input_zero_point=*/(uint8_t) 137, - /*input_scale=*/0.6061347723007202, - /*kernel_zero_point=*/(uint8_t) 90, - /*kernel_scale=*/0.0014072866179049015, - /*kernel=*/w180.data(), /*bias=*/w181.data(), - /*output_zero_point=*/(uint8_t) 117, - /*output_scale=*/0.13909709453582764, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op52); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #52" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op52, xnn_delete_operator); - - xnn_operator_t op53 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op53); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #53" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op53, xnn_delete_operator); - - xnn_operator_t op54 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/2, /*input_padding_right=*/2, - /*input_padding_bottom=*/2, /*input_padding_left=*/2, - /*kernel_height=*/5, /*kernel_width=*/5, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/120, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/120, - /*output_channel_stride=*/120, - /*input_zero_point=*/(uint8_t) 5, - /*input_scale=*/0.0727764368057251, - /*kernel_zero_point=*/(uint8_t) 121, - /*kernel_scale=*/0.09157519787549973, - /*kernel=*/w182.data(), /*bias=*/w183.data(), - /*output_zero_point=*/(uint8_t) 140, - /*output_scale=*/0.28514617681503296, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op54); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #54" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op54, xnn_delete_operator); - - xnn_operator_t op55 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op55); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #55" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op55, xnn_delete_operator); - - xnn_operator_t op56 = nullptr; - status = xnn_create_global_average_pooling_nwc_qu8( - 3 /* input zero point */, 0.1223522424697876 /* input scale */, - 3 /* output zero point */, 0.1223522424697876 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op56); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #56" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op56, xnn_delete_operator); - - xnn_operator_t op57 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/120, - /*group_output_channels=*/32, - /*input_channel_stride=*/120, - /*output_channel_stride=*/32, - /*input_zero_point=*/(uint8_t) 3, - /*input_scale=*/0.1223522424697876, - /*kernel_zero_point=*/(uint8_t) 40, - /*kernel_scale=*/0.0008257423178292811, - /*kernel=*/w184.data(), /*bias=*/w185.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.025332391262054443, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op57); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #57" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op57, xnn_delete_operator); - - xnn_operator_t op58 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/32, - /*group_output_channels=*/120, - /*input_channel_stride=*/32, - /*output_channel_stride=*/120, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.025332391262054443, - /*kernel_zero_point=*/(uint8_t) 220, - /*kernel_scale=*/0.0021832138299942017, - /*kernel=*/w186.data(), /*bias=*/w187.data(), - /*output_zero_point=*/(uint8_t) 139, - /*output_scale=*/0.026293933391571045, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op58); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #58" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op58, xnn_delete_operator); - - xnn_operator_t op59 = nullptr; - status = xnn_create_add_nd_qu8( - 139 /* input1 zero point */, 0.026293933391571045 /* input1 scale */, - 0 /* input2 zero point */, 0.0117647061124444 /* input2 scale */, - 0 /* output zero point */, 0.023528477177023888 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op59); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #59" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op59, xnn_delete_operator); - - xnn_operator_t op60 = nullptr; - status = xnn_create_multiply_nd_qu8( - 0 /* input1 zero point */, 0.023528477177023888 /* input1 scale */, - 0 /* input2 zero point */, 0.0006536078290082514 /* input2 scale */, - 0 /* output zero point */, 0.003921508323401213 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op60); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #60" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op60, xnn_delete_operator); - - xnn_operator_t op61 = nullptr; - status = xnn_create_multiply_nd_qu8( - 3 /* input1 zero point */, 0.1223522424697876 /* input1 scale */, - 0 /* input2 zero point */, 0.003921508323401213 /* input2 scale */, - 7 /* output zero point */, 0.04942065477371216 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op61); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #61" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op61, xnn_delete_operator); - - xnn_operator_t op62 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/120, - /*group_output_channels=*/48, - /*input_channel_stride=*/120, - /*output_channel_stride=*/48, - /*input_zero_point=*/(uint8_t) 7, - /*input_scale=*/0.04942065477371216, - /*kernel_zero_point=*/(uint8_t) 101, - /*kernel_scale=*/0.03507576882839203, - /*kernel=*/w190.data(), /*bias=*/w191.data(), - /*output_zero_point=*/(uint8_t) 129, - /*output_scale=*/0.39454951882362366, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op62); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #62" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op62, xnn_delete_operator); - - xnn_operator_t op63 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/48, - /*group_output_channels=*/144, - /*input_channel_stride=*/48, - /*output_channel_stride=*/144, - /*input_zero_point=*/(uint8_t) 129, - /*input_scale=*/0.39454951882362366, - /*kernel_zero_point=*/(uint8_t) 148, - /*kernel_scale=*/0.0015211983118206263, - /*kernel=*/w192.data(), /*bias=*/w193.data(), - /*output_zero_point=*/(uint8_t) 114, - /*output_scale=*/0.18048983812332153, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op63); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #63" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op63, xnn_delete_operator); - - xnn_operator_t op64 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op64); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #64" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op64, xnn_delete_operator); - - xnn_operator_t op65 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/2, /*input_padding_right=*/2, - /*input_padding_bottom=*/2, /*input_padding_left=*/2, - /*kernel_height=*/5, /*kernel_width=*/5, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/144, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/144, - /*output_channel_stride=*/144, - /*input_zero_point=*/(uint8_t) 4, - /*input_scale=*/0.09509307891130447, - /*kernel_zero_point=*/(uint8_t) 115, - /*kernel_scale=*/0.0958247184753418, - /*kernel=*/w194.data(), /*bias=*/w195.data(), - /*output_zero_point=*/(uint8_t) 151, - /*output_scale=*/0.3922523558139801, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op65); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #65" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op65, xnn_delete_operator); - - xnn_operator_t op66 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op66); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #66" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op66, xnn_delete_operator); - - xnn_operator_t op67 = nullptr; - status = xnn_create_global_average_pooling_nwc_qu8( - 2 /* input zero point */, 0.14624309539794922 /* input scale */, - 2 /* output zero point */, 0.14624309539794922 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op67); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #67" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op67, xnn_delete_operator); - - xnn_operator_t op68 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/144, - /*group_output_channels=*/40, - /*input_channel_stride=*/144, - /*output_channel_stride=*/40, - /*input_zero_point=*/(uint8_t) 2, - /*input_scale=*/0.14624309539794922, - /*kernel_zero_point=*/(uint8_t) 130, - /*kernel_scale=*/0.0060674939304590225, - /*kernel=*/w196.data(), /*bias=*/w197.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.008962834253907204, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op68); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #68" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op68, xnn_delete_operator); - - xnn_operator_t op69 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/40, - /*group_output_channels=*/144, - /*input_channel_stride=*/40, - /*output_channel_stride=*/144, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.008962834253907204, - /*kernel_zero_point=*/(uint8_t) 124, - /*kernel_scale=*/0.004431542940437794, - /*kernel=*/w198.data(), /*bias=*/w199.data(), - /*output_zero_point=*/(uint8_t) 134, - /*output_scale=*/0.02729739062488079, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op69); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #69" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op69, xnn_delete_operator); - - xnn_operator_t op70 = nullptr; - status = xnn_create_add_nd_qu8( - 134 /* input1 zero point */, 0.02729739062488079 /* input1 scale */, - 0 /* input2 zero point */, 0.0117647061124444 /* input2 scale */, - 0 /* output zero point */, 0.023528477177023888 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op70); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #70" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op70, xnn_delete_operator); - - xnn_operator_t op71 = nullptr; - status = xnn_create_multiply_nd_qu8( - 0 /* input1 zero point */, 0.023528477177023888 /* input1 scale */, - 0 /* input2 zero point */, 0.0006536078290082514 /* input2 scale */, - 0 /* output zero point */, 0.003921374212950468 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op71); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #71" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op71, xnn_delete_operator); - - xnn_operator_t op72 = nullptr; - status = xnn_create_multiply_nd_qu8( - 2 /* input1 zero point */, 0.14624309539794922 /* input1 scale */, - 0 /* input2 zero point */, 0.003921374212950468 /* input2 scale */, - 13 /* output zero point */, 0.023374175652861595 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op72); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #72" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op72, xnn_delete_operator); - - xnn_operator_t op73 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/144, - /*group_output_channels=*/48, - /*input_channel_stride=*/144, - /*output_channel_stride=*/48, - /*input_zero_point=*/(uint8_t) 13, - /*input_scale=*/0.023374175652861595, - /*kernel_zero_point=*/(uint8_t) 125, - /*kernel_scale=*/0.13331712782382965, - /*kernel=*/w202.data(), /*bias=*/w203.data(), - /*output_zero_point=*/(uint8_t) 140, - /*output_scale=*/0.42487239837646484, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op73); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #73" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op73, xnn_delete_operator); - - xnn_operator_t op74 = nullptr; - status = xnn_create_add_nd_qu8( - 140 /* input1 zero point */, 0.42487239837646484 /* input1 scale */, - 129 /* input2 zero point */, 0.39454951882362366 /* input2 scale */, - 137 /* output zero point */, 0.48052287101745605 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op74); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #74" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op74, xnn_delete_operator); - - xnn_operator_t op75 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/48, - /*group_output_channels=*/288, - /*input_channel_stride=*/48, - /*output_channel_stride=*/288, - /*input_zero_point=*/(uint8_t) 137, - /*input_scale=*/0.48052287101745605, - /*kernel_zero_point=*/(uint8_t) 132, - /*kernel_scale=*/0.0014037908986210823, - /*kernel=*/w204.data(), /*bias=*/w205.data(), - /*output_zero_point=*/(uint8_t) 113, - /*output_scale=*/0.14607380330562592, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op75); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #75" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op75, xnn_delete_operator); - - xnn_operator_t op76 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op76); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #76" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op76, xnn_delete_operator); - - xnn_operator_t op77 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/1, /*input_padding_right=*/2, - /*input_padding_bottom=*/2, /*input_padding_left=*/1, - /*kernel_height=*/5, /*kernel_width=*/5, - /*subsampling_height=*/2, /*subsampling_width=*/2, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/288, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/288, - /*output_channel_stride=*/288, - /*input_zero_point=*/(uint8_t) 5, - /*input_scale=*/0.07805965095758438, - /*kernel_zero_point=*/(uint8_t) 105, - /*kernel_scale=*/0.035035692155361176, - /*kernel=*/w206.data(), /*bias=*/w207.data(), - /*output_zero_point=*/(uint8_t) 83, - /*output_scale=*/0.13729262351989746, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op77); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #77" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op77, xnn_delete_operator); - - xnn_operator_t op78 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op78); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #78" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op78, xnn_delete_operator); - - xnn_operator_t op79 = nullptr; - status = xnn_create_global_average_pooling_nwc_qu8( - 4 /* input zero point */, 0.08693098276853561 /* input scale */, - 4 /* output zero point */, 0.08693098276853561 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op79); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #79" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op79, xnn_delete_operator); - - xnn_operator_t op80 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/288, - /*group_output_channels=*/72, - /*input_channel_stride=*/288, - /*output_channel_stride=*/72, - /*input_zero_point=*/(uint8_t) 4, - /*input_scale=*/0.08693098276853561, - /*kernel_zero_point=*/(uint8_t) 120, - /*kernel_scale=*/0.006728844251483679, - /*kernel=*/w208.data(), /*bias=*/w209.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.016604457050561905, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op80); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #80" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op80, xnn_delete_operator); - - xnn_operator_t op81 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/72, - /*group_output_channels=*/288, - /*input_channel_stride=*/72, - /*output_channel_stride=*/288, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.016604457050561905, - /*kernel_zero_point=*/(uint8_t) 152, - /*kernel_scale=*/0.005426046904176474, - /*kernel=*/w210.data(), /*bias=*/w211.data(), - /*output_zero_point=*/(uint8_t) 123, - /*output_scale=*/0.03482932597398758, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op81); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #81" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op81, xnn_delete_operator); - - xnn_operator_t op82 = nullptr; - status = xnn_create_add_nd_qu8( - 123 /* input1 zero point */, 0.03482932597398758 /* input1 scale */, - 0 /* input2 zero point */, 0.0117647061124444 /* input2 scale */, - 0 /* output zero point */, 0.023528477177023888 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op82); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #82" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op82, xnn_delete_operator); - - xnn_operator_t op83 = nullptr; - status = xnn_create_multiply_nd_qu8( - 0 /* input1 zero point */, 0.023528477177023888 /* input1 scale */, - 0 /* input2 zero point */, 0.0006536078290082514 /* input2 scale */, - 0 /* output zero point */, 0.003921508323401213 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op83); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #83" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op83, xnn_delete_operator); - - xnn_operator_t op84 = nullptr; - status = xnn_create_multiply_nd_qu8( - 4 /* input1 zero point */, 0.08693098276853561 /* input1 scale */, - 0 /* input2 zero point */, 0.003921508323401213 /* input2 scale */, - 10 /* output zero point */, 0.03586701303720474 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op84); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #84" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op84, xnn_delete_operator); - - xnn_operator_t op85 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/288, - /*group_output_channels=*/96, - /*input_channel_stride=*/288, - /*output_channel_stride=*/96, - /*input_zero_point=*/(uint8_t) 10, - /*input_scale=*/0.03586701303720474, - /*kernel_zero_point=*/(uint8_t) 122, - /*kernel_scale=*/0.019641198217868805, - /*kernel=*/w214.data(), /*bias=*/w215.data(), - /*output_zero_point=*/(uint8_t) 130, - /*output_scale=*/0.2735706567764282, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op85); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #85" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op85, xnn_delete_operator); - - xnn_operator_t op86 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/96, - /*group_output_channels=*/576, - /*input_channel_stride=*/96, - /*output_channel_stride=*/576, - /*input_zero_point=*/(uint8_t) 130, - /*input_scale=*/0.2735706567764282, - /*kernel_zero_point=*/(uint8_t) 145, - /*kernel_scale=*/0.0017236428102478385, - /*kernel=*/w216.data(), /*bias=*/w217.data(), - /*output_zero_point=*/(uint8_t) 118, - /*output_scale=*/0.14194171130657196, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op86); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #86" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op86, xnn_delete_operator); - - xnn_operator_t op87 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op87); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #87" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op87, xnn_delete_operator); - - xnn_operator_t op88 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/2, /*input_padding_right=*/2, - /*input_padding_bottom=*/2, /*input_padding_left=*/2, - /*kernel_height=*/5, /*kernel_width=*/5, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/576, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/576, - /*output_channel_stride=*/576, - /*input_zero_point=*/(uint8_t) 5, - /*input_scale=*/0.07257640361785889, - /*kernel_zero_point=*/(uint8_t) 96, - /*kernel_scale=*/0.174177348613739, - /*kernel=*/w218.data(), /*bias=*/w219.data(), - /*output_zero_point=*/(uint8_t) 104, - /*output_scale=*/0.23463939130306244, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op88); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #88" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op88, xnn_delete_operator); - - xnn_operator_t op89 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op89); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #89" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op89, xnn_delete_operator); - - xnn_operator_t op90 = nullptr; - status = xnn_create_global_average_pooling_nwc_qu8( - 3 /* input zero point */, 0.1241951510310173 /* input scale */, - 3 /* output zero point */, 0.1241951510310173 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op90); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #90" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op90, xnn_delete_operator); - - xnn_operator_t op91 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/576, - /*group_output_channels=*/144, - /*input_channel_stride=*/576, - /*output_channel_stride=*/144, - /*input_zero_point=*/(uint8_t) 3, - /*input_scale=*/0.1241951510310173, - /*kernel_zero_point=*/(uint8_t) 115, - /*kernel_scale=*/0.005609261337667704, - /*kernel=*/w220.data(), /*bias=*/w221.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.014928853139281273, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op91); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #91" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op91, xnn_delete_operator); - - xnn_operator_t op92 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/144, - /*group_output_channels=*/576, - /*input_channel_stride=*/144, - /*output_channel_stride=*/576, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.014928853139281273, - /*kernel_zero_point=*/(uint8_t) 91, - /*kernel_scale=*/0.008804556913673878, - /*kernel=*/w222.data(), /*bias=*/w223.data(), - /*output_zero_point=*/(uint8_t) 129, - /*output_scale=*/0.04489157348871231, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op92); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #92" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op92, xnn_delete_operator); - - xnn_operator_t op93 = nullptr; - status = xnn_create_add_nd_qu8( - 129 /* input1 zero point */, 0.04489157348871231 /* input1 scale */, - 0 /* input2 zero point */, 0.0117647061124444 /* input2 scale */, - 0 /* output zero point */, 0.023463299497961998 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op93); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #93" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op93, xnn_delete_operator); - - xnn_operator_t op94 = nullptr; - status = xnn_create_multiply_nd_qu8( - 0 /* input1 zero point */, 0.023463299497961998 /* input1 scale */, - 0 /* input2 zero point */, 0.0006536078290082514 /* input2 scale */, - 0 /* output zero point */, 0.003899596631526947 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op94); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #94" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op94, xnn_delete_operator); - - xnn_operator_t op95 = nullptr; - status = xnn_create_multiply_nd_qu8( - 3 /* input1 zero point */, 0.1241951510310173 /* input1 scale */, - 0 /* input2 zero point */, 0.003899596631526947 /* input2 scale */, - 15 /* output zero point */, 0.023340530693531036 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op95); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #95" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op95, xnn_delete_operator); - - xnn_operator_t op96 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/576, - /*group_output_channels=*/96, - /*input_channel_stride=*/576, - /*output_channel_stride=*/96, - /*input_zero_point=*/(uint8_t) 15, - /*input_scale=*/0.023340530693531036, - /*kernel_zero_point=*/(uint8_t) 132, - /*kernel_scale=*/0.11193376779556274, - /*kernel=*/w226.data(), /*bias=*/w227.data(), - /*output_zero_point=*/(uint8_t) 131, - /*output_scale=*/0.3130902945995331, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op96); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #96" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op96, xnn_delete_operator); - - xnn_operator_t op97 = nullptr; - status = xnn_create_add_nd_qu8( - 131 /* input1 zero point */, 0.3130902945995331 /* input1 scale */, - 130 /* input2 zero point */, 0.2735706567764282 /* input2 scale */, - 130 /* output zero point */, 0.3734561800956726 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op97); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #97" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op97, xnn_delete_operator); - - xnn_operator_t op98 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/96, - /*group_output_channels=*/576, - /*input_channel_stride=*/96, - /*output_channel_stride=*/576, - /*input_zero_point=*/(uint8_t) 130, - /*input_scale=*/0.3734561800956726, - /*kernel_zero_point=*/(uint8_t) 153, - /*kernel_scale=*/0.0030694138258695602, - /*kernel=*/w228.data(), /*bias=*/w229.data(), - /*output_zero_point=*/(uint8_t) 157, - /*output_scale=*/0.3907496929168701, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op98); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #98" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op98, xnn_delete_operator); - - xnn_operator_t op99 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op99); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #99" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op99, xnn_delete_operator); - - xnn_operator_t op100 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/2, /*input_padding_right=*/2, - /*input_padding_bottom=*/2, /*input_padding_left=*/2, - /*kernel_height=*/5, /*kernel_width=*/5, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/576, - /*group_input_channels=*/1, - /*group_output_channels=*/1, - /*input_channel_stride=*/576, - /*output_channel_stride=*/576, - /*input_zero_point=*/(uint8_t) 3, - /*input_scale=*/0.1398279368877411, - /*kernel_zero_point=*/(uint8_t) 218, - /*kernel_scale=*/2.1697041988372803, - /*kernel=*/w230.data(), /*bias=*/w231.data(), - /*output_zero_point=*/(uint8_t) 110, - /*output_scale=*/0.6755003929138184, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op100); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #100" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op100, xnn_delete_operator); - - xnn_operator_t op101 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op101); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #101" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op101, xnn_delete_operator); - - xnn_operator_t op102 = nullptr; - status = xnn_create_global_average_pooling_nwc_qu8( - 1 /* input zero point */, 0.3347671329975128 /* input scale */, - 1 /* output zero point */, 0.3347671329975128 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op102); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #102" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op102, xnn_delete_operator); - - xnn_operator_t op103 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/576, - /*group_output_channels=*/144, - /*input_channel_stride=*/576, - /*output_channel_stride=*/144, - /*input_zero_point=*/(uint8_t) 1, - /*input_scale=*/0.3347671329975128, - /*kernel_zero_point=*/(uint8_t) 96, - /*kernel_scale=*/0.006274337414652109, - /*kernel=*/w232.data(), /*bias=*/w233.data(), - /*output_zero_point=*/(uint8_t) 0, - /*output_scale=*/0.04336833581328392, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op103); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #103" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op103, xnn_delete_operator); - - xnn_operator_t op104 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/144, - /*group_output_channels=*/576, - /*input_channel_stride=*/144, - /*output_channel_stride=*/576, - /*input_zero_point=*/(uint8_t) 0, - /*input_scale=*/0.04336833581328392, - /*kernel_zero_point=*/(uint8_t) 91, - /*kernel_scale=*/0.008546789176762104, - /*kernel=*/w234.data(), /*bias=*/w235.data(), - /*output_zero_point=*/(uint8_t) 115, - /*output_scale=*/0.09501760452985764, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op104); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #104" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op104, xnn_delete_operator); - - xnn_operator_t op105 = nullptr; - status = xnn_create_add_nd_qu8( - 115 /* input1 zero point */, 0.09501760452985764 /* input1 scale */, - 0 /* input2 zero point */, 0.0117647061124444 /* input2 scale */, - 0 /* output zero point */, 0.023528477177023888 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op105); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #105" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op105, xnn_delete_operator); - - xnn_operator_t op106 = nullptr; - status = xnn_create_multiply_nd_qu8( - 0 /* input1 zero point */, 0.023528477177023888 /* input1 scale */, - 0 /* input2 zero point */, 0.0006536078290082514 /* input2 scale */, - 0 /* output zero point */, 0.003921508323401213 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op106); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #106" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op106, xnn_delete_operator); - - xnn_operator_t op107 = nullptr; - status = xnn_create_multiply_nd_qu8( - 1 /* input1 zero point */, 0.3347671329975128 /* input1 scale */, - 0 /* input2 zero point */, 0.003921508323401213 /* input2 scale */, - 2 /* output zero point */, 0.19521307945251465 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op107); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #107" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op107, xnn_delete_operator); - - xnn_operator_t op108 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/576, - /*group_output_channels=*/96, - /*input_channel_stride=*/576, - /*output_channel_stride=*/96, - /*input_zero_point=*/(uint8_t) 2, - /*input_scale=*/0.19521307945251465, - /*kernel_zero_point=*/(uint8_t) 130, - /*kernel_scale=*/0.02609884925186634, - /*kernel=*/w238.data(), /*bias=*/w239.data(), - /*output_zero_point=*/(uint8_t) 129, - /*output_scale=*/0.7081664800643921, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op108); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #108" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op108, xnn_delete_operator); - - xnn_operator_t op109 = nullptr; - status = xnn_create_add_nd_qu8( - 129 /* input1 zero point */, 0.7081664800643921 /* input1 scale */, - 130 /* input2 zero point */, 0.3734561800956726 /* input2 scale */, - 127 /* output zero point */, 0.808801531791687 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op109); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #109" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op109, xnn_delete_operator); - - xnn_operator_t op110 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/96, - /*group_output_channels=*/576, - /*input_channel_stride=*/96, - /*output_channel_stride=*/576, - /*input_zero_point=*/(uint8_t) 127, - /*input_scale=*/0.808801531791687, - /*kernel_zero_point=*/(uint8_t) 142, - /*kernel_scale=*/0.003396135289222002, - /*kernel=*/w240.data(), /*bias=*/w241.data(), - /*output_zero_point=*/(uint8_t) 131, - /*output_scale=*/0.9106870889663696, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op110); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #110" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op110, xnn_delete_operator); - - xnn_operator_t op111 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op111); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #111" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op111, xnn_delete_operator); - - xnn_operator_t op112 = nullptr; - status = xnn_create_global_average_pooling_nwc_qu8( - 1 /* input zero point */, 0.40212398767471313 /* input scale */, - 1 /* output zero point */, 0.40212398767471313 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op112); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #112" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op112, xnn_delete_operator); - - xnn_operator_t op113 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/576, - /*group_output_channels=*/1024, - /*input_channel_stride=*/576, - /*output_channel_stride=*/1024, - /*input_zero_point=*/(uint8_t) 1, - /*input_scale=*/0.40212398767471313, - /*kernel_zero_point=*/(uint8_t) 97, - /*kernel_scale=*/0.006370874121785164, - /*kernel=*/w242.data(), /*bias=*/w243.data(), - /*output_zero_point=*/(uint8_t) 170, - /*output_scale=*/0.05783478170633316, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op113); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #113" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op113, xnn_delete_operator); - - xnn_operator_t op114 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op114); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #114" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op114, xnn_delete_operator); - - xnn_operator_t op115 = nullptr; - status = xnn_create_global_average_pooling_nwc_qu8( - 19 /* input zero point */, 0.01954001374542713 /* input scale */, - 19 /* output zero point */, 0.01954001374542713 /* output scale */, - 0 /* output min */, 255 /* output max */, - 0 /* flags */, - &op115); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #115" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op115, xnn_delete_operator); - - xnn_operator_t op116 = nullptr; - status = xnn_create_convolution2d_nhwc_qu8( - /*input_padding_top=*/0, /*input_padding_right=*/0, - /*input_padding_bottom=*/0, /*input_padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/1024, - /*group_output_channels=*/1001, - /*input_channel_stride=*/1024, - /*output_channel_stride=*/1001, - /*input_zero_point=*/(uint8_t) 19, - /*input_scale=*/0.01954001374542713, - /*kernel_zero_point=*/(uint8_t) 113, - /*kernel_scale=*/0.0029929860029369593, - /*kernel=*/w244.data(), /*bias=*/w245.data(), - /*output_zero_point=*/(uint8_t) 77, - /*output_scale=*/0.07862140238285065, - /*output_min=*/(uint8_t) 0, /*output_max=*/(uint8_t) 255, - /*flags=*/0, - /*code_cache=*/code_cache_ptr, - /*weights_cache=*/nullptr, - &op116); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #116" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op116, xnn_delete_operator); - - xnn_operator_t op117 = nullptr; - status = xnn_create_copy_nc_x8( - 0 /* flags */, - &op117); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #117" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op117, xnn_delete_operator); - - xnn_operator_t op118 = nullptr; - status = xnn_create_softmax_nc_qu8( - /*input_scale=*/0.07862140238285065, - /*output_zero_point=*/0, - /*output_scale=*/0.00390625, - /*flags=*/0, - &op118); - if (status != xnn_status_success) { - std::cerr << "failed to create operation #118" << std::endl; - return ExecutionPlan(); - } - operators.emplace_back(op118, xnn_delete_operator); - - size_t op0_workspace_size = 0; - size_t op0_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op0, - /*batch_size=*/1, /*input_height=*/224, /*input_width=*/224, - &op0_workspace_size, &op0_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op0_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #0" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op1, - /*batch_size=*/12544, - 16 /* channels */, - 16 /* input stride */, - 16 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #1" << std::endl; - return ExecutionPlan(); - } - - size_t op2_workspace_size = 0; - size_t op2_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op2, - /*batch_size=*/1, /*input_height=*/112, /*input_width=*/112, - &op2_workspace_size, &op2_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op2_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #2" << std::endl; - return ExecutionPlan(); - } - - size_t op3_workspace_size = 0; - size_t op3_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_qu8( - op3, - /*batch_size=*/1, 3136 /* width */, - 16 /* channels */, 16 /* input stride */, 16 /* output stride */, - &op3_workspace_size, &op3_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op3_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #3" << std::endl; - return ExecutionPlan(); - } - - size_t op4_workspace_size = 0; - size_t op4_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op4, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op4_workspace_size, &op4_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op4_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #4" << std::endl; - return ExecutionPlan(); - } - - size_t op5_workspace_size = 0; - size_t op5_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op5, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op5_workspace_size, &op5_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op5_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #5" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 1, 1, 16 }; - const size_t b_shape[] = { 1 }; - status = xnn_reshape_add_nd_qu8( - op6, - 4, a_shape, 1, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #6" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 1, 1, 16 }; - const size_t b_shape[] = { 1 }; - status = xnn_reshape_multiply_nd_qu8( - op7, - 4, a_shape, 1, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #7" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 56, 56, 16 }; - const size_t b_shape[] = { 1, 1, 1, 16 }; - status = xnn_reshape_multiply_nd_qu8( - op8, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #8" << std::endl; - return ExecutionPlan(); - } - - size_t op9_workspace_size = 0; - size_t op9_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op9, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op9_workspace_size, &op9_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op9_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #9" << std::endl; - return ExecutionPlan(); - } - - size_t op10_workspace_size = 0; - size_t op10_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op10, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op10_workspace_size, &op10_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op10_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #10" << std::endl; - return ExecutionPlan(); - } - - size_t op11_workspace_size = 0; - size_t op11_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op11, - /*batch_size=*/1, /*input_height=*/56, /*input_width=*/56, - &op11_workspace_size, &op11_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op11_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #11" << std::endl; - return ExecutionPlan(); - } - - size_t op12_workspace_size = 0; - size_t op12_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op12, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op12_workspace_size, &op12_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op12_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #12" << std::endl; - return ExecutionPlan(); - } - - size_t op13_workspace_size = 0; - size_t op13_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op13, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op13_workspace_size, &op13_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op13_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #13" << std::endl; - return ExecutionPlan(); - } - - size_t op14_workspace_size = 0; - size_t op14_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op14, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op14_workspace_size, &op14_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op14_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #14" << std::endl; - return ExecutionPlan(); - } - - size_t op15_workspace_size = 0; - size_t op15_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op15, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op15_workspace_size, &op15_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op15_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #15" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 28, 28, 24 }; - const size_t b_shape[] = { 1, 28, 28, 24 }; - status = xnn_reshape_add_nd_qu8( - op16, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #16" << std::endl; - return ExecutionPlan(); - } - - size_t op17_workspace_size = 0; - size_t op17_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op17, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op17_workspace_size, &op17_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op17_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #17" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op18, - /*batch_size=*/784, - 96 /* channels */, - 96 /* input stride */, - 96 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #18" << std::endl; - return ExecutionPlan(); - } - - size_t op19_workspace_size = 0; - size_t op19_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op19, - /*batch_size=*/1, /*input_height=*/28, /*input_width=*/28, - &op19_workspace_size, &op19_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op19_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #19" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op20, - /*batch_size=*/196, - 96 /* channels */, - 96 /* input stride */, - 96 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #20" << std::endl; - return ExecutionPlan(); - } - - size_t op21_workspace_size = 0; - size_t op21_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_qu8( - op21, - /*batch_size=*/1, 196 /* width */, - 96 /* channels */, 96 /* input stride */, 96 /* output stride */, - &op21_workspace_size, &op21_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op21_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #21" << std::endl; - return ExecutionPlan(); - } - - size_t op22_workspace_size = 0; - size_t op22_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op22, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op22_workspace_size, &op22_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op22_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #22" << std::endl; - return ExecutionPlan(); - } - - size_t op23_workspace_size = 0; - size_t op23_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op23, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op23_workspace_size, &op23_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op23_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #23" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 1, 1, 96 }; - const size_t b_shape[] = { 1 }; - status = xnn_reshape_add_nd_qu8( - op24, - 4, a_shape, 1, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #24" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 1, 1, 96 }; - const size_t b_shape[] = { 1 }; - status = xnn_reshape_multiply_nd_qu8( - op25, - 4, a_shape, 1, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #25" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 96 }; - const size_t b_shape[] = { 1, 1, 1, 96 }; - status = xnn_reshape_multiply_nd_qu8( - op26, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #26" << std::endl; - return ExecutionPlan(); - } - - size_t op27_workspace_size = 0; - size_t op27_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op27, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op27_workspace_size, &op27_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op27_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #27" << std::endl; - return ExecutionPlan(); - } - - size_t op28_workspace_size = 0; - size_t op28_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op28, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op28_workspace_size, &op28_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op28_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #28" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op29, - /*batch_size=*/196, - 240 /* channels */, - 240 /* input stride */, - 240 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #29" << std::endl; - return ExecutionPlan(); - } - - size_t op30_workspace_size = 0; - size_t op30_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op30, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op30_workspace_size, &op30_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op30_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #30" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op31, - /*batch_size=*/196, - 240 /* channels */, - 240 /* input stride */, - 240 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #31" << std::endl; - return ExecutionPlan(); - } - - size_t op32_workspace_size = 0; - size_t op32_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_qu8( - op32, - /*batch_size=*/1, 196 /* width */, - 240 /* channels */, 240 /* input stride */, 240 /* output stride */, - &op32_workspace_size, &op32_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op32_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #32" << std::endl; - return ExecutionPlan(); - } - - size_t op33_workspace_size = 0; - size_t op33_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op33, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op33_workspace_size, &op33_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op33_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #33" << std::endl; - return ExecutionPlan(); - } - - size_t op34_workspace_size = 0; - size_t op34_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op34, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op34_workspace_size, &op34_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op34_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #34" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 1, 1, 240 }; - const size_t b_shape[] = { 1 }; - status = xnn_reshape_add_nd_qu8( - op35, - 4, a_shape, 1, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #35" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 1, 1, 240 }; - const size_t b_shape[] = { 1 }; - status = xnn_reshape_multiply_nd_qu8( - op36, - 4, a_shape, 1, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #36" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 240 }; - const size_t b_shape[] = { 1, 1, 1, 240 }; - status = xnn_reshape_multiply_nd_qu8( - op37, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #37" << std::endl; - return ExecutionPlan(); - } - - size_t op38_workspace_size = 0; - size_t op38_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op38, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op38_workspace_size, &op38_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op38_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #38" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 40 }; - const size_t b_shape[] = { 1, 14, 14, 40 }; - status = xnn_reshape_add_nd_qu8( - op39, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #39" << std::endl; - return ExecutionPlan(); - } - - size_t op40_workspace_size = 0; - size_t op40_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op40, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op40_workspace_size, &op40_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op40_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #40" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op41, - /*batch_size=*/196, - 240 /* channels */, - 240 /* input stride */, - 240 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #41" << std::endl; - return ExecutionPlan(); - } - - size_t op42_workspace_size = 0; - size_t op42_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op42, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op42_workspace_size, &op42_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op42_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #42" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op43, - /*batch_size=*/196, - 240 /* channels */, - 240 /* input stride */, - 240 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #43" << std::endl; - return ExecutionPlan(); - } - - size_t op44_workspace_size = 0; - size_t op44_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_qu8( - op44, - /*batch_size=*/1, 196 /* width */, - 240 /* channels */, 240 /* input stride */, 240 /* output stride */, - &op44_workspace_size, &op44_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op44_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #44" << std::endl; - return ExecutionPlan(); - } - - size_t op45_workspace_size = 0; - size_t op45_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op45, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op45_workspace_size, &op45_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op45_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #45" << std::endl; - return ExecutionPlan(); - } - - size_t op46_workspace_size = 0; - size_t op46_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op46, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op46_workspace_size, &op46_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op46_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #46" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 1, 1, 240 }; - const size_t b_shape[] = { 1 }; - status = xnn_reshape_add_nd_qu8( - op47, - 4, a_shape, 1, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #47" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 1, 1, 240 }; - const size_t b_shape[] = { 1 }; - status = xnn_reshape_multiply_nd_qu8( - op48, - 4, a_shape, 1, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #48" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 240 }; - const size_t b_shape[] = { 1, 1, 1, 240 }; - status = xnn_reshape_multiply_nd_qu8( - op49, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #49" << std::endl; - return ExecutionPlan(); - } - - size_t op50_workspace_size = 0; - size_t op50_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op50, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op50_workspace_size, &op50_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op50_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #50" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 40 }; - const size_t b_shape[] = { 1, 14, 14, 40 }; - status = xnn_reshape_add_nd_qu8( - op51, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #51" << std::endl; - return ExecutionPlan(); - } - - size_t op52_workspace_size = 0; - size_t op52_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op52, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op52_workspace_size, &op52_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op52_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #52" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op53, - /*batch_size=*/196, - 120 /* channels */, - 120 /* input stride */, - 120 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #53" << std::endl; - return ExecutionPlan(); - } - - size_t op54_workspace_size = 0; - size_t op54_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op54, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op54_workspace_size, &op54_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op54_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #54" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op55, - /*batch_size=*/196, - 120 /* channels */, - 120 /* input stride */, - 120 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #55" << std::endl; - return ExecutionPlan(); - } - - size_t op56_workspace_size = 0; - size_t op56_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_qu8( - op56, - /*batch_size=*/1, 196 /* width */, - 120 /* channels */, 120 /* input stride */, 120 /* output stride */, - &op56_workspace_size, &op56_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op56_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #56" << std::endl; - return ExecutionPlan(); - } - - size_t op57_workspace_size = 0; - size_t op57_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op57, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op57_workspace_size, &op57_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op57_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #57" << std::endl; - return ExecutionPlan(); - } - - size_t op58_workspace_size = 0; - size_t op58_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op58, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op58_workspace_size, &op58_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op58_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #58" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 1, 1, 120 }; - const size_t b_shape[] = { 1 }; - status = xnn_reshape_add_nd_qu8( - op59, - 4, a_shape, 1, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #59" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 1, 1, 120 }; - const size_t b_shape[] = { 1 }; - status = xnn_reshape_multiply_nd_qu8( - op60, - 4, a_shape, 1, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #60" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 120 }; - const size_t b_shape[] = { 1, 1, 1, 120 }; - status = xnn_reshape_multiply_nd_qu8( - op61, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #61" << std::endl; - return ExecutionPlan(); - } - - size_t op62_workspace_size = 0; - size_t op62_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op62, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op62_workspace_size, &op62_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op62_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #62" << std::endl; - return ExecutionPlan(); - } - - size_t op63_workspace_size = 0; - size_t op63_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op63, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op63_workspace_size, &op63_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op63_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #63" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op64, - /*batch_size=*/196, - 144 /* channels */, - 144 /* input stride */, - 144 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #64" << std::endl; - return ExecutionPlan(); - } - - size_t op65_workspace_size = 0; - size_t op65_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op65, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op65_workspace_size, &op65_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op65_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #65" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op66, - /*batch_size=*/196, - 144 /* channels */, - 144 /* input stride */, - 144 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #66" << std::endl; - return ExecutionPlan(); - } - - size_t op67_workspace_size = 0; - size_t op67_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_qu8( - op67, - /*batch_size=*/1, 196 /* width */, - 144 /* channels */, 144 /* input stride */, 144 /* output stride */, - &op67_workspace_size, &op67_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op67_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #67" << std::endl; - return ExecutionPlan(); - } - - size_t op68_workspace_size = 0; - size_t op68_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op68, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op68_workspace_size, &op68_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op68_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #68" << std::endl; - return ExecutionPlan(); - } - - size_t op69_workspace_size = 0; - size_t op69_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op69, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op69_workspace_size, &op69_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op69_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #69" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 1, 1, 144 }; - const size_t b_shape[] = { 1 }; - status = xnn_reshape_add_nd_qu8( - op70, - 4, a_shape, 1, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #70" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 1, 1, 144 }; - const size_t b_shape[] = { 1 }; - status = xnn_reshape_multiply_nd_qu8( - op71, - 4, a_shape, 1, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #71" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 144 }; - const size_t b_shape[] = { 1, 1, 1, 144 }; - status = xnn_reshape_multiply_nd_qu8( - op72, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #72" << std::endl; - return ExecutionPlan(); - } - - size_t op73_workspace_size = 0; - size_t op73_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op73, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op73_workspace_size, &op73_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op73_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #73" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 14, 14, 48 }; - const size_t b_shape[] = { 1, 14, 14, 48 }; - status = xnn_reshape_add_nd_qu8( - op74, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #74" << std::endl; - return ExecutionPlan(); - } - - size_t op75_workspace_size = 0; - size_t op75_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op75, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op75_workspace_size, &op75_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op75_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #75" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op76, - /*batch_size=*/196, - 288 /* channels */, - 288 /* input stride */, - 288 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #76" << std::endl; - return ExecutionPlan(); - } - - size_t op77_workspace_size = 0; - size_t op77_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op77, - /*batch_size=*/1, /*input_height=*/14, /*input_width=*/14, - &op77_workspace_size, &op77_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op77_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #77" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op78, - /*batch_size=*/49, - 288 /* channels */, - 288 /* input stride */, - 288 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #78" << std::endl; - return ExecutionPlan(); - } - - size_t op79_workspace_size = 0; - size_t op79_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_qu8( - op79, - /*batch_size=*/1, 49 /* width */, - 288 /* channels */, 288 /* input stride */, 288 /* output stride */, - &op79_workspace_size, &op79_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op79_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #79" << std::endl; - return ExecutionPlan(); - } - - size_t op80_workspace_size = 0; - size_t op80_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op80, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op80_workspace_size, &op80_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op80_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #80" << std::endl; - return ExecutionPlan(); - } - - size_t op81_workspace_size = 0; - size_t op81_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op81, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op81_workspace_size, &op81_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op81_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #81" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 1, 1, 288 }; - const size_t b_shape[] = { 1 }; - status = xnn_reshape_add_nd_qu8( - op82, - 4, a_shape, 1, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #82" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 1, 1, 288 }; - const size_t b_shape[] = { 1 }; - status = xnn_reshape_multiply_nd_qu8( - op83, - 4, a_shape, 1, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #83" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 288 }; - const size_t b_shape[] = { 1, 1, 1, 288 }; - status = xnn_reshape_multiply_nd_qu8( - op84, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #84" << std::endl; - return ExecutionPlan(); - } - - size_t op85_workspace_size = 0; - size_t op85_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op85, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op85_workspace_size, &op85_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op85_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #85" << std::endl; - return ExecutionPlan(); - } - - size_t op86_workspace_size = 0; - size_t op86_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op86, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op86_workspace_size, &op86_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op86_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #86" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op87, - /*batch_size=*/49, - 576 /* channels */, - 576 /* input stride */, - 576 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #87" << std::endl; - return ExecutionPlan(); - } - - size_t op88_workspace_size = 0; - size_t op88_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op88, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op88_workspace_size, &op88_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op88_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #88" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op89, - /*batch_size=*/49, - 576 /* channels */, - 576 /* input stride */, - 576 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #89" << std::endl; - return ExecutionPlan(); - } - - size_t op90_workspace_size = 0; - size_t op90_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_qu8( - op90, - /*batch_size=*/1, 49 /* width */, - 576 /* channels */, 576 /* input stride */, 576 /* output stride */, - &op90_workspace_size, &op90_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op90_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #90" << std::endl; - return ExecutionPlan(); - } - - size_t op91_workspace_size = 0; - size_t op91_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op91, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op91_workspace_size, &op91_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op91_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #91" << std::endl; - return ExecutionPlan(); - } - - size_t op92_workspace_size = 0; - size_t op92_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op92, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op92_workspace_size, &op92_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op92_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #92" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 1, 1, 576 }; - const size_t b_shape[] = { 1 }; - status = xnn_reshape_add_nd_qu8( - op93, - 4, a_shape, 1, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #93" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 1, 1, 576 }; - const size_t b_shape[] = { 1 }; - status = xnn_reshape_multiply_nd_qu8( - op94, - 4, a_shape, 1, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #94" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 576 }; - const size_t b_shape[] = { 1, 1, 1, 576 }; - status = xnn_reshape_multiply_nd_qu8( - op95, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #95" << std::endl; - return ExecutionPlan(); - } - - size_t op96_workspace_size = 0; - size_t op96_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op96, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op96_workspace_size, &op96_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op96_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #96" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 96 }; - const size_t b_shape[] = { 1, 7, 7, 96 }; - status = xnn_reshape_add_nd_qu8( - op97, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #97" << std::endl; - return ExecutionPlan(); - } - - size_t op98_workspace_size = 0; - size_t op98_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op98, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op98_workspace_size, &op98_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op98_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #98" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op99, - /*batch_size=*/49, - 576 /* channels */, - 576 /* input stride */, - 576 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #99" << std::endl; - return ExecutionPlan(); - } - - size_t op100_workspace_size = 0; - size_t op100_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op100, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op100_workspace_size, &op100_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op100_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #100" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op101, - /*batch_size=*/49, - 576 /* channels */, - 576 /* input stride */, - 576 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #101" << std::endl; - return ExecutionPlan(); - } - - size_t op102_workspace_size = 0; - size_t op102_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_qu8( - op102, - /*batch_size=*/1, 49 /* width */, - 576 /* channels */, 576 /* input stride */, 576 /* output stride */, - &op102_workspace_size, &op102_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op102_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #102" << std::endl; - return ExecutionPlan(); - } - - size_t op103_workspace_size = 0; - size_t op103_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op103, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op103_workspace_size, &op103_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op103_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #103" << std::endl; - return ExecutionPlan(); - } - - size_t op104_workspace_size = 0; - size_t op104_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op104, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op104_workspace_size, &op104_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op104_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #104" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 1, 1, 576 }; - const size_t b_shape[] = { 1 }; - status = xnn_reshape_add_nd_qu8( - op105, - 4, a_shape, 1, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #105" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 1, 1, 576 }; - const size_t b_shape[] = { 1 }; - status = xnn_reshape_multiply_nd_qu8( - op106, - 4, a_shape, 1, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #106" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 576 }; - const size_t b_shape[] = { 1, 1, 1, 576 }; - status = xnn_reshape_multiply_nd_qu8( - op107, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #107" << std::endl; - return ExecutionPlan(); - } - - size_t op108_workspace_size = 0; - size_t op108_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op108, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op108_workspace_size, &op108_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op108_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #108" << std::endl; - return ExecutionPlan(); - } - - { - const size_t a_shape[] = { 1, 7, 7, 96 }; - const size_t b_shape[] = { 1, 7, 7, 96 }; - status = xnn_reshape_add_nd_qu8( - op109, - 4, a_shape, 4, b_shape, - /*threadpool=*/threadpool); - } - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #109" << std::endl; - return ExecutionPlan(); - } - - size_t op110_workspace_size = 0; - size_t op110_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op110, - /*batch_size=*/1, /*input_height=*/7, /*input_width=*/7, - &op110_workspace_size, &op110_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op110_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #110" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op111, - /*batch_size=*/49, - 576 /* channels */, - 576 /* input stride */, - 576 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #111" << std::endl; - return ExecutionPlan(); - } - - size_t op112_workspace_size = 0; - size_t op112_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_qu8( - op112, - /*batch_size=*/1, 49 /* width */, - 576 /* channels */, 576 /* input stride */, 576 /* output stride */, - &op112_workspace_size, &op112_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op112_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #112" << std::endl; - return ExecutionPlan(); - } - - size_t op113_workspace_size = 0; - size_t op113_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op113, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op113_workspace_size, &op113_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op113_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #113" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op114, - /*batch_size=*/1, - 1024 /* channels */, - 1024 /* input stride */, - 1024 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #114" << std::endl; - return ExecutionPlan(); - } - - size_t op115_workspace_size = 0; - size_t op115_workspace_alignment = 0; - status = xnn_reshape_global_average_pooling_nwc_qu8( - op115, - /*batch_size=*/1, 1 /* width */, - 1024 /* channels */, 1024 /* input stride */, 1024 /* output stride */, - &op115_workspace_size, &op115_workspace_alignment, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op115_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #115" << std::endl; - return ExecutionPlan(); - } - - size_t op116_workspace_size = 0; - size_t op116_workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - op116, - /*batch_size=*/1, /*input_height=*/1, /*input_width=*/1, - &op116_workspace_size, &op116_workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/threadpool); - max_workspace_size = std::max(max_workspace_size, op116_workspace_size); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #116" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_copy_nc_x8( - op117, - /*batch_size=*/1001, - 1 /* channels */, - 1 /* input stride */, - 1 /* output stride */, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #117" << std::endl; - return ExecutionPlan(); - } - - status = xnn_reshape_softmax_nc_qu8( - op118, - /*channels=*/1001, - /*input_stride=*/1001, - /*output_stride=*/1001, - /*batch_size=*/1, - /*threadpool=*/threadpool); - if (status != xnn_status_success) { - std::cerr << "failed to reshape operation #118" << std::endl; - return ExecutionPlan(); - } - - Workspace workspace(max_workspace_size); - - status = xnn_setup_convolution2d_nhwc_qu8( - op0, - workspace.data(), /*input=*/v0.data(), /*output=*/v1.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #0" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op1, - /*input=*/v1.data(), /*output=*/v2.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #1" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op2, - workspace.data(), /*input=*/v2.data(), /*output=*/v3.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #2" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_qu8( - op3, - workspace.data(), - /*input=*/v3.data(), /*output=*/v4.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #3" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op4, - workspace.data(), /*input=*/v4.data(), /*output=*/v5.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #4" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op5, - workspace.data(), /*input=*/v5.data(), /*output=*/v6.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #5" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op6, - v6.data() /* a */, w128.data() /* b */, /*output=*/v7.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #6" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_qu8( - op7, - v7.data() /* a */, w129.data() /* b */, /*output=*/v8.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #7" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_qu8( - op8, - v3.data() /* a */, v8.data() /* b */, /*output=*/v9.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #8" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op9, - workspace.data(), /*input=*/v9.data(), /*output=*/v10.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #9" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op10, - workspace.data(), /*input=*/v10.data(), /*output=*/v11.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #10" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op11, - workspace.data(), /*input=*/v11.data(), /*output=*/v12.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #11" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op12, - workspace.data(), /*input=*/v12.data(), /*output=*/v13.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #12" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op13, - workspace.data(), /*input=*/v13.data(), /*output=*/v14.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #13" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op14, - workspace.data(), /*input=*/v14.data(), /*output=*/v15.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #14" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op15, - workspace.data(), /*input=*/v15.data(), /*output=*/v16.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #15" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op16, - v16.data() /* a */, v13.data() /* b */, /*output=*/v17.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #16" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op17, - workspace.data(), /*input=*/v17.data(), /*output=*/v18.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #17" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op18, - /*input=*/v18.data(), /*output=*/v19.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #18" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op19, - workspace.data(), /*input=*/v19.data(), /*output=*/v20.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #19" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op20, - /*input=*/v20.data(), /*output=*/v21.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #20" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_qu8( - op21, - workspace.data(), - /*input=*/v21.data(), /*output=*/v22.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #21" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op22, - workspace.data(), /*input=*/v22.data(), /*output=*/v23.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #22" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op23, - workspace.data(), /*input=*/v23.data(), /*output=*/v24.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #23" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op24, - v24.data() /* a */, w152.data() /* b */, /*output=*/v25.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #24" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_qu8( - op25, - v25.data() /* a */, w153.data() /* b */, /*output=*/v26.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #25" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_qu8( - op26, - v21.data() /* a */, v26.data() /* b */, /*output=*/v27.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #26" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op27, - workspace.data(), /*input=*/v27.data(), /*output=*/v28.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #27" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op28, - workspace.data(), /*input=*/v28.data(), /*output=*/v29.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #28" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op29, - /*input=*/v29.data(), /*output=*/v30.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #29" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op30, - workspace.data(), /*input=*/v30.data(), /*output=*/v31.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #30" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op31, - /*input=*/v31.data(), /*output=*/v32.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #31" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_qu8( - op32, - workspace.data(), - /*input=*/v32.data(), /*output=*/v33.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #32" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op33, - workspace.data(), /*input=*/v33.data(), /*output=*/v34.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #33" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op34, - workspace.data(), /*input=*/v34.data(), /*output=*/v35.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #34" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op35, - v35.data() /* a */, w164.data() /* b */, /*output=*/v36.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #35" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_qu8( - op36, - v36.data() /* a */, w165.data() /* b */, /*output=*/v37.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #36" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_qu8( - op37, - v32.data() /* a */, v37.data() /* b */, /*output=*/v38.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #37" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op38, - workspace.data(), /*input=*/v38.data(), /*output=*/v39.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #38" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op39, - v39.data() /* a */, v28.data() /* b */, /*output=*/v40.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #39" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op40, - workspace.data(), /*input=*/v40.data(), /*output=*/v41.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #40" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op41, - /*input=*/v41.data(), /*output=*/v42.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #41" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op42, - workspace.data(), /*input=*/v42.data(), /*output=*/v43.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #42" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op43, - /*input=*/v43.data(), /*output=*/v44.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #43" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_qu8( - op44, - workspace.data(), - /*input=*/v44.data(), /*output=*/v45.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #44" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op45, - workspace.data(), /*input=*/v45.data(), /*output=*/v46.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #45" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op46, - workspace.data(), /*input=*/v46.data(), /*output=*/v47.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #46" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op47, - v47.data() /* a */, w176.data() /* b */, /*output=*/v48.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #47" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_qu8( - op48, - v48.data() /* a */, w177.data() /* b */, /*output=*/v49.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #48" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_qu8( - op49, - v44.data() /* a */, v49.data() /* b */, /*output=*/v50.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #49" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op50, - workspace.data(), /*input=*/v50.data(), /*output=*/v51.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #50" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op51, - v51.data() /* a */, v40.data() /* b */, /*output=*/v52.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #51" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op52, - workspace.data(), /*input=*/v52.data(), /*output=*/v53.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #52" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op53, - /*input=*/v53.data(), /*output=*/v54.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #53" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op54, - workspace.data(), /*input=*/v54.data(), /*output=*/v55.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #54" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op55, - /*input=*/v55.data(), /*output=*/v56.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #55" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_qu8( - op56, - workspace.data(), - /*input=*/v56.data(), /*output=*/v57.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #56" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op57, - workspace.data(), /*input=*/v57.data(), /*output=*/v58.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #57" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op58, - workspace.data(), /*input=*/v58.data(), /*output=*/v59.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #58" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op59, - v59.data() /* a */, w188.data() /* b */, /*output=*/v60.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #59" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_qu8( - op60, - v60.data() /* a */, w189.data() /* b */, /*output=*/v61.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #60" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_qu8( - op61, - v56.data() /* a */, v61.data() /* b */, /*output=*/v62.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #61" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op62, - workspace.data(), /*input=*/v62.data(), /*output=*/v63.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #62" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op63, - workspace.data(), /*input=*/v63.data(), /*output=*/v64.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #63" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op64, - /*input=*/v64.data(), /*output=*/v65.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #64" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op65, - workspace.data(), /*input=*/v65.data(), /*output=*/v66.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #65" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op66, - /*input=*/v66.data(), /*output=*/v67.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #66" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_qu8( - op67, - workspace.data(), - /*input=*/v67.data(), /*output=*/v68.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #67" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op68, - workspace.data(), /*input=*/v68.data(), /*output=*/v69.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #68" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op69, - workspace.data(), /*input=*/v69.data(), /*output=*/v70.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #69" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op70, - v70.data() /* a */, w200.data() /* b */, /*output=*/v71.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #70" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_qu8( - op71, - v71.data() /* a */, w201.data() /* b */, /*output=*/v72.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #71" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_qu8( - op72, - v67.data() /* a */, v72.data() /* b */, /*output=*/v73.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #72" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op73, - workspace.data(), /*input=*/v73.data(), /*output=*/v74.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #73" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op74, - v74.data() /* a */, v63.data() /* b */, /*output=*/v75.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #74" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op75, - workspace.data(), /*input=*/v75.data(), /*output=*/v76.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #75" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op76, - /*input=*/v76.data(), /*output=*/v77.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #76" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op77, - workspace.data(), /*input=*/v77.data(), /*output=*/v78.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #77" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op78, - /*input=*/v78.data(), /*output=*/v79.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #78" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_qu8( - op79, - workspace.data(), - /*input=*/v79.data(), /*output=*/v80.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #79" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op80, - workspace.data(), /*input=*/v80.data(), /*output=*/v81.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #80" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op81, - workspace.data(), /*input=*/v81.data(), /*output=*/v82.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #81" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op82, - v82.data() /* a */, w212.data() /* b */, /*output=*/v83.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #82" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_qu8( - op83, - v83.data() /* a */, w213.data() /* b */, /*output=*/v84.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #83" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_qu8( - op84, - v79.data() /* a */, v84.data() /* b */, /*output=*/v85.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #84" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op85, - workspace.data(), /*input=*/v85.data(), /*output=*/v86.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #85" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op86, - workspace.data(), /*input=*/v86.data(), /*output=*/v87.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #86" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op87, - /*input=*/v87.data(), /*output=*/v88.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #87" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op88, - workspace.data(), /*input=*/v88.data(), /*output=*/v89.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #88" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op89, - /*input=*/v89.data(), /*output=*/v90.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #89" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_qu8( - op90, - workspace.data(), - /*input=*/v90.data(), /*output=*/v91.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #90" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op91, - workspace.data(), /*input=*/v91.data(), /*output=*/v92.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #91" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op92, - workspace.data(), /*input=*/v92.data(), /*output=*/v93.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #92" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op93, - v93.data() /* a */, w224.data() /* b */, /*output=*/v94.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #93" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_qu8( - op94, - v94.data() /* a */, w225.data() /* b */, /*output=*/v95.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #94" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_qu8( - op95, - v90.data() /* a */, v95.data() /* b */, /*output=*/v96.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #95" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op96, - workspace.data(), /*input=*/v96.data(), /*output=*/v97.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #96" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op97, - v97.data() /* a */, v86.data() /* b */, /*output=*/v98.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #97" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op98, - workspace.data(), /*input=*/v98.data(), /*output=*/v99.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #98" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op99, - /*input=*/v99.data(), /*output=*/v100.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #99" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op100, - workspace.data(), /*input=*/v100.data(), /*output=*/v101.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #100" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op101, - /*input=*/v101.data(), /*output=*/v102.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #101" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_qu8( - op102, - workspace.data(), - /*input=*/v102.data(), /*output=*/v103.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #102" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op103, - workspace.data(), /*input=*/v103.data(), /*output=*/v104.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #103" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op104, - workspace.data(), /*input=*/v104.data(), /*output=*/v105.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #104" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op105, - v105.data() /* a */, w236.data() /* b */, /*output=*/v106.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #105" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_qu8( - op106, - v106.data() /* a */, w237.data() /* b */, /*output=*/v107.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #106" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_multiply_nd_qu8( - op107, - v102.data() /* a */, v107.data() /* b */, /*output=*/v108.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #107" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op108, - workspace.data(), /*input=*/v108.data(), /*output=*/v109.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #108" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_add_nd_qu8( - op109, - v109.data() /* a */, v98.data() /* b */, /*output=*/v110.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #109" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op110, - workspace.data(), /*input=*/v110.data(), /*output=*/v111.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #110" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op111, - /*input=*/v111.data(), /*output=*/v112.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #111" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_qu8( - op112, - workspace.data(), - /*input=*/v112.data(), /*output=*/v113.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #112" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op113, - workspace.data(), /*input=*/v113.data(), /*output=*/v114.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #113" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op114, - /*input=*/v114.data(), /*output=*/v115.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #114" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_global_average_pooling_nwc_qu8( - op115, - workspace.data(), - /*input=*/v115.data(), /*output=*/v116.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #115" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_convolution2d_nhwc_qu8( - op116, - workspace.data(), /*input=*/v116.data(), /*output=*/v117.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #116" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_copy_nc_x8( - op117, - /*input=*/v117.data(), /*output=*/v118.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #117" << std::endl; - return ExecutionPlan(); - } - - status = xnn_setup_softmax_nc_qu8( - op118, - /*input=*/v118.data(), /*output=*/v119.data()); - if (status != xnn_status_success) { - std::cerr << "failed to setup operation #118" << std::endl; - return ExecutionPlan(); - } - - XNN_PRAGMA_CLANG("clang diagnostic push") - XNN_PRAGMA_CLANG("clang diagnostic ignored \"-Wpessimizing-move\"") - return ExecutionPlan{operators, workspace}; - XNN_PRAGMA_CLANG("clang diagnostic pop") -} - -} // namespace models diff --git a/src/configs/dwconv-config.c b/src/configs/dwconv-config.c index 539c3874915..076c85a9ee3 100644 --- a/src/configs/dwconv-config.c +++ b/src/configs/dwconv-config.c @@ -1251,7 +1251,7 @@ static void init_qu8_dwconv_config(void) { #endif } -struct xnn_dwconv_config* xnn_init_f16_dwconv_config() { +const struct xnn_dwconv_config* xnn_init_f16_dwconv_config() { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); if (hardware_config == NULL || !xnn_is_f16_compatible_config(hardware_config)) { return NULL; @@ -1260,7 +1260,7 @@ struct xnn_dwconv_config* xnn_init_f16_dwconv_config() { return f16_dwconv_config; } -struct xnn_dwconv_config* xnn_init_f32_dwconv_config() { +const struct xnn_dwconv_config* xnn_init_f32_dwconv_config() { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); if (hardware_config == NULL) { return NULL; @@ -1269,7 +1269,7 @@ struct xnn_dwconv_config* xnn_init_f32_dwconv_config() { return f32_dwconv_config; } -struct xnn_dwconv_config* xnn_init_qs8_qc8w_dwconv_config() { +const struct xnn_dwconv_config* xnn_init_qs8_qc8w_dwconv_config() { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); if (hardware_config == NULL) { return NULL; @@ -1278,7 +1278,7 @@ struct xnn_dwconv_config* xnn_init_qs8_qc8w_dwconv_config() { return qs8_qc8w_dwconv_config; } -struct xnn_dwconv_config* xnn_init_qs8_dwconv_config() { +const struct xnn_dwconv_config* xnn_init_qs8_dwconv_config() { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); if (hardware_config == NULL) { return NULL; @@ -1287,7 +1287,7 @@ struct xnn_dwconv_config* xnn_init_qs8_dwconv_config() { return qs8_dwconv_config; } -struct xnn_dwconv_config* xnn_init_qu8_dwconv_config() { +const struct xnn_dwconv_config* xnn_init_qu8_dwconv_config() { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); if (hardware_config == NULL) { return NULL; diff --git a/src/configs/gemm-config.c b/src/configs/gemm-config.c index 7e6599b7a91..6561a32b260 100644 --- a/src/configs/gemm-config.c +++ b/src/configs/gemm-config.c @@ -3767,7 +3767,7 @@ static void init_qu8_gemm_config(void) { #endif } -struct xnn_gemm_config* xnn_init_f16_gemm_config() { +const struct xnn_gemm_config* xnn_init_f16_gemm_config() { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); if (hardware_config == NULL || !xnn_is_f16_compatible_config(hardware_config)) { return NULL; @@ -3776,7 +3776,7 @@ struct xnn_gemm_config* xnn_init_f16_gemm_config() { return &f16_gemm_config; } -struct xnn_gemm_config* xnn_init_f32_gemm_config() { +const struct xnn_gemm_config* xnn_init_f32_gemm_config() { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); if (hardware_config == NULL) { return NULL; @@ -3785,7 +3785,7 @@ struct xnn_gemm_config* xnn_init_f32_gemm_config() { return &f32_gemm_config; } -struct xnn_gemm_config* xnn_init_f32_gemm_nr2_config() { +const struct xnn_gemm_config* xnn_init_f32_gemm_nr2_config() { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); if (hardware_config == NULL) { return NULL; @@ -3794,7 +3794,7 @@ struct xnn_gemm_config* xnn_init_f32_gemm_nr2_config() { return &f32_gemm_nr2_config; } -struct xnn_gemm_config* xnn_init_f32_qc4w_gemm_config() { +const struct xnn_gemm_config* xnn_init_f32_qc4w_gemm_config() { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); if (hardware_config == NULL) { return NULL; @@ -3803,7 +3803,7 @@ struct xnn_gemm_config* xnn_init_f32_qc4w_gemm_config() { return &f32_qc4w_gemm_config; } -struct xnn_gemm_config* xnn_init_f32_qc8w_gemm_config() { +const struct xnn_gemm_config* xnn_init_f32_qc8w_gemm_config() { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); if (hardware_config == NULL) { return NULL; @@ -3812,7 +3812,7 @@ struct xnn_gemm_config* xnn_init_f32_qc8w_gemm_config() { return &f32_qc8w_gemm_config; } -struct xnn_gemm_config* xnn_init_qd8_f16_qc8w_gemm_config() { +const struct xnn_gemm_config* xnn_init_qd8_f16_qc8w_gemm_config() { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); if (hardware_config == NULL || !xnn_is_f16_compatible_config(hardware_config)) { return NULL; @@ -3821,7 +3821,7 @@ struct xnn_gemm_config* xnn_init_qd8_f16_qc8w_gemm_config() { return &qd8_f16_qc8w_gemm_config; } -struct xnn_gemm_config* xnn_init_qd8_f16_qc4w_gemm_config() { +const struct xnn_gemm_config* xnn_init_qd8_f16_qc4w_gemm_config() { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); if (hardware_config == NULL || !xnn_is_f16_compatible_config(hardware_config)) { return NULL; @@ -3830,7 +3830,7 @@ struct xnn_gemm_config* xnn_init_qd8_f16_qc4w_gemm_config() { return &qd8_f16_qc4w_gemm_config; } -struct xnn_gemm_config* xnn_init_qd8_f16_qb4w_gemm_config() { +const struct xnn_gemm_config* xnn_init_qd8_f16_qb4w_gemm_config() { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); if (hardware_config == NULL || !xnn_is_f16_compatible_config(hardware_config)) { return NULL; @@ -3839,7 +3839,7 @@ struct xnn_gemm_config* xnn_init_qd8_f16_qb4w_gemm_config() { return &qd8_f16_qb4w_gemm_config; } -struct xnn_gemm_config* xnn_init_qd8_f32_qc4w_gemm_config() { +const struct xnn_gemm_config* xnn_init_qd8_f32_qc4w_gemm_config() { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); if (hardware_config == NULL) { return NULL; @@ -3848,7 +3848,7 @@ struct xnn_gemm_config* xnn_init_qd8_f32_qc4w_gemm_config() { return &qd8_f32_qc4w_gemm_config; } -struct xnn_gemm_config* xnn_init_qd8_f32_qb4w_gemm_config() { +const struct xnn_gemm_config* xnn_init_qd8_f32_qb4w_gemm_config() { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); if (hardware_config == NULL) { return NULL; @@ -3857,7 +3857,7 @@ struct xnn_gemm_config* xnn_init_qd8_f32_qb4w_gemm_config() { return &qd8_f32_qb4w_gemm_config; } -struct xnn_gemm_config* xnn_init_qd8_f32_qc8w_gemm_config() { +const struct xnn_gemm_config* xnn_init_qd8_f32_qc8w_gemm_config() { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); if (hardware_config == NULL) { return NULL; @@ -3866,7 +3866,7 @@ struct xnn_gemm_config* xnn_init_qd8_f32_qc8w_gemm_config() { return &qd8_f32_qc8w_gemm_config; } -struct xnn_gemm_config* xnn_init_qp8_f32_qc4w_gemm_config() { +const struct xnn_gemm_config* xnn_init_qp8_f32_qc4w_gemm_config() { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); if (hardware_config == NULL) { @@ -3880,7 +3880,7 @@ XNN_INIT_ONCE(qp8_f32_qc4w_gemm); return NULL; } -struct xnn_gemm_config* xnn_init_qs8_qc8w_gemm_config() { +const struct xnn_gemm_config* xnn_init_qs8_qc8w_gemm_config() { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); if (hardware_config == NULL) { return NULL; @@ -3889,7 +3889,7 @@ struct xnn_gemm_config* xnn_init_qs8_qc8w_gemm_config() { return &qs8_qc8w_gemm_config; } -struct xnn_gemm_config* xnn_init_qu8_gemm_config() { +const struct xnn_gemm_config* xnn_init_qu8_gemm_config() { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); if (hardware_config == NULL) { return NULL; diff --git a/src/xnnpack/config.h b/src/xnnpack/config.h index 9dfa5a1e927..21efac46990 100644 --- a/src/xnnpack/config.h +++ b/src/xnnpack/config.h @@ -137,11 +137,11 @@ XNN_INTERNAL const struct xnn_gavgpool_cw_config* xnn_init_f32_gavgpool_cw_confi #define XNN_MAX_QS8_DWCONV_UKERNELS 2 #define XNN_MAX_QU8_DWCONV_UKERNELS 2 -XNN_INTERNAL struct xnn_dwconv_config* xnn_init_f16_dwconv_config(); -XNN_INTERNAL struct xnn_dwconv_config* xnn_init_f32_dwconv_config(); -XNN_INTERNAL struct xnn_dwconv_config* xnn_init_qs8_qc8w_dwconv_config(); -XNN_INTERNAL struct xnn_dwconv_config* xnn_init_qs8_dwconv_config(); -XNN_INTERNAL struct xnn_dwconv_config* xnn_init_qu8_dwconv_config(); +XNN_INTERNAL const struct xnn_dwconv_config* xnn_init_f16_dwconv_config(); +XNN_INTERNAL const struct xnn_dwconv_config* xnn_init_f32_dwconv_config(); +XNN_INTERNAL const struct xnn_dwconv_config* xnn_init_qs8_qc8w_dwconv_config(); +XNN_INTERNAL const struct xnn_dwconv_config* xnn_init_qs8_dwconv_config(); +XNN_INTERNAL const struct xnn_dwconv_config* xnn_init_qu8_dwconv_config(); // Bilinear interpolation (2D). XNN_INTERNAL const struct xnn_ibilinear_config* xnn_init_f16_ibilinear_config(); @@ -234,20 +234,20 @@ static inline bool xnn_is_hmp_igemm_ukernel(struct xnn_hmp_igemm_ukernel ukernel #endif } -XNN_INTERNAL struct xnn_gemm_config* xnn_init_f16_gemm_config(); -XNN_INTERNAL struct xnn_gemm_config* xnn_init_f32_gemm_config(); -XNN_INTERNAL struct xnn_gemm_config* xnn_init_f32_gemm_nr2_config(); -XNN_INTERNAL struct xnn_gemm_config* xnn_init_f32_qc8w_gemm_config(); -XNN_INTERNAL struct xnn_gemm_config* xnn_init_f32_qc4w_gemm_config(); -XNN_INTERNAL struct xnn_gemm_config* xnn_init_qd8_f16_qb4w_gemm_config(); -XNN_INTERNAL struct xnn_gemm_config* xnn_init_qd8_f16_qc4w_gemm_config(); -XNN_INTERNAL struct xnn_gemm_config* xnn_init_qd8_f16_qc8w_gemm_config(); -XNN_INTERNAL struct xnn_gemm_config* xnn_init_qd8_f32_qb4w_gemm_config(); -XNN_INTERNAL struct xnn_gemm_config* xnn_init_qd8_f32_qc4w_gemm_config(); -XNN_INTERNAL struct xnn_gemm_config* xnn_init_qd8_f32_qc8w_gemm_config(); -XNN_INTERNAL struct xnn_gemm_config* xnn_init_qp8_f32_qc4w_gemm_config(); -XNN_INTERNAL struct xnn_gemm_config* xnn_init_qs8_qc8w_gemm_config(); -XNN_INTERNAL struct xnn_gemm_config* xnn_init_qu8_gemm_config(); +XNN_INTERNAL const struct xnn_gemm_config* xnn_init_f16_gemm_config(); +XNN_INTERNAL const struct xnn_gemm_config* xnn_init_f32_gemm_config(); +XNN_INTERNAL const struct xnn_gemm_config* xnn_init_f32_gemm_nr2_config(); +XNN_INTERNAL const struct xnn_gemm_config* xnn_init_f32_qc8w_gemm_config(); +XNN_INTERNAL const struct xnn_gemm_config* xnn_init_f32_qc4w_gemm_config(); +XNN_INTERNAL const struct xnn_gemm_config* xnn_init_qd8_f16_qb4w_gemm_config(); +XNN_INTERNAL const struct xnn_gemm_config* xnn_init_qd8_f16_qc4w_gemm_config(); +XNN_INTERNAL const struct xnn_gemm_config* xnn_init_qd8_f16_qc8w_gemm_config(); +XNN_INTERNAL const struct xnn_gemm_config* xnn_init_qd8_f32_qb4w_gemm_config(); +XNN_INTERNAL const struct xnn_gemm_config* xnn_init_qd8_f32_qc4w_gemm_config(); +XNN_INTERNAL const struct xnn_gemm_config* xnn_init_qd8_f32_qc8w_gemm_config(); +XNN_INTERNAL const struct xnn_gemm_config* xnn_init_qp8_f32_qc4w_gemm_config(); +XNN_INTERNAL const struct xnn_gemm_config* xnn_init_qs8_qc8w_gemm_config(); +XNN_INTERNAL const struct xnn_gemm_config* xnn_init_qu8_gemm_config(); XNN_INTERNAL const struct xnn_maxpool_config* xnn_init_f16_maxpool_config(); XNN_INTERNAL const struct xnn_maxpool_config* xnn_init_f32_maxpool_config(); diff --git a/src/xnnpack/models.h b/src/xnnpack/models.h deleted file mode 100644 index 8242e4e3f29..00000000000 --- a/src/xnnpack/models.h +++ /dev/null @@ -1,77 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#pragma once - -#include -#include -#include - -#include "xnnpack.h" -#include "xnnpack/aligned-allocator.h" -#include "xnnpack/common.h" - -// align a size up to XNN_EXTRA_BYTES -#define XNN_PAD_EXTRA_BYTES(s, t) (((s) + XNN_EXTRA_BYTES / sizeof(t) - 1) & ~(XNN_EXTRA_BYTES / sizeof(t) - 1)) - -namespace models { - -typedef std::vector> Operators; -typedef std::vector> Workspace; - -// Helper class for holding a list of operators and associated workspace. -// Workspace needs to live as long as the operators. -class ExecutionPlan { - public: - ExecutionPlan() = default; - // Takes ownership of operators and workspace. - ExecutionPlan(Operators& operators, Workspace& workspace) - : operators_(std::move(operators)), workspace_(std::move(workspace)) {} - - bool empty() const { - return operators_.empty(); - } - Operators::iterator begin() { return operators_.begin(); } - Operators::iterator end() { return operators_.end(); } - - private: - Operators operators_; - std::vector> workspace_; -}; - -typedef ExecutionPlan (*ExecutionPlanFactory)(pthreadpool_t threadpool); - -ExecutionPlan FP32MobileNetV1(pthreadpool_t threadpool); -ExecutionPlan FP32MobileNetV2(pthreadpool_t threadpool); -ExecutionPlan FP32MobileNetV3Large(pthreadpool_t threadpool); -ExecutionPlan FP32MobileNetV3Small(pthreadpool_t threadpool); - -ExecutionPlan FP32SparseMobileNetV1(float sparsity, pthreadpool_t threadpool); -ExecutionPlan FP32SparseMobileNetV2(float sparsity, pthreadpool_t threadpool); -ExecutionPlan FP32SparseMobileNetV3Large(float sparsity, pthreadpool_t threadpool); -ExecutionPlan FP32SparseMobileNetV3Small(float sparsity, pthreadpool_t threadpool); - -ExecutionPlan FP16MobileNetV1(pthreadpool_t threadpool); -ExecutionPlan FP16MobileNetV2(pthreadpool_t threadpool); -ExecutionPlan FP16MobileNetV3Large(pthreadpool_t threadpool); -ExecutionPlan FP16MobileNetV3Small(pthreadpool_t threadpool); - -ExecutionPlan FP16SparseMobileNetV1(float sparsity, pthreadpool_t threadpool); -ExecutionPlan FP16SparseMobileNetV2(float sparsity, pthreadpool_t threadpool); -ExecutionPlan FP16SparseMobileNetV3Large(float sparsity, pthreadpool_t threadpool); -ExecutionPlan FP16SparseMobileNetV3Small(float sparsity, pthreadpool_t threadpool); - -ExecutionPlan QC8MobileNetV1(pthreadpool_t threadpool); -ExecutionPlan QC8MobileNetV2(pthreadpool_t threadpool); - -ExecutionPlan QS8MobileNetV1(pthreadpool_t threadpool); -ExecutionPlan QS8MobileNetV2(pthreadpool_t threadpool); - -ExecutionPlan QU8MobileNetV1(pthreadpool_t threadpool); -ExecutionPlan QU8MobileNetV2(pthreadpool_t threadpool); -ExecutionPlan QU8MobileNetV3Large(pthreadpool_t threadpool); -ExecutionPlan QU8MobileNetV3Small(pthreadpool_t threadpool); - -} // namespace models From 8d72126cd64e5c6fe4f18cd6b0461381282335e0 Mon Sep 17 00:00:00 2001 From: Alan Kelly Date: Mon, 23 Sep 2024 05:41:49 -0700 Subject: [PATCH 25/50] Blast fc op with clang-format PiperOrigin-RevId: 677754238 --- src/subgraph/fully-connected.c | 992 +++++++++++++++------------------ 1 file changed, 442 insertions(+), 550 deletions(-) diff --git a/src/subgraph/fully-connected.c b/src/subgraph/fully-connected.c index 12dcd8733b8..48a6f24e5a3 100644 --- a/src/subgraph/fully-connected.c +++ b/src/subgraph/fully-connected.c @@ -22,13 +22,9 @@ #include "pthreadpool.h" static enum xnn_status create_fully_connected_operator( - const struct xnn_node* node, - const struct xnn_value* values, - size_t num_values, - struct xnn_operator_data* opdata, - struct xnn_code_cache* code_cache, - xnn_weights_cache_t weights_cache) -{ + const struct xnn_node* node, const struct xnn_value* values, + size_t num_values, struct xnn_operator_data* opdata, + struct xnn_code_cache* code_cache, xnn_weights_cache_t weights_cache) { assert(node->num_inputs >= 2); assert(node->num_inputs <= 3); const uint32_t input_id = node->inputs[0]; @@ -52,7 +48,9 @@ static enum xnn_status create_fully_connected_operator( input_channels = values[node->inputs[1]].shape.dim[1]; } - const void* kernel_data = values[filter_id].fp32_data != NULL ? values[filter_id].fp32_data : values[filter_id].data; + const void* kernel_data = values[filter_id].fp32_data != NULL + ? values[filter_id].fp32_data + : values[filter_id].data; bool has_non_static_weights = (kernel_data == NULL); const void* bias_data = NULL; @@ -61,7 +59,8 @@ static enum xnn_status create_fully_connected_operator( assert(bias_id != XNN_INVALID_VALUE_ID); assert(bias_id < num_values); - bias_data = values[bias_id].fp32_data != NULL ? values[bias_id].fp32_data : values[bias_id].data; + bias_data = values[bias_id].fp32_data != NULL ? values[bias_id].fp32_data + : values[bias_id].data; has_non_static_weights |= (bias_data == NULL); } @@ -75,99 +74,66 @@ static enum xnn_status create_fully_connected_operator( case xnn_datatype_fp16: if (has_non_static_weights) { status = xnn_create_dynamic_fully_connected_nc_f16( - node->activation.output_min, - node->activation.output_max, - /*flags=*/node->flags, - &opdata->operator_objects[0]); + node->activation.output_min, node->activation.output_max, + /*flags=*/node->flags, &opdata->operator_objects[0]); } else { status = xnn_create_fully_connected_nc_f16( - input_channels, - output_channels, - /*input_stride=*/input_channels, - /*output_stride=*/output_channels, - kernel_data, - bias_data, - node->activation.output_min, - node->activation.output_max, - node->flags, - code_cache, - weights_cache, - &opdata->operator_objects[0]); + input_channels, output_channels, + /*input_stride=*/input_channels, + /*output_stride=*/output_channels, kernel_data, bias_data, + node->activation.output_min, node->activation.output_max, + node->flags, code_cache, weights_cache, + &opdata->operator_objects[0]); } break; case xnn_datatype_fp32: if (has_non_static_weights) { status = xnn_create_dynamic_fully_connected_nc_f16( - node->activation.output_min, - node->activation.output_max, - node->flags | XNN_FLAG_FP32_STATIC_WEIGHTS, - &opdata->operator_objects[0]); + node->activation.output_min, node->activation.output_max, + node->flags | XNN_FLAG_FP32_STATIC_WEIGHTS, + &opdata->operator_objects[0]); } else { status = xnn_create_fully_connected_nc_f16( - input_channels, - output_channels, - /*input_stride=*/input_channels, - /*output_stride=*/output_channels, - kernel_data, - bias_data, - node->activation.output_min, - node->activation.output_max, - node->flags | XNN_FLAG_FP32_STATIC_WEIGHTS, - code_cache, - weights_cache, - &opdata->operator_objects[0]); + input_channels, output_channels, + /*input_stride=*/input_channels, + /*output_stride=*/output_channels, kernel_data, bias_data, + node->activation.output_min, node->activation.output_max, + node->flags | XNN_FLAG_FP32_STATIC_WEIGHTS, code_cache, + weights_cache, &opdata->operator_objects[0]); } break; case xnn_datatype_qcint4: status = xnn_create_fully_connected_nc_qd8_f16_qc4w( - input_channels, - output_channels, - /*input_stride=*/input_channels, - /*output_stride=*/output_channels, - /*kernel_zero_point=*/values[filter_id].quantization.zero_point, - values[filter_id].quantization.channelwise_scale, - kernel_data, - bias_data, - node->activation.output_min, - node->activation.output_max, - node->flags, - code_cache, - weights_cache, - &opdata->operator_objects[0]); + input_channels, output_channels, + /*input_stride=*/input_channels, + /*output_stride=*/output_channels, + /*kernel_zero_point=*/values[filter_id].quantization.zero_point, + values[filter_id].quantization.channelwise_scale, kernel_data, + bias_data, node->activation.output_min, + node->activation.output_max, node->flags, code_cache, + weights_cache, &opdata->operator_objects[0]); break; case xnn_datatype_qbint4: status = xnn_create_fully_connected_nc_qd8_f16_qb4w( - input_channels, - output_channels, - /*input_stride=*/input_channels, - /*output_stride=*/output_channels, - /*block_size=*/values[filter_id].quantization.block_size, - /*kernel_zero_point=*/values[filter_id].quantization.zero_point, - (const uint16_t*) values[filter_id].quantization.blockwise_scale, - kernel_data, - bias_data, - node->activation.output_min, - node->activation.output_max, - node->flags, - code_cache, - weights_cache, - &opdata->operator_objects[0]); + input_channels, output_channels, + /*input_stride=*/input_channels, + /*output_stride=*/output_channels, + /*block_size=*/values[filter_id].quantization.block_size, + /*kernel_zero_point=*/values[filter_id].quantization.zero_point, + (const uint16_t*)values[filter_id].quantization.blockwise_scale, + kernel_data, bias_data, node->activation.output_min, + node->activation.output_max, node->flags, code_cache, + weights_cache, &opdata->operator_objects[0]); break; case xnn_datatype_qcint8: status = xnn_create_fully_connected_nc_qd8_f16_qc8w( - input_channels, - output_channels, - /*input_stride=*/input_channels, - /*output_stride=*/output_channels, - values[filter_id].quantization.channelwise_scale, - kernel_data, - bias_data, - node->activation.output_min, - node->activation.output_max, - node->flags, - code_cache, - weights_cache, - &opdata->operator_objects[0]); + input_channels, output_channels, + /*input_stride=*/input_channels, + /*output_stride=*/output_channels, + values[filter_id].quantization.channelwise_scale, kernel_data, + bias_data, node->activation.output_min, + node->activation.output_max, node->flags, code_cache, + weights_cache, &opdata->operator_objects[0]); break; default: XNN_UNREACHABLE; @@ -178,86 +144,63 @@ static enum xnn_status create_fully_connected_operator( case xnn_datatype_fp32: if (has_non_static_weights) { status = xnn_create_dynamic_fully_connected_nc_f32( - node->activation.output_min, - node->activation.output_max, - /*flags=*/node->flags, - &opdata->operator_objects[0]); + node->activation.output_min, node->activation.output_max, + /*flags=*/node->flags, &opdata->operator_objects[0]); } else { status = xnn_create_fully_connected_nc_f32( - input_channels, - output_channels, - /*input_stride=*/input_channels, - /*output_stride=*/output_channels, - kernel_data, - bias_data, - node->activation.output_min, - node->activation.output_max, - /*flags=*/node->flags, - code_cache, - weights_cache, - &opdata->operator_objects[0]); + input_channels, output_channels, + /*input_stride=*/input_channels, + /*output_stride=*/output_channels, kernel_data, bias_data, + node->activation.output_min, node->activation.output_max, + /*flags=*/node->flags, code_cache, weights_cache, + &opdata->operator_objects[0]); } break; case xnn_datatype_qbint4: status = xnn_create_fully_connected_nc_qd8_f32_qb4w( - input_channels, - output_channels, - /*input_stride=*/input_channels, - /*output_stride=*/output_channels, - /*block_size=*/values[filter_id].quantization.block_size, - /*kernel_zero_point=*/values[filter_id].quantization.zero_point, - (const uint16_t*) values[filter_id].quantization.blockwise_scale, - kernel_data, - bias_data, - node->activation.output_min, - node->activation.output_max, - node->flags, - code_cache, - weights_cache, - &opdata->operator_objects[0]); + input_channels, output_channels, + /*input_stride=*/input_channels, + /*output_stride=*/output_channels, + /*block_size=*/values[filter_id].quantization.block_size, + /*kernel_zero_point=*/values[filter_id].quantization.zero_point, + (const uint16_t*)values[filter_id].quantization.blockwise_scale, + kernel_data, bias_data, node->activation.output_min, + node->activation.output_max, node->flags, code_cache, + weights_cache, &opdata->operator_objects[0]); break; case xnn_datatype_qcint4: switch (input_datatype) { case xnn_datatype_fp32: status = xnn_create_fully_connected_nc_f32_qc4w( - input_channels, - output_channels, - /*input_stride=*/input_channels, - /*output_stride=*/output_channels, - values[filter_id].quantization.zero_point, - values[filter_id].quantization.channelwise_scale, - kernel_data, - bias_data, - node->activation.output_min, - node->activation.output_max, - /*flags=*/node->flags, - code_cache, - weights_cache, - &opdata->operator_objects[0]); + input_channels, output_channels, + /*input_stride=*/input_channels, + /*output_stride=*/output_channels, + values[filter_id].quantization.zero_point, + values[filter_id].quantization.channelwise_scale, kernel_data, + bias_data, node->activation.output_min, + node->activation.output_max, + /*flags=*/node->flags, code_cache, weights_cache, + &opdata->operator_objects[0]); break; case xnn_datatype_qdint8: status = xnn_create_fully_connected_nc_qd8_f32_qc4w( - input_channels, - output_channels, - /*input_stride=*/input_channels, - /*output_stride=*/output_channels, - /*kernel_zero_point=*/values[filter_id].quantization.zero_point, - values[filter_id].quantization.channelwise_scale, - kernel_data, - bias_data, - node->activation.output_min, - node->activation.output_max, - node->flags, - code_cache, - weights_cache, - &opdata->operator_objects[0]); + input_channels, output_channels, + /*input_stride=*/input_channels, + /*output_stride=*/output_channels, + /*kernel_zero_point=*/ + values[filter_id].quantization.zero_point, + values[filter_id].quantization.channelwise_scale, kernel_data, + bias_data, node->activation.output_min, + node->activation.output_max, node->flags, code_cache, + weights_cache, &opdata->operator_objects[0]); break; case xnn_datatype_qpint8: status = xnn_create_fully_connected_nc_qp8_f32_qc4w( input_channels, output_channels, /*input_stride=*/input_channels, /*output_stride=*/output_channels, - /*kernel_zero_point=*/values[filter_id].quantization.zero_point, + /*kernel_zero_point=*/ + values[filter_id].quantization.zero_point, values[filter_id].quantization.channelwise_scale, kernel_data, bias_data, node->activation.output_min, node->activation.output_max, node->flags, code_cache, @@ -271,42 +214,31 @@ static enum xnn_status create_fully_connected_operator( switch (input_datatype) { case xnn_datatype_fp32: status = xnn_create_fully_connected_nc_f32_qc8w( - input_channels, - output_channels, - /*input_stride=*/input_channels, - /*output_stride=*/output_channels, - values[filter_id].quantization.channelwise_scale, - kernel_data, - bias_data, - node->activation.output_min, - node->activation.output_max, - /*flags=*/node->flags, - code_cache, - weights_cache, - &opdata->operator_objects[0]); + input_channels, output_channels, + /*input_stride=*/input_channels, + /*output_stride=*/output_channels, + values[filter_id].quantization.channelwise_scale, kernel_data, + bias_data, node->activation.output_min, + node->activation.output_max, + /*flags=*/node->flags, code_cache, weights_cache, + &opdata->operator_objects[0]); break; - case xnn_datatype_qdint8: - status = xnn_create_fully_connected_nc_qd8_f32_qc8w( - input_channels, - output_channels, - /*input_stride=*/input_channels, - /*output_stride=*/output_channels, - values[filter_id].quantization.channelwise_scale, - kernel_data, - bias_data, - node->activation.output_min, - node->activation.output_max, - node->flags, - code_cache, - weights_cache, - &opdata->operator_objects[0]); - break; - default: - XNN_UNREACHABLE; + case xnn_datatype_qdint8: + status = xnn_create_fully_connected_nc_qd8_f32_qc8w( + input_channels, output_channels, + /*input_stride=*/input_channels, + /*output_stride=*/output_channels, + values[filter_id].quantization.channelwise_scale, kernel_data, + bias_data, node->activation.output_min, + node->activation.output_max, node->flags, code_cache, + weights_cache, &opdata->operator_objects[0]); + break; + default: + XNN_UNREACHABLE; } - break; - default: - XNN_UNREACHABLE; + break; + default: + XNN_UNREACHABLE; } break; case xnn_datatype_qint8: @@ -316,81 +248,71 @@ static enum xnn_status create_fully_connected_operator( assert(kernel_data != NULL); assert(values[filter_id].datatype == xnn_datatype_qcint8); const float output_scale = values[output_id].quantization.scale; - const int32_t output_zero_point = values[output_id].quantization.zero_point; - const int8_t output_min = xnn_qs8_quantize(node->activation.output_min, output_scale, output_zero_point); - const int8_t output_max = xnn_qs8_quantize(node->activation.output_max, output_scale, output_zero_point); + const int32_t output_zero_point = + values[output_id].quantization.zero_point; + const int8_t output_min = xnn_qs8_quantize( + node->activation.output_min, output_scale, output_zero_point); + const int8_t output_max = xnn_qs8_quantize( + node->activation.output_max, output_scale, output_zero_point); status = xnn_create_fully_connected_nc_qs8_qc8w( - input_channels, - output_channels, - /*input_stride=*/input_channels, - /*output_stride=*/output_channels, - (int8_t) values[input_id].quantization.zero_point, - values[input_id].quantization.scale, - values[filter_id].quantization.channelwise_scale, - kernel_data, - bias_data, - (int8_t) output_zero_point, - output_scale, output_min, output_max, - /*flags=*/node->flags, - code_cache, - weights_cache, - &opdata->operator_objects[0]); + input_channels, output_channels, + /*input_stride=*/input_channels, + /*output_stride=*/output_channels, + (int8_t)values[input_id].quantization.zero_point, + values[input_id].quantization.scale, + values[filter_id].quantization.channelwise_scale, kernel_data, + bias_data, (int8_t)output_zero_point, output_scale, output_min, + output_max, + /*flags=*/node->flags, code_cache, weights_cache, + &opdata->operator_objects[0]); break; - case xnn_datatype_qint8: - { + case xnn_datatype_qint8: { assert(!has_non_static_weights); assert(kernel_data != NULL); const float output_scale = values[output_id].quantization.scale; - const int32_t output_zero_point = values[output_id].quantization.zero_point; - const int8_t output_min = xnn_qs8_quantize(node->activation.output_min, output_scale, output_zero_point); - const int8_t output_max = xnn_qs8_quantize(node->activation.output_max, output_scale, output_zero_point); + const int32_t output_zero_point = + values[output_id].quantization.zero_point; + const int8_t output_min = xnn_qs8_quantize( + node->activation.output_min, output_scale, output_zero_point); + const int8_t output_max = xnn_qs8_quantize( + node->activation.output_max, output_scale, output_zero_point); status = xnn_create_fully_connected_nc_qs8( - input_channels, - output_channels, - /*input_stride=*/input_channels, - /*output_stride=*/output_channels, - (int8_t) values[input_id].quantization.zero_point, - values[input_id].quantization.scale, - values[filter_id].quantization.scale, - kernel_data, - bias_data, - (int8_t) output_zero_point, - output_scale, output_min, output_max, - /*flags=*/node->flags, - code_cache, - weights_cache, - &opdata->operator_objects[0]); + input_channels, output_channels, + /*input_stride=*/input_channels, + /*output_stride=*/output_channels, + (int8_t)values[input_id].quantization.zero_point, + values[input_id].quantization.scale, + values[filter_id].quantization.scale, kernel_data, bias_data, + (int8_t)output_zero_point, output_scale, output_min, output_max, + /*flags=*/node->flags, code_cache, weights_cache, + &opdata->operator_objects[0]); break; } default: XNN_UNREACHABLE; } break; - case xnn_datatype_quint8: - { + case xnn_datatype_quint8: { assert(!has_non_static_weights); assert(kernel_data != NULL); const float output_scale = values[output_id].quantization.scale; - const int32_t output_zero_point = values[output_id].quantization.zero_point; - const uint8_t output_min = xnn_qu8_quantize(node->activation.output_min, output_scale, output_zero_point); - const uint8_t output_max = xnn_qu8_quantize(node->activation.output_max, output_scale, output_zero_point); + const int32_t output_zero_point = + values[output_id].quantization.zero_point; + const uint8_t output_min = xnn_qu8_quantize( + node->activation.output_min, output_scale, output_zero_point); + const uint8_t output_max = xnn_qu8_quantize( + node->activation.output_max, output_scale, output_zero_point); status = xnn_create_fully_connected_nc_qu8( - input_channels, - output_channels, - /*input_stride=*/input_channels, - /*output_stride=*/output_channels, - (uint8_t) values[input_id].quantization.zero_point, - values[input_id].quantization.scale, - (uint8_t) values[filter_id].quantization.zero_point, - values[filter_id].quantization.scale, - kernel_data, - bias_data, - (uint8_t) output_zero_point, - output_scale, output_min, output_max, - /*flags=*/node->flags, - code_cache, - weights_cache, - &opdata->operator_objects[0]); + input_channels, output_channels, + /*input_stride=*/input_channels, + /*output_stride=*/output_channels, + (uint8_t)values[input_id].quantization.zero_point, + values[input_id].quantization.scale, + (uint8_t)values[filter_id].quantization.zero_point, + values[filter_id].quantization.scale, kernel_data, bias_data, + (uint8_t)output_zero_point, output_scale, output_min, output_max, + /*flags=*/node->flags, code_cache, weights_cache, + &opdata->operator_objects[0]); break; } default: @@ -400,17 +322,13 @@ static enum xnn_status create_fully_connected_operator( } enum xnn_status resize_fully_connected_output_tensor( - const struct xnn_operator_data* opdata, - struct xnn_value* values, - size_t num_values, - size_t old_workspace_size, - pthreadpool_t threadpool) -{ + const struct xnn_operator_data* opdata, struct xnn_value* values, + size_t num_values, size_t old_workspace_size, pthreadpool_t threadpool) { const uint32_t filter_id = opdata->inputs[1]; const struct xnn_value* filter = &values[filter_id]; const uint32_t output_id = opdata->outputs[0]; - struct xnn_value* output = (struct xnn_value*) &values[output_id]; + struct xnn_value* output = (struct xnn_value*)&values[output_id]; const uint32_t input_id = opdata->inputs[0]; const struct xnn_value* input = &values[input_id]; @@ -422,16 +340,20 @@ enum xnn_status resize_fully_connected_output_tensor( output->shape.num_dims = input->shape.num_dims; } // Infer output channels. - const uint32_t filter_output_channel_index = (opdata->flags & XNN_FLAG_TRANSPOSE_WEIGHTS) ? 1 : 0; - output->shape.dim[output->shape.num_dims - 1] = filter->shape.dim[filter_output_channel_index]; + const uint32_t filter_output_channel_index = + (opdata->flags & XNN_FLAG_TRANSPOSE_WEIGHTS) ? 1 : 0; + output->shape.dim[output->shape.num_dims - 1] = + filter->shape.dim[filter_output_channel_index]; if (reshape_2d) { - const uint32_t filter_input_channel_index = (opdata->flags & XNN_FLAG_TRANSPOSE_WEIGHTS) ? 0 : 1; - const size_t num_input_elements = xnn_shape_multiply_all_dims(&input->shape); + const uint32_t filter_input_channel_index = + (opdata->flags & XNN_FLAG_TRANSPOSE_WEIGHTS) ? 0 : 1; + const size_t num_input_elements = + xnn_shape_multiply_all_dims(&input->shape); // propogate the input shape to output. - output->shape.dim[0] = num_input_elements / filter->shape.dim[filter_input_channel_index]; - } - else { + output->shape.dim[0] = + num_input_elements / filter->shape.dim[filter_input_channel_index]; + } else { // Propagate input shape to output. for (size_t cur_dim = 0; cur_dim < input->shape.num_dims - 1; cur_dim++) { output->shape.dim[cur_dim] = input->shape.dim[cur_dim]; @@ -448,16 +370,14 @@ enum xnn_status resize_fully_connected_output_tensor( } static enum xnn_status reshape_fully_connected_operator( - struct xnn_operator_data* opdata, - struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ + struct xnn_operator_data* opdata, struct xnn_value* values, + size_t num_values, pthreadpool_t threadpool) { const uint32_t input_id = opdata->inputs[0]; assert(input_id < num_values); const uint32_t filter_id = opdata->inputs[1]; assert(filter_id < num_values); - const size_t num_input_elements = xnn_shape_multiply_all_dims(&values[input_id].shape); + const size_t num_input_elements = + xnn_shape_multiply_all_dims(&values[input_id].shape); size_t output_channels, input_channels; if (opdata->flags & XNN_FLAG_TRANSPOSE_WEIGHTS) { input_channels = values[filter_id].shape.dim[0]; @@ -473,105 +393,71 @@ static enum xnn_status reshape_fully_connected_operator( switch (opdata->operator_objects[0]->type) { case xnn_operator_type_dynamic_fully_connected_nc_f16: status = xnn_reshape_dynamic_fully_connected_nc_f16( - opdata->operator_objects[0], - batch_size, - input_channels, output_channels, - input_channels, output_channels, - &opdata->workspace_size, &opdata->workspace_alignment, - threadpool); + opdata->operator_objects[0], batch_size, input_channels, + output_channels, input_channels, output_channels, + &opdata->workspace_size, &opdata->workspace_alignment, threadpool); break; case xnn_operator_type_dynamic_fully_connected_nc_f32: status = xnn_reshape_dynamic_fully_connected_nc_f32( - opdata->operator_objects[0], - batch_size, - input_channels, output_channels, - input_channels, output_channels, - &opdata->workspace_size, &opdata->workspace_alignment, - threadpool); + opdata->operator_objects[0], batch_size, input_channels, + output_channels, input_channels, output_channels, + &opdata->workspace_size, &opdata->workspace_alignment, threadpool); break; case xnn_operator_type_fully_connected_nc_f16: - status = xnn_reshape_fully_connected_nc_f16( - opdata->operator_objects[0], - batch_size, - threadpool); + status = xnn_reshape_fully_connected_nc_f16(opdata->operator_objects[0], + batch_size, threadpool); break; case xnn_operator_type_fully_connected_nc_f32: - status = xnn_reshape_fully_connected_nc_f32( - opdata->operator_objects[0], - batch_size, - threadpool); + status = xnn_reshape_fully_connected_nc_f32(opdata->operator_objects[0], + batch_size, threadpool); break; case xnn_operator_type_fully_connected_nc_f32_qc4w: status = xnn_reshape_fully_connected_nc_f32_qc4w( - opdata->operator_objects[0], - batch_size, - threadpool); + opdata->operator_objects[0], batch_size, threadpool); break; case xnn_operator_type_fully_connected_nc_f32_qc8w: status = xnn_reshape_fully_connected_nc_f32_qc8w( - opdata->operator_objects[0], - batch_size, - threadpool); + opdata->operator_objects[0], batch_size, threadpool); break; case xnn_operator_type_fully_connected_nc_qd8_f32_qc4w: status = xnn_reshape_fully_connected_nc_qd8_f32_qc4w( - opdata->operator_objects[0], - batch_size, - threadpool); + opdata->operator_objects[0], batch_size, threadpool); break; case xnn_operator_type_fully_connected_nc_qd8_f16_qc4w: status = xnn_reshape_fully_connected_nc_qd8_f16_qc4w( - opdata->operator_objects[0], - batch_size, - threadpool); + opdata->operator_objects[0], batch_size, threadpool); break; case xnn_operator_type_fully_connected_nc_qd8_f16_qb4w: status = xnn_reshape_fully_connected_nc_qd8_f16_qb4w( - opdata->operator_objects[0], - batch_size, - threadpool); + opdata->operator_objects[0], batch_size, threadpool); break; case xnn_operator_type_fully_connected_nc_qd8_f32_qb4w: status = xnn_reshape_fully_connected_nc_qd8_f32_qb4w( - opdata->operator_objects[0], - batch_size, - threadpool); + opdata->operator_objects[0], batch_size, threadpool); break; case xnn_operator_type_fully_connected_nc_qd8_f16_qc8w: status = xnn_reshape_fully_connected_nc_qd8_f16_qc8w( - opdata->operator_objects[0], - batch_size, - threadpool); + opdata->operator_objects[0], batch_size, threadpool); break; case xnn_operator_type_fully_connected_nc_qd8_f32_qc8w: status = xnn_reshape_fully_connected_nc_qd8_f32_qc8w( - opdata->operator_objects[0], - batch_size, - threadpool); + opdata->operator_objects[0], batch_size, threadpool); break; case xnn_operator_type_fully_connected_nc_qp8_f32_qc4w: status = xnn_reshape_fully_connected_nc_qp8_f32_qc4w( - opdata->operator_objects[0], - batch_size, - threadpool); + opdata->operator_objects[0], batch_size, threadpool); break; case xnn_operator_type_fully_connected_nc_qs8: - status = xnn_reshape_fully_connected_nc_qs8( - opdata->operator_objects[0], - batch_size, - threadpool); + status = xnn_reshape_fully_connected_nc_qs8(opdata->operator_objects[0], + batch_size, threadpool); break; case xnn_operator_type_fully_connected_nc_qs8_qc8w: status = xnn_reshape_fully_connected_nc_qs8_qc8w( - opdata->operator_objects[0], - batch_size, - threadpool); + opdata->operator_objects[0], batch_size, threadpool); break; case xnn_operator_type_fully_connected_nc_qu8: - status = xnn_reshape_fully_connected_nc_qu8( - opdata->operator_objects[0], - batch_size, - threadpool); + status = xnn_reshape_fully_connected_nc_qu8(opdata->operator_objects[0], + batch_size, threadpool); break; default: XNN_UNREACHABLE; @@ -580,15 +466,13 @@ static enum xnn_status reshape_fully_connected_operator( if (status != xnn_status_success) { return status; } - return resize_fully_connected_output_tensor(opdata, values, num_values, old_workspace_size, threadpool); + return resize_fully_connected_output_tensor(opdata, values, num_values, + old_workspace_size, threadpool); } static enum xnn_status setup_fully_connected_operator( - const struct xnn_operator_data* opdata, - const struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ + const struct xnn_operator_data* opdata, const struct xnn_value* values, + size_t num_values, pthreadpool_t threadpool) { const uint32_t input_id = opdata->inputs[0]; assert(input_id != XNN_INVALID_VALUE_ID); assert(input_id < num_values); @@ -608,15 +492,20 @@ static enum xnn_status setup_fully_connected_operator( assert(input_data != NULL); const struct xnn_value* kernel_value = values + filter_id; - bool has_dynamic_weights = kernel_value->allocation_type != xnn_allocation_type_static; - const void* kernel_data = kernel_value->allocation_type == xnn_allocation_type_static ? NULL : kernel_value->data; + bool has_dynamic_weights = + kernel_value->allocation_type != xnn_allocation_type_static; + const void* kernel_data = + kernel_value->allocation_type == xnn_allocation_type_static + ? NULL + : kernel_value->data; const void* bias_data = NULL; if (opdata->num_inputs > 2) { assert(bias_id != XNN_INVALID_VALUE_ID); assert(bias_id < num_values); const struct xnn_value* bias_value = values + bias_id; - has_dynamic_weights |= bias_value->allocation_type != xnn_allocation_type_static; + has_dynamic_weights |= + bias_value->allocation_type != xnn_allocation_type_static; if (has_dynamic_weights) { kernel_data = kernel_value->data; bias_data = bias_value->data; @@ -631,164 +520,131 @@ static enum xnn_status setup_fully_connected_operator( case xnn_operator_type_dynamic_fully_connected_nc_f16: assert(kernel_data != NULL); return xnn_setup_dynamic_fully_connected_nc_f16( - opdata->operator_objects[0], - opdata->workspace, input_data, kernel_data, bias_data, output_data); + opdata->operator_objects[0], opdata->workspace, input_data, + kernel_data, bias_data, output_data); case xnn_operator_type_dynamic_fully_connected_nc_f32: assert(kernel_data != NULL); return xnn_setup_dynamic_fully_connected_nc_f32( - opdata->operator_objects[0], - opdata->workspace, input_data, kernel_data, bias_data, output_data); + opdata->operator_objects[0], opdata->workspace, input_data, + kernel_data, bias_data, output_data); case xnn_operator_type_fully_connected_nc_f16: assert(kernel_data == NULL); assert(bias_data == NULL); - return xnn_setup_fully_connected_nc_f16( - opdata->operator_objects[0], - input_data, - output_data); + return xnn_setup_fully_connected_nc_f16(opdata->operator_objects[0], + input_data, output_data); case xnn_operator_type_fully_connected_nc_f32: assert(kernel_data == NULL); assert(bias_data == NULL); - return xnn_setup_fully_connected_nc_f32( - opdata->operator_objects[0], - input_data, - output_data); + return xnn_setup_fully_connected_nc_f32(opdata->operator_objects[0], + input_data, output_data); case xnn_operator_type_fully_connected_nc_f32_qc4w: assert(kernel_data == NULL); assert(bias_data == NULL); - return xnn_setup_fully_connected_nc_f32_qc4w( - opdata->operator_objects[0], - input_data, - output_data); + return xnn_setup_fully_connected_nc_f32_qc4w(opdata->operator_objects[0], + input_data, output_data); case xnn_operator_type_fully_connected_nc_f32_qc8w: assert(kernel_data == NULL); assert(bias_data == NULL); - return xnn_setup_fully_connected_nc_f32_qc8w( - opdata->operator_objects[0], - input_data, - output_data); - case xnn_operator_type_fully_connected_nc_qd8_f32_qc4w: - { - const void* quantization_params = input_value->quantization.dynamic_params; + return xnn_setup_fully_connected_nc_f32_qc8w(opdata->operator_objects[0], + input_data, output_data); + case xnn_operator_type_fully_connected_nc_qd8_f32_qc4w: { + const void* quantization_params = + input_value->quantization.dynamic_params; assert(kernel_data == NULL); assert(bias_data == NULL); assert(quantization_params != NULL); return xnn_setup_fully_connected_nc_qd8_f32_qc4w( - opdata->operator_objects[0], - input_data, - output_data, - quantization_params); + opdata->operator_objects[0], input_data, output_data, + quantization_params); } - case xnn_operator_type_fully_connected_nc_qd8_f16_qc4w: - { - const void* quantization_params = input_value->quantization.dynamic_params; + case xnn_operator_type_fully_connected_nc_qd8_f16_qc4w: { + const void* quantization_params = + input_value->quantization.dynamic_params; assert(kernel_data == NULL); assert(bias_data == NULL); assert(quantization_params != NULL); return xnn_setup_fully_connected_nc_qd8_f16_qc4w( - opdata->operator_objects[0], - input_data, - output_data, - quantization_params); + opdata->operator_objects[0], input_data, output_data, + quantization_params); } - case xnn_operator_type_fully_connected_nc_qd8_f32_qb4w: - { - const void* quantization_params = input_value->quantization.dynamic_params; + case xnn_operator_type_fully_connected_nc_qd8_f32_qb4w: { + const void* quantization_params = + input_value->quantization.dynamic_params; assert(kernel_data == NULL); assert(bias_data == NULL); assert(quantization_params != NULL); return xnn_setup_fully_connected_nc_qd8_f32_qb4w( - opdata->operator_objects[0], - input_data, - output_data, - quantization_params); + opdata->operator_objects[0], input_data, output_data, + quantization_params); } - case xnn_operator_type_fully_connected_nc_qd8_f16_qb4w: - { - const void* quantization_params = input_value->quantization.dynamic_params; + case xnn_operator_type_fully_connected_nc_qd8_f16_qb4w: { + const void* quantization_params = + input_value->quantization.dynamic_params; assert(kernel_data == NULL); assert(bias_data == NULL); assert(quantization_params != NULL); return xnn_setup_fully_connected_nc_qd8_f16_qb4w( - opdata->operator_objects[0], - input_data, - output_data, - quantization_params); + opdata->operator_objects[0], input_data, output_data, + quantization_params); } - case xnn_operator_type_fully_connected_nc_qd8_f16_qc8w: - { - const void* quantization_params = input_value->quantization.dynamic_params; + case xnn_operator_type_fully_connected_nc_qd8_f16_qc8w: { + const void* quantization_params = + input_value->quantization.dynamic_params; assert(kernel_data == NULL); assert(bias_data == NULL); assert(quantization_params != NULL); return xnn_setup_fully_connected_nc_qd8_f16_qc8w( - opdata->operator_objects[0], - input_data, - output_data, - quantization_params); + opdata->operator_objects[0], input_data, output_data, + quantization_params); } - case xnn_operator_type_fully_connected_nc_qd8_f32_qc8w: - { - const void* quantization_params = input_value->quantization.dynamic_params; + case xnn_operator_type_fully_connected_nc_qd8_f32_qc8w: { + const void* quantization_params = + input_value->quantization.dynamic_params; assert(kernel_data == NULL); assert(bias_data == NULL); assert(quantization_params != NULL); return xnn_setup_fully_connected_nc_qd8_f32_qc8w( - opdata->operator_objects[0], - input_data, - output_data, - quantization_params); + opdata->operator_objects[0], input_data, output_data, + quantization_params); } - case xnn_operator_type_fully_connected_nc_qp8_f32_qc4w: - { + case xnn_operator_type_fully_connected_nc_qp8_f32_qc4w: { assert(kernel_data == NULL); assert(bias_data == NULL); return xnn_setup_fully_connected_nc_qp8_f32_qc4w( - opdata->operator_objects[0], - input_data, - output_data); + opdata->operator_objects[0], input_data, output_data); } case xnn_operator_type_fully_connected_nc_qs8: assert(kernel_data == NULL); assert(bias_data == NULL); - return xnn_setup_fully_connected_nc_qs8( - opdata->operator_objects[0], - input_data, - output_data); + return xnn_setup_fully_connected_nc_qs8(opdata->operator_objects[0], + input_data, output_data); case xnn_operator_type_fully_connected_nc_qs8_qc8w: assert(kernel_data == NULL); assert(bias_data == NULL); - return xnn_setup_fully_connected_nc_qs8_qc8w( - opdata->operator_objects[0], - input_data, - output_data); + return xnn_setup_fully_connected_nc_qs8_qc8w(opdata->operator_objects[0], + input_data, output_data); case xnn_operator_type_fully_connected_nc_qu8: assert(kernel_data == NULL); assert(bias_data == NULL); - return xnn_setup_fully_connected_nc_qu8( - opdata->operator_objects[0], - input_data, - output_data); + return xnn_setup_fully_connected_nc_qu8(opdata->operator_objects[0], + input_data, output_data); default: XNN_UNREACHABLE; } } static inline enum xnn_compute_type validate_datatypes_with_bias( - enum xnn_datatype input_datatype, - enum xnn_datatype kernel_datatype, - enum xnn_datatype bias_datatype, - enum xnn_datatype output_datatype) -{ + enum xnn_datatype input_datatype, enum xnn_datatype kernel_datatype, + enum xnn_datatype bias_datatype, enum xnn_datatype output_datatype) { switch (kernel_datatype) { case xnn_datatype_fp32: if (input_datatype == xnn_datatype_fp32 && bias_datatype == xnn_datatype_fp32 && - output_datatype == xnn_datatype_fp32) - { + output_datatype == xnn_datatype_fp32) { return xnn_compute_type_fp32; } else if (input_datatype == xnn_datatype_fp16 && - bias_datatype == xnn_datatype_fp32 && - output_datatype == xnn_datatype_fp16) { + bias_datatype == xnn_datatype_fp32 && + output_datatype == xnn_datatype_fp16) { // Flag: XNN_FLAG_FP32_STATIC_WEIGHTS return xnn_compute_type_fp16; } @@ -796,21 +652,18 @@ static inline enum xnn_compute_type validate_datatypes_with_bias( case xnn_datatype_fp16: if (input_datatype == xnn_datatype_fp16 && bias_datatype == xnn_datatype_fp16 && - output_datatype == xnn_datatype_fp16) - { + output_datatype == xnn_datatype_fp16) { return xnn_compute_type_fp16; } break; case xnn_datatype_qcint4: if (input_datatype == xnn_datatype_fp32 && bias_datatype == xnn_datatype_fp32 && - output_datatype == xnn_datatype_fp32) - { + output_datatype == xnn_datatype_fp32) { return xnn_compute_type_fp32; } else if (input_datatype == xnn_datatype_qdint8 && - bias_datatype == xnn_datatype_fp32 && - output_datatype == xnn_datatype_fp32) - { + bias_datatype == xnn_datatype_fp32 && + output_datatype == xnn_datatype_fp32) { return xnn_compute_type_qd8_to_fp32; } else if (input_datatype == xnn_datatype_qpint8 && bias_datatype == xnn_datatype_fp32 && @@ -825,26 +678,22 @@ static inline enum xnn_compute_type validate_datatypes_with_bias( case xnn_datatype_qbint4: if (input_datatype == xnn_datatype_qdint8 && bias_datatype == xnn_datatype_fp32 && - output_datatype == xnn_datatype_fp32) - { + output_datatype == xnn_datatype_fp32) { return xnn_compute_type_qd8_to_fp32; } else if (input_datatype == xnn_datatype_qdint8 && - bias_datatype == xnn_datatype_fp32 && - output_datatype == xnn_datatype_fp16) - { + bias_datatype == xnn_datatype_fp32 && + output_datatype == xnn_datatype_fp16) { return xnn_compute_type_qd8_to_fp16; } break; case xnn_datatype_qcint8: if (input_datatype == xnn_datatype_fp32 && bias_datatype == xnn_datatype_fp32 && - output_datatype == xnn_datatype_fp32) - { + output_datatype == xnn_datatype_fp32) { return xnn_compute_type_fp32; } else if (input_datatype == xnn_datatype_qdint8 && - bias_datatype == xnn_datatype_fp32 && - output_datatype == xnn_datatype_fp32) - { + bias_datatype == xnn_datatype_fp32 && + output_datatype == xnn_datatype_fp32) { return xnn_compute_type_qd8_to_fp32; } else if (input_datatype == xnn_datatype_qpint8 && bias_datatype == xnn_datatype_fp32 && @@ -863,16 +712,14 @@ static inline enum xnn_compute_type validate_datatypes_with_bias( case xnn_datatype_qint8: if (input_datatype == xnn_datatype_qint8 && bias_datatype == xnn_datatype_qint32 && - output_datatype == xnn_datatype_qint8) - { + output_datatype == xnn_datatype_qint8) { return xnn_compute_type_qs8; } break; case xnn_datatype_quint8: if (input_datatype == xnn_datatype_quint8 && bias_datatype == xnn_datatype_qint32 && - output_datatype == xnn_datatype_quint8) - { + output_datatype == xnn_datatype_quint8) { return xnn_compute_type_qu8; } break; @@ -883,28 +730,31 @@ static inline enum xnn_compute_type validate_datatypes_with_bias( } static inline enum xnn_compute_type validate_datatypes_without_bias( - enum xnn_datatype input_datatype, - enum xnn_datatype kernel_datatype, - enum xnn_datatype output_datatype) -{ + enum xnn_datatype input_datatype, enum xnn_datatype kernel_datatype, + enum xnn_datatype output_datatype) { switch (kernel_datatype) { case xnn_datatype_fp32: - if (input_datatype == xnn_datatype_fp32 && output_datatype == xnn_datatype_fp32) { + if (input_datatype == xnn_datatype_fp32 && + output_datatype == xnn_datatype_fp32) { return xnn_compute_type_fp32; - } else if (input_datatype == xnn_datatype_fp16 && output_datatype == xnn_datatype_fp16) { + } else if (input_datatype == xnn_datatype_fp16 && + output_datatype == xnn_datatype_fp16) { // Flag: XNN_FLAG_FP32_STATIC_WEIGHTS return xnn_compute_type_fp16; } break; case xnn_datatype_fp16: - if (input_datatype == xnn_datatype_fp16 && output_datatype == xnn_datatype_fp16) { + if (input_datatype == xnn_datatype_fp16 && + output_datatype == xnn_datatype_fp16) { return xnn_compute_type_fp16; } break; case xnn_datatype_qcint4: - if (input_datatype == xnn_datatype_fp32 && output_datatype == xnn_datatype_fp32) { + if (input_datatype == xnn_datatype_fp32 && + output_datatype == xnn_datatype_fp32) { return xnn_compute_type_fp32; - } else if (input_datatype == xnn_datatype_qdint8 && output_datatype == xnn_datatype_fp32) { + } else if (input_datatype == xnn_datatype_qdint8 && + output_datatype == xnn_datatype_fp32) { return xnn_compute_type_qd8_to_fp32; } else if (input_datatype == xnn_datatype_qpint8 && output_datatype == xnn_datatype_fp32) { @@ -915,16 +765,20 @@ static inline enum xnn_compute_type validate_datatypes_without_bias( } break; case xnn_datatype_qbint4: - if (input_datatype == xnn_datatype_qdint8 && output_datatype == xnn_datatype_fp32) { + if (input_datatype == xnn_datatype_qdint8 && + output_datatype == xnn_datatype_fp32) { return xnn_compute_type_qd8_to_fp32; - } else if (input_datatype == xnn_datatype_qdint8 && output_datatype == xnn_datatype_fp16) { + } else if (input_datatype == xnn_datatype_qdint8 && + output_datatype == xnn_datatype_fp16) { return xnn_compute_type_qd8_to_fp16; } break; case xnn_datatype_qcint8: - if (input_datatype == xnn_datatype_fp32 && output_datatype == xnn_datatype_fp32) { + if (input_datatype == xnn_datatype_fp32 && + output_datatype == xnn_datatype_fp32) { return xnn_compute_type_fp32; - } else if (input_datatype == xnn_datatype_qdint8 && output_datatype == xnn_datatype_fp32) { + } else if (input_datatype == xnn_datatype_qdint8 && + output_datatype == xnn_datatype_fp32) { return xnn_compute_type_qd8_to_fp32; } else if (input_datatype == xnn_datatype_qpint8 && output_datatype == xnn_datatype_fp32) { @@ -938,12 +792,14 @@ static inline enum xnn_compute_type validate_datatypes_without_bias( } break; case xnn_datatype_qint8: - if (input_datatype == xnn_datatype_qint8 && output_datatype == xnn_datatype_qint8) { + if (input_datatype == xnn_datatype_qint8 && + output_datatype == xnn_datatype_qint8) { return xnn_compute_type_qs8; } break; case xnn_datatype_quint8: - if (input_datatype == xnn_datatype_quint8 && output_datatype == xnn_datatype_quint8) { + if (input_datatype == xnn_datatype_quint8 && + output_datatype == xnn_datatype_quint8) { return xnn_compute_type_qu8; } break; @@ -953,33 +809,32 @@ static inline enum xnn_compute_type validate_datatypes_without_bias( return xnn_compute_type_invalid; } -enum xnn_status xnn_define_fully_connected( - xnn_subgraph_t subgraph, - float output_min, - float output_max, - uint32_t input_id, - uint32_t filter_id, - uint32_t bias_id, - uint32_t output_id, - uint32_t flags) -{ +enum xnn_status xnn_define_fully_connected(xnn_subgraph_t subgraph, + float output_min, float output_max, + uint32_t input_id, + uint32_t filter_id, uint32_t bias_id, + uint32_t output_id, uint32_t flags) { enum xnn_status status; - if ((status = xnn_subgraph_check_xnnpack_initialized(xnn_node_type_fully_connected)) != xnn_status_success) { + if ((status = xnn_subgraph_check_xnnpack_initialized( + xnn_node_type_fully_connected)) != xnn_status_success) { return status; } - status = xnn_subgraph_check_output_min_max(xnn_node_type_fully_connected, output_min, output_max); + status = xnn_subgraph_check_output_min_max(xnn_node_type_fully_connected, + output_min, output_max); if (status != xnn_status_success) { return status; } - if ((status = xnn_subgraph_check_input_node_id(xnn_node_type_fully_connected, input_id, subgraph->num_values)) != + if ((status = xnn_subgraph_check_input_node_id( + xnn_node_type_fully_connected, input_id, subgraph->num_values)) != xnn_status_success) { return status; } const struct xnn_value* input_value = &subgraph->values[input_id]; - status = xnn_subgraph_check_input_type_dense(xnn_node_type_fully_connected, input_id, input_value); + status = xnn_subgraph_check_input_type_dense(xnn_node_type_fully_connected, + input_id, input_value); if (status != xnn_status_success) { return status; } @@ -992,35 +847,40 @@ enum xnn_status xnn_define_fully_connected( case xnn_datatype_qpint8: break; case xnn_datatype_qdint8: - if (input_value->quantization.num_nonbatch_dims > input_value->shape.num_dims) { - xnn_log_error( - "failed to define %s operator with input ID #%" PRIu32 ": num_nonbatch_dims (%zu) must be " - "<= num_dims (%zu)", - xnn_node_type_to_string(xnn_node_type_fully_connected), input_id, - input_value->quantization.num_nonbatch_dims, input_value->shape.num_dims); + if (input_value->quantization.num_nonbatch_dims > + input_value->shape.num_dims) { + xnn_log_error("failed to define %s operator with input ID #%" PRIu32 + ": num_nonbatch_dims (%zu) must be " + "<= num_dims (%zu)", + xnn_node_type_to_string(xnn_node_type_fully_connected), + input_id, input_value->quantization.num_nonbatch_dims, + input_value->shape.num_dims); return xnn_status_invalid_parameter; } break; default: - xnn_log_error( - "failed to define %s operator with input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_fully_connected), input_id, - xnn_datatype_to_string(input_value->datatype), input_value->datatype); + xnn_log_error("failed to define %s operator with input ID #%" PRIu32 + ": unsupported Value datatype %s (%d)", + xnn_node_type_to_string(xnn_node_type_fully_connected), + input_id, xnn_datatype_to_string(input_value->datatype), + input_value->datatype); return xnn_status_invalid_parameter; } if (filter_id >= subgraph->num_values) { - xnn_log_error( - "failed to define %s operator with filter ID #%" PRIu32 ": invalid Value ID", - xnn_node_type_to_string(xnn_node_type_fully_connected), filter_id); + xnn_log_error("failed to define %s operator with filter ID #%" PRIu32 + ": invalid Value ID", + xnn_node_type_to_string(xnn_node_type_fully_connected), + filter_id); return xnn_status_invalid_parameter; } const struct xnn_value* kernel_value = &subgraph->values[filter_id]; if (kernel_value->type != xnn_value_type_dense_tensor) { - xnn_log_error( - "failed to define %s operator with filter ID #%" PRIu32 ": unsupported Value type %d (expected dense tensor)", - xnn_node_type_to_string(xnn_node_type_fully_connected), filter_id, kernel_value->type); + xnn_log_error("failed to define %s operator with filter ID #%" PRIu32 + ": unsupported Value type %d (expected dense tensor)", + xnn_node_type_to_string(xnn_node_type_fully_connected), + filter_id, kernel_value->type); return xnn_status_invalid_parameter; } @@ -1030,9 +890,10 @@ enum xnn_status xnn_define_fully_connected( break; // non-static kernel is supported default: if (kernel_value->data == NULL) { - xnn_log_error( - "failed to define %s operator with filter ID #%" PRIu32 ": non-static Value", - xnn_node_type_to_string(xnn_node_type_fully_connected), filter_id); + xnn_log_error("failed to define %s operator with filter ID #%" PRIu32 + ": non-static Value", + xnn_node_type_to_string(xnn_node_type_fully_connected), + filter_id); return xnn_status_invalid_parameter; } break; @@ -1045,12 +906,15 @@ enum xnn_status xnn_define_fully_connected( break; case xnn_datatype_qbint4: case xnn_datatype_qcint4: - if (kernel_value->quantization.zero_point != 8 && kernel_value->quantization.zero_point != 0) { - xnn_log_error( - "failed to define %s operator with filter ID #%" PRIu32 ": unsupported quantization zero point %" PRId32 - " for datatype %s, must be equals to 8 (unsigned weights) or 0 (signed weights) ", - xnn_node_type_to_string(xnn_node_type_fully_connected), filter_id, kernel_value->quantization.zero_point, - xnn_datatype_to_string(kernel_value->datatype)); + if (kernel_value->quantization.zero_point != 8 && + kernel_value->quantization.zero_point != 0) { + xnn_log_error("failed to define %s operator with filter ID #%" PRIu32 + ": unsupported quantization zero point %" PRId32 + " for datatype %s, must be equals to 8 (unsigned " + "weights) or 0 (signed weights) ", + xnn_node_type_to_string(xnn_node_type_fully_connected), + filter_id, kernel_value->quantization.zero_point, + xnn_datatype_to_string(kernel_value->datatype)); return xnn_status_invalid_parameter; } break; @@ -1058,54 +922,68 @@ enum xnn_status xnn_define_fully_connected( break; case xnn_datatype_qint8: if (kernel_value->quantization.zero_point != 0) { - xnn_log_error( - "failed to define %s operator with filter ID #%" PRIu32 ": unsupported quantization zero point %" PRId32 " for datatype %s", - xnn_node_type_to_string(xnn_node_type_fully_connected), filter_id, - kernel_value->quantization.zero_point, xnn_datatype_to_string(kernel_value->datatype)); + xnn_log_error("failed to define %s operator with filter ID #%" PRIu32 + ": unsupported quantization zero point %" PRId32 + " for datatype %s", + xnn_node_type_to_string(xnn_node_type_fully_connected), + filter_id, kernel_value->quantization.zero_point, + xnn_datatype_to_string(kernel_value->datatype)); } break; case xnn_datatype_quint8: break; default: - xnn_log_error( - "failed to define %s operator with filter ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_fully_connected), filter_id, - xnn_datatype_to_string(kernel_value->datatype), kernel_value->datatype); + xnn_log_error("failed to define %s operator with filter ID #%" PRIu32 + ": unsupported Value datatype %s (%d)", + xnn_node_type_to_string(xnn_node_type_fully_connected), + filter_id, xnn_datatype_to_string(kernel_value->datatype), + kernel_value->datatype); return xnn_status_invalid_parameter; } const bool is_channelwise_quantized = - kernel_value->datatype == xnn_datatype_qcint8 || kernel_value->datatype == xnn_datatype_qcint4; + kernel_value->datatype == xnn_datatype_qcint8 || + kernel_value->datatype == xnn_datatype_qcint4; if (is_channelwise_quantized) { - const size_t output_channels_dim = ((flags & XNN_FLAG_TRANSPOSE_WEIGHTS) != 0) ? 1 : 0; + const size_t output_channels_dim = + ((flags & XNN_FLAG_TRANSPOSE_WEIGHTS) != 0) ? 1 : 0; if (kernel_value->quantization.channel_dimension != output_channels_dim) { - xnn_log_error( - "failed to define %s operator with filter ID #%" PRIu32 ": invalid channel dimension %zu", - xnn_node_type_to_string(xnn_node_type_fully_connected), input_id, kernel_value->quantization.channel_dimension); + xnn_log_error("failed to define %s operator with filter ID #%" PRIu32 + ": invalid channel dimension %zu", + xnn_node_type_to_string(xnn_node_type_fully_connected), + input_id, kernel_value->quantization.channel_dimension); return xnn_status_invalid_parameter; } } - const bool is_blockwise_quantized = kernel_value->datatype == xnn_datatype_qbint4; + const bool is_blockwise_quantized = + kernel_value->datatype == xnn_datatype_qbint4; if (is_blockwise_quantized) { // TODO: Unsupported features - assert ((flags & XNN_FLAG_TRANSPOSE_WEIGHTS) == 0); + assert((flags & XNN_FLAG_TRANSPOSE_WEIGHTS) == 0); - const size_t input_channels_dim = ((flags & XNN_FLAG_TRANSPOSE_WEIGHTS) != 0) ? 0 : 1; - const size_t output_channels_dim = ((flags & XNN_FLAG_TRANSPOSE_WEIGHTS) != 0) ? 1 : 0; - if (kernel_value->quantization.channel_dimension_blockwise != output_channels_dim) { - xnn_log_error( - "failed to define %s operator with filter ID #%" PRIu32 ": invalid channel dimension %zu", - xnn_node_type_to_string(xnn_node_type_fully_connected), input_id, kernel_value->quantization.channel_dimension_blockwise); + const size_t input_channels_dim = + ((flags & XNN_FLAG_TRANSPOSE_WEIGHTS) != 0) ? 0 : 1; + const size_t output_channels_dim = + ((flags & XNN_FLAG_TRANSPOSE_WEIGHTS) != 0) ? 1 : 0; + if (kernel_value->quantization.channel_dimension_blockwise != + output_channels_dim) { + xnn_log_error("failed to define %s operator with filter ID #%" PRIu32 + ": invalid channel dimension %zu", + xnn_node_type_to_string(xnn_node_type_fully_connected), + input_id, + kernel_value->quantization.channel_dimension_blockwise); return xnn_status_invalid_parameter; } const size_t input_channels = kernel_value->shape.dim[input_channels_dim]; if (input_channels % kernel_value->quantization.block_size) { - xnn_log_error( - "failed to define %s operator with filter ID #%" PRIu32 ": invalid block size %zu, input_channels %zu", - xnn_node_type_to_string(xnn_node_type_fully_connected), input_id, kernel_value->quantization.block_size, input_channels); + xnn_log_error("failed to define %s operator with filter ID #%" PRIu32 + ": invalid block size %zu, input_channels %zu", + xnn_node_type_to_string(xnn_node_type_fully_connected), + input_id, kernel_value->quantization.block_size, + input_channels); return xnn_status_invalid_parameter; } } @@ -1113,17 +991,19 @@ enum xnn_status xnn_define_fully_connected( const struct xnn_value* bias_value = NULL; if (bias_id != XNN_INVALID_VALUE_ID) { if (bias_id >= subgraph->num_values) { - xnn_log_error( - "failed to define %s operator with bias ID #%" PRIu32 ": invalid Value ID", - xnn_node_type_to_string(xnn_node_type_fully_connected), bias_id); + xnn_log_error("failed to define %s operator with bias ID #%" PRIu32 + ": invalid Value ID", + xnn_node_type_to_string(xnn_node_type_fully_connected), + bias_id); return xnn_status_invalid_parameter; } bias_value = &subgraph->values[bias_id]; if (bias_value->type != xnn_value_type_dense_tensor) { - xnn_log_error( - "failed to define %s operator with bias ID #%" PRIu32 ": unsupported Value type %d (expected dense tensor)", - xnn_node_type_to_string(xnn_node_type_fully_connected), bias_id, bias_value->type); + xnn_log_error("failed to define %s operator with bias ID #%" PRIu32 + ": unsupported Value type %d (expected dense tensor)", + xnn_node_type_to_string(xnn_node_type_fully_connected), + bias_id, bias_value->type); return xnn_status_invalid_parameter; } @@ -1131,17 +1011,19 @@ enum xnn_status xnn_define_fully_connected( switch (bias_value->datatype) { case xnn_datatype_fp32: if (is_channelwise_quantized && bias_value->data == NULL) { - xnn_log_error( - "failed to define %s operator with bias ID #%" PRIu32 ": non-static Value", - xnn_node_type_to_string(xnn_node_type_fully_connected), bias_id); + xnn_log_error("failed to define %s operator with bias ID #%" PRIu32 + ": non-static Value", + xnn_node_type_to_string(xnn_node_type_fully_connected), + bias_id); return xnn_status_invalid_parameter; } break; // non-static bias is supported default: if (bias_value->data == NULL) { - xnn_log_error( - "failed to define %s operator with bias ID #%" PRIu32 ": non-static Value", - xnn_node_type_to_string(xnn_node_type_fully_connected), bias_id); + xnn_log_error("failed to define %s operator with bias ID #%" PRIu32 + ": non-static Value", + xnn_node_type_to_string(xnn_node_type_fully_connected), + bias_id); return xnn_status_invalid_parameter; } break; @@ -1154,21 +1036,24 @@ enum xnn_status xnn_define_fully_connected( case xnn_datatype_qcint32: break; default: - xnn_log_error( - "failed to define %s operator with bias ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_fully_connected), bias_id, - xnn_datatype_to_string(bias_value->datatype), bias_value->datatype); + xnn_log_error("failed to define %s operator with bias ID #%" PRIu32 + ": unsupported Value datatype %s (%d)", + xnn_node_type_to_string(xnn_node_type_fully_connected), + bias_id, xnn_datatype_to_string(bias_value->datatype), + bias_value->datatype); return xnn_status_invalid_parameter; } } - status = xnn_subgraph_check_output_node_id(xnn_node_type_fully_connected, output_id, subgraph->num_values); + status = xnn_subgraph_check_output_node_id(xnn_node_type_fully_connected, + output_id, subgraph->num_values); if (status != xnn_status_success) { return status; } const struct xnn_value* output_value = &subgraph->values[output_id]; - status = xnn_subgraph_check_output_type_dense(xnn_node_type_fully_connected, output_id, output_value); + status = xnn_subgraph_check_output_type_dense(xnn_node_type_fully_connected, + output_id, output_value); if (status != xnn_status_success) { return status; } @@ -1180,39 +1065,46 @@ enum xnn_status xnn_define_fully_connected( case xnn_datatype_quint8: break; default: - xnn_log_error( - "failed to define %s operator with output ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_fully_connected), output_id, - xnn_datatype_to_string(output_value->datatype), output_value->datatype); + xnn_log_error("failed to define %s operator with output ID #%" PRIu32 + ": unsupported Value datatype %s (%d)", + xnn_node_type_to_string(xnn_node_type_fully_connected), + output_id, xnn_datatype_to_string(output_value->datatype), + output_value->datatype); return xnn_status_invalid_parameter; } enum xnn_compute_type compute_type = xnn_compute_type_invalid; if (bias_value != NULL) { compute_type = validate_datatypes_with_bias( - input_value->datatype, kernel_value->datatype, bias_value->datatype, output_value->datatype); + input_value->datatype, kernel_value->datatype, bias_value->datatype, + output_value->datatype); if (compute_type == xnn_compute_type_invalid) { - xnn_log_error( - "failed to define %s operator with input ID #%" PRIu32 ", filter ID #%" PRIu32 ", bias ID #%" PRIu32 ", and output ID #%" PRIu32 - ": mismatching datatypes across input (%s), filter (%s), bias (%s), and output (%s)", - xnn_node_type_to_string(xnn_node_type_fully_connected), input_id, filter_id, bias_id, output_id, - xnn_datatype_to_string(input_value->datatype), - xnn_datatype_to_string(kernel_value->datatype), - xnn_datatype_to_string(bias_value->datatype), - xnn_datatype_to_string(output_value->datatype)); + xnn_log_error("failed to define %s operator with input ID #%" PRIu32 + ", filter ID #%" PRIu32 ", bias ID #%" PRIu32 + ", and output ID #%" PRIu32 + ": mismatching datatypes across input (%s), filter (%s), " + "bias (%s), and output (%s)", + xnn_node_type_to_string(xnn_node_type_fully_connected), + input_id, filter_id, bias_id, output_id, + xnn_datatype_to_string(input_value->datatype), + xnn_datatype_to_string(kernel_value->datatype), + xnn_datatype_to_string(bias_value->datatype), + xnn_datatype_to_string(output_value->datatype)); return xnn_status_invalid_parameter; } } else { compute_type = validate_datatypes_without_bias( - input_value->datatype, kernel_value->datatype, output_value->datatype); + input_value->datatype, kernel_value->datatype, output_value->datatype); if (compute_type == xnn_compute_type_invalid) { - xnn_log_error( - "failed to define %s operator with input ID #%" PRIu32 ", filter ID #%" PRIu32 ", and output ID #%" PRIu32 - ": mismatching datatypes across input (%s), filter (%s), and output (%s)", - xnn_node_type_to_string(xnn_node_type_fully_connected), input_id, filter_id, output_id, - xnn_datatype_to_string(input_value->datatype), - xnn_datatype_to_string(kernel_value->datatype), - xnn_datatype_to_string(output_value->datatype)); + xnn_log_error("failed to define %s operator with input ID #%" PRIu32 + ", filter ID #%" PRIu32 ", and output ID #%" PRIu32 + ": mismatching datatypes across input (%s), filter (%s), " + "and output (%s)", + xnn_node_type_to_string(xnn_node_type_fully_connected), + input_id, filter_id, output_id, + xnn_datatype_to_string(input_value->datatype), + xnn_datatype_to_string(kernel_value->datatype), + xnn_datatype_to_string(output_value->datatype)); return xnn_status_invalid_parameter; } } @@ -1226,7 +1118,7 @@ enum xnn_status xnn_define_fully_connected( node->compute_type = compute_type; node->activation.output_min = output_min; node->activation.output_max = output_max; - node->num_inputs = 2 + (size_t) (bias_id != XNN_INVALID_VALUE_ID); + node->num_inputs = 2 + (size_t)(bias_id != XNN_INVALID_VALUE_ID); node->inputs[0] = input_id; node->inputs[1] = filter_id; node->inputs[2] = bias_id; From f993051601a0a8c189642414fa89252c7f2c7438 Mon Sep 17 00:00:00 2001 From: Alan Kelly Date: Mon, 23 Sep 2024 07:11:01 -0700 Subject: [PATCH 26/50] Re-factor fc op type detection into a function PiperOrigin-RevId: 677779781 --- src/subgraph/fully-connected.c | 505 +++++++++++++++++++-------------- 1 file changed, 290 insertions(+), 215 deletions(-) diff --git a/src/subgraph/fully-connected.c b/src/subgraph/fully-connected.c index 48a6f24e5a3..c11f7b181f3 100644 --- a/src/subgraph/fully-connected.c +++ b/src/subgraph/fully-connected.c @@ -21,120 +21,67 @@ #include "xnnpack/subgraph.h" #include "pthreadpool.h" -static enum xnn_status create_fully_connected_operator( - const struct xnn_node* node, const struct xnn_value* values, - size_t num_values, struct xnn_operator_data* opdata, - struct xnn_code_cache* code_cache, xnn_weights_cache_t weights_cache) { - assert(node->num_inputs >= 2); - assert(node->num_inputs <= 3); - const uint32_t input_id = node->inputs[0]; - assert(input_id != XNN_INVALID_VALUE_ID); - assert(input_id < num_values); - const uint32_t filter_id = node->inputs[1]; - assert(filter_id != XNN_INVALID_VALUE_ID); - assert(filter_id < num_values); +// Format is input_type, weights type, output type, (dynamic)? +enum fully_connected_op_type { + fc_type_invalid = 0, + fc_type_f16_f16_f16 = 1, + fc_type_f16_f16_f16_dynamic = 2, + fc_type_f16_f32_f16 = 3, + fc_type_f16_f32_f16_dynamic = 4, + fc_type_qd8_f16_qc4w = 5, + fc_type_qd8_f16_qb4w = 6, + fc_type_qd8_f16_qc8w = 7, + fc_type_f32_f32_f32 = 8, + fc_type_f32_f32_f32_dynamic = 9, + fc_type_qd8_f32_qb4w = 10, + fc_type_f32_f32_qc4w = 11, + fc_type_qd8_f32_qc4w = 12, + fc_type_qp8_f32_qc4w = 13, + fc_type_f32_f32_qc8w = 14, + fc_type_qd8_f32_qc8w = 15, + fc_type_qs8_qs8_qc8w = 16, + fc_type_qs8_qs8_qs8 = 17, + fc_type_qu8_qu8_qu8 = 18, +}; - assert(node->num_outputs == 1); - const uint32_t output_id = node->outputs[0]; - assert(output_id != XNN_INVALID_VALUE_ID); - assert(output_id < num_values); - - size_t output_channels, input_channels; - if (node->flags & XNN_FLAG_TRANSPOSE_WEIGHTS) { - input_channels = values[node->inputs[1]].shape.dim[0]; - output_channels = values[node->inputs[1]].shape.dim[1]; - } else { - output_channels = values[node->inputs[1]].shape.dim[0]; - input_channels = values[node->inputs[1]].shape.dim[1]; - } - - const void* kernel_data = values[filter_id].fp32_data != NULL - ? values[filter_id].fp32_data - : values[filter_id].data; - bool has_non_static_weights = (kernel_data == NULL); - - const void* bias_data = NULL; - if (node->num_inputs > 2) { - const uint32_t bias_id = node->inputs[2]; - assert(bias_id != XNN_INVALID_VALUE_ID); - assert(bias_id < num_values); - - bias_data = values[bias_id].fp32_data != NULL ? values[bias_id].fp32_data - : values[bias_id].data; +enum fully_connected_op_type get_fully_connected_op_type( + const struct xnn_value* input_value, const struct xnn_value* filter_value, + const struct xnn_value* bias_value, const struct xnn_value* output_value) { + const void* filter_data = filter_value->fp32_data != NULL + ? filter_value->fp32_data + : filter_value->data; + bool has_non_static_weights = (filter_data == NULL); + if (bias_value) { + const void* bias_data = bias_value->fp32_data != NULL + ? bias_value->fp32_data + : bias_value->data; has_non_static_weights |= (bias_data == NULL); } - const enum xnn_datatype input_datatype = values[input_id].datatype; - const enum xnn_datatype filter_datatype = values[filter_id].datatype; - const enum xnn_datatype output_datatype = values[output_id].datatype; - enum xnn_status status; + const enum xnn_datatype input_datatype = input_value->datatype; + const enum xnn_datatype filter_datatype = filter_value->datatype; + const enum xnn_datatype output_datatype = output_value->datatype; switch (output_datatype) { case xnn_datatype_fp16: switch (filter_datatype) { case xnn_datatype_fp16: if (has_non_static_weights) { - status = xnn_create_dynamic_fully_connected_nc_f16( - node->activation.output_min, node->activation.output_max, - /*flags=*/node->flags, &opdata->operator_objects[0]); + return fc_type_f16_f16_f16_dynamic; } else { - status = xnn_create_fully_connected_nc_f16( - input_channels, output_channels, - /*input_stride=*/input_channels, - /*output_stride=*/output_channels, kernel_data, bias_data, - node->activation.output_min, node->activation.output_max, - node->flags, code_cache, weights_cache, - &opdata->operator_objects[0]); + return fc_type_f16_f16_f16; } - break; case xnn_datatype_fp32: if (has_non_static_weights) { - status = xnn_create_dynamic_fully_connected_nc_f16( - node->activation.output_min, node->activation.output_max, - node->flags | XNN_FLAG_FP32_STATIC_WEIGHTS, - &opdata->operator_objects[0]); + return fc_type_f16_f32_f16_dynamic; } else { - status = xnn_create_fully_connected_nc_f16( - input_channels, output_channels, - /*input_stride=*/input_channels, - /*output_stride=*/output_channels, kernel_data, bias_data, - node->activation.output_min, node->activation.output_max, - node->flags | XNN_FLAG_FP32_STATIC_WEIGHTS, code_cache, - weights_cache, &opdata->operator_objects[0]); + return fc_type_f16_f32_f16; } - break; case xnn_datatype_qcint4: - status = xnn_create_fully_connected_nc_qd8_f16_qc4w( - input_channels, output_channels, - /*input_stride=*/input_channels, - /*output_stride=*/output_channels, - /*kernel_zero_point=*/values[filter_id].quantization.zero_point, - values[filter_id].quantization.channelwise_scale, kernel_data, - bias_data, node->activation.output_min, - node->activation.output_max, node->flags, code_cache, - weights_cache, &opdata->operator_objects[0]); - break; + return fc_type_qd8_f16_qc4w; case xnn_datatype_qbint4: - status = xnn_create_fully_connected_nc_qd8_f16_qb4w( - input_channels, output_channels, - /*input_stride=*/input_channels, - /*output_stride=*/output_channels, - /*block_size=*/values[filter_id].quantization.block_size, - /*kernel_zero_point=*/values[filter_id].quantization.zero_point, - (const uint16_t*)values[filter_id].quantization.blockwise_scale, - kernel_data, bias_data, node->activation.output_min, - node->activation.output_max, node->flags, code_cache, - weights_cache, &opdata->operator_objects[0]); - break; + return fc_type_qd8_f16_qb4w; case xnn_datatype_qcint8: - status = xnn_create_fully_connected_nc_qd8_f16_qc8w( - input_channels, output_channels, - /*input_stride=*/input_channels, - /*output_stride=*/output_channels, - values[filter_id].quantization.channelwise_scale, kernel_data, - bias_data, node->activation.output_min, - node->activation.output_max, node->flags, code_cache, - weights_cache, &opdata->operator_objects[0]); - break; + return fc_type_qd8_f16_qc8w; default: XNN_UNREACHABLE; } @@ -143,69 +90,20 @@ static enum xnn_status create_fully_connected_operator( switch (filter_datatype) { case xnn_datatype_fp32: if (has_non_static_weights) { - status = xnn_create_dynamic_fully_connected_nc_f32( - node->activation.output_min, node->activation.output_max, - /*flags=*/node->flags, &opdata->operator_objects[0]); + return fc_type_f32_f32_f32_dynamic; } else { - status = xnn_create_fully_connected_nc_f32( - input_channels, output_channels, - /*input_stride=*/input_channels, - /*output_stride=*/output_channels, kernel_data, bias_data, - node->activation.output_min, node->activation.output_max, - /*flags=*/node->flags, code_cache, weights_cache, - &opdata->operator_objects[0]); + return fc_type_f32_f32_f32; } - break; case xnn_datatype_qbint4: - status = xnn_create_fully_connected_nc_qd8_f32_qb4w( - input_channels, output_channels, - /*input_stride=*/input_channels, - /*output_stride=*/output_channels, - /*block_size=*/values[filter_id].quantization.block_size, - /*kernel_zero_point=*/values[filter_id].quantization.zero_point, - (const uint16_t*)values[filter_id].quantization.blockwise_scale, - kernel_data, bias_data, node->activation.output_min, - node->activation.output_max, node->flags, code_cache, - weights_cache, &opdata->operator_objects[0]); - break; + return fc_type_qd8_f32_qb4w; case xnn_datatype_qcint4: switch (input_datatype) { case xnn_datatype_fp32: - status = xnn_create_fully_connected_nc_f32_qc4w( - input_channels, output_channels, - /*input_stride=*/input_channels, - /*output_stride=*/output_channels, - values[filter_id].quantization.zero_point, - values[filter_id].quantization.channelwise_scale, kernel_data, - bias_data, node->activation.output_min, - node->activation.output_max, - /*flags=*/node->flags, code_cache, weights_cache, - &opdata->operator_objects[0]); - break; + return fc_type_f32_f32_qc4w; case xnn_datatype_qdint8: - status = xnn_create_fully_connected_nc_qd8_f32_qc4w( - input_channels, output_channels, - /*input_stride=*/input_channels, - /*output_stride=*/output_channels, - /*kernel_zero_point=*/ - values[filter_id].quantization.zero_point, - values[filter_id].quantization.channelwise_scale, kernel_data, - bias_data, node->activation.output_min, - node->activation.output_max, node->flags, code_cache, - weights_cache, &opdata->operator_objects[0]); - break; + return fc_type_qd8_f32_qc4w; case xnn_datatype_qpint8: - status = xnn_create_fully_connected_nc_qp8_f32_qc4w( - input_channels, output_channels, - /*input_stride=*/input_channels, - /*output_stride=*/output_channels, - /*kernel_zero_point=*/ - values[filter_id].quantization.zero_point, - values[filter_id].quantization.channelwise_scale, kernel_data, - bias_data, node->activation.output_min, - node->activation.output_max, node->flags, code_cache, - weights_cache, &opdata->operator_objects[0]); - break; + return fc_type_qp8_f32_qc4w; default: XNN_UNREACHABLE; } @@ -213,26 +111,9 @@ static enum xnn_status create_fully_connected_operator( case xnn_datatype_qcint8: switch (input_datatype) { case xnn_datatype_fp32: - status = xnn_create_fully_connected_nc_f32_qc8w( - input_channels, output_channels, - /*input_stride=*/input_channels, - /*output_stride=*/output_channels, - values[filter_id].quantization.channelwise_scale, kernel_data, - bias_data, node->activation.output_min, - node->activation.output_max, - /*flags=*/node->flags, code_cache, weights_cache, - &opdata->operator_objects[0]); - break; + return fc_type_f32_f32_qc8w; case xnn_datatype_qdint8: - status = xnn_create_fully_connected_nc_qd8_f32_qc8w( - input_channels, output_channels, - /*input_stride=*/input_channels, - /*output_stride=*/output_channels, - values[filter_id].quantization.channelwise_scale, kernel_data, - bias_data, node->activation.output_min, - node->activation.output_max, node->flags, code_cache, - weights_cache, &opdata->operator_objects[0]); - break; + return fc_type_qd8_f32_qc8w; default: XNN_UNREACHABLE; } @@ -244,55 +125,249 @@ static enum xnn_status create_fully_connected_operator( case xnn_datatype_qint8: switch (filter_datatype) { case xnn_datatype_qcint8: - assert(!has_non_static_weights); - assert(kernel_data != NULL); - assert(values[filter_id].datatype == xnn_datatype_qcint8); - const float output_scale = values[output_id].quantization.scale; - const int32_t output_zero_point = - values[output_id].quantization.zero_point; - const int8_t output_min = xnn_qs8_quantize( - node->activation.output_min, output_scale, output_zero_point); - const int8_t output_max = xnn_qs8_quantize( - node->activation.output_max, output_scale, output_zero_point); - status = xnn_create_fully_connected_nc_qs8_qc8w( - input_channels, output_channels, - /*input_stride=*/input_channels, - /*output_stride=*/output_channels, - (int8_t)values[input_id].quantization.zero_point, - values[input_id].quantization.scale, - values[filter_id].quantization.channelwise_scale, kernel_data, - bias_data, (int8_t)output_zero_point, output_scale, output_min, - output_max, - /*flags=*/node->flags, code_cache, weights_cache, - &opdata->operator_objects[0]); - break; - case xnn_datatype_qint8: { - assert(!has_non_static_weights); - assert(kernel_data != NULL); - const float output_scale = values[output_id].quantization.scale; - const int32_t output_zero_point = - values[output_id].quantization.zero_point; - const int8_t output_min = xnn_qs8_quantize( - node->activation.output_min, output_scale, output_zero_point); - const int8_t output_max = xnn_qs8_quantize( - node->activation.output_max, output_scale, output_zero_point); - status = xnn_create_fully_connected_nc_qs8( - input_channels, output_channels, - /*input_stride=*/input_channels, - /*output_stride=*/output_channels, - (int8_t)values[input_id].quantization.zero_point, - values[input_id].quantization.scale, - values[filter_id].quantization.scale, kernel_data, bias_data, - (int8_t)output_zero_point, output_scale, output_min, output_max, - /*flags=*/node->flags, code_cache, weights_cache, - &opdata->operator_objects[0]); - break; - } + return fc_type_qs8_qs8_qc8w; + case xnn_datatype_qint8: + return fc_type_qs8_qs8_qs8; default: XNN_UNREACHABLE; } break; - case xnn_datatype_quint8: { + case xnn_datatype_quint8: + return fc_type_qu8_qu8_qu8; + default: + XNN_UNREACHABLE; + } +} + +static enum xnn_status create_fully_connected_operator( + const struct xnn_node* node, const struct xnn_value* values, + size_t num_values, struct xnn_operator_data* opdata, + struct xnn_code_cache* code_cache, xnn_weights_cache_t weights_cache) { + assert(node->num_inputs >= 2); + assert(node->num_inputs <= 3); + const uint32_t input_id = node->inputs[0]; + assert(input_id != XNN_INVALID_VALUE_ID); + assert(input_id < num_values); + const uint32_t filter_id = node->inputs[1]; + assert(filter_id != XNN_INVALID_VALUE_ID); + assert(filter_id < num_values); + + assert(node->num_outputs == 1); + const uint32_t output_id = node->outputs[0]; + assert(output_id != XNN_INVALID_VALUE_ID); + assert(output_id < num_values); + + size_t output_channels, input_channels; + if (node->flags & XNN_FLAG_TRANSPOSE_WEIGHTS) { + input_channels = values[node->inputs[1]].shape.dim[0]; + output_channels = values[node->inputs[1]].shape.dim[1]; + } else { + output_channels = values[node->inputs[1]].shape.dim[0]; + input_channels = values[node->inputs[1]].shape.dim[1]; + } + + const void* kernel_data = values[filter_id].fp32_data != NULL + ? values[filter_id].fp32_data + : values[filter_id].data; + bool has_non_static_weights = (kernel_data == NULL); + + const void* bias_data = NULL; + const struct xnn_value* bias_value = NULL; + if (node->num_inputs > 2) { + const uint32_t bias_id = node->inputs[2]; + assert(bias_id != XNN_INVALID_VALUE_ID); + assert(bias_id < num_values); + + bias_data = values[bias_id].fp32_data != NULL ? values[bias_id].fp32_data + : values[bias_id].data; + has_non_static_weights |= (bias_data == NULL); + bias_value = &values[bias_id]; + } + + enum xnn_status status; + enum fully_connected_op_type op_type = get_fully_connected_op_type( + &values[input_id], &values[filter_id], bias_value, &values[output_id]); + switch (op_type) { + case fc_type_f16_f16_f16_dynamic: + status = xnn_create_dynamic_fully_connected_nc_f16( + node->activation.output_min, node->activation.output_max, + /*flags=*/node->flags, &opdata->operator_objects[0]); + break; + case fc_type_f16_f16_f16: + status = xnn_create_fully_connected_nc_f16( + input_channels, output_channels, + /*input_stride=*/input_channels, + /*output_stride=*/output_channels, kernel_data, bias_data, + node->activation.output_min, node->activation.output_max, node->flags, + code_cache, weights_cache, &opdata->operator_objects[0]); + break; + case fc_type_f16_f32_f16_dynamic: + status = xnn_create_dynamic_fully_connected_nc_f16( + node->activation.output_min, node->activation.output_max, + node->flags | XNN_FLAG_FP32_STATIC_WEIGHTS, + &opdata->operator_objects[0]); + break; + case fc_type_f16_f32_f16: + status = xnn_create_fully_connected_nc_f16( + input_channels, output_channels, + /*input_stride=*/input_channels, + /*output_stride=*/output_channels, kernel_data, bias_data, + node->activation.output_min, node->activation.output_max, + node->flags | XNN_FLAG_FP32_STATIC_WEIGHTS, code_cache, weights_cache, + &opdata->operator_objects[0]); + break; + case fc_type_qd8_f16_qc4w: + status = xnn_create_fully_connected_nc_qd8_f16_qc4w( + input_channels, output_channels, + /*input_stride=*/input_channels, + /*output_stride=*/output_channels, + /*kernel_zero_point=*/values[filter_id].quantization.zero_point, + values[filter_id].quantization.channelwise_scale, kernel_data, + bias_data, node->activation.output_min, node->activation.output_max, + node->flags, code_cache, weights_cache, &opdata->operator_objects[0]); + break; + case fc_type_qd8_f16_qb4w: + status = xnn_create_fully_connected_nc_qd8_f16_qb4w( + input_channels, output_channels, + /*input_stride=*/input_channels, + /*output_stride=*/output_channels, + /*block_size=*/values[filter_id].quantization.block_size, + /*kernel_zero_point=*/values[filter_id].quantization.zero_point, + (const uint16_t*)values[filter_id].quantization.blockwise_scale, + kernel_data, bias_data, node->activation.output_min, + node->activation.output_max, node->flags, code_cache, weights_cache, + &opdata->operator_objects[0]); + break; + case fc_type_qd8_f16_qc8w: + status = xnn_create_fully_connected_nc_qd8_f16_qc8w( + input_channels, output_channels, + /*input_stride=*/input_channels, + /*output_stride=*/output_channels, + values[filter_id].quantization.channelwise_scale, kernel_data, + bias_data, node->activation.output_min, node->activation.output_max, + node->flags, code_cache, weights_cache, &opdata->operator_objects[0]); + break; + case fc_type_f32_f32_f32_dynamic: + status = xnn_create_dynamic_fully_connected_nc_f32( + node->activation.output_min, node->activation.output_max, + /*flags=*/node->flags, &opdata->operator_objects[0]); + break; + case fc_type_f32_f32_f32: + status = xnn_create_fully_connected_nc_f32( + input_channels, output_channels, + /*input_stride=*/input_channels, + /*output_stride=*/output_channels, kernel_data, bias_data, + node->activation.output_min, node->activation.output_max, + /*flags=*/node->flags, code_cache, weights_cache, + &opdata->operator_objects[0]); + break; + case fc_type_qd8_f32_qb4w: + status = xnn_create_fully_connected_nc_qd8_f32_qb4w( + input_channels, output_channels, + /*input_stride=*/input_channels, + /*output_stride=*/output_channels, + /*block_size=*/values[filter_id].quantization.block_size, + /*kernel_zero_point=*/values[filter_id].quantization.zero_point, + (const uint16_t*)values[filter_id].quantization.blockwise_scale, + kernel_data, bias_data, node->activation.output_min, + node->activation.output_max, node->flags, code_cache, weights_cache, + &opdata->operator_objects[0]); + break; + case fc_type_f32_f32_qc4w: + status = xnn_create_fully_connected_nc_f32_qc4w( + input_channels, output_channels, + /*input_stride=*/input_channels, + /*output_stride=*/output_channels, + values[filter_id].quantization.zero_point, + values[filter_id].quantization.channelwise_scale, kernel_data, + bias_data, node->activation.output_min, node->activation.output_max, + /*flags=*/node->flags, code_cache, weights_cache, + &opdata->operator_objects[0]); + break; + case fc_type_qd8_f32_qc4w: + status = xnn_create_fully_connected_nc_qd8_f32_qc4w( + input_channels, output_channels, + /*input_stride=*/input_channels, + /*output_stride=*/output_channels, + /*kernel_zero_point=*/values[filter_id].quantization.zero_point, + values[filter_id].quantization.channelwise_scale, kernel_data, + bias_data, node->activation.output_min, node->activation.output_max, + node->flags, code_cache, weights_cache, &opdata->operator_objects[0]); + break; + case fc_type_qp8_f32_qc4w: + status = xnn_create_fully_connected_nc_qp8_f32_qc4w( + input_channels, output_channels, + /*input_stride=*/input_channels, + /*output_stride=*/output_channels, + /*kernel_zero_point=*/values[filter_id].quantization.zero_point, + values[filter_id].quantization.channelwise_scale, kernel_data, + bias_data, node->activation.output_min, node->activation.output_max, + node->flags, code_cache, weights_cache, &opdata->operator_objects[0]); + break; + case fc_type_f32_f32_qc8w: + status = xnn_create_fully_connected_nc_f32_qc8w( + input_channels, output_channels, + /*input_stride=*/input_channels, + /*output_stride=*/output_channels, + values[filter_id].quantization.channelwise_scale, kernel_data, + bias_data, node->activation.output_min, node->activation.output_max, + /*flags=*/node->flags, code_cache, weights_cache, + &opdata->operator_objects[0]); + break; + case fc_type_qd8_f32_qc8w: + status = xnn_create_fully_connected_nc_qd8_f32_qc8w( + input_channels, output_channels, + /*input_stride=*/input_channels, + /*output_stride=*/output_channels, + values[filter_id].quantization.channelwise_scale, kernel_data, + bias_data, node->activation.output_min, node->activation.output_max, + node->flags, code_cache, weights_cache, &opdata->operator_objects[0]); + break; + case fc_type_qs8_qs8_qc8w: + assert(!has_non_static_weights); + assert(kernel_data != NULL); + assert(values[filter_id].datatype == xnn_datatype_qcint8); + const float output_scale = values[output_id].quantization.scale; + const int32_t output_zero_point = + values[output_id].quantization.zero_point; + const int8_t output_min = xnn_qs8_quantize( + node->activation.output_min, output_scale, output_zero_point); + const int8_t output_max = xnn_qs8_quantize( + node->activation.output_max, output_scale, output_zero_point); + status = xnn_create_fully_connected_nc_qs8_qc8w( + input_channels, output_channels, + /*input_stride=*/input_channels, + /*output_stride=*/output_channels, + (int8_t)values[input_id].quantization.zero_point, + values[input_id].quantization.scale, + values[filter_id].quantization.channelwise_scale, kernel_data, + bias_data, (int8_t)output_zero_point, output_scale, output_min, + output_max, + /*flags=*/node->flags, code_cache, weights_cache, + &opdata->operator_objects[0]); + break; + case fc_type_qs8_qs8_qs8: { + assert(!has_non_static_weights); + assert(kernel_data != NULL); + const float output_scale = values[output_id].quantization.scale; + const int32_t output_zero_point = + values[output_id].quantization.zero_point; + const int8_t output_min = xnn_qs8_quantize( + node->activation.output_min, output_scale, output_zero_point); + const int8_t output_max = xnn_qs8_quantize( + node->activation.output_max, output_scale, output_zero_point); + status = xnn_create_fully_connected_nc_qs8( + input_channels, output_channels, + /*input_stride=*/input_channels, + /*output_stride=*/output_channels, + (int8_t)values[input_id].quantization.zero_point, + values[input_id].quantization.scale, + values[filter_id].quantization.scale, kernel_data, bias_data, + (int8_t)output_zero_point, output_scale, output_min, output_max, + /*flags=*/node->flags, code_cache, weights_cache, + &opdata->operator_objects[0]); + } break; + case fc_type_qu8_qu8_qu8: { assert(!has_non_static_weights); assert(kernel_data != NULL); const float output_scale = values[output_id].quantization.scale; From 8bed40e0e7f65d5a3f58640f2a04cdf0956cc741 Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Mon, 23 Sep 2024 07:44:06 -0700 Subject: [PATCH 27/50] Stop fusing clamps with binary operators We are going to split these later anyways. PiperOrigin-RevId: 677790032 --- src/subgraph.c | 4 -- test/fusion.cc | 120 ------------------------------------------------- 2 files changed, 124 deletions(-) diff --git a/src/subgraph.c b/src/subgraph.c index 625c0de6e8d..54c74529207 100644 --- a/src/subgraph.c +++ b/src/subgraph.c @@ -1153,17 +1153,13 @@ enum xnn_status xnn_subgraph_fusion( // Try to fuse Clamp Node upstream into producer Node if (consumer->type == xnn_node_type_clamp) { switch (producer->type) { - case xnn_node_type_add2: case xnn_node_type_average_pooling_2d: case xnn_node_type_clamp: case xnn_node_type_convolution_2d: - case xnn_node_type_divide: case xnn_node_type_deconvolution_2d: case xnn_node_type_depthwise_convolution_2d: case xnn_node_type_fully_connected: - case xnn_node_type_multiply2: case xnn_node_type_max_pooling_2d: - case xnn_node_type_subtract: xnn_log_info("fuse Clamp Node #%"PRIu32" into upstream Node #%"PRIu32, consumer_id, producer_id); assert(producer->num_outputs == 1); assert(consumer->num_inputs == 1); diff --git a/test/fusion.cc b/test/fusion.cc index 5d717d1bff9..bccf3f55781 100644 --- a/test/fusion.cc +++ b/test/fusion.cc @@ -17,36 +17,6 @@ namespace xnnpack { -TEST(ADD_THEN_CLAMP, fusion) { - RuntimeTester tester(4); - float output_min = -0.5f; - float output_max = 0.5f; - uint32_t input1_id = 0; - uint32_t input2_id = 1; - uint32_t intermediate_id = 2; - uint32_t output_id = 3; - tester - .AddInputTensorF32({1, 2, 2, 3}, input1_id) - .AddInputTensorF32({1, 2, 2, 3}, input2_id) - .AddDynamicTensorF32({1, 2, 2, 3}, intermediate_id) - .AddOutputTensorF32({1, 2, 2, 3}, output_id) - .AddAddition(input1_id, input2_id, intermediate_id) - .AddClamp(output_min, output_max, intermediate_id, output_id); - - std::vector unoptimized_output = tester.RunWithoutFusion(); - ASSERT_EQ(tester.NumOperators(), 2); - - std::vector optimized_output = tester.RunWithFusion(); - - ASSERT_EQ(tester.NumOperators(), 1); - ASSERT_EQ(tester.Node(0)->activation.output_min, output_min); - ASSERT_EQ(tester.Node(0)->activation.output_max, output_max); - ASSERT_EQ(tester.Node(0)->outputs[0], output_id); - ASSERT_EQ(tester.Node(1)->compute_type, xnn_compute_type_invalid); - - ASSERT_EQ(unoptimized_output, optimized_output); -} - TEST(AVERAGE_POOLING_2D_THEN_CLAMP, fusion) { RuntimeTester tester(3); float output_min = -0.5f; @@ -148,36 +118,6 @@ TEST(CONVOLUTION_2D_THEN_CLAMP, fusion) { ASSERT_EQ(unoptimized_output, optimized_output); } -TEST(DIVIDE_THEN_CLAMP, fusion) { - RuntimeTester tester(4); - float output_min = -0.5f; - float output_max = 0.5f; - uint32_t input1_id = 0; - uint32_t input2_id = 1; - uint32_t intermediate_id = 2; - uint32_t output_id = 3; - tester - .AddInputTensorF32({1, 2, 2, 3}, input1_id) - .AddInputTensorF32({1, 2, 2, 3}, input2_id) - .AddDynamicTensorF32({1, 2, 2, 3}, intermediate_id) - .AddOutputTensorF32({1, 2, 2, 3}, output_id) - .AddDivide(input1_id, input2_id, intermediate_id) - .AddClamp(output_min, output_max, intermediate_id, output_id); - - std::vector unoptimized_output = tester.RunWithoutFusion(); - ASSERT_EQ(tester.NumOperators(), 2); - - std::vector optimized_output = tester.RunWithFusion(); - - ASSERT_EQ(tester.NumOperators(), 1); - ASSERT_EQ(tester.Node(0)->activation.output_min, output_min); - ASSERT_EQ(tester.Node(0)->activation.output_max, output_max); - ASSERT_EQ(tester.Node(0)->outputs[0], output_id); - ASSERT_EQ(tester.Node(1)->compute_type, xnn_compute_type_invalid); - - ASSERT_EQ(unoptimized_output, optimized_output); -} - TEST(DECONVOLUTION_2D_THEN_CLAMP, fusion) { RuntimeTester tester(5); float output_min = -0.5f; @@ -328,36 +268,6 @@ TEST(FULLY_CONNECTED_2D_THEN_COPY_THEN_FULLY_CONNECTED, fusion) { ASSERT_EQ(unoptimized_output, optimized_output); } -TEST(MULTIPLY_THEN_CLAMP, fusion) { - RuntimeTester tester(4); - float output_min = -0.5f; - float output_max = 0.5f; - uint32_t input1_id = 0; - uint32_t input2_id = 1; - uint32_t intermediate_id = 2; - uint32_t output_id = 3; - tester - .AddInputTensorF32({1, 2, 2, 3}, input1_id) - .AddInputTensorF32({1, 2, 2, 3}, input2_id) - .AddDynamicTensorF32({1, 2, 2, 3}, intermediate_id) - .AddOutputTensorF32({1, 2, 2, 3}, output_id) - .AddMultiply(input1_id, input2_id, intermediate_id) - .AddClamp(output_min, output_max, intermediate_id, output_id); - - std::vector unoptimized_output = tester.RunWithoutFusion(); - ASSERT_EQ(tester.NumOperators(), 2); - - std::vector optimized_output = tester.RunWithFusion(); - - ASSERT_EQ(tester.NumOperators(), 1); - ASSERT_EQ(tester.Node(0)->activation.output_min, output_min); - ASSERT_EQ(tester.Node(0)->activation.output_max, output_max); - ASSERT_EQ(tester.Node(0)->outputs[0], output_id); - ASSERT_EQ(tester.Node(1)->compute_type, xnn_compute_type_invalid); - - ASSERT_EQ(unoptimized_output, optimized_output); -} - TEST(MAX_POOLING_THEN_CLAMP, fusion) { RuntimeTester tester(3); float output_min = -0.5f; @@ -386,36 +296,6 @@ TEST(MAX_POOLING_THEN_CLAMP, fusion) { ASSERT_EQ(unoptimized_output, optimized_output); } -TEST(SUBTRACT_THEN_CLAMP, fusion) { - RuntimeTester tester(4); - float output_min = -0.5f; - float output_max = 0.5f; - uint32_t input1_id = 0; - uint32_t input2_id = 1; - uint32_t intermediate_id = 2; - uint32_t output_id = 3; - tester - .AddInputTensorF32({1, 2, 2, 3}, input1_id) - .AddInputTensorF32({1, 2, 2, 3}, input2_id) - .AddDynamicTensorF32({1, 2, 2, 3}, intermediate_id) - .AddOutputTensorF32({1, 2, 2, 3}, output_id) - .AddSubtract(input1_id, input2_id, intermediate_id) - .AddClamp(output_min, output_max, intermediate_id, output_id); - - std::vector unoptimized_output = tester.RunWithoutFusion(); - ASSERT_EQ(tester.NumOperators(), 2); - - std::vector optimized_output = tester.RunWithFusion(); - - ASSERT_EQ(tester.NumOperators(), 1); - ASSERT_EQ(tester.Node(0)->activation.output_min, output_min); - ASSERT_EQ(tester.Node(0)->activation.output_max, output_max); - ASSERT_EQ(tester.Node(0)->outputs[0], output_id); - ASSERT_EQ(tester.Node(1)->compute_type, xnn_compute_type_invalid); - - ASSERT_EQ(unoptimized_output, optimized_output); -} - TEST(CONSTANT_PAD_THEN_CONVOLUTION, fusion) { RuntimeTester tester(5); uint32_t input_id = 0; From 886ef4fccf0bfa79b87e7b3d1deaa72783945eb5 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Mon, 23 Sep 2024 08:51:12 -0700 Subject: [PATCH 28/50] f32-raddstoreexpminusmax-avx2-rr2 microkernel - Exact port of SSE2 rr2 microkernel PiperOrigin-RevId: 677812124 --- bench/f32-raddstoreexpminusmax.cc | 118 ++++ cmake/gen/avx2_microkernels.cmake | 15 + gen/avx2_microkernels.bzl | 15 + scripts/generate-f32-raddstoreexpminusmax.sh | 30 +- src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in | 249 ++++++++ ...addstoreexpminusmax-avx2-rr2-p5-u32-acc2.c | 274 +++++++++ ...addstoreexpminusmax-avx2-rr2-p5-u32-acc4.c | 278 +++++++++ ...f32-raddstoreexpminusmax-avx2-rr2-p5-u32.c | 271 +++++++++ ...addstoreexpminusmax-avx2-rr2-p5-u64-acc2.c | 338 +++++++++++ ...addstoreexpminusmax-avx2-rr2-p5-u64-acc4.c | 342 +++++++++++ ...f32-raddstoreexpminusmax-avx2-rr2-p5-u64.c | 335 +++++++++++ ...addstoreexpminusmax-avx2-rr2-p5-u72-acc3.c | 356 +++++++++++ ...f32-raddstoreexpminusmax-avx2-rr2-p5-u72.c | 351 +++++++++++ ...addstoreexpminusmax-avx2-rr2-p5-u80-acc2.c | 370 ++++++++++++ ...addstoreexpminusmax-avx2-rr2-p5-u80-acc5.c | 376 ++++++++++++ ...f32-raddstoreexpminusmax-avx2-rr2-p5-u80.c | 367 ++++++++++++ ...addstoreexpminusmax-avx2-rr2-p5-u96-acc2.c | 402 +++++++++++++ ...addstoreexpminusmax-avx2-rr2-p5-u96-acc3.c | 404 +++++++++++++ ...addstoreexpminusmax-avx2-rr2-p5-u96-acc6.c | 410 +++++++++++++ ...f32-raddstoreexpminusmax-avx2-rr2-p5-u96.c | 399 +++++++++++++ src/xnnpack/raddstoreexpminusmax.h | 16 + test/f32-raddstoreexpminusmax.cc | 555 ++++++++++++++++++ test/f32-raddstoreexpminusmax.yaml | 16 + 23 files changed, 6273 insertions(+), 14 deletions(-) create mode 100644 src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in create mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32-acc2.c create mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32-acc4.c create mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32.c create mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64-acc2.c create mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64-acc4.c create mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64.c create mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u72-acc3.c create mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u72.c create mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80-acc2.c create mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80-acc5.c create mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80.c create mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc2.c create mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc3.c create mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc6.c create mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96.c diff --git a/bench/f32-raddstoreexpminusmax.cc b/bench/f32-raddstoreexpminusmax.cc index 4a3cae36122..87acc4dbc25 100644 --- a/bench/f32-raddstoreexpminusmax.cc +++ b/bench/f32-raddstoreexpminusmax.cc @@ -662,6 +662,124 @@ static void f32_raddstoreexpminusmax( ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); + + + + + + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr2_p5_u32, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32, + nullptr, + benchmark::utils::CheckAVX2) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr2_p5_u32_acc2, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc2, + nullptr, + benchmark::utils::CheckAVX2) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr2_p5_u32_acc4, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc4, + nullptr, + benchmark::utils::CheckAVX2) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr2_p5_u64, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64, + nullptr, + benchmark::utils::CheckAVX2) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr2_p5_u64_acc2, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64_acc2, + nullptr, + benchmark::utils::CheckAVX2) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr2_p5_u64_acc4, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64_acc4, + nullptr, + benchmark::utils::CheckAVX2) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr2_p5_u72, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u72, + nullptr, + benchmark::utils::CheckAVX2) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr2_p5_u72_acc3, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u72_acc3, + nullptr, + benchmark::utils::CheckAVX2) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr2_p5_u80, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80, + nullptr, + benchmark::utils::CheckAVX2) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr2_p5_u80_acc2, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80_acc2, + nullptr, + benchmark::utils::CheckAVX2) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr2_p5_u80_acc5, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80_acc5, + nullptr, + benchmark::utils::CheckAVX2) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr2_p5_u96, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96, + nullptr, + benchmark::utils::CheckAVX2) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr2_p5_u96_acc2, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc2, + nullptr, + benchmark::utils::CheckAVX2) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr2_p5_u96_acc3, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc3, + nullptr, + benchmark::utils::CheckAVX2) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr2_p5_u96_acc6, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc6, + nullptr, + benchmark::utils::CheckAVX2) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + + + + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_avx_rr2_p5_u4, xnn_f32_rmax_ukernel__avx_u32_acc4, xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u4, diff --git a/cmake/gen/avx2_microkernels.cmake b/cmake/gen/avx2_microkernels.cmake index 1f71448fece..480c58761d7 100644 --- a/cmake/gen/avx2_microkernels.cmake +++ b/cmake/gen/avx2_microkernels.cmake @@ -248,6 +248,21 @@ SET(NON_PROD_AVX2_MICROKERNEL_SRCS src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc3.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc6.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32-acc2.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32-acc4.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64-acc2.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64-acc4.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u72-acc3.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u72.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80-acc2.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80-acc5.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc2.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc3.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc6.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96.c src/f32-velu/gen/f32-velu-avx2-rr1-lut4-p4-perm-u8.c src/f32-velu/gen/f32-velu-avx2-rr1-lut4-p4-perm-u16.c src/f32-velu/gen/f32-velu-avx2-rr1-lut4-p4-perm-u24.c diff --git a/gen/avx2_microkernels.bzl b/gen/avx2_microkernels.bzl index d10a691fb09..d35b73e0b58 100644 --- a/gen/avx2_microkernels.bzl +++ b/gen/avx2_microkernels.bzl @@ -245,6 +245,21 @@ NON_PROD_AVX2_MICROKERNEL_SRCS = [ "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc3.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc6.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32-acc2.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32-acc4.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64-acc2.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64-acc4.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u72-acc3.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u72.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80-acc2.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80-acc5.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc2.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc3.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc6.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96.c", "src/f32-velu/gen/f32-velu-avx2-rr1-lut4-p4-perm-u8.c", "src/f32-velu/gen/f32-velu-avx2-rr1-lut4-p4-perm-u16.c", "src/f32-velu/gen/f32-velu-avx2-rr1-lut4-p4-perm-u24.c", diff --git a/scripts/generate-f32-raddstoreexpminusmax.sh b/scripts/generate-f32-raddstoreexpminusmax.sh index e789f74146f..b467de77ecb 100755 --- a/scripts/generate-f32-raddstoreexpminusmax.sh +++ b/scripts/generate-f32-raddstoreexpminusmax.sh @@ -75,21 +75,23 @@ tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=20 -D AC tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=20 -D ACCUMULATORS=2 -D AVX=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20-acc2.c & tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=20 -D ACCUMULATORS=5 -D AVX=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20-acc5.c & -################################### x86 AVX ################################## -tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=4 -D ACCUMULATORS=1 -D AVX=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u4.c & -tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=8 -D ACCUMULATORS=1 -D AVX=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u8.c & -tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=8 -D ACCUMULATORS=2 -D AVX=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u8-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=12 -D ACCUMULATORS=1 -D AVX=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12.c & -tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=12 -D ACCUMULATORS=2 -D AVX=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=12 -D ACCUMULATORS=3 -D AVX=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12-acc3.c & -tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=16 -D ACCUMULATORS=1 -D AVX=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16.c & -tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=16 -D ACCUMULATORS=2 -D AVX=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=16 -D ACCUMULATORS=4 -D AVX=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16-acc4.c & -tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=20 -D ACCUMULATORS=1 -D AVX=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20.c & -tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=20 -D ACCUMULATORS=2 -D AVX=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=20 -D ACCUMULATORS=5 -D AVX=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20-acc5.c & - ################################### x86 AVX2 ################################## +tools/xngen src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -D BATCH_TILE=32 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32.c & +tools/xngen src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -D BATCH_TILE=32 -D ACCUMULATORS=2 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32-acc2.c & +tools/xngen src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -D BATCH_TILE=32 -D ACCUMULATORS=4 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32-acc4.c & +tools/xngen src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -D BATCH_TILE=64 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64.c & +tools/xngen src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -D BATCH_TILE=64 -D ACCUMULATORS=2 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64-acc2.c & +tools/xngen src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -D BATCH_TILE=64 -D ACCUMULATORS=4 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64-acc4.c & +tools/xngen src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -D BATCH_TILE=72 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u72.c & +tools/xngen src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -D BATCH_TILE=72 -D ACCUMULATORS=3 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u72-acc3.c & +tools/xngen src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -D BATCH_TILE=80 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80.c & +tools/xngen src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -D BATCH_TILE=80 -D ACCUMULATORS=2 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80-acc2.c & +tools/xngen src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -D BATCH_TILE=80 -D ACCUMULATORS=5 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80-acc5.c & +tools/xngen src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -D BATCH_TILE=96 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96.c & +tools/xngen src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -D BATCH_TILE=96 -D ACCUMULATORS=2 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc2.c & +tools/xngen src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -D BATCH_TILE=96 -D ACCUMULATORS=3 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc3.c & +tools/xngen src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -D BATCH_TILE=96 -D ACCUMULATORS=6 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc6.c & + tools/xngen src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in -D BATCH_TILE=32 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u32.c & tools/xngen src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in -D BATCH_TILE=32 -D ACCUMULATORS=2 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u32-acc2.c & tools/xngen src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in -D BATCH_TILE=32 -D ACCUMULATORS=4 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u32-acc4.c & diff --git a/src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in b/src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in new file mode 100644 index 00000000000..733e6489813 --- /dev/null +++ b/src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in @@ -0,0 +1,249 @@ +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +$assert BATCH_TILE % 8 == 0 +$assert BATCH_TILE >= 8 +$SIMD_TILE = BATCH_TILE // 8 +#include + +#include + +#include "xnnpack/raddstoreexpminusmax.h" + + +void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u${BATCH_TILE}${"" if ACCUMULATORS == 1 else "_acc%d" % ACCUMULATORS}( + size_t batch, + const float* input, + const float* max, + float* output, + float* sum, + const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(max != NULL); + assert(output != NULL); + assert(sum != NULL); + + static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; + + const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); + const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); + const __m256 vminus_ln2_hi = _mm256_set1_ps(-0x1.62E400p-1f); + const __m256 vminus_ln2_lo = _mm256_set1_ps(-0x1.7F7D1Cp-20f); + const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); + const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); + const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); + const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); + const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); + const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); + + XNN_FORCE_REALIZATION(vlog2e); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vminus_ln2_hi); + XNN_FORCE_REALIZATION(vminus_ln2_lo); + XNN_FORCE_REALIZATION(vc5); + XNN_FORCE_REALIZATION(vc4); + XNN_FORCE_REALIZATION(vc3); + XNN_FORCE_REALIZATION(vc2); + XNN_FORCE_REALIZATION(vc1); + XNN_FORCE_REALIZATION(vdenorm_cutoff); + + const __m256 vi_max = _mm256_broadcast_ss(max); + + $for K in range(ACCUMULATORS): + __m256 vacc${K} = _mm256_setzero_ps(); + for (; batch >= ${BATCH_TILE} * sizeof(float); batch -= ${BATCH_TILE} * sizeof(float)) { + // Load ${BATCH_TILE} (${SIMD_TILE}x4) inputs at a time. + const __m256 vi0 = _mm256_loadu_ps(input); + $for N in range(1, SIMD_TILE): + const __m256 vi${N} = _mm256_loadu_ps(input + ${N*8}); + input += ${BATCH_TILE}; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + $for N in range(SIMD_TILE): + const __m256 vx${N} = _mm256_sub_ps(vi${N}, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + $for N in range(SIMD_TILE): + __m256 vn${N} = _mm256_add_ps(_mm256_mul_ps(vx${N}, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + $for N in range(SIMD_TILE): + const __m256 vs${N} = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn${N}), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + $for N in range(SIMD_TILE): + vn${N} = _mm256_sub_ps(vn${N}, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + $for N in range(SIMD_TILE): + __m256 vt${N} = _mm256_add_ps(_mm256_mul_ps(vn${N}, vminus_ln2_hi), vx${N}); + + $for N in range(SIMD_TILE): + vt${N} = _mm256_add_ps(_mm256_mul_ps(vn${N}, vminus_ln2_lo), vt${N}); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + $for N in range(SIMD_TILE): + __m256 vp${N} = _mm256_add_ps(_mm256_mul_ps(vc5, vt${N}), vc4); + + $for N in range(SIMD_TILE): + vp${N} = _mm256_add_ps(_mm256_mul_ps(vp${N}, vt${N}), vc3); + + $for N in range(SIMD_TILE): + vp${N} = _mm256_add_ps(_mm256_mul_ps(vp${N}, vt${N}), vc2); + + $for N in range(SIMD_TILE): + vp${N} = _mm256_add_ps(_mm256_mul_ps(vp${N}, vt${N}), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + $for N in range(SIMD_TILE): + vt${N} = _mm256_mul_ps(vt${N}, vs${N}); + + $for N in range(SIMD_TILE): + __m256 vf${N} = _mm256_add_ps(_mm256_mul_ps(vt${N}, vp${N}), vs${N}); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + $for N in range(SIMD_TILE): + vf${N} = _mm256_andnot_ps(_mm256_cmp_ps(vx${N}, vdenorm_cutoff, _CMP_LT_OS), vf${N}); + + // Store ${BATCH_TILE} (${SIMD_TILE}x4) outputs at a time. + _mm256_storeu_ps(output, vf0); + $for N in range(1, SIMD_TILE): + _mm256_storeu_ps(output + ${N*8}, vf${N}); + output += ${BATCH_TILE}; + + // Accumulate computed exponents. + $for N in range(SIMD_TILE): + vacc${N % ACCUMULATORS} = _mm256_add_ps(vacc${N % ACCUMULATORS}, vf${N}); + } + $if ACCUMULATORS > 1: + // Add up all accumulators to vacc0 + $ACC_SLICE = 1 + $while ACC_SLICE < ACCUMULATORS: + $for A in range(0, ACCUMULATORS, ACC_SLICE * 2): + $if A + ACC_SLICE < ACCUMULATORS: + vacc${A} = _mm256_add_ps(vacc${A}, vacc${A + ACC_SLICE}); + $ACC_SLICE *= 2 + + __m256 vacc = vacc0; + for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { + // Load 8 inputs at a time. + const __m256 vi = _mm256_loadu_ps(input); + input += 8; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx = _mm256_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm256_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm256_mul_ps(vt, vs); + __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); + + // Store 8 outputs at a time. + _mm256_storeu_ps(output, vf); + output += 8; + + // Accumulate computed exponents. + vacc = _mm256_add_ps(vacc, vf); + } + if (batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 7 * sizeof(float)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); + + const __m256 vi = _mm256_maskload_ps(input, vmask); + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx = _mm256_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm256_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm256_mul_ps(vt, vs); + __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); + + __m128 vf_lo = _mm256_castps256_ps128(vf); + if (batch & (4 * sizeof(float))) { + _mm_storeu_ps(output, vf_lo); + vf_lo = _mm256_extractf128_ps(vf, 1); + output += 4; + } + if (batch & (2 * sizeof(float))) { + _mm_storel_pi((__m64*) output, vf_lo); + vf_lo = _mm_movehl_ps(vf_lo, vf_lo); + output += 2; + } + if (batch & (1 * sizeof(float))) { + _mm_store_ss(output, vf_lo); + } + + vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); + } + __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); + vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); + vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); + _mm_store_ss(sum, vacc_lo); +} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32-acc2.c new file mode 100644 index 00000000000..623755988c5 --- /dev/null +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32-acc2.c @@ -0,0 +1,274 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in +// Generator: tools/xngen +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/raddstoreexpminusmax.h" + + +void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc2( + size_t batch, + const float* input, + const float* max, + float* output, + float* sum, + const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(max != NULL); + assert(output != NULL); + assert(sum != NULL); + + static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; + + const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); + const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); + const __m256 vminus_ln2_hi = _mm256_set1_ps(-0x1.62E400p-1f); + const __m256 vminus_ln2_lo = _mm256_set1_ps(-0x1.7F7D1Cp-20f); + const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); + const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); + const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); + const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); + const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); + const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); + + XNN_FORCE_REALIZATION(vlog2e); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vminus_ln2_hi); + XNN_FORCE_REALIZATION(vminus_ln2_lo); + XNN_FORCE_REALIZATION(vc5); + XNN_FORCE_REALIZATION(vc4); + XNN_FORCE_REALIZATION(vc3); + XNN_FORCE_REALIZATION(vc2); + XNN_FORCE_REALIZATION(vc1); + XNN_FORCE_REALIZATION(vdenorm_cutoff); + + const __m256 vi_max = _mm256_broadcast_ss(max); + + __m256 vacc0 = _mm256_setzero_ps(); + __m256 vacc1 = _mm256_setzero_ps(); + for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { + // Load 32 (4x4) inputs at a time. + const __m256 vi0 = _mm256_loadu_ps(input); + const __m256 vi1 = _mm256_loadu_ps(input + 8); + const __m256 vi2 = _mm256_loadu_ps(input + 16); + const __m256 vi3 = _mm256_loadu_ps(input + 24); + input += 32; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); + const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); + const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); + const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn0 = _mm256_add_ps(_mm256_mul_ps(vx0, vlog2e), vmagic_bias); + __m256 vn1 = _mm256_add_ps(_mm256_mul_ps(vx1, vlog2e), vmagic_bias); + __m256 vn2 = _mm256_add_ps(_mm256_mul_ps(vx2, vlog2e), vmagic_bias); + __m256 vn3 = _mm256_add_ps(_mm256_mul_ps(vx3, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); + const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); + const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); + const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn0 = _mm256_sub_ps(vn0, vmagic_bias); + vn1 = _mm256_sub_ps(vn1, vmagic_bias); + vn2 = _mm256_sub_ps(vn2, vmagic_bias); + vn3 = _mm256_sub_ps(vn3, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_hi), vx0); + __m256 vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_hi), vx1); + __m256 vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_hi), vx2); + __m256 vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_hi), vx3); + + vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_lo), vt1); + vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_lo), vt2); + vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_lo), vt3); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp0 = _mm256_add_ps(_mm256_mul_ps(vc5, vt0), vc4); + __m256 vp1 = _mm256_add_ps(_mm256_mul_ps(vc5, vt1), vc4); + __m256 vp2 = _mm256_add_ps(_mm256_mul_ps(vc5, vt2), vc4); + __m256 vp3 = _mm256_add_ps(_mm256_mul_ps(vc5, vt3), vc4); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc3); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc3); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc3); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc3); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc2); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc2); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc2); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc2); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc1); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc1); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc1); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt0 = _mm256_mul_ps(vt0, vs0); + vt1 = _mm256_mul_ps(vt1, vs1); + vt2 = _mm256_mul_ps(vt2, vs2); + vt3 = _mm256_mul_ps(vt3, vs3); + + __m256 vf0 = _mm256_add_ps(_mm256_mul_ps(vt0, vp0), vs0); + __m256 vf1 = _mm256_add_ps(_mm256_mul_ps(vt1, vp1), vs1); + __m256 vf2 = _mm256_add_ps(_mm256_mul_ps(vt2, vp2), vs2); + __m256 vf3 = _mm256_add_ps(_mm256_mul_ps(vt3, vp3), vs3); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); + vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); + vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); + vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); + + // Store 32 (4x4) outputs at a time. + _mm256_storeu_ps(output, vf0); + _mm256_storeu_ps(output + 8, vf1); + _mm256_storeu_ps(output + 16, vf2); + _mm256_storeu_ps(output + 24, vf3); + output += 32; + + // Accumulate computed exponents. + vacc0 = _mm256_add_ps(vacc0, vf0); + vacc1 = _mm256_add_ps(vacc1, vf1); + vacc0 = _mm256_add_ps(vacc0, vf2); + vacc1 = _mm256_add_ps(vacc1, vf3); + } + // Add up all accumulators to vacc0 + vacc0 = _mm256_add_ps(vacc0, vacc1); + + __m256 vacc = vacc0; + for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { + // Load 8 inputs at a time. + const __m256 vi = _mm256_loadu_ps(input); + input += 8; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx = _mm256_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm256_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm256_mul_ps(vt, vs); + __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); + + // Store 8 outputs at a time. + _mm256_storeu_ps(output, vf); + output += 8; + + // Accumulate computed exponents. + vacc = _mm256_add_ps(vacc, vf); + } + if (batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 7 * sizeof(float)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); + + const __m256 vi = _mm256_maskload_ps(input, vmask); + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx = _mm256_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm256_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm256_mul_ps(vt, vs); + __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); + + __m128 vf_lo = _mm256_castps256_ps128(vf); + if (batch & (4 * sizeof(float))) { + _mm_storeu_ps(output, vf_lo); + vf_lo = _mm256_extractf128_ps(vf, 1); + output += 4; + } + if (batch & (2 * sizeof(float))) { + _mm_storel_pi((__m64*) output, vf_lo); + vf_lo = _mm_movehl_ps(vf_lo, vf_lo); + output += 2; + } + if (batch & (1 * sizeof(float))) { + _mm_store_ss(output, vf_lo); + } + + vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); + } + __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); + vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); + vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); + _mm_store_ss(sum, vacc_lo); +} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32-acc4.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32-acc4.c new file mode 100644 index 00000000000..376339e1a2a --- /dev/null +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32-acc4.c @@ -0,0 +1,278 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in +// Generator: tools/xngen +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/raddstoreexpminusmax.h" + + +void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc4( + size_t batch, + const float* input, + const float* max, + float* output, + float* sum, + const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(max != NULL); + assert(output != NULL); + assert(sum != NULL); + + static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; + + const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); + const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); + const __m256 vminus_ln2_hi = _mm256_set1_ps(-0x1.62E400p-1f); + const __m256 vminus_ln2_lo = _mm256_set1_ps(-0x1.7F7D1Cp-20f); + const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); + const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); + const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); + const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); + const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); + const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); + + XNN_FORCE_REALIZATION(vlog2e); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vminus_ln2_hi); + XNN_FORCE_REALIZATION(vminus_ln2_lo); + XNN_FORCE_REALIZATION(vc5); + XNN_FORCE_REALIZATION(vc4); + XNN_FORCE_REALIZATION(vc3); + XNN_FORCE_REALIZATION(vc2); + XNN_FORCE_REALIZATION(vc1); + XNN_FORCE_REALIZATION(vdenorm_cutoff); + + const __m256 vi_max = _mm256_broadcast_ss(max); + + __m256 vacc0 = _mm256_setzero_ps(); + __m256 vacc1 = _mm256_setzero_ps(); + __m256 vacc2 = _mm256_setzero_ps(); + __m256 vacc3 = _mm256_setzero_ps(); + for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { + // Load 32 (4x4) inputs at a time. + const __m256 vi0 = _mm256_loadu_ps(input); + const __m256 vi1 = _mm256_loadu_ps(input + 8); + const __m256 vi2 = _mm256_loadu_ps(input + 16); + const __m256 vi3 = _mm256_loadu_ps(input + 24); + input += 32; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); + const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); + const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); + const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn0 = _mm256_add_ps(_mm256_mul_ps(vx0, vlog2e), vmagic_bias); + __m256 vn1 = _mm256_add_ps(_mm256_mul_ps(vx1, vlog2e), vmagic_bias); + __m256 vn2 = _mm256_add_ps(_mm256_mul_ps(vx2, vlog2e), vmagic_bias); + __m256 vn3 = _mm256_add_ps(_mm256_mul_ps(vx3, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); + const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); + const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); + const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn0 = _mm256_sub_ps(vn0, vmagic_bias); + vn1 = _mm256_sub_ps(vn1, vmagic_bias); + vn2 = _mm256_sub_ps(vn2, vmagic_bias); + vn3 = _mm256_sub_ps(vn3, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_hi), vx0); + __m256 vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_hi), vx1); + __m256 vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_hi), vx2); + __m256 vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_hi), vx3); + + vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_lo), vt1); + vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_lo), vt2); + vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_lo), vt3); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp0 = _mm256_add_ps(_mm256_mul_ps(vc5, vt0), vc4); + __m256 vp1 = _mm256_add_ps(_mm256_mul_ps(vc5, vt1), vc4); + __m256 vp2 = _mm256_add_ps(_mm256_mul_ps(vc5, vt2), vc4); + __m256 vp3 = _mm256_add_ps(_mm256_mul_ps(vc5, vt3), vc4); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc3); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc3); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc3); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc3); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc2); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc2); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc2); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc2); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc1); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc1); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc1); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt0 = _mm256_mul_ps(vt0, vs0); + vt1 = _mm256_mul_ps(vt1, vs1); + vt2 = _mm256_mul_ps(vt2, vs2); + vt3 = _mm256_mul_ps(vt3, vs3); + + __m256 vf0 = _mm256_add_ps(_mm256_mul_ps(vt0, vp0), vs0); + __m256 vf1 = _mm256_add_ps(_mm256_mul_ps(vt1, vp1), vs1); + __m256 vf2 = _mm256_add_ps(_mm256_mul_ps(vt2, vp2), vs2); + __m256 vf3 = _mm256_add_ps(_mm256_mul_ps(vt3, vp3), vs3); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); + vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); + vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); + vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); + + // Store 32 (4x4) outputs at a time. + _mm256_storeu_ps(output, vf0); + _mm256_storeu_ps(output + 8, vf1); + _mm256_storeu_ps(output + 16, vf2); + _mm256_storeu_ps(output + 24, vf3); + output += 32; + + // Accumulate computed exponents. + vacc0 = _mm256_add_ps(vacc0, vf0); + vacc1 = _mm256_add_ps(vacc1, vf1); + vacc2 = _mm256_add_ps(vacc2, vf2); + vacc3 = _mm256_add_ps(vacc3, vf3); + } + // Add up all accumulators to vacc0 + vacc0 = _mm256_add_ps(vacc0, vacc1); + vacc2 = _mm256_add_ps(vacc2, vacc3); + vacc0 = _mm256_add_ps(vacc0, vacc2); + + __m256 vacc = vacc0; + for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { + // Load 8 inputs at a time. + const __m256 vi = _mm256_loadu_ps(input); + input += 8; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx = _mm256_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm256_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm256_mul_ps(vt, vs); + __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); + + // Store 8 outputs at a time. + _mm256_storeu_ps(output, vf); + output += 8; + + // Accumulate computed exponents. + vacc = _mm256_add_ps(vacc, vf); + } + if (batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 7 * sizeof(float)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); + + const __m256 vi = _mm256_maskload_ps(input, vmask); + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx = _mm256_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm256_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm256_mul_ps(vt, vs); + __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); + + __m128 vf_lo = _mm256_castps256_ps128(vf); + if (batch & (4 * sizeof(float))) { + _mm_storeu_ps(output, vf_lo); + vf_lo = _mm256_extractf128_ps(vf, 1); + output += 4; + } + if (batch & (2 * sizeof(float))) { + _mm_storel_pi((__m64*) output, vf_lo); + vf_lo = _mm_movehl_ps(vf_lo, vf_lo); + output += 2; + } + if (batch & (1 * sizeof(float))) { + _mm_store_ss(output, vf_lo); + } + + vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); + } + __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); + vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); + vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); + _mm_store_ss(sum, vacc_lo); +} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32.c new file mode 100644 index 00000000000..b671d6b1833 --- /dev/null +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32.c @@ -0,0 +1,271 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in +// Generator: tools/xngen +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/raddstoreexpminusmax.h" + + +void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32( + size_t batch, + const float* input, + const float* max, + float* output, + float* sum, + const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(max != NULL); + assert(output != NULL); + assert(sum != NULL); + + static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; + + const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); + const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); + const __m256 vminus_ln2_hi = _mm256_set1_ps(-0x1.62E400p-1f); + const __m256 vminus_ln2_lo = _mm256_set1_ps(-0x1.7F7D1Cp-20f); + const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); + const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); + const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); + const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); + const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); + const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); + + XNN_FORCE_REALIZATION(vlog2e); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vminus_ln2_hi); + XNN_FORCE_REALIZATION(vminus_ln2_lo); + XNN_FORCE_REALIZATION(vc5); + XNN_FORCE_REALIZATION(vc4); + XNN_FORCE_REALIZATION(vc3); + XNN_FORCE_REALIZATION(vc2); + XNN_FORCE_REALIZATION(vc1); + XNN_FORCE_REALIZATION(vdenorm_cutoff); + + const __m256 vi_max = _mm256_broadcast_ss(max); + + __m256 vacc0 = _mm256_setzero_ps(); + for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { + // Load 32 (4x4) inputs at a time. + const __m256 vi0 = _mm256_loadu_ps(input); + const __m256 vi1 = _mm256_loadu_ps(input + 8); + const __m256 vi2 = _mm256_loadu_ps(input + 16); + const __m256 vi3 = _mm256_loadu_ps(input + 24); + input += 32; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); + const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); + const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); + const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn0 = _mm256_add_ps(_mm256_mul_ps(vx0, vlog2e), vmagic_bias); + __m256 vn1 = _mm256_add_ps(_mm256_mul_ps(vx1, vlog2e), vmagic_bias); + __m256 vn2 = _mm256_add_ps(_mm256_mul_ps(vx2, vlog2e), vmagic_bias); + __m256 vn3 = _mm256_add_ps(_mm256_mul_ps(vx3, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); + const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); + const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); + const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn0 = _mm256_sub_ps(vn0, vmagic_bias); + vn1 = _mm256_sub_ps(vn1, vmagic_bias); + vn2 = _mm256_sub_ps(vn2, vmagic_bias); + vn3 = _mm256_sub_ps(vn3, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_hi), vx0); + __m256 vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_hi), vx1); + __m256 vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_hi), vx2); + __m256 vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_hi), vx3); + + vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_lo), vt1); + vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_lo), vt2); + vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_lo), vt3); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp0 = _mm256_add_ps(_mm256_mul_ps(vc5, vt0), vc4); + __m256 vp1 = _mm256_add_ps(_mm256_mul_ps(vc5, vt1), vc4); + __m256 vp2 = _mm256_add_ps(_mm256_mul_ps(vc5, vt2), vc4); + __m256 vp3 = _mm256_add_ps(_mm256_mul_ps(vc5, vt3), vc4); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc3); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc3); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc3); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc3); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc2); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc2); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc2); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc2); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc1); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc1); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc1); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt0 = _mm256_mul_ps(vt0, vs0); + vt1 = _mm256_mul_ps(vt1, vs1); + vt2 = _mm256_mul_ps(vt2, vs2); + vt3 = _mm256_mul_ps(vt3, vs3); + + __m256 vf0 = _mm256_add_ps(_mm256_mul_ps(vt0, vp0), vs0); + __m256 vf1 = _mm256_add_ps(_mm256_mul_ps(vt1, vp1), vs1); + __m256 vf2 = _mm256_add_ps(_mm256_mul_ps(vt2, vp2), vs2); + __m256 vf3 = _mm256_add_ps(_mm256_mul_ps(vt3, vp3), vs3); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); + vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); + vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); + vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); + + // Store 32 (4x4) outputs at a time. + _mm256_storeu_ps(output, vf0); + _mm256_storeu_ps(output + 8, vf1); + _mm256_storeu_ps(output + 16, vf2); + _mm256_storeu_ps(output + 24, vf3); + output += 32; + + // Accumulate computed exponents. + vacc0 = _mm256_add_ps(vacc0, vf0); + vacc0 = _mm256_add_ps(vacc0, vf1); + vacc0 = _mm256_add_ps(vacc0, vf2); + vacc0 = _mm256_add_ps(vacc0, vf3); + } + + __m256 vacc = vacc0; + for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { + // Load 8 inputs at a time. + const __m256 vi = _mm256_loadu_ps(input); + input += 8; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx = _mm256_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm256_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm256_mul_ps(vt, vs); + __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); + + // Store 8 outputs at a time. + _mm256_storeu_ps(output, vf); + output += 8; + + // Accumulate computed exponents. + vacc = _mm256_add_ps(vacc, vf); + } + if (batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 7 * sizeof(float)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); + + const __m256 vi = _mm256_maskload_ps(input, vmask); + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx = _mm256_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm256_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm256_mul_ps(vt, vs); + __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); + + __m128 vf_lo = _mm256_castps256_ps128(vf); + if (batch & (4 * sizeof(float))) { + _mm_storeu_ps(output, vf_lo); + vf_lo = _mm256_extractf128_ps(vf, 1); + output += 4; + } + if (batch & (2 * sizeof(float))) { + _mm_storel_pi((__m64*) output, vf_lo); + vf_lo = _mm_movehl_ps(vf_lo, vf_lo); + output += 2; + } + if (batch & (1 * sizeof(float))) { + _mm_store_ss(output, vf_lo); + } + + vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); + } + __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); + vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); + vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); + _mm_store_ss(sum, vacc_lo); +} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64-acc2.c new file mode 100644 index 00000000000..bba69d82e40 --- /dev/null +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64-acc2.c @@ -0,0 +1,338 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in +// Generator: tools/xngen +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/raddstoreexpminusmax.h" + + +void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64_acc2( + size_t batch, + const float* input, + const float* max, + float* output, + float* sum, + const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(max != NULL); + assert(output != NULL); + assert(sum != NULL); + + static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; + + const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); + const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); + const __m256 vminus_ln2_hi = _mm256_set1_ps(-0x1.62E400p-1f); + const __m256 vminus_ln2_lo = _mm256_set1_ps(-0x1.7F7D1Cp-20f); + const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); + const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); + const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); + const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); + const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); + const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); + + XNN_FORCE_REALIZATION(vlog2e); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vminus_ln2_hi); + XNN_FORCE_REALIZATION(vminus_ln2_lo); + XNN_FORCE_REALIZATION(vc5); + XNN_FORCE_REALIZATION(vc4); + XNN_FORCE_REALIZATION(vc3); + XNN_FORCE_REALIZATION(vc2); + XNN_FORCE_REALIZATION(vc1); + XNN_FORCE_REALIZATION(vdenorm_cutoff); + + const __m256 vi_max = _mm256_broadcast_ss(max); + + __m256 vacc0 = _mm256_setzero_ps(); + __m256 vacc1 = _mm256_setzero_ps(); + for (; batch >= 64 * sizeof(float); batch -= 64 * sizeof(float)) { + // Load 64 (8x4) inputs at a time. + const __m256 vi0 = _mm256_loadu_ps(input); + const __m256 vi1 = _mm256_loadu_ps(input + 8); + const __m256 vi2 = _mm256_loadu_ps(input + 16); + const __m256 vi3 = _mm256_loadu_ps(input + 24); + const __m256 vi4 = _mm256_loadu_ps(input + 32); + const __m256 vi5 = _mm256_loadu_ps(input + 40); + const __m256 vi6 = _mm256_loadu_ps(input + 48); + const __m256 vi7 = _mm256_loadu_ps(input + 56); + input += 64; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); + const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); + const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); + const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); + const __m256 vx4 = _mm256_sub_ps(vi4, vi_max); + const __m256 vx5 = _mm256_sub_ps(vi5, vi_max); + const __m256 vx6 = _mm256_sub_ps(vi6, vi_max); + const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn0 = _mm256_add_ps(_mm256_mul_ps(vx0, vlog2e), vmagic_bias); + __m256 vn1 = _mm256_add_ps(_mm256_mul_ps(vx1, vlog2e), vmagic_bias); + __m256 vn2 = _mm256_add_ps(_mm256_mul_ps(vx2, vlog2e), vmagic_bias); + __m256 vn3 = _mm256_add_ps(_mm256_mul_ps(vx3, vlog2e), vmagic_bias); + __m256 vn4 = _mm256_add_ps(_mm256_mul_ps(vx4, vlog2e), vmagic_bias); + __m256 vn5 = _mm256_add_ps(_mm256_mul_ps(vx5, vlog2e), vmagic_bias); + __m256 vn6 = _mm256_add_ps(_mm256_mul_ps(vx6, vlog2e), vmagic_bias); + __m256 vn7 = _mm256_add_ps(_mm256_mul_ps(vx7, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); + const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); + const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); + const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); + const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23)); + const __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); + const __m256 vs6 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn6), 23)); + const __m256 vs7 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn7), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn0 = _mm256_sub_ps(vn0, vmagic_bias); + vn1 = _mm256_sub_ps(vn1, vmagic_bias); + vn2 = _mm256_sub_ps(vn2, vmagic_bias); + vn3 = _mm256_sub_ps(vn3, vmagic_bias); + vn4 = _mm256_sub_ps(vn4, vmagic_bias); + vn5 = _mm256_sub_ps(vn5, vmagic_bias); + vn6 = _mm256_sub_ps(vn6, vmagic_bias); + vn7 = _mm256_sub_ps(vn7, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_hi), vx0); + __m256 vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_hi), vx1); + __m256 vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_hi), vx2); + __m256 vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_hi), vx3); + __m256 vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_hi), vx4); + __m256 vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_hi), vx5); + __m256 vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_hi), vx6); + __m256 vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_hi), vx7); + + vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_lo), vt1); + vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_lo), vt2); + vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_lo), vt3); + vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_lo), vt4); + vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_lo), vt5); + vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_lo), vt6); + vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_lo), vt7); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp0 = _mm256_add_ps(_mm256_mul_ps(vc5, vt0), vc4); + __m256 vp1 = _mm256_add_ps(_mm256_mul_ps(vc5, vt1), vc4); + __m256 vp2 = _mm256_add_ps(_mm256_mul_ps(vc5, vt2), vc4); + __m256 vp3 = _mm256_add_ps(_mm256_mul_ps(vc5, vt3), vc4); + __m256 vp4 = _mm256_add_ps(_mm256_mul_ps(vc5, vt4), vc4); + __m256 vp5 = _mm256_add_ps(_mm256_mul_ps(vc5, vt5), vc4); + __m256 vp6 = _mm256_add_ps(_mm256_mul_ps(vc5, vt6), vc4); + __m256 vp7 = _mm256_add_ps(_mm256_mul_ps(vc5, vt7), vc4); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc3); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc3); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc3); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc3); + vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc3); + vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc3); + vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc3); + vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc3); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc2); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc2); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc2); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc2); + vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc2); + vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc2); + vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc2); + vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc2); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc1); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc1); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc1); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc1); + vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc1); + vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc1); + vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc1); + vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt0 = _mm256_mul_ps(vt0, vs0); + vt1 = _mm256_mul_ps(vt1, vs1); + vt2 = _mm256_mul_ps(vt2, vs2); + vt3 = _mm256_mul_ps(vt3, vs3); + vt4 = _mm256_mul_ps(vt4, vs4); + vt5 = _mm256_mul_ps(vt5, vs5); + vt6 = _mm256_mul_ps(vt6, vs6); + vt7 = _mm256_mul_ps(vt7, vs7); + + __m256 vf0 = _mm256_add_ps(_mm256_mul_ps(vt0, vp0), vs0); + __m256 vf1 = _mm256_add_ps(_mm256_mul_ps(vt1, vp1), vs1); + __m256 vf2 = _mm256_add_ps(_mm256_mul_ps(vt2, vp2), vs2); + __m256 vf3 = _mm256_add_ps(_mm256_mul_ps(vt3, vp3), vs3); + __m256 vf4 = _mm256_add_ps(_mm256_mul_ps(vt4, vp4), vs4); + __m256 vf5 = _mm256_add_ps(_mm256_mul_ps(vt5, vp5), vs5); + __m256 vf6 = _mm256_add_ps(_mm256_mul_ps(vt6, vp6), vs6); + __m256 vf7 = _mm256_add_ps(_mm256_mul_ps(vt7, vp7), vs7); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); + vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); + vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); + vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); + vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vx4, vdenorm_cutoff, _CMP_LT_OS), vf4); + vf5 = _mm256_andnot_ps(_mm256_cmp_ps(vx5, vdenorm_cutoff, _CMP_LT_OS), vf5); + vf6 = _mm256_andnot_ps(_mm256_cmp_ps(vx6, vdenorm_cutoff, _CMP_LT_OS), vf6); + vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); + + // Store 64 (8x4) outputs at a time. + _mm256_storeu_ps(output, vf0); + _mm256_storeu_ps(output + 8, vf1); + _mm256_storeu_ps(output + 16, vf2); + _mm256_storeu_ps(output + 24, vf3); + _mm256_storeu_ps(output + 32, vf4); + _mm256_storeu_ps(output + 40, vf5); + _mm256_storeu_ps(output + 48, vf6); + _mm256_storeu_ps(output + 56, vf7); + output += 64; + + // Accumulate computed exponents. + vacc0 = _mm256_add_ps(vacc0, vf0); + vacc1 = _mm256_add_ps(vacc1, vf1); + vacc0 = _mm256_add_ps(vacc0, vf2); + vacc1 = _mm256_add_ps(vacc1, vf3); + vacc0 = _mm256_add_ps(vacc0, vf4); + vacc1 = _mm256_add_ps(vacc1, vf5); + vacc0 = _mm256_add_ps(vacc0, vf6); + vacc1 = _mm256_add_ps(vacc1, vf7); + } + // Add up all accumulators to vacc0 + vacc0 = _mm256_add_ps(vacc0, vacc1); + + __m256 vacc = vacc0; + for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { + // Load 8 inputs at a time. + const __m256 vi = _mm256_loadu_ps(input); + input += 8; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx = _mm256_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm256_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm256_mul_ps(vt, vs); + __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); + + // Store 8 outputs at a time. + _mm256_storeu_ps(output, vf); + output += 8; + + // Accumulate computed exponents. + vacc = _mm256_add_ps(vacc, vf); + } + if (batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 7 * sizeof(float)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); + + const __m256 vi = _mm256_maskload_ps(input, vmask); + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx = _mm256_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm256_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm256_mul_ps(vt, vs); + __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); + + __m128 vf_lo = _mm256_castps256_ps128(vf); + if (batch & (4 * sizeof(float))) { + _mm_storeu_ps(output, vf_lo); + vf_lo = _mm256_extractf128_ps(vf, 1); + output += 4; + } + if (batch & (2 * sizeof(float))) { + _mm_storel_pi((__m64*) output, vf_lo); + vf_lo = _mm_movehl_ps(vf_lo, vf_lo); + output += 2; + } + if (batch & (1 * sizeof(float))) { + _mm_store_ss(output, vf_lo); + } + + vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); + } + __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); + vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); + vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); + _mm_store_ss(sum, vacc_lo); +} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64-acc4.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64-acc4.c new file mode 100644 index 00000000000..fd50ec677c8 --- /dev/null +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64-acc4.c @@ -0,0 +1,342 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in +// Generator: tools/xngen +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/raddstoreexpminusmax.h" + + +void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64_acc4( + size_t batch, + const float* input, + const float* max, + float* output, + float* sum, + const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(max != NULL); + assert(output != NULL); + assert(sum != NULL); + + static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; + + const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); + const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); + const __m256 vminus_ln2_hi = _mm256_set1_ps(-0x1.62E400p-1f); + const __m256 vminus_ln2_lo = _mm256_set1_ps(-0x1.7F7D1Cp-20f); + const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); + const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); + const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); + const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); + const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); + const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); + + XNN_FORCE_REALIZATION(vlog2e); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vminus_ln2_hi); + XNN_FORCE_REALIZATION(vminus_ln2_lo); + XNN_FORCE_REALIZATION(vc5); + XNN_FORCE_REALIZATION(vc4); + XNN_FORCE_REALIZATION(vc3); + XNN_FORCE_REALIZATION(vc2); + XNN_FORCE_REALIZATION(vc1); + XNN_FORCE_REALIZATION(vdenorm_cutoff); + + const __m256 vi_max = _mm256_broadcast_ss(max); + + __m256 vacc0 = _mm256_setzero_ps(); + __m256 vacc1 = _mm256_setzero_ps(); + __m256 vacc2 = _mm256_setzero_ps(); + __m256 vacc3 = _mm256_setzero_ps(); + for (; batch >= 64 * sizeof(float); batch -= 64 * sizeof(float)) { + // Load 64 (8x4) inputs at a time. + const __m256 vi0 = _mm256_loadu_ps(input); + const __m256 vi1 = _mm256_loadu_ps(input + 8); + const __m256 vi2 = _mm256_loadu_ps(input + 16); + const __m256 vi3 = _mm256_loadu_ps(input + 24); + const __m256 vi4 = _mm256_loadu_ps(input + 32); + const __m256 vi5 = _mm256_loadu_ps(input + 40); + const __m256 vi6 = _mm256_loadu_ps(input + 48); + const __m256 vi7 = _mm256_loadu_ps(input + 56); + input += 64; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); + const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); + const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); + const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); + const __m256 vx4 = _mm256_sub_ps(vi4, vi_max); + const __m256 vx5 = _mm256_sub_ps(vi5, vi_max); + const __m256 vx6 = _mm256_sub_ps(vi6, vi_max); + const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn0 = _mm256_add_ps(_mm256_mul_ps(vx0, vlog2e), vmagic_bias); + __m256 vn1 = _mm256_add_ps(_mm256_mul_ps(vx1, vlog2e), vmagic_bias); + __m256 vn2 = _mm256_add_ps(_mm256_mul_ps(vx2, vlog2e), vmagic_bias); + __m256 vn3 = _mm256_add_ps(_mm256_mul_ps(vx3, vlog2e), vmagic_bias); + __m256 vn4 = _mm256_add_ps(_mm256_mul_ps(vx4, vlog2e), vmagic_bias); + __m256 vn5 = _mm256_add_ps(_mm256_mul_ps(vx5, vlog2e), vmagic_bias); + __m256 vn6 = _mm256_add_ps(_mm256_mul_ps(vx6, vlog2e), vmagic_bias); + __m256 vn7 = _mm256_add_ps(_mm256_mul_ps(vx7, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); + const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); + const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); + const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); + const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23)); + const __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); + const __m256 vs6 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn6), 23)); + const __m256 vs7 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn7), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn0 = _mm256_sub_ps(vn0, vmagic_bias); + vn1 = _mm256_sub_ps(vn1, vmagic_bias); + vn2 = _mm256_sub_ps(vn2, vmagic_bias); + vn3 = _mm256_sub_ps(vn3, vmagic_bias); + vn4 = _mm256_sub_ps(vn4, vmagic_bias); + vn5 = _mm256_sub_ps(vn5, vmagic_bias); + vn6 = _mm256_sub_ps(vn6, vmagic_bias); + vn7 = _mm256_sub_ps(vn7, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_hi), vx0); + __m256 vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_hi), vx1); + __m256 vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_hi), vx2); + __m256 vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_hi), vx3); + __m256 vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_hi), vx4); + __m256 vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_hi), vx5); + __m256 vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_hi), vx6); + __m256 vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_hi), vx7); + + vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_lo), vt1); + vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_lo), vt2); + vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_lo), vt3); + vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_lo), vt4); + vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_lo), vt5); + vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_lo), vt6); + vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_lo), vt7); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp0 = _mm256_add_ps(_mm256_mul_ps(vc5, vt0), vc4); + __m256 vp1 = _mm256_add_ps(_mm256_mul_ps(vc5, vt1), vc4); + __m256 vp2 = _mm256_add_ps(_mm256_mul_ps(vc5, vt2), vc4); + __m256 vp3 = _mm256_add_ps(_mm256_mul_ps(vc5, vt3), vc4); + __m256 vp4 = _mm256_add_ps(_mm256_mul_ps(vc5, vt4), vc4); + __m256 vp5 = _mm256_add_ps(_mm256_mul_ps(vc5, vt5), vc4); + __m256 vp6 = _mm256_add_ps(_mm256_mul_ps(vc5, vt6), vc4); + __m256 vp7 = _mm256_add_ps(_mm256_mul_ps(vc5, vt7), vc4); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc3); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc3); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc3); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc3); + vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc3); + vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc3); + vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc3); + vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc3); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc2); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc2); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc2); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc2); + vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc2); + vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc2); + vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc2); + vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc2); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc1); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc1); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc1); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc1); + vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc1); + vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc1); + vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc1); + vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt0 = _mm256_mul_ps(vt0, vs0); + vt1 = _mm256_mul_ps(vt1, vs1); + vt2 = _mm256_mul_ps(vt2, vs2); + vt3 = _mm256_mul_ps(vt3, vs3); + vt4 = _mm256_mul_ps(vt4, vs4); + vt5 = _mm256_mul_ps(vt5, vs5); + vt6 = _mm256_mul_ps(vt6, vs6); + vt7 = _mm256_mul_ps(vt7, vs7); + + __m256 vf0 = _mm256_add_ps(_mm256_mul_ps(vt0, vp0), vs0); + __m256 vf1 = _mm256_add_ps(_mm256_mul_ps(vt1, vp1), vs1); + __m256 vf2 = _mm256_add_ps(_mm256_mul_ps(vt2, vp2), vs2); + __m256 vf3 = _mm256_add_ps(_mm256_mul_ps(vt3, vp3), vs3); + __m256 vf4 = _mm256_add_ps(_mm256_mul_ps(vt4, vp4), vs4); + __m256 vf5 = _mm256_add_ps(_mm256_mul_ps(vt5, vp5), vs5); + __m256 vf6 = _mm256_add_ps(_mm256_mul_ps(vt6, vp6), vs6); + __m256 vf7 = _mm256_add_ps(_mm256_mul_ps(vt7, vp7), vs7); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); + vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); + vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); + vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); + vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vx4, vdenorm_cutoff, _CMP_LT_OS), vf4); + vf5 = _mm256_andnot_ps(_mm256_cmp_ps(vx5, vdenorm_cutoff, _CMP_LT_OS), vf5); + vf6 = _mm256_andnot_ps(_mm256_cmp_ps(vx6, vdenorm_cutoff, _CMP_LT_OS), vf6); + vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); + + // Store 64 (8x4) outputs at a time. + _mm256_storeu_ps(output, vf0); + _mm256_storeu_ps(output + 8, vf1); + _mm256_storeu_ps(output + 16, vf2); + _mm256_storeu_ps(output + 24, vf3); + _mm256_storeu_ps(output + 32, vf4); + _mm256_storeu_ps(output + 40, vf5); + _mm256_storeu_ps(output + 48, vf6); + _mm256_storeu_ps(output + 56, vf7); + output += 64; + + // Accumulate computed exponents. + vacc0 = _mm256_add_ps(vacc0, vf0); + vacc1 = _mm256_add_ps(vacc1, vf1); + vacc2 = _mm256_add_ps(vacc2, vf2); + vacc3 = _mm256_add_ps(vacc3, vf3); + vacc0 = _mm256_add_ps(vacc0, vf4); + vacc1 = _mm256_add_ps(vacc1, vf5); + vacc2 = _mm256_add_ps(vacc2, vf6); + vacc3 = _mm256_add_ps(vacc3, vf7); + } + // Add up all accumulators to vacc0 + vacc0 = _mm256_add_ps(vacc0, vacc1); + vacc2 = _mm256_add_ps(vacc2, vacc3); + vacc0 = _mm256_add_ps(vacc0, vacc2); + + __m256 vacc = vacc0; + for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { + // Load 8 inputs at a time. + const __m256 vi = _mm256_loadu_ps(input); + input += 8; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx = _mm256_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm256_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm256_mul_ps(vt, vs); + __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); + + // Store 8 outputs at a time. + _mm256_storeu_ps(output, vf); + output += 8; + + // Accumulate computed exponents. + vacc = _mm256_add_ps(vacc, vf); + } + if (batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 7 * sizeof(float)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); + + const __m256 vi = _mm256_maskload_ps(input, vmask); + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx = _mm256_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm256_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm256_mul_ps(vt, vs); + __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); + + __m128 vf_lo = _mm256_castps256_ps128(vf); + if (batch & (4 * sizeof(float))) { + _mm_storeu_ps(output, vf_lo); + vf_lo = _mm256_extractf128_ps(vf, 1); + output += 4; + } + if (batch & (2 * sizeof(float))) { + _mm_storel_pi((__m64*) output, vf_lo); + vf_lo = _mm_movehl_ps(vf_lo, vf_lo); + output += 2; + } + if (batch & (1 * sizeof(float))) { + _mm_store_ss(output, vf_lo); + } + + vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); + } + __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); + vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); + vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); + _mm_store_ss(sum, vacc_lo); +} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64.c new file mode 100644 index 00000000000..d79849dc12a --- /dev/null +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64.c @@ -0,0 +1,335 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in +// Generator: tools/xngen +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/raddstoreexpminusmax.h" + + +void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64( + size_t batch, + const float* input, + const float* max, + float* output, + float* sum, + const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(max != NULL); + assert(output != NULL); + assert(sum != NULL); + + static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; + + const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); + const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); + const __m256 vminus_ln2_hi = _mm256_set1_ps(-0x1.62E400p-1f); + const __m256 vminus_ln2_lo = _mm256_set1_ps(-0x1.7F7D1Cp-20f); + const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); + const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); + const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); + const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); + const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); + const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); + + XNN_FORCE_REALIZATION(vlog2e); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vminus_ln2_hi); + XNN_FORCE_REALIZATION(vminus_ln2_lo); + XNN_FORCE_REALIZATION(vc5); + XNN_FORCE_REALIZATION(vc4); + XNN_FORCE_REALIZATION(vc3); + XNN_FORCE_REALIZATION(vc2); + XNN_FORCE_REALIZATION(vc1); + XNN_FORCE_REALIZATION(vdenorm_cutoff); + + const __m256 vi_max = _mm256_broadcast_ss(max); + + __m256 vacc0 = _mm256_setzero_ps(); + for (; batch >= 64 * sizeof(float); batch -= 64 * sizeof(float)) { + // Load 64 (8x4) inputs at a time. + const __m256 vi0 = _mm256_loadu_ps(input); + const __m256 vi1 = _mm256_loadu_ps(input + 8); + const __m256 vi2 = _mm256_loadu_ps(input + 16); + const __m256 vi3 = _mm256_loadu_ps(input + 24); + const __m256 vi4 = _mm256_loadu_ps(input + 32); + const __m256 vi5 = _mm256_loadu_ps(input + 40); + const __m256 vi6 = _mm256_loadu_ps(input + 48); + const __m256 vi7 = _mm256_loadu_ps(input + 56); + input += 64; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); + const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); + const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); + const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); + const __m256 vx4 = _mm256_sub_ps(vi4, vi_max); + const __m256 vx5 = _mm256_sub_ps(vi5, vi_max); + const __m256 vx6 = _mm256_sub_ps(vi6, vi_max); + const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn0 = _mm256_add_ps(_mm256_mul_ps(vx0, vlog2e), vmagic_bias); + __m256 vn1 = _mm256_add_ps(_mm256_mul_ps(vx1, vlog2e), vmagic_bias); + __m256 vn2 = _mm256_add_ps(_mm256_mul_ps(vx2, vlog2e), vmagic_bias); + __m256 vn3 = _mm256_add_ps(_mm256_mul_ps(vx3, vlog2e), vmagic_bias); + __m256 vn4 = _mm256_add_ps(_mm256_mul_ps(vx4, vlog2e), vmagic_bias); + __m256 vn5 = _mm256_add_ps(_mm256_mul_ps(vx5, vlog2e), vmagic_bias); + __m256 vn6 = _mm256_add_ps(_mm256_mul_ps(vx6, vlog2e), vmagic_bias); + __m256 vn7 = _mm256_add_ps(_mm256_mul_ps(vx7, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); + const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); + const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); + const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); + const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23)); + const __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); + const __m256 vs6 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn6), 23)); + const __m256 vs7 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn7), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn0 = _mm256_sub_ps(vn0, vmagic_bias); + vn1 = _mm256_sub_ps(vn1, vmagic_bias); + vn2 = _mm256_sub_ps(vn2, vmagic_bias); + vn3 = _mm256_sub_ps(vn3, vmagic_bias); + vn4 = _mm256_sub_ps(vn4, vmagic_bias); + vn5 = _mm256_sub_ps(vn5, vmagic_bias); + vn6 = _mm256_sub_ps(vn6, vmagic_bias); + vn7 = _mm256_sub_ps(vn7, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_hi), vx0); + __m256 vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_hi), vx1); + __m256 vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_hi), vx2); + __m256 vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_hi), vx3); + __m256 vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_hi), vx4); + __m256 vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_hi), vx5); + __m256 vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_hi), vx6); + __m256 vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_hi), vx7); + + vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_lo), vt1); + vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_lo), vt2); + vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_lo), vt3); + vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_lo), vt4); + vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_lo), vt5); + vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_lo), vt6); + vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_lo), vt7); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp0 = _mm256_add_ps(_mm256_mul_ps(vc5, vt0), vc4); + __m256 vp1 = _mm256_add_ps(_mm256_mul_ps(vc5, vt1), vc4); + __m256 vp2 = _mm256_add_ps(_mm256_mul_ps(vc5, vt2), vc4); + __m256 vp3 = _mm256_add_ps(_mm256_mul_ps(vc5, vt3), vc4); + __m256 vp4 = _mm256_add_ps(_mm256_mul_ps(vc5, vt4), vc4); + __m256 vp5 = _mm256_add_ps(_mm256_mul_ps(vc5, vt5), vc4); + __m256 vp6 = _mm256_add_ps(_mm256_mul_ps(vc5, vt6), vc4); + __m256 vp7 = _mm256_add_ps(_mm256_mul_ps(vc5, vt7), vc4); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc3); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc3); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc3); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc3); + vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc3); + vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc3); + vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc3); + vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc3); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc2); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc2); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc2); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc2); + vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc2); + vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc2); + vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc2); + vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc2); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc1); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc1); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc1); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc1); + vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc1); + vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc1); + vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc1); + vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt0 = _mm256_mul_ps(vt0, vs0); + vt1 = _mm256_mul_ps(vt1, vs1); + vt2 = _mm256_mul_ps(vt2, vs2); + vt3 = _mm256_mul_ps(vt3, vs3); + vt4 = _mm256_mul_ps(vt4, vs4); + vt5 = _mm256_mul_ps(vt5, vs5); + vt6 = _mm256_mul_ps(vt6, vs6); + vt7 = _mm256_mul_ps(vt7, vs7); + + __m256 vf0 = _mm256_add_ps(_mm256_mul_ps(vt0, vp0), vs0); + __m256 vf1 = _mm256_add_ps(_mm256_mul_ps(vt1, vp1), vs1); + __m256 vf2 = _mm256_add_ps(_mm256_mul_ps(vt2, vp2), vs2); + __m256 vf3 = _mm256_add_ps(_mm256_mul_ps(vt3, vp3), vs3); + __m256 vf4 = _mm256_add_ps(_mm256_mul_ps(vt4, vp4), vs4); + __m256 vf5 = _mm256_add_ps(_mm256_mul_ps(vt5, vp5), vs5); + __m256 vf6 = _mm256_add_ps(_mm256_mul_ps(vt6, vp6), vs6); + __m256 vf7 = _mm256_add_ps(_mm256_mul_ps(vt7, vp7), vs7); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); + vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); + vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); + vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); + vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vx4, vdenorm_cutoff, _CMP_LT_OS), vf4); + vf5 = _mm256_andnot_ps(_mm256_cmp_ps(vx5, vdenorm_cutoff, _CMP_LT_OS), vf5); + vf6 = _mm256_andnot_ps(_mm256_cmp_ps(vx6, vdenorm_cutoff, _CMP_LT_OS), vf6); + vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); + + // Store 64 (8x4) outputs at a time. + _mm256_storeu_ps(output, vf0); + _mm256_storeu_ps(output + 8, vf1); + _mm256_storeu_ps(output + 16, vf2); + _mm256_storeu_ps(output + 24, vf3); + _mm256_storeu_ps(output + 32, vf4); + _mm256_storeu_ps(output + 40, vf5); + _mm256_storeu_ps(output + 48, vf6); + _mm256_storeu_ps(output + 56, vf7); + output += 64; + + // Accumulate computed exponents. + vacc0 = _mm256_add_ps(vacc0, vf0); + vacc0 = _mm256_add_ps(vacc0, vf1); + vacc0 = _mm256_add_ps(vacc0, vf2); + vacc0 = _mm256_add_ps(vacc0, vf3); + vacc0 = _mm256_add_ps(vacc0, vf4); + vacc0 = _mm256_add_ps(vacc0, vf5); + vacc0 = _mm256_add_ps(vacc0, vf6); + vacc0 = _mm256_add_ps(vacc0, vf7); + } + + __m256 vacc = vacc0; + for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { + // Load 8 inputs at a time. + const __m256 vi = _mm256_loadu_ps(input); + input += 8; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx = _mm256_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm256_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm256_mul_ps(vt, vs); + __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); + + // Store 8 outputs at a time. + _mm256_storeu_ps(output, vf); + output += 8; + + // Accumulate computed exponents. + vacc = _mm256_add_ps(vacc, vf); + } + if (batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 7 * sizeof(float)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); + + const __m256 vi = _mm256_maskload_ps(input, vmask); + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx = _mm256_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm256_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm256_mul_ps(vt, vs); + __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); + + __m128 vf_lo = _mm256_castps256_ps128(vf); + if (batch & (4 * sizeof(float))) { + _mm_storeu_ps(output, vf_lo); + vf_lo = _mm256_extractf128_ps(vf, 1); + output += 4; + } + if (batch & (2 * sizeof(float))) { + _mm_storel_pi((__m64*) output, vf_lo); + vf_lo = _mm_movehl_ps(vf_lo, vf_lo); + output += 2; + } + if (batch & (1 * sizeof(float))) { + _mm_store_ss(output, vf_lo); + } + + vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); + } + __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); + vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); + vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); + _mm_store_ss(sum, vacc_lo); +} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u72-acc3.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u72-acc3.c new file mode 100644 index 00000000000..83632b4397b --- /dev/null +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u72-acc3.c @@ -0,0 +1,356 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in +// Generator: tools/xngen +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/raddstoreexpminusmax.h" + + +void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u72_acc3( + size_t batch, + const float* input, + const float* max, + float* output, + float* sum, + const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(max != NULL); + assert(output != NULL); + assert(sum != NULL); + + static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; + + const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); + const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); + const __m256 vminus_ln2_hi = _mm256_set1_ps(-0x1.62E400p-1f); + const __m256 vminus_ln2_lo = _mm256_set1_ps(-0x1.7F7D1Cp-20f); + const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); + const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); + const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); + const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); + const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); + const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); + + XNN_FORCE_REALIZATION(vlog2e); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vminus_ln2_hi); + XNN_FORCE_REALIZATION(vminus_ln2_lo); + XNN_FORCE_REALIZATION(vc5); + XNN_FORCE_REALIZATION(vc4); + XNN_FORCE_REALIZATION(vc3); + XNN_FORCE_REALIZATION(vc2); + XNN_FORCE_REALIZATION(vc1); + XNN_FORCE_REALIZATION(vdenorm_cutoff); + + const __m256 vi_max = _mm256_broadcast_ss(max); + + __m256 vacc0 = _mm256_setzero_ps(); + __m256 vacc1 = _mm256_setzero_ps(); + __m256 vacc2 = _mm256_setzero_ps(); + for (; batch >= 72 * sizeof(float); batch -= 72 * sizeof(float)) { + // Load 72 (9x4) inputs at a time. + const __m256 vi0 = _mm256_loadu_ps(input); + const __m256 vi1 = _mm256_loadu_ps(input + 8); + const __m256 vi2 = _mm256_loadu_ps(input + 16); + const __m256 vi3 = _mm256_loadu_ps(input + 24); + const __m256 vi4 = _mm256_loadu_ps(input + 32); + const __m256 vi5 = _mm256_loadu_ps(input + 40); + const __m256 vi6 = _mm256_loadu_ps(input + 48); + const __m256 vi7 = _mm256_loadu_ps(input + 56); + const __m256 vi8 = _mm256_loadu_ps(input + 64); + input += 72; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); + const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); + const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); + const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); + const __m256 vx4 = _mm256_sub_ps(vi4, vi_max); + const __m256 vx5 = _mm256_sub_ps(vi5, vi_max); + const __m256 vx6 = _mm256_sub_ps(vi6, vi_max); + const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); + const __m256 vx8 = _mm256_sub_ps(vi8, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn0 = _mm256_add_ps(_mm256_mul_ps(vx0, vlog2e), vmagic_bias); + __m256 vn1 = _mm256_add_ps(_mm256_mul_ps(vx1, vlog2e), vmagic_bias); + __m256 vn2 = _mm256_add_ps(_mm256_mul_ps(vx2, vlog2e), vmagic_bias); + __m256 vn3 = _mm256_add_ps(_mm256_mul_ps(vx3, vlog2e), vmagic_bias); + __m256 vn4 = _mm256_add_ps(_mm256_mul_ps(vx4, vlog2e), vmagic_bias); + __m256 vn5 = _mm256_add_ps(_mm256_mul_ps(vx5, vlog2e), vmagic_bias); + __m256 vn6 = _mm256_add_ps(_mm256_mul_ps(vx6, vlog2e), vmagic_bias); + __m256 vn7 = _mm256_add_ps(_mm256_mul_ps(vx7, vlog2e), vmagic_bias); + __m256 vn8 = _mm256_add_ps(_mm256_mul_ps(vx8, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); + const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); + const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); + const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); + const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23)); + const __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); + const __m256 vs6 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn6), 23)); + const __m256 vs7 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn7), 23)); + const __m256 vs8 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn8), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn0 = _mm256_sub_ps(vn0, vmagic_bias); + vn1 = _mm256_sub_ps(vn1, vmagic_bias); + vn2 = _mm256_sub_ps(vn2, vmagic_bias); + vn3 = _mm256_sub_ps(vn3, vmagic_bias); + vn4 = _mm256_sub_ps(vn4, vmagic_bias); + vn5 = _mm256_sub_ps(vn5, vmagic_bias); + vn6 = _mm256_sub_ps(vn6, vmagic_bias); + vn7 = _mm256_sub_ps(vn7, vmagic_bias); + vn8 = _mm256_sub_ps(vn8, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_hi), vx0); + __m256 vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_hi), vx1); + __m256 vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_hi), vx2); + __m256 vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_hi), vx3); + __m256 vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_hi), vx4); + __m256 vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_hi), vx5); + __m256 vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_hi), vx6); + __m256 vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_hi), vx7); + __m256 vt8 = _mm256_add_ps(_mm256_mul_ps(vn8, vminus_ln2_hi), vx8); + + vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_lo), vt1); + vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_lo), vt2); + vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_lo), vt3); + vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_lo), vt4); + vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_lo), vt5); + vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_lo), vt6); + vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_lo), vt7); + vt8 = _mm256_add_ps(_mm256_mul_ps(vn8, vminus_ln2_lo), vt8); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp0 = _mm256_add_ps(_mm256_mul_ps(vc5, vt0), vc4); + __m256 vp1 = _mm256_add_ps(_mm256_mul_ps(vc5, vt1), vc4); + __m256 vp2 = _mm256_add_ps(_mm256_mul_ps(vc5, vt2), vc4); + __m256 vp3 = _mm256_add_ps(_mm256_mul_ps(vc5, vt3), vc4); + __m256 vp4 = _mm256_add_ps(_mm256_mul_ps(vc5, vt4), vc4); + __m256 vp5 = _mm256_add_ps(_mm256_mul_ps(vc5, vt5), vc4); + __m256 vp6 = _mm256_add_ps(_mm256_mul_ps(vc5, vt6), vc4); + __m256 vp7 = _mm256_add_ps(_mm256_mul_ps(vc5, vt7), vc4); + __m256 vp8 = _mm256_add_ps(_mm256_mul_ps(vc5, vt8), vc4); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc3); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc3); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc3); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc3); + vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc3); + vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc3); + vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc3); + vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc3); + vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc3); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc2); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc2); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc2); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc2); + vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc2); + vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc2); + vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc2); + vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc2); + vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc2); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc1); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc1); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc1); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc1); + vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc1); + vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc1); + vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc1); + vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc1); + vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt0 = _mm256_mul_ps(vt0, vs0); + vt1 = _mm256_mul_ps(vt1, vs1); + vt2 = _mm256_mul_ps(vt2, vs2); + vt3 = _mm256_mul_ps(vt3, vs3); + vt4 = _mm256_mul_ps(vt4, vs4); + vt5 = _mm256_mul_ps(vt5, vs5); + vt6 = _mm256_mul_ps(vt6, vs6); + vt7 = _mm256_mul_ps(vt7, vs7); + vt8 = _mm256_mul_ps(vt8, vs8); + + __m256 vf0 = _mm256_add_ps(_mm256_mul_ps(vt0, vp0), vs0); + __m256 vf1 = _mm256_add_ps(_mm256_mul_ps(vt1, vp1), vs1); + __m256 vf2 = _mm256_add_ps(_mm256_mul_ps(vt2, vp2), vs2); + __m256 vf3 = _mm256_add_ps(_mm256_mul_ps(vt3, vp3), vs3); + __m256 vf4 = _mm256_add_ps(_mm256_mul_ps(vt4, vp4), vs4); + __m256 vf5 = _mm256_add_ps(_mm256_mul_ps(vt5, vp5), vs5); + __m256 vf6 = _mm256_add_ps(_mm256_mul_ps(vt6, vp6), vs6); + __m256 vf7 = _mm256_add_ps(_mm256_mul_ps(vt7, vp7), vs7); + __m256 vf8 = _mm256_add_ps(_mm256_mul_ps(vt8, vp8), vs8); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); + vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); + vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); + vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); + vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vx4, vdenorm_cutoff, _CMP_LT_OS), vf4); + vf5 = _mm256_andnot_ps(_mm256_cmp_ps(vx5, vdenorm_cutoff, _CMP_LT_OS), vf5); + vf6 = _mm256_andnot_ps(_mm256_cmp_ps(vx6, vdenorm_cutoff, _CMP_LT_OS), vf6); + vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); + vf8 = _mm256_andnot_ps(_mm256_cmp_ps(vx8, vdenorm_cutoff, _CMP_LT_OS), vf8); + + // Store 72 (9x4) outputs at a time. + _mm256_storeu_ps(output, vf0); + _mm256_storeu_ps(output + 8, vf1); + _mm256_storeu_ps(output + 16, vf2); + _mm256_storeu_ps(output + 24, vf3); + _mm256_storeu_ps(output + 32, vf4); + _mm256_storeu_ps(output + 40, vf5); + _mm256_storeu_ps(output + 48, vf6); + _mm256_storeu_ps(output + 56, vf7); + _mm256_storeu_ps(output + 64, vf8); + output += 72; + + // Accumulate computed exponents. + vacc0 = _mm256_add_ps(vacc0, vf0); + vacc1 = _mm256_add_ps(vacc1, vf1); + vacc2 = _mm256_add_ps(vacc2, vf2); + vacc0 = _mm256_add_ps(vacc0, vf3); + vacc1 = _mm256_add_ps(vacc1, vf4); + vacc2 = _mm256_add_ps(vacc2, vf5); + vacc0 = _mm256_add_ps(vacc0, vf6); + vacc1 = _mm256_add_ps(vacc1, vf7); + vacc2 = _mm256_add_ps(vacc2, vf8); + } + // Add up all accumulators to vacc0 + vacc0 = _mm256_add_ps(vacc0, vacc1); + vacc0 = _mm256_add_ps(vacc0, vacc2); + + __m256 vacc = vacc0; + for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { + // Load 8 inputs at a time. + const __m256 vi = _mm256_loadu_ps(input); + input += 8; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx = _mm256_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm256_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm256_mul_ps(vt, vs); + __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); + + // Store 8 outputs at a time. + _mm256_storeu_ps(output, vf); + output += 8; + + // Accumulate computed exponents. + vacc = _mm256_add_ps(vacc, vf); + } + if (batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 7 * sizeof(float)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); + + const __m256 vi = _mm256_maskload_ps(input, vmask); + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx = _mm256_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm256_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm256_mul_ps(vt, vs); + __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); + + __m128 vf_lo = _mm256_castps256_ps128(vf); + if (batch & (4 * sizeof(float))) { + _mm_storeu_ps(output, vf_lo); + vf_lo = _mm256_extractf128_ps(vf, 1); + output += 4; + } + if (batch & (2 * sizeof(float))) { + _mm_storel_pi((__m64*) output, vf_lo); + vf_lo = _mm_movehl_ps(vf_lo, vf_lo); + output += 2; + } + if (batch & (1 * sizeof(float))) { + _mm_store_ss(output, vf_lo); + } + + vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); + } + __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); + vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); + vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); + _mm_store_ss(sum, vacc_lo); +} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u72.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u72.c new file mode 100644 index 00000000000..640e31a234e --- /dev/null +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u72.c @@ -0,0 +1,351 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in +// Generator: tools/xngen +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/raddstoreexpminusmax.h" + + +void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u72( + size_t batch, + const float* input, + const float* max, + float* output, + float* sum, + const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(max != NULL); + assert(output != NULL); + assert(sum != NULL); + + static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; + + const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); + const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); + const __m256 vminus_ln2_hi = _mm256_set1_ps(-0x1.62E400p-1f); + const __m256 vminus_ln2_lo = _mm256_set1_ps(-0x1.7F7D1Cp-20f); + const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); + const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); + const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); + const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); + const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); + const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); + + XNN_FORCE_REALIZATION(vlog2e); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vminus_ln2_hi); + XNN_FORCE_REALIZATION(vminus_ln2_lo); + XNN_FORCE_REALIZATION(vc5); + XNN_FORCE_REALIZATION(vc4); + XNN_FORCE_REALIZATION(vc3); + XNN_FORCE_REALIZATION(vc2); + XNN_FORCE_REALIZATION(vc1); + XNN_FORCE_REALIZATION(vdenorm_cutoff); + + const __m256 vi_max = _mm256_broadcast_ss(max); + + __m256 vacc0 = _mm256_setzero_ps(); + for (; batch >= 72 * sizeof(float); batch -= 72 * sizeof(float)) { + // Load 72 (9x4) inputs at a time. + const __m256 vi0 = _mm256_loadu_ps(input); + const __m256 vi1 = _mm256_loadu_ps(input + 8); + const __m256 vi2 = _mm256_loadu_ps(input + 16); + const __m256 vi3 = _mm256_loadu_ps(input + 24); + const __m256 vi4 = _mm256_loadu_ps(input + 32); + const __m256 vi5 = _mm256_loadu_ps(input + 40); + const __m256 vi6 = _mm256_loadu_ps(input + 48); + const __m256 vi7 = _mm256_loadu_ps(input + 56); + const __m256 vi8 = _mm256_loadu_ps(input + 64); + input += 72; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); + const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); + const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); + const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); + const __m256 vx4 = _mm256_sub_ps(vi4, vi_max); + const __m256 vx5 = _mm256_sub_ps(vi5, vi_max); + const __m256 vx6 = _mm256_sub_ps(vi6, vi_max); + const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); + const __m256 vx8 = _mm256_sub_ps(vi8, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn0 = _mm256_add_ps(_mm256_mul_ps(vx0, vlog2e), vmagic_bias); + __m256 vn1 = _mm256_add_ps(_mm256_mul_ps(vx1, vlog2e), vmagic_bias); + __m256 vn2 = _mm256_add_ps(_mm256_mul_ps(vx2, vlog2e), vmagic_bias); + __m256 vn3 = _mm256_add_ps(_mm256_mul_ps(vx3, vlog2e), vmagic_bias); + __m256 vn4 = _mm256_add_ps(_mm256_mul_ps(vx4, vlog2e), vmagic_bias); + __m256 vn5 = _mm256_add_ps(_mm256_mul_ps(vx5, vlog2e), vmagic_bias); + __m256 vn6 = _mm256_add_ps(_mm256_mul_ps(vx6, vlog2e), vmagic_bias); + __m256 vn7 = _mm256_add_ps(_mm256_mul_ps(vx7, vlog2e), vmagic_bias); + __m256 vn8 = _mm256_add_ps(_mm256_mul_ps(vx8, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); + const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); + const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); + const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); + const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23)); + const __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); + const __m256 vs6 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn6), 23)); + const __m256 vs7 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn7), 23)); + const __m256 vs8 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn8), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn0 = _mm256_sub_ps(vn0, vmagic_bias); + vn1 = _mm256_sub_ps(vn1, vmagic_bias); + vn2 = _mm256_sub_ps(vn2, vmagic_bias); + vn3 = _mm256_sub_ps(vn3, vmagic_bias); + vn4 = _mm256_sub_ps(vn4, vmagic_bias); + vn5 = _mm256_sub_ps(vn5, vmagic_bias); + vn6 = _mm256_sub_ps(vn6, vmagic_bias); + vn7 = _mm256_sub_ps(vn7, vmagic_bias); + vn8 = _mm256_sub_ps(vn8, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_hi), vx0); + __m256 vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_hi), vx1); + __m256 vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_hi), vx2); + __m256 vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_hi), vx3); + __m256 vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_hi), vx4); + __m256 vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_hi), vx5); + __m256 vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_hi), vx6); + __m256 vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_hi), vx7); + __m256 vt8 = _mm256_add_ps(_mm256_mul_ps(vn8, vminus_ln2_hi), vx8); + + vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_lo), vt1); + vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_lo), vt2); + vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_lo), vt3); + vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_lo), vt4); + vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_lo), vt5); + vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_lo), vt6); + vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_lo), vt7); + vt8 = _mm256_add_ps(_mm256_mul_ps(vn8, vminus_ln2_lo), vt8); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp0 = _mm256_add_ps(_mm256_mul_ps(vc5, vt0), vc4); + __m256 vp1 = _mm256_add_ps(_mm256_mul_ps(vc5, vt1), vc4); + __m256 vp2 = _mm256_add_ps(_mm256_mul_ps(vc5, vt2), vc4); + __m256 vp3 = _mm256_add_ps(_mm256_mul_ps(vc5, vt3), vc4); + __m256 vp4 = _mm256_add_ps(_mm256_mul_ps(vc5, vt4), vc4); + __m256 vp5 = _mm256_add_ps(_mm256_mul_ps(vc5, vt5), vc4); + __m256 vp6 = _mm256_add_ps(_mm256_mul_ps(vc5, vt6), vc4); + __m256 vp7 = _mm256_add_ps(_mm256_mul_ps(vc5, vt7), vc4); + __m256 vp8 = _mm256_add_ps(_mm256_mul_ps(vc5, vt8), vc4); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc3); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc3); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc3); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc3); + vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc3); + vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc3); + vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc3); + vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc3); + vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc3); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc2); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc2); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc2); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc2); + vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc2); + vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc2); + vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc2); + vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc2); + vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc2); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc1); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc1); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc1); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc1); + vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc1); + vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc1); + vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc1); + vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc1); + vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt0 = _mm256_mul_ps(vt0, vs0); + vt1 = _mm256_mul_ps(vt1, vs1); + vt2 = _mm256_mul_ps(vt2, vs2); + vt3 = _mm256_mul_ps(vt3, vs3); + vt4 = _mm256_mul_ps(vt4, vs4); + vt5 = _mm256_mul_ps(vt5, vs5); + vt6 = _mm256_mul_ps(vt6, vs6); + vt7 = _mm256_mul_ps(vt7, vs7); + vt8 = _mm256_mul_ps(vt8, vs8); + + __m256 vf0 = _mm256_add_ps(_mm256_mul_ps(vt0, vp0), vs0); + __m256 vf1 = _mm256_add_ps(_mm256_mul_ps(vt1, vp1), vs1); + __m256 vf2 = _mm256_add_ps(_mm256_mul_ps(vt2, vp2), vs2); + __m256 vf3 = _mm256_add_ps(_mm256_mul_ps(vt3, vp3), vs3); + __m256 vf4 = _mm256_add_ps(_mm256_mul_ps(vt4, vp4), vs4); + __m256 vf5 = _mm256_add_ps(_mm256_mul_ps(vt5, vp5), vs5); + __m256 vf6 = _mm256_add_ps(_mm256_mul_ps(vt6, vp6), vs6); + __m256 vf7 = _mm256_add_ps(_mm256_mul_ps(vt7, vp7), vs7); + __m256 vf8 = _mm256_add_ps(_mm256_mul_ps(vt8, vp8), vs8); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); + vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); + vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); + vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); + vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vx4, vdenorm_cutoff, _CMP_LT_OS), vf4); + vf5 = _mm256_andnot_ps(_mm256_cmp_ps(vx5, vdenorm_cutoff, _CMP_LT_OS), vf5); + vf6 = _mm256_andnot_ps(_mm256_cmp_ps(vx6, vdenorm_cutoff, _CMP_LT_OS), vf6); + vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); + vf8 = _mm256_andnot_ps(_mm256_cmp_ps(vx8, vdenorm_cutoff, _CMP_LT_OS), vf8); + + // Store 72 (9x4) outputs at a time. + _mm256_storeu_ps(output, vf0); + _mm256_storeu_ps(output + 8, vf1); + _mm256_storeu_ps(output + 16, vf2); + _mm256_storeu_ps(output + 24, vf3); + _mm256_storeu_ps(output + 32, vf4); + _mm256_storeu_ps(output + 40, vf5); + _mm256_storeu_ps(output + 48, vf6); + _mm256_storeu_ps(output + 56, vf7); + _mm256_storeu_ps(output + 64, vf8); + output += 72; + + // Accumulate computed exponents. + vacc0 = _mm256_add_ps(vacc0, vf0); + vacc0 = _mm256_add_ps(vacc0, vf1); + vacc0 = _mm256_add_ps(vacc0, vf2); + vacc0 = _mm256_add_ps(vacc0, vf3); + vacc0 = _mm256_add_ps(vacc0, vf4); + vacc0 = _mm256_add_ps(vacc0, vf5); + vacc0 = _mm256_add_ps(vacc0, vf6); + vacc0 = _mm256_add_ps(vacc0, vf7); + vacc0 = _mm256_add_ps(vacc0, vf8); + } + + __m256 vacc = vacc0; + for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { + // Load 8 inputs at a time. + const __m256 vi = _mm256_loadu_ps(input); + input += 8; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx = _mm256_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm256_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm256_mul_ps(vt, vs); + __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); + + // Store 8 outputs at a time. + _mm256_storeu_ps(output, vf); + output += 8; + + // Accumulate computed exponents. + vacc = _mm256_add_ps(vacc, vf); + } + if (batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 7 * sizeof(float)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); + + const __m256 vi = _mm256_maskload_ps(input, vmask); + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx = _mm256_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm256_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm256_mul_ps(vt, vs); + __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); + + __m128 vf_lo = _mm256_castps256_ps128(vf); + if (batch & (4 * sizeof(float))) { + _mm_storeu_ps(output, vf_lo); + vf_lo = _mm256_extractf128_ps(vf, 1); + output += 4; + } + if (batch & (2 * sizeof(float))) { + _mm_storel_pi((__m64*) output, vf_lo); + vf_lo = _mm_movehl_ps(vf_lo, vf_lo); + output += 2; + } + if (batch & (1 * sizeof(float))) { + _mm_store_ss(output, vf_lo); + } + + vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); + } + __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); + vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); + vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); + _mm_store_ss(sum, vacc_lo); +} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80-acc2.c new file mode 100644 index 00000000000..c674de9489d --- /dev/null +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80-acc2.c @@ -0,0 +1,370 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in +// Generator: tools/xngen +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/raddstoreexpminusmax.h" + + +void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80_acc2( + size_t batch, + const float* input, + const float* max, + float* output, + float* sum, + const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(max != NULL); + assert(output != NULL); + assert(sum != NULL); + + static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; + + const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); + const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); + const __m256 vminus_ln2_hi = _mm256_set1_ps(-0x1.62E400p-1f); + const __m256 vminus_ln2_lo = _mm256_set1_ps(-0x1.7F7D1Cp-20f); + const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); + const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); + const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); + const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); + const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); + const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); + + XNN_FORCE_REALIZATION(vlog2e); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vminus_ln2_hi); + XNN_FORCE_REALIZATION(vminus_ln2_lo); + XNN_FORCE_REALIZATION(vc5); + XNN_FORCE_REALIZATION(vc4); + XNN_FORCE_REALIZATION(vc3); + XNN_FORCE_REALIZATION(vc2); + XNN_FORCE_REALIZATION(vc1); + XNN_FORCE_REALIZATION(vdenorm_cutoff); + + const __m256 vi_max = _mm256_broadcast_ss(max); + + __m256 vacc0 = _mm256_setzero_ps(); + __m256 vacc1 = _mm256_setzero_ps(); + for (; batch >= 80 * sizeof(float); batch -= 80 * sizeof(float)) { + // Load 80 (10x4) inputs at a time. + const __m256 vi0 = _mm256_loadu_ps(input); + const __m256 vi1 = _mm256_loadu_ps(input + 8); + const __m256 vi2 = _mm256_loadu_ps(input + 16); + const __m256 vi3 = _mm256_loadu_ps(input + 24); + const __m256 vi4 = _mm256_loadu_ps(input + 32); + const __m256 vi5 = _mm256_loadu_ps(input + 40); + const __m256 vi6 = _mm256_loadu_ps(input + 48); + const __m256 vi7 = _mm256_loadu_ps(input + 56); + const __m256 vi8 = _mm256_loadu_ps(input + 64); + const __m256 vi9 = _mm256_loadu_ps(input + 72); + input += 80; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); + const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); + const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); + const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); + const __m256 vx4 = _mm256_sub_ps(vi4, vi_max); + const __m256 vx5 = _mm256_sub_ps(vi5, vi_max); + const __m256 vx6 = _mm256_sub_ps(vi6, vi_max); + const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); + const __m256 vx8 = _mm256_sub_ps(vi8, vi_max); + const __m256 vx9 = _mm256_sub_ps(vi9, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn0 = _mm256_add_ps(_mm256_mul_ps(vx0, vlog2e), vmagic_bias); + __m256 vn1 = _mm256_add_ps(_mm256_mul_ps(vx1, vlog2e), vmagic_bias); + __m256 vn2 = _mm256_add_ps(_mm256_mul_ps(vx2, vlog2e), vmagic_bias); + __m256 vn3 = _mm256_add_ps(_mm256_mul_ps(vx3, vlog2e), vmagic_bias); + __m256 vn4 = _mm256_add_ps(_mm256_mul_ps(vx4, vlog2e), vmagic_bias); + __m256 vn5 = _mm256_add_ps(_mm256_mul_ps(vx5, vlog2e), vmagic_bias); + __m256 vn6 = _mm256_add_ps(_mm256_mul_ps(vx6, vlog2e), vmagic_bias); + __m256 vn7 = _mm256_add_ps(_mm256_mul_ps(vx7, vlog2e), vmagic_bias); + __m256 vn8 = _mm256_add_ps(_mm256_mul_ps(vx8, vlog2e), vmagic_bias); + __m256 vn9 = _mm256_add_ps(_mm256_mul_ps(vx9, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); + const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); + const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); + const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); + const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23)); + const __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); + const __m256 vs6 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn6), 23)); + const __m256 vs7 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn7), 23)); + const __m256 vs8 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn8), 23)); + const __m256 vs9 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn9), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn0 = _mm256_sub_ps(vn0, vmagic_bias); + vn1 = _mm256_sub_ps(vn1, vmagic_bias); + vn2 = _mm256_sub_ps(vn2, vmagic_bias); + vn3 = _mm256_sub_ps(vn3, vmagic_bias); + vn4 = _mm256_sub_ps(vn4, vmagic_bias); + vn5 = _mm256_sub_ps(vn5, vmagic_bias); + vn6 = _mm256_sub_ps(vn6, vmagic_bias); + vn7 = _mm256_sub_ps(vn7, vmagic_bias); + vn8 = _mm256_sub_ps(vn8, vmagic_bias); + vn9 = _mm256_sub_ps(vn9, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_hi), vx0); + __m256 vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_hi), vx1); + __m256 vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_hi), vx2); + __m256 vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_hi), vx3); + __m256 vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_hi), vx4); + __m256 vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_hi), vx5); + __m256 vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_hi), vx6); + __m256 vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_hi), vx7); + __m256 vt8 = _mm256_add_ps(_mm256_mul_ps(vn8, vminus_ln2_hi), vx8); + __m256 vt9 = _mm256_add_ps(_mm256_mul_ps(vn9, vminus_ln2_hi), vx9); + + vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_lo), vt1); + vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_lo), vt2); + vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_lo), vt3); + vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_lo), vt4); + vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_lo), vt5); + vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_lo), vt6); + vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_lo), vt7); + vt8 = _mm256_add_ps(_mm256_mul_ps(vn8, vminus_ln2_lo), vt8); + vt9 = _mm256_add_ps(_mm256_mul_ps(vn9, vminus_ln2_lo), vt9); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp0 = _mm256_add_ps(_mm256_mul_ps(vc5, vt0), vc4); + __m256 vp1 = _mm256_add_ps(_mm256_mul_ps(vc5, vt1), vc4); + __m256 vp2 = _mm256_add_ps(_mm256_mul_ps(vc5, vt2), vc4); + __m256 vp3 = _mm256_add_ps(_mm256_mul_ps(vc5, vt3), vc4); + __m256 vp4 = _mm256_add_ps(_mm256_mul_ps(vc5, vt4), vc4); + __m256 vp5 = _mm256_add_ps(_mm256_mul_ps(vc5, vt5), vc4); + __m256 vp6 = _mm256_add_ps(_mm256_mul_ps(vc5, vt6), vc4); + __m256 vp7 = _mm256_add_ps(_mm256_mul_ps(vc5, vt7), vc4); + __m256 vp8 = _mm256_add_ps(_mm256_mul_ps(vc5, vt8), vc4); + __m256 vp9 = _mm256_add_ps(_mm256_mul_ps(vc5, vt9), vc4); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc3); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc3); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc3); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc3); + vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc3); + vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc3); + vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc3); + vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc3); + vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc3); + vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc3); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc2); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc2); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc2); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc2); + vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc2); + vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc2); + vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc2); + vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc2); + vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc2); + vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc2); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc1); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc1); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc1); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc1); + vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc1); + vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc1); + vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc1); + vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc1); + vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc1); + vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt0 = _mm256_mul_ps(vt0, vs0); + vt1 = _mm256_mul_ps(vt1, vs1); + vt2 = _mm256_mul_ps(vt2, vs2); + vt3 = _mm256_mul_ps(vt3, vs3); + vt4 = _mm256_mul_ps(vt4, vs4); + vt5 = _mm256_mul_ps(vt5, vs5); + vt6 = _mm256_mul_ps(vt6, vs6); + vt7 = _mm256_mul_ps(vt7, vs7); + vt8 = _mm256_mul_ps(vt8, vs8); + vt9 = _mm256_mul_ps(vt9, vs9); + + __m256 vf0 = _mm256_add_ps(_mm256_mul_ps(vt0, vp0), vs0); + __m256 vf1 = _mm256_add_ps(_mm256_mul_ps(vt1, vp1), vs1); + __m256 vf2 = _mm256_add_ps(_mm256_mul_ps(vt2, vp2), vs2); + __m256 vf3 = _mm256_add_ps(_mm256_mul_ps(vt3, vp3), vs3); + __m256 vf4 = _mm256_add_ps(_mm256_mul_ps(vt4, vp4), vs4); + __m256 vf5 = _mm256_add_ps(_mm256_mul_ps(vt5, vp5), vs5); + __m256 vf6 = _mm256_add_ps(_mm256_mul_ps(vt6, vp6), vs6); + __m256 vf7 = _mm256_add_ps(_mm256_mul_ps(vt7, vp7), vs7); + __m256 vf8 = _mm256_add_ps(_mm256_mul_ps(vt8, vp8), vs8); + __m256 vf9 = _mm256_add_ps(_mm256_mul_ps(vt9, vp9), vs9); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); + vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); + vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); + vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); + vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vx4, vdenorm_cutoff, _CMP_LT_OS), vf4); + vf5 = _mm256_andnot_ps(_mm256_cmp_ps(vx5, vdenorm_cutoff, _CMP_LT_OS), vf5); + vf6 = _mm256_andnot_ps(_mm256_cmp_ps(vx6, vdenorm_cutoff, _CMP_LT_OS), vf6); + vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); + vf8 = _mm256_andnot_ps(_mm256_cmp_ps(vx8, vdenorm_cutoff, _CMP_LT_OS), vf8); + vf9 = _mm256_andnot_ps(_mm256_cmp_ps(vx9, vdenorm_cutoff, _CMP_LT_OS), vf9); + + // Store 80 (10x4) outputs at a time. + _mm256_storeu_ps(output, vf0); + _mm256_storeu_ps(output + 8, vf1); + _mm256_storeu_ps(output + 16, vf2); + _mm256_storeu_ps(output + 24, vf3); + _mm256_storeu_ps(output + 32, vf4); + _mm256_storeu_ps(output + 40, vf5); + _mm256_storeu_ps(output + 48, vf6); + _mm256_storeu_ps(output + 56, vf7); + _mm256_storeu_ps(output + 64, vf8); + _mm256_storeu_ps(output + 72, vf9); + output += 80; + + // Accumulate computed exponents. + vacc0 = _mm256_add_ps(vacc0, vf0); + vacc1 = _mm256_add_ps(vacc1, vf1); + vacc0 = _mm256_add_ps(vacc0, vf2); + vacc1 = _mm256_add_ps(vacc1, vf3); + vacc0 = _mm256_add_ps(vacc0, vf4); + vacc1 = _mm256_add_ps(vacc1, vf5); + vacc0 = _mm256_add_ps(vacc0, vf6); + vacc1 = _mm256_add_ps(vacc1, vf7); + vacc0 = _mm256_add_ps(vacc0, vf8); + vacc1 = _mm256_add_ps(vacc1, vf9); + } + // Add up all accumulators to vacc0 + vacc0 = _mm256_add_ps(vacc0, vacc1); + + __m256 vacc = vacc0; + for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { + // Load 8 inputs at a time. + const __m256 vi = _mm256_loadu_ps(input); + input += 8; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx = _mm256_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm256_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm256_mul_ps(vt, vs); + __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); + + // Store 8 outputs at a time. + _mm256_storeu_ps(output, vf); + output += 8; + + // Accumulate computed exponents. + vacc = _mm256_add_ps(vacc, vf); + } + if (batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 7 * sizeof(float)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); + + const __m256 vi = _mm256_maskload_ps(input, vmask); + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx = _mm256_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm256_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm256_mul_ps(vt, vs); + __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); + + __m128 vf_lo = _mm256_castps256_ps128(vf); + if (batch & (4 * sizeof(float))) { + _mm_storeu_ps(output, vf_lo); + vf_lo = _mm256_extractf128_ps(vf, 1); + output += 4; + } + if (batch & (2 * sizeof(float))) { + _mm_storel_pi((__m64*) output, vf_lo); + vf_lo = _mm_movehl_ps(vf_lo, vf_lo); + output += 2; + } + if (batch & (1 * sizeof(float))) { + _mm_store_ss(output, vf_lo); + } + + vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); + } + __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); + vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); + vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); + _mm_store_ss(sum, vacc_lo); +} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80-acc5.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80-acc5.c new file mode 100644 index 00000000000..afc39c1c9da --- /dev/null +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80-acc5.c @@ -0,0 +1,376 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in +// Generator: tools/xngen +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/raddstoreexpminusmax.h" + + +void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80_acc5( + size_t batch, + const float* input, + const float* max, + float* output, + float* sum, + const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(max != NULL); + assert(output != NULL); + assert(sum != NULL); + + static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; + + const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); + const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); + const __m256 vminus_ln2_hi = _mm256_set1_ps(-0x1.62E400p-1f); + const __m256 vminus_ln2_lo = _mm256_set1_ps(-0x1.7F7D1Cp-20f); + const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); + const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); + const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); + const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); + const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); + const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); + + XNN_FORCE_REALIZATION(vlog2e); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vminus_ln2_hi); + XNN_FORCE_REALIZATION(vminus_ln2_lo); + XNN_FORCE_REALIZATION(vc5); + XNN_FORCE_REALIZATION(vc4); + XNN_FORCE_REALIZATION(vc3); + XNN_FORCE_REALIZATION(vc2); + XNN_FORCE_REALIZATION(vc1); + XNN_FORCE_REALIZATION(vdenorm_cutoff); + + const __m256 vi_max = _mm256_broadcast_ss(max); + + __m256 vacc0 = _mm256_setzero_ps(); + __m256 vacc1 = _mm256_setzero_ps(); + __m256 vacc2 = _mm256_setzero_ps(); + __m256 vacc3 = _mm256_setzero_ps(); + __m256 vacc4 = _mm256_setzero_ps(); + for (; batch >= 80 * sizeof(float); batch -= 80 * sizeof(float)) { + // Load 80 (10x4) inputs at a time. + const __m256 vi0 = _mm256_loadu_ps(input); + const __m256 vi1 = _mm256_loadu_ps(input + 8); + const __m256 vi2 = _mm256_loadu_ps(input + 16); + const __m256 vi3 = _mm256_loadu_ps(input + 24); + const __m256 vi4 = _mm256_loadu_ps(input + 32); + const __m256 vi5 = _mm256_loadu_ps(input + 40); + const __m256 vi6 = _mm256_loadu_ps(input + 48); + const __m256 vi7 = _mm256_loadu_ps(input + 56); + const __m256 vi8 = _mm256_loadu_ps(input + 64); + const __m256 vi9 = _mm256_loadu_ps(input + 72); + input += 80; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); + const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); + const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); + const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); + const __m256 vx4 = _mm256_sub_ps(vi4, vi_max); + const __m256 vx5 = _mm256_sub_ps(vi5, vi_max); + const __m256 vx6 = _mm256_sub_ps(vi6, vi_max); + const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); + const __m256 vx8 = _mm256_sub_ps(vi8, vi_max); + const __m256 vx9 = _mm256_sub_ps(vi9, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn0 = _mm256_add_ps(_mm256_mul_ps(vx0, vlog2e), vmagic_bias); + __m256 vn1 = _mm256_add_ps(_mm256_mul_ps(vx1, vlog2e), vmagic_bias); + __m256 vn2 = _mm256_add_ps(_mm256_mul_ps(vx2, vlog2e), vmagic_bias); + __m256 vn3 = _mm256_add_ps(_mm256_mul_ps(vx3, vlog2e), vmagic_bias); + __m256 vn4 = _mm256_add_ps(_mm256_mul_ps(vx4, vlog2e), vmagic_bias); + __m256 vn5 = _mm256_add_ps(_mm256_mul_ps(vx5, vlog2e), vmagic_bias); + __m256 vn6 = _mm256_add_ps(_mm256_mul_ps(vx6, vlog2e), vmagic_bias); + __m256 vn7 = _mm256_add_ps(_mm256_mul_ps(vx7, vlog2e), vmagic_bias); + __m256 vn8 = _mm256_add_ps(_mm256_mul_ps(vx8, vlog2e), vmagic_bias); + __m256 vn9 = _mm256_add_ps(_mm256_mul_ps(vx9, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); + const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); + const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); + const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); + const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23)); + const __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); + const __m256 vs6 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn6), 23)); + const __m256 vs7 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn7), 23)); + const __m256 vs8 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn8), 23)); + const __m256 vs9 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn9), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn0 = _mm256_sub_ps(vn0, vmagic_bias); + vn1 = _mm256_sub_ps(vn1, vmagic_bias); + vn2 = _mm256_sub_ps(vn2, vmagic_bias); + vn3 = _mm256_sub_ps(vn3, vmagic_bias); + vn4 = _mm256_sub_ps(vn4, vmagic_bias); + vn5 = _mm256_sub_ps(vn5, vmagic_bias); + vn6 = _mm256_sub_ps(vn6, vmagic_bias); + vn7 = _mm256_sub_ps(vn7, vmagic_bias); + vn8 = _mm256_sub_ps(vn8, vmagic_bias); + vn9 = _mm256_sub_ps(vn9, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_hi), vx0); + __m256 vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_hi), vx1); + __m256 vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_hi), vx2); + __m256 vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_hi), vx3); + __m256 vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_hi), vx4); + __m256 vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_hi), vx5); + __m256 vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_hi), vx6); + __m256 vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_hi), vx7); + __m256 vt8 = _mm256_add_ps(_mm256_mul_ps(vn8, vminus_ln2_hi), vx8); + __m256 vt9 = _mm256_add_ps(_mm256_mul_ps(vn9, vminus_ln2_hi), vx9); + + vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_lo), vt1); + vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_lo), vt2); + vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_lo), vt3); + vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_lo), vt4); + vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_lo), vt5); + vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_lo), vt6); + vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_lo), vt7); + vt8 = _mm256_add_ps(_mm256_mul_ps(vn8, vminus_ln2_lo), vt8); + vt9 = _mm256_add_ps(_mm256_mul_ps(vn9, vminus_ln2_lo), vt9); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp0 = _mm256_add_ps(_mm256_mul_ps(vc5, vt0), vc4); + __m256 vp1 = _mm256_add_ps(_mm256_mul_ps(vc5, vt1), vc4); + __m256 vp2 = _mm256_add_ps(_mm256_mul_ps(vc5, vt2), vc4); + __m256 vp3 = _mm256_add_ps(_mm256_mul_ps(vc5, vt3), vc4); + __m256 vp4 = _mm256_add_ps(_mm256_mul_ps(vc5, vt4), vc4); + __m256 vp5 = _mm256_add_ps(_mm256_mul_ps(vc5, vt5), vc4); + __m256 vp6 = _mm256_add_ps(_mm256_mul_ps(vc5, vt6), vc4); + __m256 vp7 = _mm256_add_ps(_mm256_mul_ps(vc5, vt7), vc4); + __m256 vp8 = _mm256_add_ps(_mm256_mul_ps(vc5, vt8), vc4); + __m256 vp9 = _mm256_add_ps(_mm256_mul_ps(vc5, vt9), vc4); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc3); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc3); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc3); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc3); + vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc3); + vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc3); + vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc3); + vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc3); + vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc3); + vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc3); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc2); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc2); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc2); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc2); + vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc2); + vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc2); + vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc2); + vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc2); + vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc2); + vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc2); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc1); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc1); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc1); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc1); + vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc1); + vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc1); + vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc1); + vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc1); + vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc1); + vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt0 = _mm256_mul_ps(vt0, vs0); + vt1 = _mm256_mul_ps(vt1, vs1); + vt2 = _mm256_mul_ps(vt2, vs2); + vt3 = _mm256_mul_ps(vt3, vs3); + vt4 = _mm256_mul_ps(vt4, vs4); + vt5 = _mm256_mul_ps(vt5, vs5); + vt6 = _mm256_mul_ps(vt6, vs6); + vt7 = _mm256_mul_ps(vt7, vs7); + vt8 = _mm256_mul_ps(vt8, vs8); + vt9 = _mm256_mul_ps(vt9, vs9); + + __m256 vf0 = _mm256_add_ps(_mm256_mul_ps(vt0, vp0), vs0); + __m256 vf1 = _mm256_add_ps(_mm256_mul_ps(vt1, vp1), vs1); + __m256 vf2 = _mm256_add_ps(_mm256_mul_ps(vt2, vp2), vs2); + __m256 vf3 = _mm256_add_ps(_mm256_mul_ps(vt3, vp3), vs3); + __m256 vf4 = _mm256_add_ps(_mm256_mul_ps(vt4, vp4), vs4); + __m256 vf5 = _mm256_add_ps(_mm256_mul_ps(vt5, vp5), vs5); + __m256 vf6 = _mm256_add_ps(_mm256_mul_ps(vt6, vp6), vs6); + __m256 vf7 = _mm256_add_ps(_mm256_mul_ps(vt7, vp7), vs7); + __m256 vf8 = _mm256_add_ps(_mm256_mul_ps(vt8, vp8), vs8); + __m256 vf9 = _mm256_add_ps(_mm256_mul_ps(vt9, vp9), vs9); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); + vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); + vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); + vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); + vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vx4, vdenorm_cutoff, _CMP_LT_OS), vf4); + vf5 = _mm256_andnot_ps(_mm256_cmp_ps(vx5, vdenorm_cutoff, _CMP_LT_OS), vf5); + vf6 = _mm256_andnot_ps(_mm256_cmp_ps(vx6, vdenorm_cutoff, _CMP_LT_OS), vf6); + vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); + vf8 = _mm256_andnot_ps(_mm256_cmp_ps(vx8, vdenorm_cutoff, _CMP_LT_OS), vf8); + vf9 = _mm256_andnot_ps(_mm256_cmp_ps(vx9, vdenorm_cutoff, _CMP_LT_OS), vf9); + + // Store 80 (10x4) outputs at a time. + _mm256_storeu_ps(output, vf0); + _mm256_storeu_ps(output + 8, vf1); + _mm256_storeu_ps(output + 16, vf2); + _mm256_storeu_ps(output + 24, vf3); + _mm256_storeu_ps(output + 32, vf4); + _mm256_storeu_ps(output + 40, vf5); + _mm256_storeu_ps(output + 48, vf6); + _mm256_storeu_ps(output + 56, vf7); + _mm256_storeu_ps(output + 64, vf8); + _mm256_storeu_ps(output + 72, vf9); + output += 80; + + // Accumulate computed exponents. + vacc0 = _mm256_add_ps(vacc0, vf0); + vacc1 = _mm256_add_ps(vacc1, vf1); + vacc2 = _mm256_add_ps(vacc2, vf2); + vacc3 = _mm256_add_ps(vacc3, vf3); + vacc4 = _mm256_add_ps(vacc4, vf4); + vacc0 = _mm256_add_ps(vacc0, vf5); + vacc1 = _mm256_add_ps(vacc1, vf6); + vacc2 = _mm256_add_ps(vacc2, vf7); + vacc3 = _mm256_add_ps(vacc3, vf8); + vacc4 = _mm256_add_ps(vacc4, vf9); + } + // Add up all accumulators to vacc0 + vacc0 = _mm256_add_ps(vacc0, vacc1); + vacc2 = _mm256_add_ps(vacc2, vacc3); + vacc0 = _mm256_add_ps(vacc0, vacc2); + vacc0 = _mm256_add_ps(vacc0, vacc4); + + __m256 vacc = vacc0; + for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { + // Load 8 inputs at a time. + const __m256 vi = _mm256_loadu_ps(input); + input += 8; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx = _mm256_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm256_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm256_mul_ps(vt, vs); + __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); + + // Store 8 outputs at a time. + _mm256_storeu_ps(output, vf); + output += 8; + + // Accumulate computed exponents. + vacc = _mm256_add_ps(vacc, vf); + } + if (batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 7 * sizeof(float)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); + + const __m256 vi = _mm256_maskload_ps(input, vmask); + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx = _mm256_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm256_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm256_mul_ps(vt, vs); + __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); + + __m128 vf_lo = _mm256_castps256_ps128(vf); + if (batch & (4 * sizeof(float))) { + _mm_storeu_ps(output, vf_lo); + vf_lo = _mm256_extractf128_ps(vf, 1); + output += 4; + } + if (batch & (2 * sizeof(float))) { + _mm_storel_pi((__m64*) output, vf_lo); + vf_lo = _mm_movehl_ps(vf_lo, vf_lo); + output += 2; + } + if (batch & (1 * sizeof(float))) { + _mm_store_ss(output, vf_lo); + } + + vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); + } + __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); + vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); + vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); + _mm_store_ss(sum, vacc_lo); +} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80.c new file mode 100644 index 00000000000..8642c6df381 --- /dev/null +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80.c @@ -0,0 +1,367 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in +// Generator: tools/xngen +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/raddstoreexpminusmax.h" + + +void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80( + size_t batch, + const float* input, + const float* max, + float* output, + float* sum, + const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(max != NULL); + assert(output != NULL); + assert(sum != NULL); + + static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; + + const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); + const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); + const __m256 vminus_ln2_hi = _mm256_set1_ps(-0x1.62E400p-1f); + const __m256 vminus_ln2_lo = _mm256_set1_ps(-0x1.7F7D1Cp-20f); + const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); + const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); + const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); + const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); + const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); + const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); + + XNN_FORCE_REALIZATION(vlog2e); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vminus_ln2_hi); + XNN_FORCE_REALIZATION(vminus_ln2_lo); + XNN_FORCE_REALIZATION(vc5); + XNN_FORCE_REALIZATION(vc4); + XNN_FORCE_REALIZATION(vc3); + XNN_FORCE_REALIZATION(vc2); + XNN_FORCE_REALIZATION(vc1); + XNN_FORCE_REALIZATION(vdenorm_cutoff); + + const __m256 vi_max = _mm256_broadcast_ss(max); + + __m256 vacc0 = _mm256_setzero_ps(); + for (; batch >= 80 * sizeof(float); batch -= 80 * sizeof(float)) { + // Load 80 (10x4) inputs at a time. + const __m256 vi0 = _mm256_loadu_ps(input); + const __m256 vi1 = _mm256_loadu_ps(input + 8); + const __m256 vi2 = _mm256_loadu_ps(input + 16); + const __m256 vi3 = _mm256_loadu_ps(input + 24); + const __m256 vi4 = _mm256_loadu_ps(input + 32); + const __m256 vi5 = _mm256_loadu_ps(input + 40); + const __m256 vi6 = _mm256_loadu_ps(input + 48); + const __m256 vi7 = _mm256_loadu_ps(input + 56); + const __m256 vi8 = _mm256_loadu_ps(input + 64); + const __m256 vi9 = _mm256_loadu_ps(input + 72); + input += 80; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); + const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); + const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); + const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); + const __m256 vx4 = _mm256_sub_ps(vi4, vi_max); + const __m256 vx5 = _mm256_sub_ps(vi5, vi_max); + const __m256 vx6 = _mm256_sub_ps(vi6, vi_max); + const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); + const __m256 vx8 = _mm256_sub_ps(vi8, vi_max); + const __m256 vx9 = _mm256_sub_ps(vi9, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn0 = _mm256_add_ps(_mm256_mul_ps(vx0, vlog2e), vmagic_bias); + __m256 vn1 = _mm256_add_ps(_mm256_mul_ps(vx1, vlog2e), vmagic_bias); + __m256 vn2 = _mm256_add_ps(_mm256_mul_ps(vx2, vlog2e), vmagic_bias); + __m256 vn3 = _mm256_add_ps(_mm256_mul_ps(vx3, vlog2e), vmagic_bias); + __m256 vn4 = _mm256_add_ps(_mm256_mul_ps(vx4, vlog2e), vmagic_bias); + __m256 vn5 = _mm256_add_ps(_mm256_mul_ps(vx5, vlog2e), vmagic_bias); + __m256 vn6 = _mm256_add_ps(_mm256_mul_ps(vx6, vlog2e), vmagic_bias); + __m256 vn7 = _mm256_add_ps(_mm256_mul_ps(vx7, vlog2e), vmagic_bias); + __m256 vn8 = _mm256_add_ps(_mm256_mul_ps(vx8, vlog2e), vmagic_bias); + __m256 vn9 = _mm256_add_ps(_mm256_mul_ps(vx9, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); + const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); + const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); + const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); + const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23)); + const __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); + const __m256 vs6 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn6), 23)); + const __m256 vs7 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn7), 23)); + const __m256 vs8 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn8), 23)); + const __m256 vs9 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn9), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn0 = _mm256_sub_ps(vn0, vmagic_bias); + vn1 = _mm256_sub_ps(vn1, vmagic_bias); + vn2 = _mm256_sub_ps(vn2, vmagic_bias); + vn3 = _mm256_sub_ps(vn3, vmagic_bias); + vn4 = _mm256_sub_ps(vn4, vmagic_bias); + vn5 = _mm256_sub_ps(vn5, vmagic_bias); + vn6 = _mm256_sub_ps(vn6, vmagic_bias); + vn7 = _mm256_sub_ps(vn7, vmagic_bias); + vn8 = _mm256_sub_ps(vn8, vmagic_bias); + vn9 = _mm256_sub_ps(vn9, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_hi), vx0); + __m256 vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_hi), vx1); + __m256 vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_hi), vx2); + __m256 vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_hi), vx3); + __m256 vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_hi), vx4); + __m256 vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_hi), vx5); + __m256 vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_hi), vx6); + __m256 vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_hi), vx7); + __m256 vt8 = _mm256_add_ps(_mm256_mul_ps(vn8, vminus_ln2_hi), vx8); + __m256 vt9 = _mm256_add_ps(_mm256_mul_ps(vn9, vminus_ln2_hi), vx9); + + vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_lo), vt1); + vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_lo), vt2); + vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_lo), vt3); + vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_lo), vt4); + vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_lo), vt5); + vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_lo), vt6); + vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_lo), vt7); + vt8 = _mm256_add_ps(_mm256_mul_ps(vn8, vminus_ln2_lo), vt8); + vt9 = _mm256_add_ps(_mm256_mul_ps(vn9, vminus_ln2_lo), vt9); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp0 = _mm256_add_ps(_mm256_mul_ps(vc5, vt0), vc4); + __m256 vp1 = _mm256_add_ps(_mm256_mul_ps(vc5, vt1), vc4); + __m256 vp2 = _mm256_add_ps(_mm256_mul_ps(vc5, vt2), vc4); + __m256 vp3 = _mm256_add_ps(_mm256_mul_ps(vc5, vt3), vc4); + __m256 vp4 = _mm256_add_ps(_mm256_mul_ps(vc5, vt4), vc4); + __m256 vp5 = _mm256_add_ps(_mm256_mul_ps(vc5, vt5), vc4); + __m256 vp6 = _mm256_add_ps(_mm256_mul_ps(vc5, vt6), vc4); + __m256 vp7 = _mm256_add_ps(_mm256_mul_ps(vc5, vt7), vc4); + __m256 vp8 = _mm256_add_ps(_mm256_mul_ps(vc5, vt8), vc4); + __m256 vp9 = _mm256_add_ps(_mm256_mul_ps(vc5, vt9), vc4); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc3); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc3); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc3); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc3); + vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc3); + vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc3); + vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc3); + vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc3); + vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc3); + vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc3); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc2); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc2); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc2); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc2); + vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc2); + vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc2); + vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc2); + vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc2); + vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc2); + vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc2); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc1); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc1); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc1); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc1); + vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc1); + vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc1); + vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc1); + vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc1); + vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc1); + vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt0 = _mm256_mul_ps(vt0, vs0); + vt1 = _mm256_mul_ps(vt1, vs1); + vt2 = _mm256_mul_ps(vt2, vs2); + vt3 = _mm256_mul_ps(vt3, vs3); + vt4 = _mm256_mul_ps(vt4, vs4); + vt5 = _mm256_mul_ps(vt5, vs5); + vt6 = _mm256_mul_ps(vt6, vs6); + vt7 = _mm256_mul_ps(vt7, vs7); + vt8 = _mm256_mul_ps(vt8, vs8); + vt9 = _mm256_mul_ps(vt9, vs9); + + __m256 vf0 = _mm256_add_ps(_mm256_mul_ps(vt0, vp0), vs0); + __m256 vf1 = _mm256_add_ps(_mm256_mul_ps(vt1, vp1), vs1); + __m256 vf2 = _mm256_add_ps(_mm256_mul_ps(vt2, vp2), vs2); + __m256 vf3 = _mm256_add_ps(_mm256_mul_ps(vt3, vp3), vs3); + __m256 vf4 = _mm256_add_ps(_mm256_mul_ps(vt4, vp4), vs4); + __m256 vf5 = _mm256_add_ps(_mm256_mul_ps(vt5, vp5), vs5); + __m256 vf6 = _mm256_add_ps(_mm256_mul_ps(vt6, vp6), vs6); + __m256 vf7 = _mm256_add_ps(_mm256_mul_ps(vt7, vp7), vs7); + __m256 vf8 = _mm256_add_ps(_mm256_mul_ps(vt8, vp8), vs8); + __m256 vf9 = _mm256_add_ps(_mm256_mul_ps(vt9, vp9), vs9); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); + vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); + vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); + vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); + vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vx4, vdenorm_cutoff, _CMP_LT_OS), vf4); + vf5 = _mm256_andnot_ps(_mm256_cmp_ps(vx5, vdenorm_cutoff, _CMP_LT_OS), vf5); + vf6 = _mm256_andnot_ps(_mm256_cmp_ps(vx6, vdenorm_cutoff, _CMP_LT_OS), vf6); + vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); + vf8 = _mm256_andnot_ps(_mm256_cmp_ps(vx8, vdenorm_cutoff, _CMP_LT_OS), vf8); + vf9 = _mm256_andnot_ps(_mm256_cmp_ps(vx9, vdenorm_cutoff, _CMP_LT_OS), vf9); + + // Store 80 (10x4) outputs at a time. + _mm256_storeu_ps(output, vf0); + _mm256_storeu_ps(output + 8, vf1); + _mm256_storeu_ps(output + 16, vf2); + _mm256_storeu_ps(output + 24, vf3); + _mm256_storeu_ps(output + 32, vf4); + _mm256_storeu_ps(output + 40, vf5); + _mm256_storeu_ps(output + 48, vf6); + _mm256_storeu_ps(output + 56, vf7); + _mm256_storeu_ps(output + 64, vf8); + _mm256_storeu_ps(output + 72, vf9); + output += 80; + + // Accumulate computed exponents. + vacc0 = _mm256_add_ps(vacc0, vf0); + vacc0 = _mm256_add_ps(vacc0, vf1); + vacc0 = _mm256_add_ps(vacc0, vf2); + vacc0 = _mm256_add_ps(vacc0, vf3); + vacc0 = _mm256_add_ps(vacc0, vf4); + vacc0 = _mm256_add_ps(vacc0, vf5); + vacc0 = _mm256_add_ps(vacc0, vf6); + vacc0 = _mm256_add_ps(vacc0, vf7); + vacc0 = _mm256_add_ps(vacc0, vf8); + vacc0 = _mm256_add_ps(vacc0, vf9); + } + + __m256 vacc = vacc0; + for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { + // Load 8 inputs at a time. + const __m256 vi = _mm256_loadu_ps(input); + input += 8; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx = _mm256_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm256_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm256_mul_ps(vt, vs); + __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); + + // Store 8 outputs at a time. + _mm256_storeu_ps(output, vf); + output += 8; + + // Accumulate computed exponents. + vacc = _mm256_add_ps(vacc, vf); + } + if (batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 7 * sizeof(float)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); + + const __m256 vi = _mm256_maskload_ps(input, vmask); + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx = _mm256_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm256_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm256_mul_ps(vt, vs); + __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); + + __m128 vf_lo = _mm256_castps256_ps128(vf); + if (batch & (4 * sizeof(float))) { + _mm_storeu_ps(output, vf_lo); + vf_lo = _mm256_extractf128_ps(vf, 1); + output += 4; + } + if (batch & (2 * sizeof(float))) { + _mm_storel_pi((__m64*) output, vf_lo); + vf_lo = _mm_movehl_ps(vf_lo, vf_lo); + output += 2; + } + if (batch & (1 * sizeof(float))) { + _mm_store_ss(output, vf_lo); + } + + vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); + } + __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); + vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); + vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); + _mm_store_ss(sum, vacc_lo); +} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc2.c new file mode 100644 index 00000000000..590e5f5ccbf --- /dev/null +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc2.c @@ -0,0 +1,402 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in +// Generator: tools/xngen +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/raddstoreexpminusmax.h" + + +void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc2( + size_t batch, + const float* input, + const float* max, + float* output, + float* sum, + const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(max != NULL); + assert(output != NULL); + assert(sum != NULL); + + static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; + + const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); + const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); + const __m256 vminus_ln2_hi = _mm256_set1_ps(-0x1.62E400p-1f); + const __m256 vminus_ln2_lo = _mm256_set1_ps(-0x1.7F7D1Cp-20f); + const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); + const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); + const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); + const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); + const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); + const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); + + XNN_FORCE_REALIZATION(vlog2e); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vminus_ln2_hi); + XNN_FORCE_REALIZATION(vminus_ln2_lo); + XNN_FORCE_REALIZATION(vc5); + XNN_FORCE_REALIZATION(vc4); + XNN_FORCE_REALIZATION(vc3); + XNN_FORCE_REALIZATION(vc2); + XNN_FORCE_REALIZATION(vc1); + XNN_FORCE_REALIZATION(vdenorm_cutoff); + + const __m256 vi_max = _mm256_broadcast_ss(max); + + __m256 vacc0 = _mm256_setzero_ps(); + __m256 vacc1 = _mm256_setzero_ps(); + for (; batch >= 96 * sizeof(float); batch -= 96 * sizeof(float)) { + // Load 96 (12x4) inputs at a time. + const __m256 vi0 = _mm256_loadu_ps(input); + const __m256 vi1 = _mm256_loadu_ps(input + 8); + const __m256 vi2 = _mm256_loadu_ps(input + 16); + const __m256 vi3 = _mm256_loadu_ps(input + 24); + const __m256 vi4 = _mm256_loadu_ps(input + 32); + const __m256 vi5 = _mm256_loadu_ps(input + 40); + const __m256 vi6 = _mm256_loadu_ps(input + 48); + const __m256 vi7 = _mm256_loadu_ps(input + 56); + const __m256 vi8 = _mm256_loadu_ps(input + 64); + const __m256 vi9 = _mm256_loadu_ps(input + 72); + const __m256 vi10 = _mm256_loadu_ps(input + 80); + const __m256 vi11 = _mm256_loadu_ps(input + 88); + input += 96; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); + const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); + const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); + const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); + const __m256 vx4 = _mm256_sub_ps(vi4, vi_max); + const __m256 vx5 = _mm256_sub_ps(vi5, vi_max); + const __m256 vx6 = _mm256_sub_ps(vi6, vi_max); + const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); + const __m256 vx8 = _mm256_sub_ps(vi8, vi_max); + const __m256 vx9 = _mm256_sub_ps(vi9, vi_max); + const __m256 vx10 = _mm256_sub_ps(vi10, vi_max); + const __m256 vx11 = _mm256_sub_ps(vi11, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn0 = _mm256_add_ps(_mm256_mul_ps(vx0, vlog2e), vmagic_bias); + __m256 vn1 = _mm256_add_ps(_mm256_mul_ps(vx1, vlog2e), vmagic_bias); + __m256 vn2 = _mm256_add_ps(_mm256_mul_ps(vx2, vlog2e), vmagic_bias); + __m256 vn3 = _mm256_add_ps(_mm256_mul_ps(vx3, vlog2e), vmagic_bias); + __m256 vn4 = _mm256_add_ps(_mm256_mul_ps(vx4, vlog2e), vmagic_bias); + __m256 vn5 = _mm256_add_ps(_mm256_mul_ps(vx5, vlog2e), vmagic_bias); + __m256 vn6 = _mm256_add_ps(_mm256_mul_ps(vx6, vlog2e), vmagic_bias); + __m256 vn7 = _mm256_add_ps(_mm256_mul_ps(vx7, vlog2e), vmagic_bias); + __m256 vn8 = _mm256_add_ps(_mm256_mul_ps(vx8, vlog2e), vmagic_bias); + __m256 vn9 = _mm256_add_ps(_mm256_mul_ps(vx9, vlog2e), vmagic_bias); + __m256 vn10 = _mm256_add_ps(_mm256_mul_ps(vx10, vlog2e), vmagic_bias); + __m256 vn11 = _mm256_add_ps(_mm256_mul_ps(vx11, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); + const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); + const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); + const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); + const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23)); + const __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); + const __m256 vs6 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn6), 23)); + const __m256 vs7 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn7), 23)); + const __m256 vs8 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn8), 23)); + const __m256 vs9 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn9), 23)); + const __m256 vs10 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn10), 23)); + const __m256 vs11 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn11), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn0 = _mm256_sub_ps(vn0, vmagic_bias); + vn1 = _mm256_sub_ps(vn1, vmagic_bias); + vn2 = _mm256_sub_ps(vn2, vmagic_bias); + vn3 = _mm256_sub_ps(vn3, vmagic_bias); + vn4 = _mm256_sub_ps(vn4, vmagic_bias); + vn5 = _mm256_sub_ps(vn5, vmagic_bias); + vn6 = _mm256_sub_ps(vn6, vmagic_bias); + vn7 = _mm256_sub_ps(vn7, vmagic_bias); + vn8 = _mm256_sub_ps(vn8, vmagic_bias); + vn9 = _mm256_sub_ps(vn9, vmagic_bias); + vn10 = _mm256_sub_ps(vn10, vmagic_bias); + vn11 = _mm256_sub_ps(vn11, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_hi), vx0); + __m256 vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_hi), vx1); + __m256 vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_hi), vx2); + __m256 vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_hi), vx3); + __m256 vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_hi), vx4); + __m256 vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_hi), vx5); + __m256 vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_hi), vx6); + __m256 vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_hi), vx7); + __m256 vt8 = _mm256_add_ps(_mm256_mul_ps(vn8, vminus_ln2_hi), vx8); + __m256 vt9 = _mm256_add_ps(_mm256_mul_ps(vn9, vminus_ln2_hi), vx9); + __m256 vt10 = _mm256_add_ps(_mm256_mul_ps(vn10, vminus_ln2_hi), vx10); + __m256 vt11 = _mm256_add_ps(_mm256_mul_ps(vn11, vminus_ln2_hi), vx11); + + vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_lo), vt1); + vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_lo), vt2); + vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_lo), vt3); + vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_lo), vt4); + vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_lo), vt5); + vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_lo), vt6); + vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_lo), vt7); + vt8 = _mm256_add_ps(_mm256_mul_ps(vn8, vminus_ln2_lo), vt8); + vt9 = _mm256_add_ps(_mm256_mul_ps(vn9, vminus_ln2_lo), vt9); + vt10 = _mm256_add_ps(_mm256_mul_ps(vn10, vminus_ln2_lo), vt10); + vt11 = _mm256_add_ps(_mm256_mul_ps(vn11, vminus_ln2_lo), vt11); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp0 = _mm256_add_ps(_mm256_mul_ps(vc5, vt0), vc4); + __m256 vp1 = _mm256_add_ps(_mm256_mul_ps(vc5, vt1), vc4); + __m256 vp2 = _mm256_add_ps(_mm256_mul_ps(vc5, vt2), vc4); + __m256 vp3 = _mm256_add_ps(_mm256_mul_ps(vc5, vt3), vc4); + __m256 vp4 = _mm256_add_ps(_mm256_mul_ps(vc5, vt4), vc4); + __m256 vp5 = _mm256_add_ps(_mm256_mul_ps(vc5, vt5), vc4); + __m256 vp6 = _mm256_add_ps(_mm256_mul_ps(vc5, vt6), vc4); + __m256 vp7 = _mm256_add_ps(_mm256_mul_ps(vc5, vt7), vc4); + __m256 vp8 = _mm256_add_ps(_mm256_mul_ps(vc5, vt8), vc4); + __m256 vp9 = _mm256_add_ps(_mm256_mul_ps(vc5, vt9), vc4); + __m256 vp10 = _mm256_add_ps(_mm256_mul_ps(vc5, vt10), vc4); + __m256 vp11 = _mm256_add_ps(_mm256_mul_ps(vc5, vt11), vc4); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc3); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc3); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc3); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc3); + vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc3); + vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc3); + vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc3); + vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc3); + vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc3); + vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc3); + vp10 = _mm256_add_ps(_mm256_mul_ps(vp10, vt10), vc3); + vp11 = _mm256_add_ps(_mm256_mul_ps(vp11, vt11), vc3); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc2); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc2); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc2); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc2); + vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc2); + vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc2); + vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc2); + vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc2); + vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc2); + vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc2); + vp10 = _mm256_add_ps(_mm256_mul_ps(vp10, vt10), vc2); + vp11 = _mm256_add_ps(_mm256_mul_ps(vp11, vt11), vc2); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc1); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc1); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc1); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc1); + vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc1); + vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc1); + vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc1); + vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc1); + vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc1); + vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc1); + vp10 = _mm256_add_ps(_mm256_mul_ps(vp10, vt10), vc1); + vp11 = _mm256_add_ps(_mm256_mul_ps(vp11, vt11), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt0 = _mm256_mul_ps(vt0, vs0); + vt1 = _mm256_mul_ps(vt1, vs1); + vt2 = _mm256_mul_ps(vt2, vs2); + vt3 = _mm256_mul_ps(vt3, vs3); + vt4 = _mm256_mul_ps(vt4, vs4); + vt5 = _mm256_mul_ps(vt5, vs5); + vt6 = _mm256_mul_ps(vt6, vs6); + vt7 = _mm256_mul_ps(vt7, vs7); + vt8 = _mm256_mul_ps(vt8, vs8); + vt9 = _mm256_mul_ps(vt9, vs9); + vt10 = _mm256_mul_ps(vt10, vs10); + vt11 = _mm256_mul_ps(vt11, vs11); + + __m256 vf0 = _mm256_add_ps(_mm256_mul_ps(vt0, vp0), vs0); + __m256 vf1 = _mm256_add_ps(_mm256_mul_ps(vt1, vp1), vs1); + __m256 vf2 = _mm256_add_ps(_mm256_mul_ps(vt2, vp2), vs2); + __m256 vf3 = _mm256_add_ps(_mm256_mul_ps(vt3, vp3), vs3); + __m256 vf4 = _mm256_add_ps(_mm256_mul_ps(vt4, vp4), vs4); + __m256 vf5 = _mm256_add_ps(_mm256_mul_ps(vt5, vp5), vs5); + __m256 vf6 = _mm256_add_ps(_mm256_mul_ps(vt6, vp6), vs6); + __m256 vf7 = _mm256_add_ps(_mm256_mul_ps(vt7, vp7), vs7); + __m256 vf8 = _mm256_add_ps(_mm256_mul_ps(vt8, vp8), vs8); + __m256 vf9 = _mm256_add_ps(_mm256_mul_ps(vt9, vp9), vs9); + __m256 vf10 = _mm256_add_ps(_mm256_mul_ps(vt10, vp10), vs10); + __m256 vf11 = _mm256_add_ps(_mm256_mul_ps(vt11, vp11), vs11); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); + vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); + vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); + vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); + vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vx4, vdenorm_cutoff, _CMP_LT_OS), vf4); + vf5 = _mm256_andnot_ps(_mm256_cmp_ps(vx5, vdenorm_cutoff, _CMP_LT_OS), vf5); + vf6 = _mm256_andnot_ps(_mm256_cmp_ps(vx6, vdenorm_cutoff, _CMP_LT_OS), vf6); + vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); + vf8 = _mm256_andnot_ps(_mm256_cmp_ps(vx8, vdenorm_cutoff, _CMP_LT_OS), vf8); + vf9 = _mm256_andnot_ps(_mm256_cmp_ps(vx9, vdenorm_cutoff, _CMP_LT_OS), vf9); + vf10 = _mm256_andnot_ps(_mm256_cmp_ps(vx10, vdenorm_cutoff, _CMP_LT_OS), vf10); + vf11 = _mm256_andnot_ps(_mm256_cmp_ps(vx11, vdenorm_cutoff, _CMP_LT_OS), vf11); + + // Store 96 (12x4) outputs at a time. + _mm256_storeu_ps(output, vf0); + _mm256_storeu_ps(output + 8, vf1); + _mm256_storeu_ps(output + 16, vf2); + _mm256_storeu_ps(output + 24, vf3); + _mm256_storeu_ps(output + 32, vf4); + _mm256_storeu_ps(output + 40, vf5); + _mm256_storeu_ps(output + 48, vf6); + _mm256_storeu_ps(output + 56, vf7); + _mm256_storeu_ps(output + 64, vf8); + _mm256_storeu_ps(output + 72, vf9); + _mm256_storeu_ps(output + 80, vf10); + _mm256_storeu_ps(output + 88, vf11); + output += 96; + + // Accumulate computed exponents. + vacc0 = _mm256_add_ps(vacc0, vf0); + vacc1 = _mm256_add_ps(vacc1, vf1); + vacc0 = _mm256_add_ps(vacc0, vf2); + vacc1 = _mm256_add_ps(vacc1, vf3); + vacc0 = _mm256_add_ps(vacc0, vf4); + vacc1 = _mm256_add_ps(vacc1, vf5); + vacc0 = _mm256_add_ps(vacc0, vf6); + vacc1 = _mm256_add_ps(vacc1, vf7); + vacc0 = _mm256_add_ps(vacc0, vf8); + vacc1 = _mm256_add_ps(vacc1, vf9); + vacc0 = _mm256_add_ps(vacc0, vf10); + vacc1 = _mm256_add_ps(vacc1, vf11); + } + // Add up all accumulators to vacc0 + vacc0 = _mm256_add_ps(vacc0, vacc1); + + __m256 vacc = vacc0; + for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { + // Load 8 inputs at a time. + const __m256 vi = _mm256_loadu_ps(input); + input += 8; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx = _mm256_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm256_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm256_mul_ps(vt, vs); + __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); + + // Store 8 outputs at a time. + _mm256_storeu_ps(output, vf); + output += 8; + + // Accumulate computed exponents. + vacc = _mm256_add_ps(vacc, vf); + } + if (batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 7 * sizeof(float)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); + + const __m256 vi = _mm256_maskload_ps(input, vmask); + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx = _mm256_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm256_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm256_mul_ps(vt, vs); + __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); + + __m128 vf_lo = _mm256_castps256_ps128(vf); + if (batch & (4 * sizeof(float))) { + _mm_storeu_ps(output, vf_lo); + vf_lo = _mm256_extractf128_ps(vf, 1); + output += 4; + } + if (batch & (2 * sizeof(float))) { + _mm_storel_pi((__m64*) output, vf_lo); + vf_lo = _mm_movehl_ps(vf_lo, vf_lo); + output += 2; + } + if (batch & (1 * sizeof(float))) { + _mm_store_ss(output, vf_lo); + } + + vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); + } + __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); + vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); + vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); + _mm_store_ss(sum, vacc_lo); +} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc3.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc3.c new file mode 100644 index 00000000000..d5650025e2a --- /dev/null +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc3.c @@ -0,0 +1,404 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in +// Generator: tools/xngen +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/raddstoreexpminusmax.h" + + +void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc3( + size_t batch, + const float* input, + const float* max, + float* output, + float* sum, + const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(max != NULL); + assert(output != NULL); + assert(sum != NULL); + + static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; + + const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); + const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); + const __m256 vminus_ln2_hi = _mm256_set1_ps(-0x1.62E400p-1f); + const __m256 vminus_ln2_lo = _mm256_set1_ps(-0x1.7F7D1Cp-20f); + const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); + const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); + const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); + const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); + const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); + const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); + + XNN_FORCE_REALIZATION(vlog2e); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vminus_ln2_hi); + XNN_FORCE_REALIZATION(vminus_ln2_lo); + XNN_FORCE_REALIZATION(vc5); + XNN_FORCE_REALIZATION(vc4); + XNN_FORCE_REALIZATION(vc3); + XNN_FORCE_REALIZATION(vc2); + XNN_FORCE_REALIZATION(vc1); + XNN_FORCE_REALIZATION(vdenorm_cutoff); + + const __m256 vi_max = _mm256_broadcast_ss(max); + + __m256 vacc0 = _mm256_setzero_ps(); + __m256 vacc1 = _mm256_setzero_ps(); + __m256 vacc2 = _mm256_setzero_ps(); + for (; batch >= 96 * sizeof(float); batch -= 96 * sizeof(float)) { + // Load 96 (12x4) inputs at a time. + const __m256 vi0 = _mm256_loadu_ps(input); + const __m256 vi1 = _mm256_loadu_ps(input + 8); + const __m256 vi2 = _mm256_loadu_ps(input + 16); + const __m256 vi3 = _mm256_loadu_ps(input + 24); + const __m256 vi4 = _mm256_loadu_ps(input + 32); + const __m256 vi5 = _mm256_loadu_ps(input + 40); + const __m256 vi6 = _mm256_loadu_ps(input + 48); + const __m256 vi7 = _mm256_loadu_ps(input + 56); + const __m256 vi8 = _mm256_loadu_ps(input + 64); + const __m256 vi9 = _mm256_loadu_ps(input + 72); + const __m256 vi10 = _mm256_loadu_ps(input + 80); + const __m256 vi11 = _mm256_loadu_ps(input + 88); + input += 96; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); + const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); + const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); + const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); + const __m256 vx4 = _mm256_sub_ps(vi4, vi_max); + const __m256 vx5 = _mm256_sub_ps(vi5, vi_max); + const __m256 vx6 = _mm256_sub_ps(vi6, vi_max); + const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); + const __m256 vx8 = _mm256_sub_ps(vi8, vi_max); + const __m256 vx9 = _mm256_sub_ps(vi9, vi_max); + const __m256 vx10 = _mm256_sub_ps(vi10, vi_max); + const __m256 vx11 = _mm256_sub_ps(vi11, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn0 = _mm256_add_ps(_mm256_mul_ps(vx0, vlog2e), vmagic_bias); + __m256 vn1 = _mm256_add_ps(_mm256_mul_ps(vx1, vlog2e), vmagic_bias); + __m256 vn2 = _mm256_add_ps(_mm256_mul_ps(vx2, vlog2e), vmagic_bias); + __m256 vn3 = _mm256_add_ps(_mm256_mul_ps(vx3, vlog2e), vmagic_bias); + __m256 vn4 = _mm256_add_ps(_mm256_mul_ps(vx4, vlog2e), vmagic_bias); + __m256 vn5 = _mm256_add_ps(_mm256_mul_ps(vx5, vlog2e), vmagic_bias); + __m256 vn6 = _mm256_add_ps(_mm256_mul_ps(vx6, vlog2e), vmagic_bias); + __m256 vn7 = _mm256_add_ps(_mm256_mul_ps(vx7, vlog2e), vmagic_bias); + __m256 vn8 = _mm256_add_ps(_mm256_mul_ps(vx8, vlog2e), vmagic_bias); + __m256 vn9 = _mm256_add_ps(_mm256_mul_ps(vx9, vlog2e), vmagic_bias); + __m256 vn10 = _mm256_add_ps(_mm256_mul_ps(vx10, vlog2e), vmagic_bias); + __m256 vn11 = _mm256_add_ps(_mm256_mul_ps(vx11, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); + const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); + const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); + const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); + const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23)); + const __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); + const __m256 vs6 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn6), 23)); + const __m256 vs7 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn7), 23)); + const __m256 vs8 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn8), 23)); + const __m256 vs9 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn9), 23)); + const __m256 vs10 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn10), 23)); + const __m256 vs11 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn11), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn0 = _mm256_sub_ps(vn0, vmagic_bias); + vn1 = _mm256_sub_ps(vn1, vmagic_bias); + vn2 = _mm256_sub_ps(vn2, vmagic_bias); + vn3 = _mm256_sub_ps(vn3, vmagic_bias); + vn4 = _mm256_sub_ps(vn4, vmagic_bias); + vn5 = _mm256_sub_ps(vn5, vmagic_bias); + vn6 = _mm256_sub_ps(vn6, vmagic_bias); + vn7 = _mm256_sub_ps(vn7, vmagic_bias); + vn8 = _mm256_sub_ps(vn8, vmagic_bias); + vn9 = _mm256_sub_ps(vn9, vmagic_bias); + vn10 = _mm256_sub_ps(vn10, vmagic_bias); + vn11 = _mm256_sub_ps(vn11, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_hi), vx0); + __m256 vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_hi), vx1); + __m256 vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_hi), vx2); + __m256 vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_hi), vx3); + __m256 vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_hi), vx4); + __m256 vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_hi), vx5); + __m256 vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_hi), vx6); + __m256 vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_hi), vx7); + __m256 vt8 = _mm256_add_ps(_mm256_mul_ps(vn8, vminus_ln2_hi), vx8); + __m256 vt9 = _mm256_add_ps(_mm256_mul_ps(vn9, vminus_ln2_hi), vx9); + __m256 vt10 = _mm256_add_ps(_mm256_mul_ps(vn10, vminus_ln2_hi), vx10); + __m256 vt11 = _mm256_add_ps(_mm256_mul_ps(vn11, vminus_ln2_hi), vx11); + + vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_lo), vt1); + vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_lo), vt2); + vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_lo), vt3); + vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_lo), vt4); + vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_lo), vt5); + vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_lo), vt6); + vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_lo), vt7); + vt8 = _mm256_add_ps(_mm256_mul_ps(vn8, vminus_ln2_lo), vt8); + vt9 = _mm256_add_ps(_mm256_mul_ps(vn9, vminus_ln2_lo), vt9); + vt10 = _mm256_add_ps(_mm256_mul_ps(vn10, vminus_ln2_lo), vt10); + vt11 = _mm256_add_ps(_mm256_mul_ps(vn11, vminus_ln2_lo), vt11); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp0 = _mm256_add_ps(_mm256_mul_ps(vc5, vt0), vc4); + __m256 vp1 = _mm256_add_ps(_mm256_mul_ps(vc5, vt1), vc4); + __m256 vp2 = _mm256_add_ps(_mm256_mul_ps(vc5, vt2), vc4); + __m256 vp3 = _mm256_add_ps(_mm256_mul_ps(vc5, vt3), vc4); + __m256 vp4 = _mm256_add_ps(_mm256_mul_ps(vc5, vt4), vc4); + __m256 vp5 = _mm256_add_ps(_mm256_mul_ps(vc5, vt5), vc4); + __m256 vp6 = _mm256_add_ps(_mm256_mul_ps(vc5, vt6), vc4); + __m256 vp7 = _mm256_add_ps(_mm256_mul_ps(vc5, vt7), vc4); + __m256 vp8 = _mm256_add_ps(_mm256_mul_ps(vc5, vt8), vc4); + __m256 vp9 = _mm256_add_ps(_mm256_mul_ps(vc5, vt9), vc4); + __m256 vp10 = _mm256_add_ps(_mm256_mul_ps(vc5, vt10), vc4); + __m256 vp11 = _mm256_add_ps(_mm256_mul_ps(vc5, vt11), vc4); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc3); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc3); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc3); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc3); + vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc3); + vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc3); + vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc3); + vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc3); + vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc3); + vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc3); + vp10 = _mm256_add_ps(_mm256_mul_ps(vp10, vt10), vc3); + vp11 = _mm256_add_ps(_mm256_mul_ps(vp11, vt11), vc3); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc2); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc2); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc2); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc2); + vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc2); + vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc2); + vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc2); + vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc2); + vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc2); + vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc2); + vp10 = _mm256_add_ps(_mm256_mul_ps(vp10, vt10), vc2); + vp11 = _mm256_add_ps(_mm256_mul_ps(vp11, vt11), vc2); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc1); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc1); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc1); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc1); + vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc1); + vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc1); + vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc1); + vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc1); + vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc1); + vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc1); + vp10 = _mm256_add_ps(_mm256_mul_ps(vp10, vt10), vc1); + vp11 = _mm256_add_ps(_mm256_mul_ps(vp11, vt11), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt0 = _mm256_mul_ps(vt0, vs0); + vt1 = _mm256_mul_ps(vt1, vs1); + vt2 = _mm256_mul_ps(vt2, vs2); + vt3 = _mm256_mul_ps(vt3, vs3); + vt4 = _mm256_mul_ps(vt4, vs4); + vt5 = _mm256_mul_ps(vt5, vs5); + vt6 = _mm256_mul_ps(vt6, vs6); + vt7 = _mm256_mul_ps(vt7, vs7); + vt8 = _mm256_mul_ps(vt8, vs8); + vt9 = _mm256_mul_ps(vt9, vs9); + vt10 = _mm256_mul_ps(vt10, vs10); + vt11 = _mm256_mul_ps(vt11, vs11); + + __m256 vf0 = _mm256_add_ps(_mm256_mul_ps(vt0, vp0), vs0); + __m256 vf1 = _mm256_add_ps(_mm256_mul_ps(vt1, vp1), vs1); + __m256 vf2 = _mm256_add_ps(_mm256_mul_ps(vt2, vp2), vs2); + __m256 vf3 = _mm256_add_ps(_mm256_mul_ps(vt3, vp3), vs3); + __m256 vf4 = _mm256_add_ps(_mm256_mul_ps(vt4, vp4), vs4); + __m256 vf5 = _mm256_add_ps(_mm256_mul_ps(vt5, vp5), vs5); + __m256 vf6 = _mm256_add_ps(_mm256_mul_ps(vt6, vp6), vs6); + __m256 vf7 = _mm256_add_ps(_mm256_mul_ps(vt7, vp7), vs7); + __m256 vf8 = _mm256_add_ps(_mm256_mul_ps(vt8, vp8), vs8); + __m256 vf9 = _mm256_add_ps(_mm256_mul_ps(vt9, vp9), vs9); + __m256 vf10 = _mm256_add_ps(_mm256_mul_ps(vt10, vp10), vs10); + __m256 vf11 = _mm256_add_ps(_mm256_mul_ps(vt11, vp11), vs11); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); + vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); + vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); + vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); + vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vx4, vdenorm_cutoff, _CMP_LT_OS), vf4); + vf5 = _mm256_andnot_ps(_mm256_cmp_ps(vx5, vdenorm_cutoff, _CMP_LT_OS), vf5); + vf6 = _mm256_andnot_ps(_mm256_cmp_ps(vx6, vdenorm_cutoff, _CMP_LT_OS), vf6); + vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); + vf8 = _mm256_andnot_ps(_mm256_cmp_ps(vx8, vdenorm_cutoff, _CMP_LT_OS), vf8); + vf9 = _mm256_andnot_ps(_mm256_cmp_ps(vx9, vdenorm_cutoff, _CMP_LT_OS), vf9); + vf10 = _mm256_andnot_ps(_mm256_cmp_ps(vx10, vdenorm_cutoff, _CMP_LT_OS), vf10); + vf11 = _mm256_andnot_ps(_mm256_cmp_ps(vx11, vdenorm_cutoff, _CMP_LT_OS), vf11); + + // Store 96 (12x4) outputs at a time. + _mm256_storeu_ps(output, vf0); + _mm256_storeu_ps(output + 8, vf1); + _mm256_storeu_ps(output + 16, vf2); + _mm256_storeu_ps(output + 24, vf3); + _mm256_storeu_ps(output + 32, vf4); + _mm256_storeu_ps(output + 40, vf5); + _mm256_storeu_ps(output + 48, vf6); + _mm256_storeu_ps(output + 56, vf7); + _mm256_storeu_ps(output + 64, vf8); + _mm256_storeu_ps(output + 72, vf9); + _mm256_storeu_ps(output + 80, vf10); + _mm256_storeu_ps(output + 88, vf11); + output += 96; + + // Accumulate computed exponents. + vacc0 = _mm256_add_ps(vacc0, vf0); + vacc1 = _mm256_add_ps(vacc1, vf1); + vacc2 = _mm256_add_ps(vacc2, vf2); + vacc0 = _mm256_add_ps(vacc0, vf3); + vacc1 = _mm256_add_ps(vacc1, vf4); + vacc2 = _mm256_add_ps(vacc2, vf5); + vacc0 = _mm256_add_ps(vacc0, vf6); + vacc1 = _mm256_add_ps(vacc1, vf7); + vacc2 = _mm256_add_ps(vacc2, vf8); + vacc0 = _mm256_add_ps(vacc0, vf9); + vacc1 = _mm256_add_ps(vacc1, vf10); + vacc2 = _mm256_add_ps(vacc2, vf11); + } + // Add up all accumulators to vacc0 + vacc0 = _mm256_add_ps(vacc0, vacc1); + vacc0 = _mm256_add_ps(vacc0, vacc2); + + __m256 vacc = vacc0; + for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { + // Load 8 inputs at a time. + const __m256 vi = _mm256_loadu_ps(input); + input += 8; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx = _mm256_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm256_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm256_mul_ps(vt, vs); + __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); + + // Store 8 outputs at a time. + _mm256_storeu_ps(output, vf); + output += 8; + + // Accumulate computed exponents. + vacc = _mm256_add_ps(vacc, vf); + } + if (batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 7 * sizeof(float)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); + + const __m256 vi = _mm256_maskload_ps(input, vmask); + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx = _mm256_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm256_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm256_mul_ps(vt, vs); + __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); + + __m128 vf_lo = _mm256_castps256_ps128(vf); + if (batch & (4 * sizeof(float))) { + _mm_storeu_ps(output, vf_lo); + vf_lo = _mm256_extractf128_ps(vf, 1); + output += 4; + } + if (batch & (2 * sizeof(float))) { + _mm_storel_pi((__m64*) output, vf_lo); + vf_lo = _mm_movehl_ps(vf_lo, vf_lo); + output += 2; + } + if (batch & (1 * sizeof(float))) { + _mm_store_ss(output, vf_lo); + } + + vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); + } + __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); + vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); + vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); + _mm_store_ss(sum, vacc_lo); +} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc6.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc6.c new file mode 100644 index 00000000000..3b0891d92a7 --- /dev/null +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc6.c @@ -0,0 +1,410 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in +// Generator: tools/xngen +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/raddstoreexpminusmax.h" + + +void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc6( + size_t batch, + const float* input, + const float* max, + float* output, + float* sum, + const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(max != NULL); + assert(output != NULL); + assert(sum != NULL); + + static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; + + const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); + const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); + const __m256 vminus_ln2_hi = _mm256_set1_ps(-0x1.62E400p-1f); + const __m256 vminus_ln2_lo = _mm256_set1_ps(-0x1.7F7D1Cp-20f); + const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); + const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); + const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); + const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); + const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); + const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); + + XNN_FORCE_REALIZATION(vlog2e); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vminus_ln2_hi); + XNN_FORCE_REALIZATION(vminus_ln2_lo); + XNN_FORCE_REALIZATION(vc5); + XNN_FORCE_REALIZATION(vc4); + XNN_FORCE_REALIZATION(vc3); + XNN_FORCE_REALIZATION(vc2); + XNN_FORCE_REALIZATION(vc1); + XNN_FORCE_REALIZATION(vdenorm_cutoff); + + const __m256 vi_max = _mm256_broadcast_ss(max); + + __m256 vacc0 = _mm256_setzero_ps(); + __m256 vacc1 = _mm256_setzero_ps(); + __m256 vacc2 = _mm256_setzero_ps(); + __m256 vacc3 = _mm256_setzero_ps(); + __m256 vacc4 = _mm256_setzero_ps(); + __m256 vacc5 = _mm256_setzero_ps(); + for (; batch >= 96 * sizeof(float); batch -= 96 * sizeof(float)) { + // Load 96 (12x4) inputs at a time. + const __m256 vi0 = _mm256_loadu_ps(input); + const __m256 vi1 = _mm256_loadu_ps(input + 8); + const __m256 vi2 = _mm256_loadu_ps(input + 16); + const __m256 vi3 = _mm256_loadu_ps(input + 24); + const __m256 vi4 = _mm256_loadu_ps(input + 32); + const __m256 vi5 = _mm256_loadu_ps(input + 40); + const __m256 vi6 = _mm256_loadu_ps(input + 48); + const __m256 vi7 = _mm256_loadu_ps(input + 56); + const __m256 vi8 = _mm256_loadu_ps(input + 64); + const __m256 vi9 = _mm256_loadu_ps(input + 72); + const __m256 vi10 = _mm256_loadu_ps(input + 80); + const __m256 vi11 = _mm256_loadu_ps(input + 88); + input += 96; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); + const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); + const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); + const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); + const __m256 vx4 = _mm256_sub_ps(vi4, vi_max); + const __m256 vx5 = _mm256_sub_ps(vi5, vi_max); + const __m256 vx6 = _mm256_sub_ps(vi6, vi_max); + const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); + const __m256 vx8 = _mm256_sub_ps(vi8, vi_max); + const __m256 vx9 = _mm256_sub_ps(vi9, vi_max); + const __m256 vx10 = _mm256_sub_ps(vi10, vi_max); + const __m256 vx11 = _mm256_sub_ps(vi11, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn0 = _mm256_add_ps(_mm256_mul_ps(vx0, vlog2e), vmagic_bias); + __m256 vn1 = _mm256_add_ps(_mm256_mul_ps(vx1, vlog2e), vmagic_bias); + __m256 vn2 = _mm256_add_ps(_mm256_mul_ps(vx2, vlog2e), vmagic_bias); + __m256 vn3 = _mm256_add_ps(_mm256_mul_ps(vx3, vlog2e), vmagic_bias); + __m256 vn4 = _mm256_add_ps(_mm256_mul_ps(vx4, vlog2e), vmagic_bias); + __m256 vn5 = _mm256_add_ps(_mm256_mul_ps(vx5, vlog2e), vmagic_bias); + __m256 vn6 = _mm256_add_ps(_mm256_mul_ps(vx6, vlog2e), vmagic_bias); + __m256 vn7 = _mm256_add_ps(_mm256_mul_ps(vx7, vlog2e), vmagic_bias); + __m256 vn8 = _mm256_add_ps(_mm256_mul_ps(vx8, vlog2e), vmagic_bias); + __m256 vn9 = _mm256_add_ps(_mm256_mul_ps(vx9, vlog2e), vmagic_bias); + __m256 vn10 = _mm256_add_ps(_mm256_mul_ps(vx10, vlog2e), vmagic_bias); + __m256 vn11 = _mm256_add_ps(_mm256_mul_ps(vx11, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); + const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); + const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); + const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); + const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23)); + const __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); + const __m256 vs6 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn6), 23)); + const __m256 vs7 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn7), 23)); + const __m256 vs8 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn8), 23)); + const __m256 vs9 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn9), 23)); + const __m256 vs10 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn10), 23)); + const __m256 vs11 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn11), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn0 = _mm256_sub_ps(vn0, vmagic_bias); + vn1 = _mm256_sub_ps(vn1, vmagic_bias); + vn2 = _mm256_sub_ps(vn2, vmagic_bias); + vn3 = _mm256_sub_ps(vn3, vmagic_bias); + vn4 = _mm256_sub_ps(vn4, vmagic_bias); + vn5 = _mm256_sub_ps(vn5, vmagic_bias); + vn6 = _mm256_sub_ps(vn6, vmagic_bias); + vn7 = _mm256_sub_ps(vn7, vmagic_bias); + vn8 = _mm256_sub_ps(vn8, vmagic_bias); + vn9 = _mm256_sub_ps(vn9, vmagic_bias); + vn10 = _mm256_sub_ps(vn10, vmagic_bias); + vn11 = _mm256_sub_ps(vn11, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_hi), vx0); + __m256 vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_hi), vx1); + __m256 vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_hi), vx2); + __m256 vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_hi), vx3); + __m256 vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_hi), vx4); + __m256 vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_hi), vx5); + __m256 vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_hi), vx6); + __m256 vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_hi), vx7); + __m256 vt8 = _mm256_add_ps(_mm256_mul_ps(vn8, vminus_ln2_hi), vx8); + __m256 vt9 = _mm256_add_ps(_mm256_mul_ps(vn9, vminus_ln2_hi), vx9); + __m256 vt10 = _mm256_add_ps(_mm256_mul_ps(vn10, vminus_ln2_hi), vx10); + __m256 vt11 = _mm256_add_ps(_mm256_mul_ps(vn11, vminus_ln2_hi), vx11); + + vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_lo), vt1); + vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_lo), vt2); + vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_lo), vt3); + vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_lo), vt4); + vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_lo), vt5); + vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_lo), vt6); + vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_lo), vt7); + vt8 = _mm256_add_ps(_mm256_mul_ps(vn8, vminus_ln2_lo), vt8); + vt9 = _mm256_add_ps(_mm256_mul_ps(vn9, vminus_ln2_lo), vt9); + vt10 = _mm256_add_ps(_mm256_mul_ps(vn10, vminus_ln2_lo), vt10); + vt11 = _mm256_add_ps(_mm256_mul_ps(vn11, vminus_ln2_lo), vt11); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp0 = _mm256_add_ps(_mm256_mul_ps(vc5, vt0), vc4); + __m256 vp1 = _mm256_add_ps(_mm256_mul_ps(vc5, vt1), vc4); + __m256 vp2 = _mm256_add_ps(_mm256_mul_ps(vc5, vt2), vc4); + __m256 vp3 = _mm256_add_ps(_mm256_mul_ps(vc5, vt3), vc4); + __m256 vp4 = _mm256_add_ps(_mm256_mul_ps(vc5, vt4), vc4); + __m256 vp5 = _mm256_add_ps(_mm256_mul_ps(vc5, vt5), vc4); + __m256 vp6 = _mm256_add_ps(_mm256_mul_ps(vc5, vt6), vc4); + __m256 vp7 = _mm256_add_ps(_mm256_mul_ps(vc5, vt7), vc4); + __m256 vp8 = _mm256_add_ps(_mm256_mul_ps(vc5, vt8), vc4); + __m256 vp9 = _mm256_add_ps(_mm256_mul_ps(vc5, vt9), vc4); + __m256 vp10 = _mm256_add_ps(_mm256_mul_ps(vc5, vt10), vc4); + __m256 vp11 = _mm256_add_ps(_mm256_mul_ps(vc5, vt11), vc4); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc3); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc3); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc3); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc3); + vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc3); + vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc3); + vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc3); + vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc3); + vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc3); + vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc3); + vp10 = _mm256_add_ps(_mm256_mul_ps(vp10, vt10), vc3); + vp11 = _mm256_add_ps(_mm256_mul_ps(vp11, vt11), vc3); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc2); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc2); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc2); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc2); + vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc2); + vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc2); + vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc2); + vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc2); + vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc2); + vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc2); + vp10 = _mm256_add_ps(_mm256_mul_ps(vp10, vt10), vc2); + vp11 = _mm256_add_ps(_mm256_mul_ps(vp11, vt11), vc2); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc1); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc1); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc1); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc1); + vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc1); + vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc1); + vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc1); + vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc1); + vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc1); + vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc1); + vp10 = _mm256_add_ps(_mm256_mul_ps(vp10, vt10), vc1); + vp11 = _mm256_add_ps(_mm256_mul_ps(vp11, vt11), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt0 = _mm256_mul_ps(vt0, vs0); + vt1 = _mm256_mul_ps(vt1, vs1); + vt2 = _mm256_mul_ps(vt2, vs2); + vt3 = _mm256_mul_ps(vt3, vs3); + vt4 = _mm256_mul_ps(vt4, vs4); + vt5 = _mm256_mul_ps(vt5, vs5); + vt6 = _mm256_mul_ps(vt6, vs6); + vt7 = _mm256_mul_ps(vt7, vs7); + vt8 = _mm256_mul_ps(vt8, vs8); + vt9 = _mm256_mul_ps(vt9, vs9); + vt10 = _mm256_mul_ps(vt10, vs10); + vt11 = _mm256_mul_ps(vt11, vs11); + + __m256 vf0 = _mm256_add_ps(_mm256_mul_ps(vt0, vp0), vs0); + __m256 vf1 = _mm256_add_ps(_mm256_mul_ps(vt1, vp1), vs1); + __m256 vf2 = _mm256_add_ps(_mm256_mul_ps(vt2, vp2), vs2); + __m256 vf3 = _mm256_add_ps(_mm256_mul_ps(vt3, vp3), vs3); + __m256 vf4 = _mm256_add_ps(_mm256_mul_ps(vt4, vp4), vs4); + __m256 vf5 = _mm256_add_ps(_mm256_mul_ps(vt5, vp5), vs5); + __m256 vf6 = _mm256_add_ps(_mm256_mul_ps(vt6, vp6), vs6); + __m256 vf7 = _mm256_add_ps(_mm256_mul_ps(vt7, vp7), vs7); + __m256 vf8 = _mm256_add_ps(_mm256_mul_ps(vt8, vp8), vs8); + __m256 vf9 = _mm256_add_ps(_mm256_mul_ps(vt9, vp9), vs9); + __m256 vf10 = _mm256_add_ps(_mm256_mul_ps(vt10, vp10), vs10); + __m256 vf11 = _mm256_add_ps(_mm256_mul_ps(vt11, vp11), vs11); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); + vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); + vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); + vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); + vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vx4, vdenorm_cutoff, _CMP_LT_OS), vf4); + vf5 = _mm256_andnot_ps(_mm256_cmp_ps(vx5, vdenorm_cutoff, _CMP_LT_OS), vf5); + vf6 = _mm256_andnot_ps(_mm256_cmp_ps(vx6, vdenorm_cutoff, _CMP_LT_OS), vf6); + vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); + vf8 = _mm256_andnot_ps(_mm256_cmp_ps(vx8, vdenorm_cutoff, _CMP_LT_OS), vf8); + vf9 = _mm256_andnot_ps(_mm256_cmp_ps(vx9, vdenorm_cutoff, _CMP_LT_OS), vf9); + vf10 = _mm256_andnot_ps(_mm256_cmp_ps(vx10, vdenorm_cutoff, _CMP_LT_OS), vf10); + vf11 = _mm256_andnot_ps(_mm256_cmp_ps(vx11, vdenorm_cutoff, _CMP_LT_OS), vf11); + + // Store 96 (12x4) outputs at a time. + _mm256_storeu_ps(output, vf0); + _mm256_storeu_ps(output + 8, vf1); + _mm256_storeu_ps(output + 16, vf2); + _mm256_storeu_ps(output + 24, vf3); + _mm256_storeu_ps(output + 32, vf4); + _mm256_storeu_ps(output + 40, vf5); + _mm256_storeu_ps(output + 48, vf6); + _mm256_storeu_ps(output + 56, vf7); + _mm256_storeu_ps(output + 64, vf8); + _mm256_storeu_ps(output + 72, vf9); + _mm256_storeu_ps(output + 80, vf10); + _mm256_storeu_ps(output + 88, vf11); + output += 96; + + // Accumulate computed exponents. + vacc0 = _mm256_add_ps(vacc0, vf0); + vacc1 = _mm256_add_ps(vacc1, vf1); + vacc2 = _mm256_add_ps(vacc2, vf2); + vacc3 = _mm256_add_ps(vacc3, vf3); + vacc4 = _mm256_add_ps(vacc4, vf4); + vacc5 = _mm256_add_ps(vacc5, vf5); + vacc0 = _mm256_add_ps(vacc0, vf6); + vacc1 = _mm256_add_ps(vacc1, vf7); + vacc2 = _mm256_add_ps(vacc2, vf8); + vacc3 = _mm256_add_ps(vacc3, vf9); + vacc4 = _mm256_add_ps(vacc4, vf10); + vacc5 = _mm256_add_ps(vacc5, vf11); + } + // Add up all accumulators to vacc0 + vacc0 = _mm256_add_ps(vacc0, vacc1); + vacc2 = _mm256_add_ps(vacc2, vacc3); + vacc4 = _mm256_add_ps(vacc4, vacc5); + vacc0 = _mm256_add_ps(vacc0, vacc2); + vacc0 = _mm256_add_ps(vacc0, vacc4); + + __m256 vacc = vacc0; + for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { + // Load 8 inputs at a time. + const __m256 vi = _mm256_loadu_ps(input); + input += 8; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx = _mm256_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm256_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm256_mul_ps(vt, vs); + __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); + + // Store 8 outputs at a time. + _mm256_storeu_ps(output, vf); + output += 8; + + // Accumulate computed exponents. + vacc = _mm256_add_ps(vacc, vf); + } + if (batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 7 * sizeof(float)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); + + const __m256 vi = _mm256_maskload_ps(input, vmask); + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx = _mm256_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm256_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm256_mul_ps(vt, vs); + __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); + + __m128 vf_lo = _mm256_castps256_ps128(vf); + if (batch & (4 * sizeof(float))) { + _mm_storeu_ps(output, vf_lo); + vf_lo = _mm256_extractf128_ps(vf, 1); + output += 4; + } + if (batch & (2 * sizeof(float))) { + _mm_storel_pi((__m64*) output, vf_lo); + vf_lo = _mm_movehl_ps(vf_lo, vf_lo); + output += 2; + } + if (batch & (1 * sizeof(float))) { + _mm_store_ss(output, vf_lo); + } + + vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); + } + __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); + vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); + vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); + _mm_store_ss(sum, vacc_lo); +} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96.c new file mode 100644 index 00000000000..af744681554 --- /dev/null +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96.c @@ -0,0 +1,399 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in +// Generator: tools/xngen +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/raddstoreexpminusmax.h" + + +void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96( + size_t batch, + const float* input, + const float* max, + float* output, + float* sum, + const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(max != NULL); + assert(output != NULL); + assert(sum != NULL); + + static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; + + const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); + const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); + const __m256 vminus_ln2_hi = _mm256_set1_ps(-0x1.62E400p-1f); + const __m256 vminus_ln2_lo = _mm256_set1_ps(-0x1.7F7D1Cp-20f); + const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); + const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); + const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); + const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); + const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); + const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); + + XNN_FORCE_REALIZATION(vlog2e); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vminus_ln2_hi); + XNN_FORCE_REALIZATION(vminus_ln2_lo); + XNN_FORCE_REALIZATION(vc5); + XNN_FORCE_REALIZATION(vc4); + XNN_FORCE_REALIZATION(vc3); + XNN_FORCE_REALIZATION(vc2); + XNN_FORCE_REALIZATION(vc1); + XNN_FORCE_REALIZATION(vdenorm_cutoff); + + const __m256 vi_max = _mm256_broadcast_ss(max); + + __m256 vacc0 = _mm256_setzero_ps(); + for (; batch >= 96 * sizeof(float); batch -= 96 * sizeof(float)) { + // Load 96 (12x4) inputs at a time. + const __m256 vi0 = _mm256_loadu_ps(input); + const __m256 vi1 = _mm256_loadu_ps(input + 8); + const __m256 vi2 = _mm256_loadu_ps(input + 16); + const __m256 vi3 = _mm256_loadu_ps(input + 24); + const __m256 vi4 = _mm256_loadu_ps(input + 32); + const __m256 vi5 = _mm256_loadu_ps(input + 40); + const __m256 vi6 = _mm256_loadu_ps(input + 48); + const __m256 vi7 = _mm256_loadu_ps(input + 56); + const __m256 vi8 = _mm256_loadu_ps(input + 64); + const __m256 vi9 = _mm256_loadu_ps(input + 72); + const __m256 vi10 = _mm256_loadu_ps(input + 80); + const __m256 vi11 = _mm256_loadu_ps(input + 88); + input += 96; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); + const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); + const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); + const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); + const __m256 vx4 = _mm256_sub_ps(vi4, vi_max); + const __m256 vx5 = _mm256_sub_ps(vi5, vi_max); + const __m256 vx6 = _mm256_sub_ps(vi6, vi_max); + const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); + const __m256 vx8 = _mm256_sub_ps(vi8, vi_max); + const __m256 vx9 = _mm256_sub_ps(vi9, vi_max); + const __m256 vx10 = _mm256_sub_ps(vi10, vi_max); + const __m256 vx11 = _mm256_sub_ps(vi11, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn0 = _mm256_add_ps(_mm256_mul_ps(vx0, vlog2e), vmagic_bias); + __m256 vn1 = _mm256_add_ps(_mm256_mul_ps(vx1, vlog2e), vmagic_bias); + __m256 vn2 = _mm256_add_ps(_mm256_mul_ps(vx2, vlog2e), vmagic_bias); + __m256 vn3 = _mm256_add_ps(_mm256_mul_ps(vx3, vlog2e), vmagic_bias); + __m256 vn4 = _mm256_add_ps(_mm256_mul_ps(vx4, vlog2e), vmagic_bias); + __m256 vn5 = _mm256_add_ps(_mm256_mul_ps(vx5, vlog2e), vmagic_bias); + __m256 vn6 = _mm256_add_ps(_mm256_mul_ps(vx6, vlog2e), vmagic_bias); + __m256 vn7 = _mm256_add_ps(_mm256_mul_ps(vx7, vlog2e), vmagic_bias); + __m256 vn8 = _mm256_add_ps(_mm256_mul_ps(vx8, vlog2e), vmagic_bias); + __m256 vn9 = _mm256_add_ps(_mm256_mul_ps(vx9, vlog2e), vmagic_bias); + __m256 vn10 = _mm256_add_ps(_mm256_mul_ps(vx10, vlog2e), vmagic_bias); + __m256 vn11 = _mm256_add_ps(_mm256_mul_ps(vx11, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); + const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); + const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); + const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); + const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23)); + const __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); + const __m256 vs6 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn6), 23)); + const __m256 vs7 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn7), 23)); + const __m256 vs8 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn8), 23)); + const __m256 vs9 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn9), 23)); + const __m256 vs10 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn10), 23)); + const __m256 vs11 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn11), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn0 = _mm256_sub_ps(vn0, vmagic_bias); + vn1 = _mm256_sub_ps(vn1, vmagic_bias); + vn2 = _mm256_sub_ps(vn2, vmagic_bias); + vn3 = _mm256_sub_ps(vn3, vmagic_bias); + vn4 = _mm256_sub_ps(vn4, vmagic_bias); + vn5 = _mm256_sub_ps(vn5, vmagic_bias); + vn6 = _mm256_sub_ps(vn6, vmagic_bias); + vn7 = _mm256_sub_ps(vn7, vmagic_bias); + vn8 = _mm256_sub_ps(vn8, vmagic_bias); + vn9 = _mm256_sub_ps(vn9, vmagic_bias); + vn10 = _mm256_sub_ps(vn10, vmagic_bias); + vn11 = _mm256_sub_ps(vn11, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_hi), vx0); + __m256 vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_hi), vx1); + __m256 vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_hi), vx2); + __m256 vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_hi), vx3); + __m256 vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_hi), vx4); + __m256 vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_hi), vx5); + __m256 vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_hi), vx6); + __m256 vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_hi), vx7); + __m256 vt8 = _mm256_add_ps(_mm256_mul_ps(vn8, vminus_ln2_hi), vx8); + __m256 vt9 = _mm256_add_ps(_mm256_mul_ps(vn9, vminus_ln2_hi), vx9); + __m256 vt10 = _mm256_add_ps(_mm256_mul_ps(vn10, vminus_ln2_hi), vx10); + __m256 vt11 = _mm256_add_ps(_mm256_mul_ps(vn11, vminus_ln2_hi), vx11); + + vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_lo), vt1); + vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_lo), vt2); + vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_lo), vt3); + vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_lo), vt4); + vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_lo), vt5); + vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_lo), vt6); + vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_lo), vt7); + vt8 = _mm256_add_ps(_mm256_mul_ps(vn8, vminus_ln2_lo), vt8); + vt9 = _mm256_add_ps(_mm256_mul_ps(vn9, vminus_ln2_lo), vt9); + vt10 = _mm256_add_ps(_mm256_mul_ps(vn10, vminus_ln2_lo), vt10); + vt11 = _mm256_add_ps(_mm256_mul_ps(vn11, vminus_ln2_lo), vt11); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp0 = _mm256_add_ps(_mm256_mul_ps(vc5, vt0), vc4); + __m256 vp1 = _mm256_add_ps(_mm256_mul_ps(vc5, vt1), vc4); + __m256 vp2 = _mm256_add_ps(_mm256_mul_ps(vc5, vt2), vc4); + __m256 vp3 = _mm256_add_ps(_mm256_mul_ps(vc5, vt3), vc4); + __m256 vp4 = _mm256_add_ps(_mm256_mul_ps(vc5, vt4), vc4); + __m256 vp5 = _mm256_add_ps(_mm256_mul_ps(vc5, vt5), vc4); + __m256 vp6 = _mm256_add_ps(_mm256_mul_ps(vc5, vt6), vc4); + __m256 vp7 = _mm256_add_ps(_mm256_mul_ps(vc5, vt7), vc4); + __m256 vp8 = _mm256_add_ps(_mm256_mul_ps(vc5, vt8), vc4); + __m256 vp9 = _mm256_add_ps(_mm256_mul_ps(vc5, vt9), vc4); + __m256 vp10 = _mm256_add_ps(_mm256_mul_ps(vc5, vt10), vc4); + __m256 vp11 = _mm256_add_ps(_mm256_mul_ps(vc5, vt11), vc4); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc3); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc3); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc3); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc3); + vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc3); + vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc3); + vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc3); + vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc3); + vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc3); + vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc3); + vp10 = _mm256_add_ps(_mm256_mul_ps(vp10, vt10), vc3); + vp11 = _mm256_add_ps(_mm256_mul_ps(vp11, vt11), vc3); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc2); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc2); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc2); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc2); + vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc2); + vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc2); + vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc2); + vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc2); + vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc2); + vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc2); + vp10 = _mm256_add_ps(_mm256_mul_ps(vp10, vt10), vc2); + vp11 = _mm256_add_ps(_mm256_mul_ps(vp11, vt11), vc2); + + vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc1); + vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc1); + vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc1); + vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc1); + vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc1); + vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc1); + vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc1); + vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc1); + vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc1); + vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc1); + vp10 = _mm256_add_ps(_mm256_mul_ps(vp10, vt10), vc1); + vp11 = _mm256_add_ps(_mm256_mul_ps(vp11, vt11), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt0 = _mm256_mul_ps(vt0, vs0); + vt1 = _mm256_mul_ps(vt1, vs1); + vt2 = _mm256_mul_ps(vt2, vs2); + vt3 = _mm256_mul_ps(vt3, vs3); + vt4 = _mm256_mul_ps(vt4, vs4); + vt5 = _mm256_mul_ps(vt5, vs5); + vt6 = _mm256_mul_ps(vt6, vs6); + vt7 = _mm256_mul_ps(vt7, vs7); + vt8 = _mm256_mul_ps(vt8, vs8); + vt9 = _mm256_mul_ps(vt9, vs9); + vt10 = _mm256_mul_ps(vt10, vs10); + vt11 = _mm256_mul_ps(vt11, vs11); + + __m256 vf0 = _mm256_add_ps(_mm256_mul_ps(vt0, vp0), vs0); + __m256 vf1 = _mm256_add_ps(_mm256_mul_ps(vt1, vp1), vs1); + __m256 vf2 = _mm256_add_ps(_mm256_mul_ps(vt2, vp2), vs2); + __m256 vf3 = _mm256_add_ps(_mm256_mul_ps(vt3, vp3), vs3); + __m256 vf4 = _mm256_add_ps(_mm256_mul_ps(vt4, vp4), vs4); + __m256 vf5 = _mm256_add_ps(_mm256_mul_ps(vt5, vp5), vs5); + __m256 vf6 = _mm256_add_ps(_mm256_mul_ps(vt6, vp6), vs6); + __m256 vf7 = _mm256_add_ps(_mm256_mul_ps(vt7, vp7), vs7); + __m256 vf8 = _mm256_add_ps(_mm256_mul_ps(vt8, vp8), vs8); + __m256 vf9 = _mm256_add_ps(_mm256_mul_ps(vt9, vp9), vs9); + __m256 vf10 = _mm256_add_ps(_mm256_mul_ps(vt10, vp10), vs10); + __m256 vf11 = _mm256_add_ps(_mm256_mul_ps(vt11, vp11), vs11); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); + vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); + vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); + vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); + vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vx4, vdenorm_cutoff, _CMP_LT_OS), vf4); + vf5 = _mm256_andnot_ps(_mm256_cmp_ps(vx5, vdenorm_cutoff, _CMP_LT_OS), vf5); + vf6 = _mm256_andnot_ps(_mm256_cmp_ps(vx6, vdenorm_cutoff, _CMP_LT_OS), vf6); + vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); + vf8 = _mm256_andnot_ps(_mm256_cmp_ps(vx8, vdenorm_cutoff, _CMP_LT_OS), vf8); + vf9 = _mm256_andnot_ps(_mm256_cmp_ps(vx9, vdenorm_cutoff, _CMP_LT_OS), vf9); + vf10 = _mm256_andnot_ps(_mm256_cmp_ps(vx10, vdenorm_cutoff, _CMP_LT_OS), vf10); + vf11 = _mm256_andnot_ps(_mm256_cmp_ps(vx11, vdenorm_cutoff, _CMP_LT_OS), vf11); + + // Store 96 (12x4) outputs at a time. + _mm256_storeu_ps(output, vf0); + _mm256_storeu_ps(output + 8, vf1); + _mm256_storeu_ps(output + 16, vf2); + _mm256_storeu_ps(output + 24, vf3); + _mm256_storeu_ps(output + 32, vf4); + _mm256_storeu_ps(output + 40, vf5); + _mm256_storeu_ps(output + 48, vf6); + _mm256_storeu_ps(output + 56, vf7); + _mm256_storeu_ps(output + 64, vf8); + _mm256_storeu_ps(output + 72, vf9); + _mm256_storeu_ps(output + 80, vf10); + _mm256_storeu_ps(output + 88, vf11); + output += 96; + + // Accumulate computed exponents. + vacc0 = _mm256_add_ps(vacc0, vf0); + vacc0 = _mm256_add_ps(vacc0, vf1); + vacc0 = _mm256_add_ps(vacc0, vf2); + vacc0 = _mm256_add_ps(vacc0, vf3); + vacc0 = _mm256_add_ps(vacc0, vf4); + vacc0 = _mm256_add_ps(vacc0, vf5); + vacc0 = _mm256_add_ps(vacc0, vf6); + vacc0 = _mm256_add_ps(vacc0, vf7); + vacc0 = _mm256_add_ps(vacc0, vf8); + vacc0 = _mm256_add_ps(vacc0, vf9); + vacc0 = _mm256_add_ps(vacc0, vf10); + vacc0 = _mm256_add_ps(vacc0, vf11); + } + + __m256 vacc = vacc0; + for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { + // Load 8 inputs at a time. + const __m256 vi = _mm256_loadu_ps(input); + input += 8; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx = _mm256_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm256_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm256_mul_ps(vt, vs); + __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); + + // Store 8 outputs at a time. + _mm256_storeu_ps(output, vf); + output += 8; + + // Accumulate computed exponents. + vacc = _mm256_add_ps(vacc, vf); + } + if (batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 7 * sizeof(float)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); + + const __m256 vi = _mm256_maskload_ps(input, vmask); + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m256 vx = _mm256_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm256_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); + vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm256_mul_ps(vt, vs); + __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); + + __m128 vf_lo = _mm256_castps256_ps128(vf); + if (batch & (4 * sizeof(float))) { + _mm_storeu_ps(output, vf_lo); + vf_lo = _mm256_extractf128_ps(vf, 1); + output += 4; + } + if (batch & (2 * sizeof(float))) { + _mm_storel_pi((__m64*) output, vf_lo); + vf_lo = _mm_movehl_ps(vf_lo, vf_lo); + output += 2; + } + if (batch & (1 * sizeof(float))) { + _mm_store_ss(output, vf_lo); + } + + vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); + } + __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); + vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); + vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); + _mm_store_ss(sum, vacc_lo); +} diff --git a/src/xnnpack/raddstoreexpminusmax.h b/src/xnnpack/raddstoreexpminusmax.h index 4a645c57232..ca1f5c90a75 100644 --- a/src/xnnpack/raddstoreexpminusmax.h +++ b/src/xnnpack/raddstoreexpminusmax.h @@ -177,6 +177,22 @@ DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_u DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96_acc3) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96_acc6) +DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32) +DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc2) +DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc4) +DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64) +DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64_acc2) +DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64_acc4) +DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u72) +DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u72_acc3) +DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80) +DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80_acc2) +DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80_acc5) +DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96) +DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc2) +DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc3) +DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc6) + DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64_acc2) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64_acc4) diff --git a/test/f32-raddstoreexpminusmax.cc b/test/f32-raddstoreexpminusmax.cc index ebf89e24b97..94cc2261f1c 100644 --- a/test/f32-raddstoreexpminusmax.cc +++ b/test/f32-raddstoreexpminusmax.cc @@ -3321,6 +3321,561 @@ #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U32, elements_eq_32) { + TEST_REQUIRES_X86_AVX2; + RAddStoreExpMinusMaxMicrokernelTester() + .elements(32) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32, nullptr); + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U32, elements_div_32) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 64; elements < 320; elements += 32) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U32, elements_lt_32) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 1; elements < 32; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U32, elements_gt_32) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 33; elements < 64; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32, nullptr); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U32_ACC2, elements_eq_32) { + TEST_REQUIRES_X86_AVX2; + RAddStoreExpMinusMaxMicrokernelTester() + .elements(32) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc2, nullptr); + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U32_ACC2, elements_div_32) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 64; elements < 320; elements += 32) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc2, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U32_ACC2, elements_lt_32) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 1; elements < 32; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc2, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U32_ACC2, elements_gt_32) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 33; elements < 64; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc2, nullptr); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U32_ACC4, elements_eq_32) { + TEST_REQUIRES_X86_AVX2; + RAddStoreExpMinusMaxMicrokernelTester() + .elements(32) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc4, nullptr); + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U32_ACC4, elements_div_32) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 64; elements < 320; elements += 32) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc4, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U32_ACC4, elements_lt_32) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 1; elements < 32; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc4, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U32_ACC4, elements_gt_32) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 33; elements < 64; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc4, nullptr); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U64, elements_eq_64) { + TEST_REQUIRES_X86_AVX2; + RAddStoreExpMinusMaxMicrokernelTester() + .elements(64) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64, nullptr); + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U64, elements_div_64) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 128; elements < 640; elements += 64) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U64, elements_lt_64) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 1; elements < 64; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U64, elements_gt_64) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 65; elements < 128; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64, nullptr); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U64_ACC2, elements_eq_64) { + TEST_REQUIRES_X86_AVX2; + RAddStoreExpMinusMaxMicrokernelTester() + .elements(64) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64_acc2, nullptr); + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U64_ACC2, elements_div_64) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 128; elements < 640; elements += 64) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64_acc2, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U64_ACC2, elements_lt_64) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 1; elements < 64; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64_acc2, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U64_ACC2, elements_gt_64) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 65; elements < 128; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64_acc2, nullptr); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U64_ACC4, elements_eq_64) { + TEST_REQUIRES_X86_AVX2; + RAddStoreExpMinusMaxMicrokernelTester() + .elements(64) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64_acc4, nullptr); + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U64_ACC4, elements_div_64) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 128; elements < 640; elements += 64) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64_acc4, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U64_ACC4, elements_lt_64) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 1; elements < 64; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64_acc4, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U64_ACC4, elements_gt_64) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 65; elements < 128; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64_acc4, nullptr); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U72, elements_eq_72) { + TEST_REQUIRES_X86_AVX2; + RAddStoreExpMinusMaxMicrokernelTester() + .elements(72) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u72, nullptr); + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U72, elements_div_72) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 144; elements < 720; elements += 72) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u72, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U72, elements_lt_72) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 1; elements < 72; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u72, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U72, elements_gt_72) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 73; elements < 144; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u72, nullptr); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U72_ACC3, elements_eq_72) { + TEST_REQUIRES_X86_AVX2; + RAddStoreExpMinusMaxMicrokernelTester() + .elements(72) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u72_acc3, nullptr); + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U72_ACC3, elements_div_72) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 144; elements < 720; elements += 72) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u72_acc3, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U72_ACC3, elements_lt_72) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 1; elements < 72; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u72_acc3, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U72_ACC3, elements_gt_72) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 73; elements < 144; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u72_acc3, nullptr); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U80, elements_eq_80) { + TEST_REQUIRES_X86_AVX2; + RAddStoreExpMinusMaxMicrokernelTester() + .elements(80) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80, nullptr); + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U80, elements_div_80) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 160; elements < 800; elements += 80) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U80, elements_lt_80) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 1; elements < 80; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U80, elements_gt_80) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 81; elements < 160; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80, nullptr); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U80_ACC2, elements_eq_80) { + TEST_REQUIRES_X86_AVX2; + RAddStoreExpMinusMaxMicrokernelTester() + .elements(80) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80_acc2, nullptr); + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U80_ACC2, elements_div_80) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 160; elements < 800; elements += 80) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80_acc2, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U80_ACC2, elements_lt_80) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 1; elements < 80; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80_acc2, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U80_ACC2, elements_gt_80) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 81; elements < 160; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80_acc2, nullptr); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U80_ACC5, elements_eq_80) { + TEST_REQUIRES_X86_AVX2; + RAddStoreExpMinusMaxMicrokernelTester() + .elements(80) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80_acc5, nullptr); + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U80_ACC5, elements_div_80) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 160; elements < 800; elements += 80) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80_acc5, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U80_ACC5, elements_lt_80) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 1; elements < 80; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80_acc5, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U80_ACC5, elements_gt_80) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 81; elements < 160; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80_acc5, nullptr); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U96, elements_eq_96) { + TEST_REQUIRES_X86_AVX2; + RAddStoreExpMinusMaxMicrokernelTester() + .elements(96) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96, nullptr); + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U96, elements_div_96) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 192; elements < 960; elements += 96) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U96, elements_lt_96) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 1; elements < 96; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U96, elements_gt_96) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 97; elements < 192; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96, nullptr); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U96_ACC2, elements_eq_96) { + TEST_REQUIRES_X86_AVX2; + RAddStoreExpMinusMaxMicrokernelTester() + .elements(96) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc2, nullptr); + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U96_ACC2, elements_div_96) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 192; elements < 960; elements += 96) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc2, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U96_ACC2, elements_lt_96) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 1; elements < 96; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc2, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U96_ACC2, elements_gt_96) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 97; elements < 192; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc2, nullptr); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U96_ACC3, elements_eq_96) { + TEST_REQUIRES_X86_AVX2; + RAddStoreExpMinusMaxMicrokernelTester() + .elements(96) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc3, nullptr); + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U96_ACC3, elements_div_96) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 192; elements < 960; elements += 96) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc3, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U96_ACC3, elements_lt_96) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 1; elements < 96; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc3, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U96_ACC3, elements_gt_96) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 97; elements < 192; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc3, nullptr); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U96_ACC6, elements_eq_96) { + TEST_REQUIRES_X86_AVX2; + RAddStoreExpMinusMaxMicrokernelTester() + .elements(96) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc6, nullptr); + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U96_ACC6, elements_div_96) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 192; elements < 960; elements += 96) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc6, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U96_ACC6, elements_lt_96) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 1; elements < 96; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc6, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U96_ACC6, elements_gt_96) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 97; elements < 192; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc6, nullptr); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + #if XNN_ARCH_X86 || XNN_ARCH_X86_64 TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U64, elements_eq_64) { TEST_REQUIRES_X86_AVX512F; diff --git a/test/f32-raddstoreexpminusmax.yaml b/test/f32-raddstoreexpminusmax.yaml index 26cb25c310d..1a2d0ba8c9d 100644 --- a/test/f32-raddstoreexpminusmax.yaml +++ b/test/f32-raddstoreexpminusmax.yaml @@ -102,6 +102,22 @@ - name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96_acc3 - name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96_acc6 +- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32 +- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc2 +- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc4 +- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64 +- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64_acc2 +- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64_acc4 +- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u72 +- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u72_acc3 +- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80 +- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80_acc2 +- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80_acc5 +- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96 +- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc2 +- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc3 +- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc6 + # x86 AVX512 - name: xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64 - name: xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64_acc2 From a913c25dcf37e3108da0da50600696d01e31622f Mon Sep 17 00:00:00 2001 From: Pedro Gonnet Date: Mon, 23 Sep 2024 10:10:49 -0700 Subject: [PATCH 29/50] Bump the version of KleidiAI. PiperOrigin-RevId: 677841927 --- WORKSPACE | 6 +++--- cmake/DownloadKleidiAI.cmake | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/WORKSPACE b/WORKSPACE index faf26cc33a3..458f154fecb 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -112,10 +112,10 @@ http_archive( # KleidiAI library, used for ARM microkernels. http_archive( name = "KleidiAI", - sha256 = "88233e427be6579560073267575f00f3b5fc370a31a43bbdd87a1810bd4bf1b6", - strip_prefix = "kleidiai-cddf991af5de49fd34949fa39690e4e906e04074", + sha256 = "d8f2b5bf6eba7ab8fe3cedd97c4adc967c1befa69a6f4c4f6cbb3c102a7dd3c9", + strip_prefix = "kleidiai-32384cde728f444afdb92eecbb65e293fc6a6315", urls = [ - "https://gitlab.arm.com/kleidi/kleidiai/-/archive/cddf991af5de49fd34949fa39690e4e906e04074/kleidiai-cddf991af5de49fd34949fa39690e4e906e04074.zip", + "https://gitlab.arm.com/kleidi/kleidiai/-/archive/32384cde728f444afdb92eecbb65e293fc6a6315/kleidiai-32384cde728f444afdb92eecbb65e293fc6a6315.zip", ], ) # LINT.ThenChange(cmake/DownloadKleidiAI.cmake) diff --git a/cmake/DownloadKleidiAI.cmake b/cmake/DownloadKleidiAI.cmake index bbc1e4f0c52..49a0e04346d 100644 --- a/cmake/DownloadKleidiAI.cmake +++ b/cmake/DownloadKleidiAI.cmake @@ -17,8 +17,8 @@ ENDIF() INCLUDE(ExternalProject) ExternalProject_Add(kleidiai - URL https://gitlab.arm.com/kleidi/kleidiai/-/archive/cddf991af5de49fd34949fa39690e4e906e04074/kleidiai-cddf991af5de49fd34949fa39690e4e906e04074.zip - URL_HASH SHA256=88233e427be6579560073267575f00f3b5fc370a31a43bbdd87a1810bd4bf1b6 + URL https://gitlab.arm.com/kleidi/kleidiai/-/archive/32384cde728f444afdb92eecbb65e293fc6a6315/kleidiai-32384cde728f444afdb92eecbb65e293fc6a6315.zip + URL_HASH SHA256=d8f2b5bf6eba7ab8fe3cedd97c4adc967c1befa69a6f4c4f6cbb3c102a7dd3c9 SOURCE_DIR "${CMAKE_BINARY_DIR}/kleidiai-source" BINARY_DIR "${CMAKE_BINARY_DIR}/kleidiai" CONFIGURE_COMMAND "" From 7ae442764ef9c3f650e55c247ff4617b0b1fa00e Mon Sep 17 00:00:00 2001 From: Misha Gutman Date: Mon, 23 Sep 2024 10:19:30 -0700 Subject: [PATCH 30/50] Added mean for qu8. PiperOrigin-RevId: 677845512 --- include/xnnpack.h | 23 ++++ src/configs/reduce-config.c | 2 +- src/enums/operator-type.c | 11 +- src/enums/operator-type.yaml | 2 + src/microparams-init.c | 14 +++ src/operator-run.c | 40 ++++-- src/operators/reduce-nd.c | 88 +++++++++++++- src/xnnpack/compute.h | 2 + src/xnnpack/config-types.h | 1 + src/xnnpack/config.h | 2 + src/xnnpack/microfnptr.h | 7 ++ src/xnnpack/microparams-init.h | 20 ++- src/xnnpack/microparams.h | 9 ++ src/xnnpack/operator-type.h | 1 + src/xnnpack/operator.h | 2 + test/mean-nd.cc | 215 +++++++++++++++++++++++++++++++++ test/mean-operator-tester.h | 146 ++++++++++++++++++++++ 17 files changed, 559 insertions(+), 26 deletions(-) diff --git a/include/xnnpack.h b/include/xnnpack.h index c4df36b1be8..772b6513425 100644 --- a/include/xnnpack.h +++ b/include/xnnpack.h @@ -5220,6 +5220,13 @@ enum xnn_status xnn_create_mean_nd_qs8( uint32_t flags, xnn_operator_t* mean_op_out); +enum xnn_status xnn_create_mean_nd_qu8( + float scale, + uint8_t input_zero_point, + uint8_t output_zero_point, + uint32_t flags, + xnn_operator_t* mean_op_out); + enum xnn_status xnn_reshape_mean_nd_f32( xnn_operator_t mean_op, size_t num_reduction_axes, @@ -5238,6 +5245,16 @@ enum xnn_status xnn_reshape_mean_nd_qs8( size_t* workspace_alignment, pthreadpool_t threadpool); +enum xnn_status xnn_reshape_mean_nd_qu8( + xnn_operator_t mean_op, + size_t num_reduction_axes, + const size_t* reduction_axes, + size_t num_input_dims, + const size_t* input_shape, + size_t* workspace_size, + size_t* workspace_alignment, + pthreadpool_t threadpool); + enum xnn_status xnn_setup_mean_nd_f32( xnn_operator_t mean_op, const float* input, @@ -5261,6 +5278,12 @@ enum xnn_status xnn_setup_mean_nd_qs8( const void* input, void* output); +enum xnn_status xnn_setup_mean_nd_qu8( + xnn_operator_t mean_op, + void* workspace, + const void* input, + void* output); + enum xnn_status xnn_setup_minimum_nd_f16( xnn_operator_t minimum_op, const void* input1, diff --git a/src/configs/reduce-config.c b/src/configs/reduce-config.c index 4bc4e0b9c17..d86975a0a87 100644 --- a/src/configs/reduce-config.c +++ b/src/configs/reduce-config.c @@ -265,7 +265,7 @@ static void init_qu8_rsum_config(void) { }; #endif - qu8_rsum_config.init.qs8_mean = xnn_init_qs8_mean_minmax_scalar_params; + qu8_rsum_config.init.qu8_mean = xnn_init_qu8_mean_minmax_scalar_params; } static void init_f16_f32acc_rsum_config(void) { diff --git a/src/enums/operator-type.c b/src/enums/operator-type.c index 52cb5051b1d..a1d9acaf394 100644 --- a/src/enums/operator-type.c +++ b/src/enums/operator-type.c @@ -12,16 +12,16 @@ #include "xnnpack/operator-type.h" -static const uint16_t offset[170] = { +static const uint16_t offset[171] = { 0, 8, 22, 36, 50, 64, 78, 92, 119, 147, 175, 203, 230, 257, 289, 321, 364, 382, 400, 425, 451, 467, 483, 498, 513, 535, 558, 581, 604, 627, 650, 673, 696, 719, 742, 760, 783, 806, 830, 848, 871, 895, 919, 943, 967, 1002, 1037, 1061, 1085, 1109, 1123, 1138, 1153, 1173, 1199, 1225, 1262, 1288, 1318, 1344, 1376, 1408, 1434, 1461, 1488, 1505, 1522, 1556, 1590, 1604, 1618, 1632, 1646, 1662, 1678, 1704, 1730, 1762, 1794, 1831, 1868, 1905, 1942, 1979, 2016, 2053, 2079, 2111, 2137, 2152, 2186, 2220, 2254, 2288, 2322, 2356, 2386, 2416, 2436, 2456, 2477, 2498, 2519, 2540, 2554, - 2578, 2602, 2625, 2648, 2666, 2684, 2699, 2714, 2729, 2747, 2765, 2784, 2803, 2822, 2841, 2860, 2877, 2894, 2910, - 2926, 2959, 2992, 3020, 3048, 3076, 3104, 3131, 3158, 3175, 3192, 3233, 3274, 3292, 3310, 3328, 3346, 3361, 3377, - 3393, 3411, 3429, 3447, 3473, 3500, 3527, 3544, 3561, 3583, 3605, 3634, 3663, 3682, 3701, 3720, 3739, 3754, 3769, - 3784, 3799, 3818, 3838, 3858, 3878, 3899, 3920 + 2578, 2602, 2625, 2648, 2666, 2684, 2699, 2714, 2729, 2744, 2762, 2780, 2799, 2818, 2837, 2856, 2875, 2892, 2909, + 2925, 2941, 2974, 3007, 3035, 3063, 3091, 3119, 3146, 3173, 3190, 3207, 3248, 3289, 3307, 3325, 3343, 3361, 3376, + 3392, 3408, 3426, 3444, 3462, 3488, 3515, 3542, 3559, 3576, 3598, 3620, 3649, 3678, 3697, 3716, 3735, 3754, 3769, + 3784, 3799, 3814, 3833, 3853, 3873, 3893, 3914, 3935 }; static const char data[] = @@ -138,6 +138,7 @@ static const char data[] = "Mean (ND, F16)\0" "Mean (ND, F32)\0" "Mean (ND, QS8)\0" + "Mean (ND, QU8)\0" "Minimum (ND, F16)\0" "Minimum (ND, F32)\0" "Multiply (ND, F16)\0" diff --git a/src/enums/operator-type.yaml b/src/enums/operator-type.yaml index 073ed9b120e..8a2741526ea 100644 --- a/src/enums/operator-type.yaml +++ b/src/enums/operator-type.yaml @@ -231,6 +231,8 @@ string: "Mean (ND, F32)" - name: xnn_operator_type_mean_nd_qs8 string: "Mean (ND, QS8)" +- name: xnn_operator_type_mean_nd_qu8 + string: "Mean (ND, QU8)" - name: xnn_operator_type_minimum_nd_f16 string: "Minimum (ND, F16)" - name: xnn_operator_type_minimum_nd_f32 diff --git a/src/microparams-init.c b/src/microparams-init.c index ee4c06b47f5..a3fa0b7c019 100644 --- a/src/microparams-init.c +++ b/src/microparams-init.c @@ -2074,6 +2074,20 @@ size_t xnn_init_qs8_mean_minmax_scalar_params( return sizeof(params->scalar); } +size_t xnn_init_qu8_mean_minmax_scalar_params( + struct xnn_qu8_mean_minmax_params params[XNN_MIN_ELEMENTS(1)], + float scale, + int32_t num_elements, + uint8_t input_zero_point, + uint8_t output_zero_point) +{ + params->scalar.scale = scale; + params->scalar.num_elements = num_elements; + params->scalar.input_zero_point = input_zero_point; + params->scalar.output_zero_point = output_zero_point; + return sizeof(params->scalar); +} + size_t xnn_init_f32_qu8_cvt_scalar_params( struct xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)], float scale, diff --git a/src/operator-run.c b/src/operator-run.c index f33f0e096fc..24aaafcd1bb 100644 --- a/src/operator-run.c +++ b/src/operator-run.c @@ -2231,12 +2231,22 @@ void xnn_compute_contiguous_reduce( xnn_init_s32_f32_cvt_scalar_params(&s32_f32_cvt_params, context->params.qs8_mean.scalar.num_elements * (int32_t) context->params.qs8_mean.scalar.input_zero_point); context->s32_f32_cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr, workspace_ptr, /*params=*/&s32_f32_cvt_params); + struct xnn_f32_qs8_cvt_params cvt_params; + xnn_init_f32_qs8_cvt_scalar_params(&cvt_params, context->params.qs8_mean.scalar.scale, context->params.qs8_mean.scalar.output_zero_point, INT8_MIN, INT8_MAX); + context->cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr, + output_ptr, /*params=*/&cvt_params); + } else if (context->u32_f32_cvt_ukernel) { + struct xnn_u32_f32_cvt_params u32_f32_cvt_params; + xnn_init_u32_f32_cvt_scalar_params(&u32_f32_cvt_params, context->params.qu8_mean.scalar.num_elements * (int32_t) context->params.qu8_mean.scalar.input_zero_point); + context->u32_f32_cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr, + workspace_ptr, /*params=*/&u32_f32_cvt_params); + struct xnn_f32_qu8_cvt_params cvt_params; + xnn_init_f32_qu8_cvt_scalar_params(&cvt_params, context->params.qu8_mean.scalar.scale, context->params.qu8_mean.scalar.output_zero_point, 0, UINT8_MAX); + context->cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr, + output_ptr, /*params=*/&cvt_params); + } else { + context->cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr, output_ptr, /*params=*/NULL); } - - struct xnn_f32_qs8_cvt_params cvt_params; - xnn_init_f32_qs8_cvt_scalar_params(&cvt_params, context->params.qs8_mean.scalar.scale, context->params.qs8_mean.scalar.output_zero_point, INT8_MIN, INT8_MAX); - context->cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr, - output_ptr, /*params=*/&cvt_params); } } @@ -2302,12 +2312,22 @@ void xnn_compute_discontiguous_reduce( xnn_init_s32_f32_cvt_scalar_params(&s32_f32_cvt_params, context->params.qs8_mean.scalar.num_elements * (int32_t) context->params.qs8_mean.scalar.input_zero_point); context->s32_f32_cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr, workspace_ptr, /*params=*/&s32_f32_cvt_params); + struct xnn_f32_qs8_cvt_params cvt_params; + xnn_init_f32_qs8_cvt_scalar_params(&cvt_params, context->params.qs8_mean.scalar.scale, context->params.qs8_mean.scalar.output_zero_point, INT8_MIN, INT8_MAX); + context->cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr, + output_ptr, /*params=*/&cvt_params); + } else if (context->u32_f32_cvt_ukernel) { + struct xnn_u32_f32_cvt_params u32_f32_cvt_params; + xnn_init_u32_f32_cvt_scalar_params(&u32_f32_cvt_params, context->params.qu8_mean.scalar.num_elements * (int32_t) context->params.qu8_mean.scalar.input_zero_point); + context->u32_f32_cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr, + workspace_ptr, /*params=*/&u32_f32_cvt_params); + struct xnn_f32_qu8_cvt_params cvt_params; + xnn_init_f32_qu8_cvt_scalar_params(&cvt_params, context->params.qu8_mean.scalar.scale, context->params.qu8_mean.scalar.output_zero_point, 0, UINT8_MAX); + context->cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr, + output_ptr, /*params=*/&cvt_params); + } else { + context->cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr, output_ptr, /*params=*/NULL); } - - struct xnn_f32_qs8_cvt_params cvt_params; - xnn_init_f32_qs8_cvt_scalar_params(&cvt_params, context->params.qs8_mean.scalar.scale, context->params.qs8_mean.scalar.output_zero_point, INT8_MIN, INT8_MAX); - context->cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr, - output_ptr, /*params=*/&cvt_params); } } diff --git a/src/operators/reduce-nd.c b/src/operators/reduce-nd.c index 7ea11a30893..a808a0dd3c1 100644 --- a/src/operators/reduce-nd.c +++ b/src/operators/reduce-nd.c @@ -33,6 +33,7 @@ static enum xnn_status create_mean_nd( const struct xnn_reduce_config* rsum_config, const struct xnn_unary_elementwise_config* cvt_config, const struct xnn_unary_elementwise_config* s32_f32_cvt_config, + const struct xnn_unary_elementwise_config* u32_f32_cvt_config, const void* params, size_t params_size, xnn_operator_t* mean_op_out) @@ -62,6 +63,7 @@ static enum xnn_status create_mean_nd( mean_op->rsum_config = rsum_config; mean_op->cvt_config = cvt_config; mean_op->s32_f32_cvt_config = s32_f32_cvt_config; + mean_op->u32_f32_cvt_config = u32_f32_cvt_config; if (params_size != 0) { memcpy(&mean_op->params, params, params_size); } @@ -100,7 +102,36 @@ enum xnn_status xnn_create_mean_nd_qs8( flags, /*log2_element_size=*/XNN_LOG2_SIZEOF_INT8_T, xnn_operator_type_mean_nd_qs8, - rdsum_config, rsum_config, f32_qs8_cvt_config, s32_f32_cvt_config, + rdsum_config, rsum_config, f32_qs8_cvt_config, s32_f32_cvt_config, /*u32_f32_cvt_config=*/NULL, + ¶ms, sizeof(params), + mean_op_out); +} + +enum xnn_status xnn_create_mean_nd_qu8( + float scale, + uint8_t input_zero_point, + uint8_t output_zero_point, + uint32_t flags, + xnn_operator_t* mean_op_out) +{ + const struct xnn_reduce_config* rsum_config = xnn_init_qu8_rsum_config(); + const struct xnn_reduce_config* rdsum_config = xnn_init_qu8_rdsum_config(); + const struct xnn_unary_elementwise_config* f32_qu8_cvt_config = xnn_init_f32_to_qu8_cvt_config(); + const struct xnn_unary_elementwise_config* u32_f32_cvt_config = xnn_init_u32_to_f32_cvt_config(); + if (rdsum_config == NULL || rsum_config == NULL || f32_qu8_cvt_config == NULL || u32_f32_cvt_config == NULL) { + xnn_log_error("failed to create %s operator: unsupported hardware configuration", + xnn_operator_type_to_string(xnn_operator_type_mean_nd_qu8)); + return xnn_status_unsupported_hardware; + } + + struct xnn_qu8_mean_minmax_params params; + rsum_config->init.qu8_mean(¶ms, scale, -1, input_zero_point, output_zero_point); + + return create_mean_nd( + flags, + /*log2_element_size=*/XNN_LOG2_SIZEOF_UINT8_T, + xnn_operator_type_mean_nd_qu8, + rdsum_config, rsum_config, f32_qu8_cvt_config, /*s32_f32_cvt_config=*/NULL, u32_f32_cvt_config, ¶ms, sizeof(params), mean_op_out); } @@ -124,8 +155,7 @@ enum xnn_status xnn_create_mean_nd_f16( /*log2_element_size=*/XNN_LOG2_SIZEOF_HALF, xnn_operator_type_mean_nd_f16, rdsum_config, rsum_config, f32_to_f16_cvt_config, /*s32_f32_cvt_config=*/NULL, - ¶ms, sizeof(params), - mean_op_out); + /*u32_f32_cvt_config=*/NULL, ¶ms, sizeof(params), mean_op_out); } enum xnn_status xnn_create_mean_nd_f32( @@ -147,8 +177,7 @@ enum xnn_status xnn_create_mean_nd_f32( /*log2_element_size=*/XNN_LOG2_SIZEOF_FLOAT, xnn_operator_type_mean_nd_f32, rdsum_config, rsum_config, /*cvt_config=*/NULL, /*s32_f32_cvt_config=*/NULL, - ¶ms, sizeof(params), - mean_op_out); + /*u32_f32_cvt_config=*/NULL, ¶ms, sizeof(params), mean_op_out); } static enum xnn_status reshape_mean_nd( @@ -291,6 +320,9 @@ static enum xnn_status reshape_mean_nd( if (mean_op->s32_f32_cvt_config) { mean_op->context.reduce.s32_f32_cvt_ukernel = mean_op->s32_f32_cvt_config->ukernel; } + if (mean_op->u32_f32_cvt_config) { + mean_op->context.reduce.u32_f32_cvt_ukernel = mean_op->u32_f32_cvt_config->ukernel; + } } else { // Reduction along the non-innermost dimension const size_t channel_like_dim = normalized_input_shape[XNN_MAX_TENSOR_DIMS - 1]; @@ -344,6 +376,9 @@ static enum xnn_status reshape_mean_nd( if (mean_op->s32_f32_cvt_config) { mean_op->context.reduce.s32_f32_cvt_ukernel = mean_op->s32_f32_cvt_config->ukernel; } + if (mean_op->u32_f32_cvt_config) { + mean_op->context.reduce.u32_f32_cvt_ukernel = mean_op->u32_f32_cvt_config->ukernel; + } for (int i = XNN_MAX_TENSOR_DIMS - 2; i >= 0; --i) { mean_op->context.reduce.input_stride[i] = (mean_op->context.reduce.input_stride[i + 1] * normalized_input_shape[i + 1]); } @@ -423,6 +458,13 @@ static void update_params_mean_qs8( mean_op->params.qs8_mean.scalar.num_elements = num_elements; } +static void update_params_mean_qu8( + xnn_operator_t mean_op, + size_t num_elements) { + mean_op->params.qu8_mean.scalar.scale *= 1.0f / (float) (double) num_elements; + mean_op->params.qu8_mean.scalar.num_elements = num_elements; +} + enum xnn_status xnn_reshape_mean_nd_qs8( xnn_operator_t mean_op, size_t num_reduction_axes, @@ -447,6 +489,30 @@ enum xnn_status xnn_reshape_mean_nd_qs8( threadpool); } +enum xnn_status xnn_reshape_mean_nd_qu8( + xnn_operator_t mean_op, + size_t num_reduction_axes, + const size_t* reduction_axes, + size_t num_input_dims, + const size_t* input_shape, + size_t* workspace_size, + size_t* workspace_alignment, + pthreadpool_t threadpool) +{ + return reshape_mean_nd( + mean_op, + num_reduction_axes, reduction_axes, + num_input_dims, input_shape, + workspace_size, workspace_alignment, + /*log2_data_element_size=*/XNN_LOG2_SIZEOF_UINT8_T, + /*log2_accumulator_element_size=*/XNN_LOG2_SIZEOF_FLOAT, + xnn_operator_type_mean_nd_qu8, + /*scale_params=*/&mean_op->params.qu8_mean, + /*scale_params_size=*/sizeof(mean_op->params.qu8_mean), + update_params_mean_qu8, + threadpool); +} + static enum xnn_status setup_mean_nd( xnn_operator_t mean_op, void* workspace, @@ -518,3 +584,15 @@ enum xnn_status xnn_setup_mean_nd_qs8( workspace, input, output, xnn_operator_type_mean_nd_qs8); } + +enum xnn_status xnn_setup_mean_nd_qu8( + xnn_operator_t mean_op, + void* workspace, + const void* input, + void* output) +{ + return setup_mean_nd( + mean_op, + workspace, input, output, + xnn_operator_type_mean_nd_qu8); +} diff --git a/src/xnnpack/compute.h b/src/xnnpack/compute.h index 43cbda3398f..a15512a8011 100644 --- a/src/xnnpack/compute.h +++ b/src/xnnpack/compute.h @@ -1446,8 +1446,10 @@ struct reduce_context { } ukernel; xnn_vunary_ukernel_fn cvt_ukernel; xnn_vunary_ukernel_fn s32_f32_cvt_ukernel; + xnn_vunary_ukernel_fn u32_f32_cvt_ukernel; union { struct xnn_qs8_mean_minmax_params qs8_mean; + struct xnn_qu8_mean_minmax_params qu8_mean; struct xnn_f32_default_params f32_default; struct xnn_f16_f32acc_scale_params scale_params; struct xnn_f32_scale_params f32_scale; diff --git a/src/xnnpack/config-types.h b/src/xnnpack/config-types.h index 14df4347545..0bb8428a819 100644 --- a/src/xnnpack/config-types.h +++ b/src/xnnpack/config-types.h @@ -130,6 +130,7 @@ struct xnn_reduce_config { xnn_rdsum_ukernel_fn rd_ukernel; union { xnn_init_qs8_mean_minmax_params_fn qs8_mean; + xnn_init_qu8_mean_minmax_params_fn qu8_mean; xnn_init_f32_qs8_cvt_params_fn f32_qs8_cvt; xnn_init_f16_f32acc_scale_params_fn f16_f32acc_scale; xnn_init_f16_default_params_fn f16_default; diff --git a/src/xnnpack/config.h b/src/xnnpack/config.h index 21efac46990..340c60b6cc2 100644 --- a/src/xnnpack/config.h +++ b/src/xnnpack/config.h @@ -111,6 +111,8 @@ XNN_INTERNAL const struct xnn_reduce_config* xnn_init_f32_rsum_config(); XNN_INTERNAL const struct xnn_reduce_config* xnn_init_f32_rdsum_config(); XNN_INTERNAL const struct xnn_reduce_config* xnn_init_qs8_rsum_config(); XNN_INTERNAL const struct xnn_reduce_config* xnn_init_qs8_rdsum_config(); +XNN_INTERNAL const struct xnn_reduce_config* xnn_init_qu8_rsum_config(); +XNN_INTERNAL const struct xnn_reduce_config* xnn_init_qu8_rdsum_config(); XNN_INTERNAL const struct xnn_xx_fill_config* xnn_init_xx_fill_config(); diff --git a/src/xnnpack/microfnptr.h b/src/xnnpack/microfnptr.h index 9b300dc3920..a6d41063a59 100644 --- a/src/xnnpack/microfnptr.h +++ b/src/xnnpack/microfnptr.h @@ -2527,6 +2527,13 @@ typedef size_t (*xnn_init_qs8_mean_minmax_params_fn)( int8_t input_zero_point, int8_t output_zero_point); +typedef size_t (*xnn_init_qu8_mean_minmax_params_fn)( + struct xnn_qu8_mean_minmax_params params[XNN_MIN_ELEMENTS(1)], + float scale, + int32_t num_elements, + uint8_t input_zero_point, + uint8_t output_zero_point); + typedef size_t (*xnn_init_f32_qu8_cvt_params_fn)( struct xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)], float scale, diff --git a/src/xnnpack/microparams-init.h b/src/xnnpack/microparams-init.h index 68629199382..a1c7bb96edf 100644 --- a/src/xnnpack/microparams-init.h +++ b/src/xnnpack/microparams-init.h @@ -237,16 +237,26 @@ XNN_INTERNAL size_t xnn_init_f32_scale_scalar_params( struct xnn_f32_scale_params params[XNN_MIN_ELEMENTS(1)], float scale); -#define DECLARE_INIT_QS8_MEAN_MINMAX_PARAMS_FUNCTION(fn_name) \ - XNN_INTERNAL size_t fn_name( \ +#define DECLARE_INIT_QS8_MEAN_MINMAX_PARAMS_FUNCTION(fn_name) \ + XNN_INTERNAL size_t fn_name( \ struct xnn_qs8_mean_minmax_params params[XNN_MIN_ELEMENTS(1)], \ - float scale, \ - int32_t num_elements, \ - int8_t input_zero_point, \ + float scale, \ + int32_t num_elements, \ + int8_t input_zero_point, \ int8_t output_zero_point); DECLARE_INIT_QS8_MEAN_MINMAX_PARAMS_FUNCTION(xnn_init_qs8_mean_minmax_scalar_params) +#define DECLARE_INIT_QU8_MEAN_MINMAX_PARAMS_FUNCTION(fn_name) \ + XNN_INTERNAL size_t fn_name( \ + struct xnn_qu8_mean_minmax_params params[XNN_MIN_ELEMENTS(1)], \ + float scale, \ + int32_t num_elements, \ + uint8_t input_zero_point, \ + uint8_t output_zero_point); + +DECLARE_INIT_QU8_MEAN_MINMAX_PARAMS_FUNCTION(xnn_init_qu8_mean_minmax_scalar_params) + XNN_INTERNAL size_t xnn_init_f16_scaleminmax_scalar_params( struct xnn_f16_scaleminmax_params params[XNN_MIN_ELEMENTS(1)], xnn_float16 scale, diff --git a/src/xnnpack/microparams.h b/src/xnnpack/microparams.h index 8fe1a992034..0819502a99c 100644 --- a/src/xnnpack/microparams.h +++ b/src/xnnpack/microparams.h @@ -380,6 +380,15 @@ struct xnn_qs8_mean_minmax_params { } scalar; }; +struct xnn_qu8_mean_minmax_params { + struct { + float scale; + int32_t num_elements; + uint8_t input_zero_point; + uint8_t output_zero_point; + } scalar; +}; + // AvgPool w. Min+Max: used by quantized GAVGPOOL microkernels with MINMAX activation. union xnn_qs8_avgpool_minmax_params { diff --git a/src/xnnpack/operator-type.h b/src/xnnpack/operator-type.h index cdbe1fb7e79..0a6749d1667 100644 --- a/src/xnnpack/operator-type.h +++ b/src/xnnpack/operator-type.h @@ -130,6 +130,7 @@ enum xnn_operator_type { xnn_operator_type_mean_nd_f16, xnn_operator_type_mean_nd_f32, xnn_operator_type_mean_nd_qs8, + xnn_operator_type_mean_nd_qu8, xnn_operator_type_minimum_nd_f16, xnn_operator_type_minimum_nd_f32, xnn_operator_type_multiply_nd_f16, diff --git a/src/xnnpack/operator.h b/src/xnnpack/operator.h index 7a53ae4d79a..4e7c00acfe3 100644 --- a/src/xnnpack/operator.h +++ b/src/xnnpack/operator.h @@ -260,6 +260,7 @@ struct xnn_operator { union xnn_qs8_avgpool_minmax_params qs8_gavgpool; }; struct xnn_qs8_mean_minmax_params qs8_mean; + struct xnn_qu8_mean_minmax_params qu8_mean; struct xnn_qs8_add_minmax_params qs8_add; union xnn_qs8_mul_minmax_params qs8_mul; struct xnn_qu8_add_minmax_params qu8_add; @@ -318,6 +319,7 @@ struct xnn_operator { const struct xnn_reduce_config* rsum_config; const struct xnn_unary_elementwise_config* cvt_config; const struct xnn_unary_elementwise_config* s32_f32_cvt_config; + const struct xnn_unary_elementwise_config* u32_f32_cvt_config; }; const struct xnn_gavgpool_cw_config* gavgpool_cw_config; const struct xnn_ibilinear_chw_config* ibilinear_chw_config; diff --git a/test/mean-nd.cc b/test/mean-nd.cc index da9be54d2c5..1d908515af8 100644 --- a/test/mean-nd.cc +++ b/test/mean-nd.cc @@ -658,3 +658,218 @@ TEST(MEAN_ND_QS8, reduce_6d_multithreaded) { .TestQS8(); } } + +TEST(MEAN_ND_QU8, reduce_all) { + MeanOperatorTester() + .input_shape({kDim1}) + .reduction_axes({0}) + .TestQU8(); +} + +TEST(MEAN_ND_QU8, reduce_first_axis) { + MeanOperatorTester() + .input_shape({kDim1, kDim2}) + .reduction_axes({0}) + .TestQU8(); +} + +TEST(MEAN_ND_QU8, reduce_last_axis) { + MeanOperatorTester() + .input_shape({kDim1, kDim2, kDim3}) + .reduction_axes({0,2}) + .TestQU8(); +} + +TEST(MEAN_ND_QU8, reduce_last_axis2) { + MeanOperatorTester() + .input_shape({kDim1, kDim2, kDim3}) + .reduction_axes({0,2}) + .TestQU8(); +} + +TEST(MEAN_ND_QU8, reduce_2d) { + std::vector reduction_axes; + for (uint32_t bm1 = 1; bm1 < (uint32_t(1) << 2); bm1++) { + const bool reduce_dim1 = (bm1 & UINT32_C(1)) != 0; + const bool reduce_dim2 = (bm1 & UINT32_C(2)) != 0; + + reduction_axes.clear(); + if (reduce_dim1) { + reduction_axes.push_back(0); + } + if (reduce_dim2) { + reduction_axes.push_back(1); + } + MeanOperatorTester() + .input_shape({kDim1, kDim2}) + .reduction_axes(reduction_axes) + .TestQU8(); + } +} + +TEST(MEAN_ND_QU8, reduce_3d) { + std::vector reduction_axes; + for (uint32_t bm1 = 1; bm1 < (uint32_t(1) << 3); bm1++) { + const bool reduce_dim1 = (bm1 & UINT32_C(1)) != 0; + const bool reduce_dim2 = (bm1 & UINT32_C(2)) != 0; + const bool reduce_dim3 = (bm1 & UINT32_C(4)) != 0; + + const std::vector input_shape{{kDim1, kDim2, kDim3}}; + reduction_axes.clear(); + if (reduce_dim1) { + reduction_axes.push_back(0); + } + if (reduce_dim2) { + reduction_axes.push_back(1); + } + if (reduce_dim3) { + reduction_axes.push_back(2); + } + + MeanOperatorTester() + .input_shape(input_shape) + .reduction_axes(reduction_axes) + .TestQU8(); + } +} + +TEST(MEAN_ND_QU8, reduce_4d) { + std::vector reduction_axes; + for (uint32_t bm1 = 1; bm1 < (uint32_t(1) << 4); bm1++) { + const bool reduce_dim1 = (bm1 & UINT32_C(1)) != 0; + const bool reduce_dim2 = (bm1 & UINT32_C(2)) != 0; + const bool reduce_dim3 = (bm1 & UINT32_C(4)) != 0; + const bool reduce_dim4 = (bm1 & UINT32_C(8)) != 0; + + const std::vector input_shape{{kDim1, kDim2, kDim3, kDim4}}; + reduction_axes.clear(); + if (reduce_dim1) { + reduction_axes.push_back(0); + } + if (reduce_dim2) { + reduction_axes.push_back(1); + } + if (reduce_dim3) { + reduction_axes.push_back(2); + } + if (reduce_dim4) { + reduction_axes.push_back(3); + } + + MeanOperatorTester() + .input_shape(input_shape) + .reduction_axes(reduction_axes) + .TestQU8(); + } +} + +TEST(MEAN_ND_QU8, reduce_5d) { + std::vector reduction_axes; + for (uint32_t bm1 = 1; bm1 < (uint32_t(1) << 5); bm1++) { + const bool reduce_dim1 = (bm1 & UINT32_C(1)) != 0; + const bool reduce_dim2 = (bm1 & UINT32_C(2)) != 0; + const bool reduce_dim3 = (bm1 & UINT32_C(4)) != 0; + const bool reduce_dim4 = (bm1 & UINT32_C(8)) != 0; + const bool reduce_dim5 = (bm1 & UINT32_C(16)) != 0; + + const std::vector input_shape{{kDim1, kDim2, kDim3, kDim4, kDim5}}; + reduction_axes.clear(); + if (reduce_dim1) { + reduction_axes.push_back(0); + } + if (reduce_dim2) { + reduction_axes.push_back(1); + } + if (reduce_dim3) { + reduction_axes.push_back(2); + } + if (reduce_dim4) { + reduction_axes.push_back(3); + } + if (reduce_dim5) { + reduction_axes.push_back(4); + } + + MeanOperatorTester() + .input_shape(input_shape) + .reduction_axes(reduction_axes) + .TestQU8(); + } +} + +TEST(MEAN_ND_QU8, reduce_6d) { + std::vector reduction_axes; + for (uint32_t bm1 = 1; bm1 < (uint32_t(1) << 6); bm1++) { + const bool reduce_dim1 = (bm1 & UINT32_C(1)) != 0; + const bool reduce_dim2 = (bm1 & UINT32_C(2)) != 0; + const bool reduce_dim3 = (bm1 & UINT32_C(4)) != 0; + const bool reduce_dim4 = (bm1 & UINT32_C(8)) != 0; + const bool reduce_dim5 = (bm1 & UINT32_C(16)) != 0; + const bool reduce_dim6 = (bm1 & UINT32_C(32)) != 0; + + const std::vector input_shape{{kDim1, kDim2, kDim3, kDim4, kDim5, kDim6}}; + reduction_axes.clear(); + if (reduce_dim1) { + reduction_axes.push_back(0); + } + if (reduce_dim2) { + reduction_axes.push_back(1); + } + if (reduce_dim3) { + reduction_axes.push_back(2); + } + if (reduce_dim4) { + reduction_axes.push_back(3); + } + if (reduce_dim5) { + reduction_axes.push_back(4); + } + if (reduce_dim6) { + reduction_axes.push_back(5); + } + + MeanOperatorTester() + .input_shape(input_shape) + .reduction_axes(reduction_axes) + .TestQU8(); + } +} + +TEST(MEAN_ND_QU8, reduce_6d_multithreaded) { + std::vector reduction_axes; + for (uint32_t bm1 = 1; bm1 < (uint32_t(1) << 6); bm1++) { + const bool reduce_dim1 = (bm1 & UINT32_C(1)) != 0; + const bool reduce_dim2 = (bm1 & UINT32_C(2)) != 0; + const bool reduce_dim3 = (bm1 & UINT32_C(4)) != 0; + const bool reduce_dim4 = (bm1 & UINT32_C(8)) != 0; + const bool reduce_dim5 = (bm1 & UINT32_C(16)) != 0; + const bool reduce_dim6 = (bm1 & UINT32_C(32)) != 0; + + const std::vector input_shape{{kDim1, kDim2, kDim3, kDim4, kDim5, kDim6}}; + reduction_axes.clear(); + if (reduce_dim1) { + reduction_axes.push_back(0); + } + if (reduce_dim2) { + reduction_axes.push_back(1); + } + if (reduce_dim3) { + reduction_axes.push_back(2); + } + if (reduce_dim4) { + reduction_axes.push_back(3); + } + if (reduce_dim5) { + reduction_axes.push_back(4); + } + if (reduce_dim6) { + reduction_axes.push_back(5); + } + + MeanOperatorTester() + .input_shape(input_shape) + .reduction_axes(reduction_axes) + .multithreaded(true) + .TestQU8(); + } +} diff --git a/test/mean-operator-tester.h b/test/mean-operator-tester.h index e735e09a971..fb87f40dd11 100644 --- a/test/mean-operator-tester.h +++ b/test/mean-operator-tester.h @@ -489,6 +489,152 @@ class MeanOperatorTester { } } + void TestQU8() const { + xnnpack::ReplicableRandomDevice rng; + std::uniform_int_distribution u8dist( + std::numeric_limits::min(), std::numeric_limits::max()); + + // Compute generalized shapes. + std::array input_dims; + std::array output_dims; + std::fill(input_dims.begin(), input_dims.end(), 1); + std::fill(output_dims.begin(), output_dims.end(), 1); + std::copy(input_shape().cbegin(), input_shape().cend(), input_dims.end() - num_input_dims()); + std::copy(input_dims.cbegin(), input_dims.cend(), output_dims.begin()); + for (size_t axis : reduction_axes()) { + (output_dims.end() - num_input_dims())[axis] = 1; + } + const size_t num_output_elements = + std::accumulate(output_dims.begin(), output_dims.end(), size_t(1), std::multiplies()); + + // Compute generalized strides. + std::array input_strides; + std::array output_strides; + size_t input_stride = 1, output_stride = 1; + for (size_t i = XNN_MAX_TENSOR_DIMS; i != 0; i--) { + input_strides[i - 1] = input_stride; + output_strides[i - 1] = output_dims[i - 1] == 1 ? 0 : output_stride; + input_stride *= input_dims[i - 1]; + output_stride *= output_dims[i - 1]; + } + + std::vector input(XNN_EXTRA_BYTES / sizeof(int8_t) + num_input_elements()); + std::vector output(num_output_elements); + std::vector output_ref(num_output_elements); + std::vector output_ref_qu8(num_output_elements); + std::vector accumulator(num_output_elements); + for (size_t iteration = 0; iteration < iterations(); iteration++) { + std::fill(accumulator.begin(), accumulator.end(), 0); + + std::unique_ptr auto_threadpool{nullptr, pthreadpool_destroy}; + if (multithreaded()) { + const pthreadpool_t threadpool = pthreadpool_create(num_threads()); + if (pthreadpool_get_threads_count(threadpool) <= 1) { + GTEST_SKIP(); + } else { + auto_threadpool.reset(threadpool); + } + } + + std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); }); + std::fill(output.begin(), output.end(), INT8_C(0xA5)); + + const int32_t num_reduced_elements = num_input_elements() / num_output_elements; + const float mean_scale = static_cast(1.0f) / num_reduced_elements; + const float input_scale = 0.5f; + const float output_scale = 0.75f; + const uint8_t input_zero_point = u8dist(rng); + const uint8_t output_zero_point = u8dist(rng); + const uint8_t quantized_output_min = xnn_qu8_quantize(-INFINITY, output_scale, output_zero_point); + const uint8_t quantized_output_max = xnn_qu8_quantize(INFINITY, output_scale, output_zero_point); + + // Compute reference results. + std::fill(output_ref.begin(), output_ref.end(), 0); + for (size_t i = 0; i < input_dims[0]; i++) { + for (size_t j = 0; j < input_dims[1]; j++) { + for (size_t k = 0; k < input_dims[2]; k++) { + for (size_t l = 0; l < input_dims[3]; l++) { + for (size_t m = 0; m < input_dims[4]; m++) { + for (size_t n = 0; n < input_dims[5]; n++) { + size_t input_idx = i * input_strides[0] + j * input_strides[1] + k * input_strides[2] + l * input_strides[3] + m * input_strides[4] + n * input_strides[5]; + size_t output_idx = i * output_strides[0] + j * output_strides[1] + k * output_strides[2] + l * output_strides[3] + m * output_strides[4] + n * output_strides[5]; + accumulator[output_idx] += static_cast(input[input_idx]); + } + } + } + } + } + } + + for (size_t idx = 0; idx < output_ref.size(); ++idx) { + output_ref[idx] = static_cast(accumulator[idx] - static_cast(input_zero_point) * num_reduced_elements); + output_ref[idx] *= input_scale * mean_scale * output_scale; + output_ref[idx] = std::min(output_ref[idx], static_cast(static_cast(quantized_output_max) - static_cast(output_zero_point))); + output_ref[idx] = std::max(output_ref[idx], static_cast(static_cast(quantized_output_min) - static_cast(output_zero_point))); + output_ref_qu8[idx] = static_cast(std::lrintf(output_ref[idx]) + static_cast(output_zero_point)); + } + + // Create, setup, run, and destroy a mean operator. + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + xnn_operator_t mean_op = nullptr; + + const xnn_status status = xnn_create_mean_nd_qu8( + input_scale * output_scale, input_zero_point, output_zero_point, + /*flags=*/0, &mean_op); + if (status == xnn_status_unsupported_hardware) { + GTEST_SKIP(); + } + ASSERT_EQ(xnn_status_success, status); + ASSERT_NE(nullptr, mean_op); + + // Smart pointer to automatically delete mean_op. + std::unique_ptr auto_mean_op(mean_op, xnn_delete_operator); + + size_t workspace_size = SIZE_MAX; + size_t workspace_alignment = SIZE_MAX; + ASSERT_EQ(xnn_status_success, + xnn_reshape_mean_nd_qu8( + mean_op, + num_reduction_axes(), + reduction_axes().data(), + num_input_dims(), + input_shape().data(), + &workspace_size, &workspace_alignment, + auto_threadpool.get())); + + ASSERT_NE(workspace_size, SIZE_MAX); + ASSERT_LE(workspace_alignment, XNN_ALLOCATION_ALIGNMENT); + std::vector> workspace(workspace_size); + + ASSERT_EQ(xnn_status_success, + xnn_setup_mean_nd_qu8( + mean_op, + workspace.data(), + input.data(), output.data())); + + ASSERT_EQ(xnn_status_success, + xnn_run_operator(mean_op, auto_threadpool.get())); + + // Verify results. + for (size_t i = 0; i < output_dims[0]; i++) { + for (size_t j = 0; j < output_dims[1]; j++) { + for (size_t k = 0; k < output_dims[2]; k++) { + for (size_t l = 0; l < output_dims[3]; l++) { + for (size_t m = 0; m < output_dims[4]; m++) { + for (size_t n = 0; n < output_dims[5]; n++) { + const size_t index = + i * output_strides[0] + j * output_strides[1] + k * output_strides[2] + l * output_strides[3] + m * output_strides[4] + n * output_strides[5]; + ASSERT_EQ(output[index], output_ref_qu8[index]) + << "(i, j, k, l, m, n) = (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ")"; + } + } + } + } + } + } + } + } + private: std::vector input_shape_; std::vector reduction_axes_; From 1f9e45698eeb644557dfe400203e7591d48491db Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Mon, 23 Sep 2024 10:45:12 -0700 Subject: [PATCH 31/50] Use a command line parameter instead of generating N benchmarks per core - Using the benchmark parameters for threads means benchmarks can't make other parameters easily. - It's pretty annoying to have each benchmark run dozens of times, with likely not very useful number of threads. PiperOrigin-RevId: 677855629 --- bench/models/benchmark.cc | 50 +++++++++++++++++++++++++++------------ bench/utils.cc | 30 ----------------------- bench/utils.h | 3 --- 3 files changed, 35 insertions(+), 48 deletions(-) diff --git a/bench/models/benchmark.cc b/bench/models/benchmark.cc index de46af8b8f0..2e7409fd4d2 100644 --- a/bench/models/benchmark.cc +++ b/bench/models/benchmark.cc @@ -5,10 +5,12 @@ #include +#include #include #include #include #include +#include #include #include @@ -19,6 +21,8 @@ #include "xnnpack/subgraph.h" #include "pthreadpool.h" +int FLAGS_num_threads = 1; + struct ModelRuntime { std::unique_ptr model; pthreadpool_t threadpool = nullptr; @@ -87,7 +91,7 @@ static void BenchmarkInvoke(benchmark::State& state, return; } - ModelRuntime model_runtime(state.range(0)); + ModelRuntime model_runtime(FLAGS_num_threads); if (!model_runtime.CreateModel(model_factory)) { state.SkipWithError("failed to create model"); return; @@ -162,18 +166,34 @@ static void QS8MobileNetV2(benchmark::State& state) { BenchmarkInvoke(state, models::QS8MobileNetV2); } -BENCHMARK(FP32MobileNetV1)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); -BENCHMARK(FP32MobileNetV2)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); -BENCHMARK(FP32MobileNetV3Large)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); -BENCHMARK(FP32MobileNetV3Small)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); - -BENCHMARK(FP16MobileNetV1)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); -BENCHMARK(FP16MobileNetV2)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); -BENCHMARK(FP16MobileNetV3Large)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); -BENCHMARK(FP16MobileNetV3Small)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); - -BENCHMARK(QS8MobileNetV2)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime(); +BENCHMARK(FP32MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime(); +BENCHMARK(FP32MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime(); +BENCHMARK(FP32MobileNetV3Large)->Unit(benchmark::kMicrosecond)->UseRealTime(); +BENCHMARK(FP32MobileNetV3Small)->Unit(benchmark::kMicrosecond)->UseRealTime(); + +BENCHMARK(FP16MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime(); +BENCHMARK(FP16MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime(); +BENCHMARK(FP16MobileNetV3Large)->Unit(benchmark::kMicrosecond)->UseRealTime(); +BENCHMARK(FP16MobileNetV3Small)->Unit(benchmark::kMicrosecond)->UseRealTime(); + +BENCHMARK(QS8MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime(); + +int main(int argc, char** argv) { + ::benchmark::Initialize(&argc, argv); + for (int i = 1; i < argc;) { + if (strncmp(argv[i], "--num_threads=", 14) == 0) { + FLAGS_num_threads = atoi(argv[i] + 14); + if (FLAGS_num_threads <= 0) { + std::cerr << "Invalid --num_threads: " << FLAGS_num_threads << "\n"; + return 1; + } + std::copy(argv + i + 1, argv + argc, argv + i); + argc -= 1; + } else { + ++i; + } + } + if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return 1; + ::benchmark::RunSpecifiedBenchmarks(); +} -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/utils.cc b/bench/utils.cc index 71b763a749c..d612d964cd1 100644 --- a/bench/utils.cc +++ b/bench/utils.cc @@ -152,36 +152,6 @@ size_t GetMaxCacheSize() { return max_cache_size; } -void MultiThreadingParameters(benchmark::internal::Benchmark* benchmark) { - benchmark->ArgName("T"); - - // Disabled thread pool (execution on the caller thread only). - benchmark->Arg(1); - - #if XNN_ENABLE_CPUINFO - if (cpuinfo_initialize()) { - // All cores except the little ones. - uint32_t max_cores = cpuinfo_get_cores_count(); - if (cpuinfo_get_clusters_count() > 1) { - max_cores -= cpuinfo_get_cluster(cpuinfo_get_clusters_count() - 1)->core_count; - } - for (uint32_t t = 2; t <= max_cores; t++) { - benchmark->Arg(t); - } - - // All cores (if more than one cluster). - if (cpuinfo_get_cores_count() > max_cores) { - benchmark->Arg(cpuinfo_get_cores_count()); - } - - // All cores + hyperthreads (only if hyperthreading supported). - if (cpuinfo_get_processors_count() > cpuinfo_get_cores_count()) { - benchmark->Arg(cpuinfo_get_processors_count()); - } - } - #endif // XNN_ENABLE_CPUINFO -} - bool CheckArchFlags(benchmark::State& state, uint64_t arch_flags) { const xnn_hardware_config* hardware_config = xnn_init_hardware_config(); if (hardware_config == nullptr) { diff --git a/bench/utils.h b/bench/utils.h index 00156c02522..a6c7a250a92 100644 --- a/bench/utils.h +++ b/bench/utils.h @@ -93,9 +93,6 @@ void BinaryElementwiseParameters(benchmark::internal::Benchmark* benchmark) { benchmark->Arg(characteristic_l2 / elementwise_size / 960 * 960); } -// Set multi-threading parameters appropriate for the processor. -void MultiThreadingParameters(benchmark::internal::Benchmark* benchmark); - using IsaCheckFunction = std::function; // Check if the architecture flags are supported. From 0dfa0d88817a1774f4fc6fd425cf84a3bd60686e Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Mon, 23 Sep 2024 12:54:30 -0700 Subject: [PATCH 32/50] F32_RADDSTOREEXPMINUSMAX AVX2/AVX512 RR1 remove ABC - AVX use 16 entry mask table (2 vectors) PiperOrigin-RevId: 677905657 --- src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in | 5 ++--- src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in | 4 ++-- src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in | 1 - .../gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u32-acc2.c | 4 ++-- .../gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u32-acc4.c | 4 ++-- .../gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u32.c | 4 ++-- .../gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u64-acc2.c | 4 ++-- .../gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u64-acc4.c | 4 ++-- .../gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u64.c | 4 ++-- .../gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u72-acc3.c | 4 ++-- .../gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u72.c | 4 ++-- .../gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u80-acc2.c | 4 ++-- .../gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u80-acc5.c | 4 ++-- .../gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u80.c | 4 ++-- .../gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc2.c | 4 ++-- .../gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc3.c | 4 ++-- .../gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc6.c | 4 ++-- .../gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96.c | 4 ++-- 18 files changed, 34 insertions(+), 36 deletions(-) diff --git a/src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in b/src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in index 1a09edb3f1f..0bb62d22b12 100644 --- a/src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in +++ b/src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in @@ -6,7 +6,6 @@ $assert BATCH_TILE % 8 == 0 $assert BATCH_TILE >= 8 $SIMD_TILE = BATCH_TILE // 8 -$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" #include #include @@ -29,7 +28,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u${BATCH_TILE}${"" if ACC assert(output != NULL); assert(sum != NULL); - static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); @@ -146,7 +145,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u${BATCH_TILE}${"" if ACC if (batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); const __m256 vi = _mm256_maskload_ps(input, vmask); diff --git a/src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in b/src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in index 733e6489813..3ba462b9bed 100644 --- a/src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in +++ b/src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in @@ -60,7 +60,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u${BATCH_TILE}${"" if ACC // Load ${BATCH_TILE} (${SIMD_TILE}x4) inputs at a time. const __m256 vi0 = _mm256_loadu_ps(input); $for N in range(1, SIMD_TILE): - const __m256 vi${N} = _mm256_loadu_ps(input + ${N*8}); + const __m256 vi${N} = _mm256_loadu_ps(input + ${N * 8}); input += ${BATCH_TILE}; // Subtract maximum input x := i - i_max. This implies x <= 0. @@ -119,7 +119,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u${BATCH_TILE}${"" if ACC // Store ${BATCH_TILE} (${SIMD_TILE}x4) outputs at a time. _mm256_storeu_ps(output, vf0); $for N in range(1, SIMD_TILE): - _mm256_storeu_ps(output + ${N*8}, vf${N}); + _mm256_storeu_ps(output + ${N * 8}, vf${N}); output += ${BATCH_TILE}; // Accumulate computed exponents. diff --git a/src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in b/src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in index bc955ca3256..f5481f2bb67 100644 --- a/src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in +++ b/src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in @@ -6,7 +6,6 @@ $assert BATCH_TILE % 16 == 0 $assert BATCH_TILE >= 16 $SIMD_TILE = BATCH_TILE // 16 -$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" #include #include diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u32-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u32-acc2.c index c943293f5a1..21ddfdc3e04 100644 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u32-acc2.c +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u32-acc2.c @@ -29,7 +29,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32_acc2( assert(output != NULL); assert(sum != NULL); - static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); @@ -168,7 +168,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32_acc2( if (batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); const __m256 vi = _mm256_maskload_ps(input, vmask); diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u32-acc4.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u32-acc4.c index 7a6a1468e15..08d69215e9a 100644 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u32-acc4.c +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u32-acc4.c @@ -29,7 +29,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32_acc4( assert(output != NULL); assert(sum != NULL); - static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); @@ -172,7 +172,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32_acc4( if (batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); const __m256 vi = _mm256_maskload_ps(input, vmask); diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u32.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u32.c index de5b0d96306..0abda8dc5ba 100644 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u32.c +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u32.c @@ -29,7 +29,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32( assert(output != NULL); assert(sum != NULL); - static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); @@ -166,7 +166,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32( if (batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); const __m256 vi = _mm256_maskload_ps(input, vmask); diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u64-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u64-acc2.c index 3a1fc3d5f5a..1d9e4b79e14 100644 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u64-acc2.c +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u64-acc2.c @@ -29,7 +29,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u64_acc2( assert(output != NULL); assert(sum != NULL); - static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); @@ -228,7 +228,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u64_acc2( if (batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); const __m256 vi = _mm256_maskload_ps(input, vmask); diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u64-acc4.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u64-acc4.c index 7da88606433..6be24aa0635 100644 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u64-acc4.c +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u64-acc4.c @@ -29,7 +29,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u64_acc4( assert(output != NULL); assert(sum != NULL); - static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); @@ -232,7 +232,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u64_acc4( if (batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); const __m256 vi = _mm256_maskload_ps(input, vmask); diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u64.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u64.c index 5b0cea20ba2..d163e68f9cc 100644 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u64.c +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u64.c @@ -29,7 +29,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u64( assert(output != NULL); assert(sum != NULL); - static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); @@ -226,7 +226,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u64( if (batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); const __m256 vi = _mm256_maskload_ps(input, vmask); diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u72-acc3.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u72-acc3.c index 42df44f687f..dc41eb737b7 100644 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u72-acc3.c +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u72-acc3.c @@ -29,7 +29,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u72_acc3( assert(output != NULL); assert(sum != NULL); - static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); @@ -245,7 +245,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u72_acc3( if (batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); const __m256 vi = _mm256_maskload_ps(input, vmask); diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u72.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u72.c index 830e1df08d3..ad00f6f05db 100644 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u72.c +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u72.c @@ -29,7 +29,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u72( assert(output != NULL); assert(sum != NULL); - static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); @@ -241,7 +241,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u72( if (batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); const __m256 vi = _mm256_maskload_ps(input, vmask); diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u80-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u80-acc2.c index 46ef1fa0dc0..3e63283c08d 100644 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u80-acc2.c +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u80-acc2.c @@ -29,7 +29,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u80_acc2( assert(output != NULL); assert(sum != NULL); - static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); @@ -258,7 +258,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u80_acc2( if (batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); const __m256 vi = _mm256_maskload_ps(input, vmask); diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u80-acc5.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u80-acc5.c index e0b44714182..cab22fc7376 100644 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u80-acc5.c +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u80-acc5.c @@ -29,7 +29,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u80_acc5( assert(output != NULL); assert(sum != NULL); - static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); @@ -264,7 +264,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u80_acc5( if (batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); const __m256 vi = _mm256_maskload_ps(input, vmask); diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u80.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u80.c index 8ab59dc2882..82abac46a59 100644 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u80.c +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u80.c @@ -29,7 +29,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u80( assert(output != NULL); assert(sum != NULL); - static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); @@ -256,7 +256,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u80( if (batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); const __m256 vi = _mm256_maskload_ps(input, vmask); diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc2.c index 9a48b35224c..5bc716258d9 100644 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc2.c +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc2.c @@ -29,7 +29,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96_acc2( assert(output != NULL); assert(sum != NULL); - static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); @@ -288,7 +288,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96_acc2( if (batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); const __m256 vi = _mm256_maskload_ps(input, vmask); diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc3.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc3.c index 184ad63004b..f9340744efd 100644 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc3.c +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc3.c @@ -29,7 +29,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96_acc3( assert(output != NULL); assert(sum != NULL); - static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); @@ -290,7 +290,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96_acc3( if (batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); const __m256 vi = _mm256_maskload_ps(input, vmask); diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc6.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc6.c index 4ea0faac5ed..89c609fe60e 100644 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc6.c +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc6.c @@ -29,7 +29,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96_acc6( assert(output != NULL); assert(sum != NULL); - static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); @@ -296,7 +296,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96_acc6( if (batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); const __m256 vi = _mm256_maskload_ps(input, vmask); diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96.c index 0abf7a1a196..8060155fe95 100644 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96.c +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96.c @@ -29,7 +29,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96( assert(output != NULL); assert(sum != NULL); - static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); @@ -286,7 +286,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96( if (batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); const __m256 vi = _mm256_maskload_ps(input, vmask); From 5db85d127bfd1e4f75135a1221990ec2449857e9 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Mon, 23 Sep 2024 13:10:35 -0700 Subject: [PATCH 33/50] Enable xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc2 for softmax PiperOrigin-RevId: 677911402 --- cmake/gen/avx2_microkernels.cmake | 2 +- gen/avx2_microkernels.bzl | 2 +- src/configs/raddstoreexpminusmax-config.c | 27 ++++++++++++----------- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/cmake/gen/avx2_microkernels.cmake b/cmake/gen/avx2_microkernels.cmake index 480c58761d7..d7906685e70 100644 --- a/cmake/gen/avx2_microkernels.cmake +++ b/cmake/gen/avx2_microkernels.cmake @@ -25,6 +25,7 @@ SET(PROD_AVX2_MICROKERNEL_SRCS src/f32-qc8w-gemm/gen/f32-qc8w-gemm-5x16-minmax-avx2-broadcast.c src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-u64.c src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-u64.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32-acc2.c src/f32-velu/gen/f32-velu-avx2-rr1-lut4-p4-perm-u56.c src/f32-vlog/gen/f32-vlog-avx2-rational-3-3-div.c src/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-div-u40.c @@ -248,7 +249,6 @@ SET(NON_PROD_AVX2_MICROKERNEL_SRCS src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc3.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc6.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32-acc2.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32-acc4.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64-acc2.c diff --git a/gen/avx2_microkernels.bzl b/gen/avx2_microkernels.bzl index d35b73e0b58..4aa456a5565 100644 --- a/gen/avx2_microkernels.bzl +++ b/gen/avx2_microkernels.bzl @@ -21,6 +21,7 @@ PROD_AVX2_MICROKERNEL_SRCS = [ "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-5x16-minmax-avx2-broadcast.c", "src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-u64.c", "src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-u64.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32-acc2.c", "src/f32-velu/gen/f32-velu-avx2-rr1-lut4-p4-perm-u56.c", "src/f32-vlog/gen/f32-vlog-avx2-rational-3-3-div.c", "src/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-div-u40.c", @@ -245,7 +246,6 @@ NON_PROD_AVX2_MICROKERNEL_SRCS = [ "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc3.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc6.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32-acc2.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32-acc4.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64-acc2.c", diff --git a/src/configs/raddstoreexpminusmax-config.c b/src/configs/raddstoreexpminusmax-config.c index 1efd36693bf..6fd1c5d4f07 100644 --- a/src/configs/raddstoreexpminusmax-config.c +++ b/src/configs/raddstoreexpminusmax-config.c @@ -24,24 +24,21 @@ static void init_f16_raddstoreexpminusmax_config(void) { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon_fp16_arith) { - f16_raddstoreexpminusmax_config.ukernel = - (xnn_raddstoreexpminusmax_ukernel_fn) xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_u32; + f16_raddstoreexpminusmax_config.ukernel = (xnn_raddstoreexpminusmax_ukernel_fn) xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_u32; f16_raddstoreexpminusmax_config.element_tile = 32; } #elif XNN_ARCH_ARM64 && XNN_ENABLE_ARM_FP16_VECTOR const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon_fp16_arith) { - f16_raddstoreexpminusmax_config.ukernel = - (xnn_raddstoreexpminusmax_ukernel_fn) xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_u40; + f16_raddstoreexpminusmax_config.ukernel = (xnn_raddstoreexpminusmax_ukernel_fn) xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_u40; f16_raddstoreexpminusmax_config.element_tile = 40; } #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_x86_avx2) { - f16_raddstoreexpminusmax_config.ukernel = - (xnn_raddstoreexpminusmax_ukernel_fn) xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_u40; + f16_raddstoreexpminusmax_config.ukernel = (xnn_raddstoreexpminusmax_ukernel_fn) xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_u40; f16_raddstoreexpminusmax_config.element_tile = 40; } #endif @@ -65,9 +62,15 @@ static void init_f32_raddstoreexpminusmax_config(void) { (xnn_raddstoreexpminusmax_ukernel_fn) xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u16; f32_raddstoreexpminusmax_config.element_tile = 16; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 - f32_raddstoreexpminusmax_config.ukernel = - (xnn_raddstoreexpminusmax_ukernel_fn) xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u20_acc2; - f32_raddstoreexpminusmax_config.element_tile = 20; + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + assert(hardware_config != NULL); + if (hardware_config->use_x86_avx2) { + f32_raddstoreexpminusmax_config.ukernel = (xnn_raddstoreexpminusmax_ukernel_fn) xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc2; + f32_raddstoreexpminusmax_config.element_tile = 32; + } else { + f32_raddstoreexpminusmax_config.ukernel = (xnn_raddstoreexpminusmax_ukernel_fn) xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u20_acc2; + f32_raddstoreexpminusmax_config.element_tile = 20; + } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD #if XNN_ARCH_WASMRELAXEDSIMD f32_raddstoreexpminusmax_config.ukernel = @@ -80,12 +83,10 @@ static void init_f32_raddstoreexpminusmax_config(void) { #endif #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - f32_raddstoreexpminusmax_config.ukernel = - (xnn_raddstoreexpminusmax_ukernel_fn) xnn_f32_raddstoreexpminusmax_ukernel__rvv_rr2_p6_u4v; + f32_raddstoreexpminusmax_config.ukernel = (xnn_raddstoreexpminusmax_ukernel_fn) xnn_f32_raddstoreexpminusmax_ukernel__rvv_rr2_p6_u4v; f32_raddstoreexpminusmax_config.element_tile = hardware_config->vlenb; // VLENB * (4 / sizeof(float)) #else - f32_raddstoreexpminusmax_config.ukernel = - (xnn_raddstoreexpminusmax_ukernel_fn) xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_u4_acc2; + f32_raddstoreexpminusmax_config.ukernel = (xnn_raddstoreexpminusmax_ukernel_fn) xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_u4_acc2; f32_raddstoreexpminusmax_config.element_tile = 4; #endif } From 9081a13436df39eda9d04c82fc5ea9eff7ed6f12 Mon Sep 17 00:00:00 2001 From: XNNPACK Team Date: Mon, 23 Sep 2024 21:19:30 -0700 Subject: [PATCH 34/50] Keep stride for pre-packed weights. PiperOrigin-RevId: 678060967 --- src/operators/batch-matrix-multiply-nc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/operators/batch-matrix-multiply-nc.c b/src/operators/batch-matrix-multiply-nc.c index 9e102ab34a8..b95676b341e 100644 --- a/src/operators/batch-matrix-multiply-nc.c +++ b/src/operators/batch-matrix-multiply-nc.c @@ -164,6 +164,7 @@ enum xnn_status xnn_create_batch_matrix_multiply_nc_f32_const_weights( // Allocate the packed weights. void* packed_data = xnn_get_pointer_to_write_weights( batch_matrix_multiply_op, aligned_size, /*padding_byte=*/0); + batch_matrix_multiply_op->weights_stride = input_b_batch_stride / n_stride; if (packed_data == NULL) { xnn_log_error( "failed to allocate %zu bytes for %s operator packed weights", From c2f0eead56ee1726533486a3c7e437cf8d90418c Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Tue, 24 Sep 2024 01:53:43 -0700 Subject: [PATCH 35/50] Rename xnn_dynamic_quantization_params -> xnn_quantization_params PiperOrigin-RevId: 678139667 --- include/xnnpack.h | 30 ++++++++--------- src/operators/batch-matrix-multiply-nc.c | 4 +-- src/operators/convolution-nhwc.c | 4 +-- src/operators/deconvolution-nhwc.c | 2 +- src/operators/fully-connected-nc.c | 12 +++---- src/operators/unary-elementwise-nc.c | 4 +-- src/tensor.c | 2 +- src/xnnpack/subgraph.h | 6 ++-- test/batch-matrix-multiply-operator-tester.h | 2 +- test/batch-matrix-multiply.cc | 2 +- test/convert-operator-tester.h | 4 +-- test/convolution-2d.cc | 4 +-- test/convolution-operator-tester.h | 16 ++++----- test/deconvolution-2d.cc | 2 +- test/deconvolution-operator-tester.h | 4 +-- test/fully-connected-operator-tester.h | 24 +++++++------- test/fully-connected.cc | 34 ++++++++++---------- test/workspace.cc | 2 +- 18 files changed, 79 insertions(+), 79 deletions(-) diff --git a/include/xnnpack.h b/include/xnnpack.h index 772b6513425..f8062ac894b 100644 --- a/include/xnnpack.h +++ b/include/xnnpack.h @@ -94,8 +94,8 @@ extern "C" { // Next unused flag value: 0x00000100. -/// The number of entries in an array of xnn_dynamic_quantization_params that XNNPACK may read beyond array bounds. -/// The caller must allocate at least this many extra xnn_dynamic_quantization_params before passing the array to XNNPACK. +/// The number of entries in an array of xnn_quantization_params that XNNPACK may read beyond array bounds. +/// The caller must allocate at least this many extra xnn_quantization_params before passing the array to XNNPACK. /// /// Note: XNNPACK reads, but never writes beyond array bounds. #define XNN_EXTRA_QUANTIZATION_PARAMS 10 @@ -103,7 +103,7 @@ extern "C" { /// The minimum blocksize for blockwise quantized operators. #define XNN_MIN_BLOCKSIZE 32 -struct xnn_dynamic_quantization_params { +struct xnn_quantization_params { int32_t zero_point; float scale; }; @@ -2642,7 +2642,7 @@ enum xnn_status xnn_reshape_batch_matrix_multiply_nc_qd8_f32_qc8w( enum xnn_status xnn_setup_batch_matrix_multiply_nc_qd8_f32_qc8w( xnn_operator_t batch_matrix_multiply_op, const int8_t* input_a, - const struct xnn_dynamic_quantization_params* quantization_params, + const struct xnn_quantization_params* quantization_params, float* output); enum xnn_status xnn_create_ceiling_nc_f16( @@ -2944,7 +2944,7 @@ enum xnn_status xnn_setup_convert_nc_f16_qd8( xnn_operator_t convert_op, const void* input, int8_t* output, - struct xnn_dynamic_quantization_params* quantization_params); + struct xnn_quantization_params* quantization_params); enum xnn_status xnn_create_convert_nc_f32_qd8( uint32_t flags, @@ -2963,7 +2963,7 @@ enum xnn_status xnn_setup_convert_nc_f32_qd8( xnn_operator_t convert_op, const float* input, int8_t* output, - struct xnn_dynamic_quantization_params* quantization_params); + struct xnn_quantization_params* quantization_params); enum xnn_status xnn_create_convert_nc_f32_f16( uint32_t flags, @@ -3485,12 +3485,12 @@ enum xnn_status xnn_reshape_convolution2d_nhwc_qs8( enum xnn_status xnn_setup_convolution2d_nhwc_qd8_f16_qc8w( xnn_operator_t convolution_op, void* workspace, const int8_t* input, void* output, - const struct xnn_dynamic_quantization_params* quantization_params); + const struct xnn_quantization_params* quantization_params); enum xnn_status xnn_setup_convolution2d_nhwc_qd8_f32_qc8w( xnn_operator_t convolution_op, void* workspace, const int8_t* input, float* output, - const struct xnn_dynamic_quantization_params* quantization_params); + const struct xnn_quantization_params* quantization_params); enum xnn_status xnn_setup_convolution2d_nhwc_qs8( xnn_operator_t convolution_op, @@ -3777,7 +3777,7 @@ enum xnn_status xnn_setup_deconvolution2d_nhwc_qd8_f32_qc8w( xnn_operator_t deconvolution_op, const int8_t* input, float* output, - const struct xnn_dynamic_quantization_params* quantization_params); + const struct xnn_quantization_params* quantization_params); enum xnn_status xnn_create_deconvolution2d_nhwc_qs8( uint32_t output_padding_top, @@ -4419,7 +4419,7 @@ enum xnn_status xnn_setup_fully_connected_nc_qd8_f16_qc4w( xnn_operator_t fully_connected_op, const int8_t* input, void* output, - const struct xnn_dynamic_quantization_params* quantization_params); + const struct xnn_quantization_params* quantization_params); enum xnn_status xnn_reshape_fully_connected_nc_qd8_f16_qc4w( xnn_operator_t fully_connected_op, @@ -4452,7 +4452,7 @@ enum xnn_status xnn_setup_fully_connected_nc_qd8_f16_qb4w( xnn_operator_t fully_connected_op, const int8_t* input, void* output, - const struct xnn_dynamic_quantization_params* quantization_params); + const struct xnn_quantization_params* quantization_params); enum xnn_status xnn_create_fully_connected_nc_qd8_f32_qc4w( size_t input_channels, @@ -4474,7 +4474,7 @@ enum xnn_status xnn_setup_fully_connected_nc_qd8_f32_qc4w( xnn_operator_t fully_connected_op, const int8_t* input, float* output, - const struct xnn_dynamic_quantization_params* quantization_params); + const struct xnn_quantization_params* quantization_params); enum xnn_status xnn_reshape_fully_connected_nc_qd8_f32_qc4w( xnn_operator_t fully_connected_op, @@ -4507,7 +4507,7 @@ enum xnn_status xnn_setup_fully_connected_nc_qd8_f32_qb4w( xnn_operator_t fully_connected_op, const int8_t* input, float* output, - const struct xnn_dynamic_quantization_params* quantization_params); + const struct xnn_quantization_params* quantization_params); enum xnn_status xnn_create_fully_connected_nc_qd8_f16_qc8w( size_t input_channels, @@ -4528,7 +4528,7 @@ enum xnn_status xnn_setup_fully_connected_nc_qd8_f16_qc8w( xnn_operator_t fully_connected_op, const int8_t* input, void* output, - const struct xnn_dynamic_quantization_params* quantization_params); + const struct xnn_quantization_params* quantization_params); enum xnn_status xnn_reshape_fully_connected_nc_qd8_f16_qc8w( xnn_operator_t fully_connected_op, @@ -4554,7 +4554,7 @@ enum xnn_status xnn_setup_fully_connected_nc_qd8_f32_qc8w( xnn_operator_t fully_connected_op, const int8_t* input, float* output, - const struct xnn_dynamic_quantization_params* quantization_params); + const struct xnn_quantization_params* quantization_params); enum xnn_status xnn_reshape_fully_connected_nc_qd8_f32_qc8w( xnn_operator_t fully_connected_op, diff --git a/src/operators/batch-matrix-multiply-nc.c b/src/operators/batch-matrix-multiply-nc.c index b95676b341e..268b6ffcbaa 100644 --- a/src/operators/batch-matrix-multiply-nc.c +++ b/src/operators/batch-matrix-multiply-nc.c @@ -726,7 +726,7 @@ enum xnn_status xnn_reshape_batch_matrix_multiply_nc_qd8_f32_qc8w( static enum xnn_status setup_batch_matrix_multiply_nc( xnn_operator_t batch_matrix_multiply_op, enum xnn_operator_type expected_operator_type, const void* input_a, - const struct xnn_dynamic_quantization_params* quantization_params, + const struct xnn_quantization_params* quantization_params, const void* input_b, void* packed_weights, void* output) { if (batch_matrix_multiply_op->type != expected_operator_type) { xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)", @@ -797,7 +797,7 @@ enum xnn_status xnn_setup_batch_matrix_multiply_nc_f32( enum xnn_status xnn_setup_batch_matrix_multiply_nc_qd8_f32_qc8w( xnn_operator_t batch_matrix_multiply_op, const int8_t* input_a, - const struct xnn_dynamic_quantization_params* quantization_params, + const struct xnn_quantization_params* quantization_params, float* output) { return setup_batch_matrix_multiply_nc( batch_matrix_multiply_op, diff --git a/src/operators/convolution-nhwc.c b/src/operators/convolution-nhwc.c index 2b9fc87be14..0125b6cb3cb 100644 --- a/src/operators/convolution-nhwc.c +++ b/src/operators/convolution-nhwc.c @@ -2754,7 +2754,7 @@ enum xnn_status xnn_setup_convolution2d_nhwc_qd8_f16_qc8w( void* workspace, const int8_t* input, void* output, - const struct xnn_dynamic_quantization_params* quantization_params) + const struct xnn_quantization_params* quantization_params) { return setup_convolution2d_nhwc( convolution_op, xnn_operator_type_convolution_nhwc_qd8_f16_qc8w, @@ -2767,7 +2767,7 @@ enum xnn_status xnn_setup_convolution2d_nhwc_qd8_f32_qc8w( void* workspace, const int8_t* input, float* output, - const struct xnn_dynamic_quantization_params* quantization_params) + const struct xnn_quantization_params* quantization_params) { return setup_convolution2d_nhwc( convolution_op, xnn_operator_type_convolution_nhwc_qd8_f32_qc8w, diff --git a/src/operators/deconvolution-nhwc.c b/src/operators/deconvolution-nhwc.c index 1fee7b9ad0b..60a44bc158b 100644 --- a/src/operators/deconvolution-nhwc.c +++ b/src/operators/deconvolution-nhwc.c @@ -2001,7 +2001,7 @@ enum xnn_status xnn_setup_deconvolution2d_nhwc_qd8_f32_qc8w( xnn_operator_t deconvolution_op, const int8_t* input, float* output, - const struct xnn_dynamic_quantization_params* quantization_params) + const struct xnn_quantization_params* quantization_params) { return setup_deconvolution2d_nhwc(deconvolution_op, xnn_operator_type_deconvolution_nhwc_qd8_f32_qc8w, input, quantization_params, output); } diff --git a/src/operators/fully-connected-nc.c b/src/operators/fully-connected-nc.c index fe5a4f4d26f..ae65fa290fc 100644 --- a/src/operators/fully-connected-nc.c +++ b/src/operators/fully-connected-nc.c @@ -2212,7 +2212,7 @@ enum xnn_status xnn_setup_fully_connected_nc_qd8_f16_qc4w( xnn_operator_t fully_connected_op, const int8_t* input, void* output, - const struct xnn_dynamic_quantization_params* quantization_params) + const struct xnn_quantization_params* quantization_params) { return setup_fully_connected_nc( fully_connected_op, xnn_operator_type_fully_connected_nc_qd8_f16_qc4w, @@ -2223,7 +2223,7 @@ enum xnn_status xnn_setup_fully_connected_nc_qd8_f16_qb4w( xnn_operator_t fully_connected_op, const int8_t* input, void* output, - const struct xnn_dynamic_quantization_params* quantization_params) + const struct xnn_quantization_params* quantization_params) { return setup_fully_connected_nc( fully_connected_op, xnn_operator_type_fully_connected_nc_qd8_f16_qb4w, @@ -2234,7 +2234,7 @@ enum xnn_status xnn_setup_fully_connected_nc_qd8_f32_qc4w( xnn_operator_t fully_connected_op, const int8_t* input, float* output, - const struct xnn_dynamic_quantization_params* quantization_params) + const struct xnn_quantization_params* quantization_params) { return setup_fully_connected_nc( fully_connected_op, xnn_operator_type_fully_connected_nc_qd8_f32_qc4w, @@ -2245,7 +2245,7 @@ enum xnn_status xnn_setup_fully_connected_nc_qd8_f32_qb4w( xnn_operator_t fully_connected_op, const int8_t* input, float* output, - const struct xnn_dynamic_quantization_params* quantization_params) + const struct xnn_quantization_params* quantization_params) { return setup_fully_connected_nc( fully_connected_op, xnn_operator_type_fully_connected_nc_qd8_f32_qb4w, @@ -2256,7 +2256,7 @@ enum xnn_status xnn_setup_fully_connected_nc_qd8_f16_qc8w( xnn_operator_t fully_connected_op, const int8_t* input, void* output, - const struct xnn_dynamic_quantization_params* quantization_params) + const struct xnn_quantization_params* quantization_params) { return setup_fully_connected_nc( fully_connected_op, xnn_operator_type_fully_connected_nc_qd8_f16_qc8w, @@ -2274,7 +2274,7 @@ enum xnn_status xnn_setup_fully_connected_nc_qd8_f32_qc8w( xnn_operator_t fully_connected_op, const int8_t* input, float* output, - const struct xnn_dynamic_quantization_params* quantization_params) + const struct xnn_quantization_params* quantization_params) { return setup_fully_connected_nc( fully_connected_op, xnn_operator_type_fully_connected_nc_qd8_f32_qc8w, diff --git a/src/operators/unary-elementwise-nc.c b/src/operators/unary-elementwise-nc.c index 7fa26e977cb..0cd6baa9955 100644 --- a/src/operators/unary-elementwise-nc.c +++ b/src/operators/unary-elementwise-nc.c @@ -2679,7 +2679,7 @@ enum xnn_status xnn_setup_convert_nc_f16_qd8( xnn_operator_t convert_op, const void* input, int8_t* output, - struct xnn_dynamic_quantization_params* quantization_params) + struct xnn_quantization_params* quantization_params) { if (convert_op->type != xnn_operator_type_convert_nc_f16_qd8) { xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)", @@ -2715,7 +2715,7 @@ enum xnn_status xnn_setup_convert_nc_f32_qd8( xnn_operator_t convert_op, const float* input, int8_t* output, - struct xnn_dynamic_quantization_params* quantization_params) + struct xnn_quantization_params* quantization_params) { if (convert_op->type != xnn_operator_type_convert_nc_f32_qd8) { xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)", diff --git a/src/tensor.c b/src/tensor.c index 70b5cec0360..447c6faeb6c 100644 --- a/src/tensor.c +++ b/src/tensor.c @@ -660,7 +660,7 @@ size_t xnn_tensor_get_dynamic_quant_param_size(const struct xnn_value* value) case xnn_datatype_qdint8: { const size_t batch_dims_size = xnn_shape_multiply_batch_dims( &value->shape, value->quantization.num_nonbatch_dims); - return batch_dims_size * sizeof(struct xnn_dynamic_quantization_params); + return batch_dims_size * sizeof(struct xnn_quantization_params); } case xnn_datatype_qpint8: return 0; diff --git a/src/xnnpack/subgraph.h b/src/xnnpack/subgraph.h index aa8ea5aaead..8b3b1205626 100644 --- a/src/xnnpack/subgraph.h +++ b/src/xnnpack/subgraph.h @@ -97,8 +97,8 @@ struct xnn_value { /// Number of non-batch dimensions. 1 for FC, 3 for Conv2D. size_t num_nonbatch_dims; /// Per-batch quantization parameters factor to convert quantized elements to real representation. - struct xnn_dynamic_quantization_params* dynamic_params; - /// Number of (struct xnn_dynamic_quantization_params) * sizeof(struct xnn_dynamic_quantization_params) + struct xnn_quantization_params* dynamic_params; + /// Number of (struct xnn_quantization_params) * sizeof(struct xnn_quantization_params) size_t dynamic_params_size; }; }; @@ -545,7 +545,7 @@ XNN_INLINE static size_t xnn_tensor_get_rounded_dynamic_quant_param_size(const s // We may read out of bounds for qparams. return xnn_get_rounded_size(value->quantization.dynamic_params_size - + XNN_EXTRA_QUANTIZATION_PARAMS * sizeof(struct xnn_dynamic_quantization_params)); + + XNN_EXTRA_QUANTIZATION_PARAMS * sizeof(struct xnn_quantization_params)); } diff --git a/test/batch-matrix-multiply-operator-tester.h b/test/batch-matrix-multiply-operator-tester.h index bdffb620eb6..708afeddbb8 100644 --- a/test/batch-matrix-multiply-operator-tester.h +++ b/test/batch-matrix-multiply-operator-tester.h @@ -468,7 +468,7 @@ class BatchMatMulOperatorTester { // Create the dynamically quantized input data with the corresponding // `quantization_params`. - std::vector quantization_params( + std::vector quantization_params( batch_size_a * m() + XNN_EXTRA_QUANTIZATION_PARAMS); std::vector input_a_qd8(batch_size_a * m() * k() + XNN_EXTRA_BYTES / sizeof(int8_t)); diff --git a/test/batch-matrix-multiply.cc b/test/batch-matrix-multiply.cc index e42d71c18d4..55fbe9eeb3e 100644 --- a/test/batch-matrix-multiply.cc +++ b/test/batch-matrix-multiply.cc @@ -580,7 +580,7 @@ TEST_F(BatchMatrixMultiplyTestQD8ToF32, matches_operator_api) { // Create the dynamically quantized input data with the corresponding // `quantization_params`. const size_t input_batch_size = NumElements(input1_dims) / m / k; - std::vector quantization_params( + std::vector quantization_params( input_batch_size * m + XNN_EXTRA_QUANTIZATION_PARAMS); std::vector input1_f32(NumElements(input1_dims) + XNN_EXTRA_BYTES / sizeof(float)); diff --git a/test/convert-operator-tester.h b/test/convert-operator-tester.h index 99646e1b6bc..78724161ed6 100644 --- a/test/convert-operator-tester.h +++ b/test/convert-operator-tester.h @@ -238,7 +238,7 @@ class ConvertOperatorTester { channels()); std::vector output((batch_size() - 1) * output_stride() + channels()); - std::vector quantization_params( + std::vector quantization_params( batch_size() + XNN_EXTRA_QUANTIZATION_PARAMS); std::uniform_real_distribution range_dist(-10, 10); for (size_t iteration = 0; iteration < iterations(); iteration++) { @@ -313,7 +313,7 @@ class ConvertOperatorTester { std::vector input(XNN_EXTRA_BYTES / sizeof(float) + (batch_size() - 1) * input_stride() + channels()); std::vector output((batch_size() - 1) * output_stride() + channels()); - std::vector quantization_params(batch_size() + XNN_EXTRA_QUANTIZATION_PARAMS); + std::vector quantization_params(batch_size() + XNN_EXTRA_QUANTIZATION_PARAMS); std::uniform_real_distribution range_dist(-100000, 100000); for (size_t iteration = 0; iteration < iterations(); iteration++) { const float first_val = range_dist(rng); diff --git a/test/convolution-2d.cc b/test/convolution-2d.cc index e996c7a7fc7..a029865d309 100644 --- a/test/convolution-2d.cc +++ b/test/convolution-2d.cc @@ -294,7 +294,7 @@ TEST_F(ConvolutionTestQD8F16QC8W, internally_allocated_dynamic_quantization_para std::vector operator_dq_data(batch_size * input_height * input_width * groups * group_input_channels + XNN_EXTRA_BYTES); std::fill(operator_output.begin(), operator_output.end(), UINT16_C(0xDEAD)); std::fill(subgraph_output.begin(), subgraph_output.end(), UINT16_C(0xDEAD)); - std::vector quantization_params(batch_size + XNN_EXTRA_QUANTIZATION_PARAMS); + std::vector quantization_params(batch_size + XNN_EXTRA_QUANTIZATION_PARAMS); std::vector kernel_scale(group_output_channels * groups); std::generate(kernel_scale.begin(), kernel_scale.end(), [&]() { return scale_dist(rng); }); std::generate(filter.begin(), filter.end(), [&]() { return w8dist(rng); }); @@ -467,7 +467,7 @@ TEST_F(ConvolutionTestQD8F32QC8W, internally_allocated_dynamic_quantization_para std::vector operator_dq_data(batch_size * input_height * input_width * groups * group_input_channels + XNN_EXTRA_BYTES); std::fill(operator_output.begin(), operator_output.end(), nanf("")); std::fill(subgraph_output.begin(), subgraph_output.end(), nanf("")); - std::vector quantization_params(batch_size + XNN_EXTRA_QUANTIZATION_PARAMS); + std::vector quantization_params(batch_size + XNN_EXTRA_QUANTIZATION_PARAMS); std::vector kernel_scale(group_output_channels * groups); std::generate(kernel_scale.begin(), kernel_scale.end(), [&]() { return scale_dist(rng); }); std::generate(filter.begin(), filter.end(), [&]() { return w8dist(rng); }); diff --git a/test/convolution-operator-tester.h b/test/convolution-operator-tester.h index 1c34c44fb23..ad155b26759 100644 --- a/test/convolution-operator-tester.h +++ b/test/convolution-operator-tester.h @@ -989,7 +989,7 @@ class ConvolutionOperatorTester { xnn_status_success, xnn_setup_convolution2d_nhwc_qd8_f16_qc8w( convolution_op, workspace.data(), input.data(), output.data(), - reinterpret_cast( + reinterpret_cast( quantization_params.data()))); } else { ASSERT_NE(workspace_size, SIZE_MAX); @@ -998,7 +998,7 @@ class ConvolutionOperatorTester { xnn_status_success, xnn_setup_convolution2d_nhwc_qd8_f16_qc8w( convolution_op, workspace.data(), input.data(), output.data(), - reinterpret_cast( + reinterpret_cast( quantization_params.data()))); } ASSERT_EQ(xnn_status_success, @@ -1051,7 +1051,7 @@ class ConvolutionOperatorTester { convolution_op2, workspace.data(), input.data(), output2.data(), reinterpret_cast< - const struct xnn_dynamic_quantization_params*>( + const struct xnn_quantization_params*>( quantization_params.data()))); } else { ASSERT_NE(workspace_size, SIZE_MAX); @@ -1061,7 +1061,7 @@ class ConvolutionOperatorTester { convolution_op2, workspace.data(), input.data(), output2.data(), reinterpret_cast< - const struct xnn_dynamic_quantization_params*>( + const struct xnn_quantization_params*>( quantization_params.data()))); } ASSERT_EQ(xnn_status_success, @@ -1209,7 +1209,7 @@ class ConvolutionOperatorTester { xnn_status_success, xnn_setup_convolution2d_nhwc_qd8_f32_qc8w( convolution_op, workspace.data(), input.data(), output.data(), - reinterpret_cast( + reinterpret_cast( quantization_params.data()))); } else { ASSERT_NE(workspace_size, SIZE_MAX); @@ -1218,7 +1218,7 @@ class ConvolutionOperatorTester { xnn_status_success, xnn_setup_convolution2d_nhwc_qd8_f32_qc8w( convolution_op, workspace.data(), input.data(), output.data(), - reinterpret_cast( + reinterpret_cast( quantization_params.data()))); } ASSERT_EQ(xnn_status_success, @@ -1271,7 +1271,7 @@ class ConvolutionOperatorTester { convolution_op2, workspace.data(), input.data(), output2.data(), reinterpret_cast< - const struct xnn_dynamic_quantization_params*>( + const struct xnn_quantization_params*>( quantization_params.data()))); } else { ASSERT_NE(workspace_size, SIZE_MAX); @@ -1281,7 +1281,7 @@ class ConvolutionOperatorTester { convolution_op2, workspace.data(), input.data(), output2.data(), reinterpret_cast< - const struct xnn_dynamic_quantization_params*>( + const struct xnn_quantization_params*>( quantization_params.data()))); } ASSERT_EQ(xnn_status_success, diff --git a/test/deconvolution-2d.cc b/test/deconvolution-2d.cc index ff29519170d..1d7d53bc314 100644 --- a/test/deconvolution-2d.cc +++ b/test/deconvolution-2d.cc @@ -904,7 +904,7 @@ TEST_F(DeconvolutionTestQD8F32QC8W, internally_allocated_dynamic_quantization_pa std::generate(convert_input.begin(), convert_input.end(), [&]() { return f32dist(rng); }); std::vector operator_dq_data(batch_size * input_height * input_width * groups * group_input_channels + XNN_EXTRA_BYTES); - std::vector quantization_params(batch_size + XNN_EXTRA_QUANTIZATION_PARAMS); + std::vector quantization_params(batch_size + XNN_EXTRA_QUANTIZATION_PARAMS); std::vector kernel_scale(group_output_channels * groups); std::generate(kernel_scale.begin(), kernel_scale.end(), [&]() { return scale_dist(rng); }); diff --git a/test/deconvolution-operator-tester.h b/test/deconvolution-operator-tester.h index ec87b7568b7..21d108ed9a4 100644 --- a/test/deconvolution-operator-tester.h +++ b/test/deconvolution-operator-tester.h @@ -1302,7 +1302,7 @@ class DeconvolutionOperatorTester { xnn_status_success, xnn_setup_deconvolution2d_nhwc_qd8_f32_qc8w( deconvolution_op, input.data(), output.data(), - reinterpret_cast( + reinterpret_cast( quantization_params.data()))); ASSERT_EQ(xnn_status_success, @@ -1343,7 +1343,7 @@ class DeconvolutionOperatorTester { xnn_status_success, xnn_setup_deconvolution2d_nhwc_qd8_f32_qc8w( deconvolution_op2, input.data(), output2.data(), - reinterpret_cast( + reinterpret_cast( quantization_params.data()))); ASSERT_EQ(xnn_status_success, diff --git a/test/fully-connected-operator-tester.h b/test/fully-connected-operator-tester.h index 5fa9c7e570c..dc0ee4706d3 100644 --- a/test/fully-connected-operator-tester.h +++ b/test/fully-connected-operator-tester.h @@ -366,7 +366,7 @@ class FullyConnectedOperatorTester { xnn_setup_fully_connected_nc_qd8_f16_qc4w( fully_connected_op, input.data(), output.data(), - reinterpret_cast(quantization_params.data()))); + reinterpret_cast(quantization_params.data()))); ASSERT_EQ(xnn_status_success, xnn_run_operator(fully_connected_op, /*threadpool=*/nullptr)); @@ -406,7 +406,7 @@ class FullyConnectedOperatorTester { xnn_setup_fully_connected_nc_qd8_f16_qc4w( fully_connected_op2, input.data(), output2.data(), - reinterpret_cast(quantization_params.data()))); + reinterpret_cast(quantization_params.data()))); ASSERT_EQ( @@ -555,7 +555,7 @@ class FullyConnectedOperatorTester { xnn_setup_fully_connected_nc_qd8_f16_qb4w( fully_connected_op, input.data(), output.data(), - reinterpret_cast(quantization_params.data()))); + reinterpret_cast(quantization_params.data()))); ASSERT_EQ(xnn_status_success, xnn_run_operator(fully_connected_op, /*threadpool=*/nullptr)); @@ -596,7 +596,7 @@ class FullyConnectedOperatorTester { xnn_setup_fully_connected_nc_qd8_f16_qb4w( fully_connected_op2, input.data(), output2.data(), - reinterpret_cast(quantization_params.data()))); + reinterpret_cast(quantization_params.data()))); ASSERT_EQ( @@ -751,7 +751,7 @@ class FullyConnectedOperatorTester { xnn_setup_fully_connected_nc_qd8_f32_qc4w( fully_connected_op, input.data(), output.data(), - reinterpret_cast(quantization_params.data()))); + reinterpret_cast(quantization_params.data()))); ASSERT_EQ(xnn_status_success, xnn_run_operator(fully_connected_op, /*threadpool=*/nullptr)); @@ -791,7 +791,7 @@ class FullyConnectedOperatorTester { xnn_setup_fully_connected_nc_qd8_f32_qc4w( fully_connected_op2, input.data(), output2.data(), - reinterpret_cast(quantization_params.data()))); + reinterpret_cast(quantization_params.data()))); ASSERT_EQ( @@ -937,7 +937,7 @@ class FullyConnectedOperatorTester { xnn_setup_fully_connected_nc_qd8_f32_qb4w( fully_connected_op, input.data(), output.data(), - reinterpret_cast(quantization_params.data()))); + reinterpret_cast(quantization_params.data()))); ASSERT_EQ(xnn_status_success, xnn_run_operator(fully_connected_op, /*threadpool=*/nullptr)); @@ -978,7 +978,7 @@ class FullyConnectedOperatorTester { xnn_setup_fully_connected_nc_qd8_f32_qb4w( fully_connected_op2, input.data(), output2.data(), - reinterpret_cast(quantization_params.data()))); + reinterpret_cast(quantization_params.data()))); ASSERT_EQ( @@ -1346,7 +1346,7 @@ class FullyConnectedOperatorTester { xnn_setup_fully_connected_nc_qd8_f16_qc8w( fully_connected_op, input.data(), output.data(), - reinterpret_cast(quantization_params.data()))); + reinterpret_cast(quantization_params.data()))); ASSERT_EQ(xnn_status_success, xnn_run_operator(fully_connected_op, /*threadpool=*/nullptr)); @@ -1385,7 +1385,7 @@ class FullyConnectedOperatorTester { xnn_setup_fully_connected_nc_qd8_f16_qc8w( fully_connected_op2, input.data(), output2.data(), - reinterpret_cast(quantization_params.data()))); + reinterpret_cast(quantization_params.data()))); ASSERT_EQ( @@ -1522,7 +1522,7 @@ class FullyConnectedOperatorTester { xnn_setup_fully_connected_nc_qd8_f32_qc8w( fully_connected_op, input.data(), output.data(), - reinterpret_cast(quantization_params.data()))); + reinterpret_cast(quantization_params.data()))); ASSERT_EQ(xnn_status_success, xnn_run_operator(fully_connected_op, /*threadpool=*/nullptr)); @@ -1561,7 +1561,7 @@ class FullyConnectedOperatorTester { xnn_setup_fully_connected_nc_qd8_f32_qc8w( fully_connected_op2, input.data(), output2.data(), - reinterpret_cast(quantization_params.data()))); + reinterpret_cast(quantization_params.data()))); ASSERT_EQ( diff --git a/test/fully-connected.cc b/test/fully-connected.cc index e00fc6219b2..f4ac741cc0b 100644 --- a/test/fully-connected.cc +++ b/test/fully-connected.cc @@ -229,7 +229,7 @@ TEST_F(FullyConnectedTestQP8F32QC4W, matches_qd8_f32_qc4w) { kernel = std::vector(output_channels * rounded_input_channels); std::vector kernel_scale(output_channels); - std::vector quantization_params( + std::vector quantization_params( batch_size + XNN_EXTRA_QUANTIZATION_PARAMS); std::generate(kernel_scale.begin(), kernel_scale.end(), [&]() { return scale_dist(rng); }); @@ -2893,13 +2893,13 @@ TEST_F(FullyConnectedTestQD8F16QC4W, internally_allocated_dynamic_quantization_p std::vector operator_output(batch_size * output_channels); std::fill(operator_output.begin(), operator_output.end(), UINT16_C(0xDEAD)); std::fill(subgraph_output.begin(), subgraph_output.end(), UINT16_C(0xDEAD)); - std::vector quantization_params(batch_size + XNN_EXTRA_QUANTIZATION_PARAMS); + std::vector quantization_params(batch_size + XNN_EXTRA_QUANTIZATION_PARAMS); std::vector kernel_scale(output_channels); std::generate(kernel_scale.begin(), kernel_scale.end(), [&]() { return scale_dist(rng); }); std::generate(kernel.begin(), kernel.end(), [&]() { return w8dist(rng); }); std::generate(bias.begin(), bias.end(), [&]() { return f32dist(rng); }); - std::generate(quantization_params.begin(), quantization_params.end(), [&]() { return xnn_dynamic_quantization_params{w8dist(rng), f32dist(rng)}; }); + std::generate(quantization_params.begin(), quantization_params.end(), [&]() { return xnn_quantization_params{w8dist(rng), f32dist(rng)}; }); std::generate(convert_input.begin(), convert_input.end(), [&]() { return f32dist(rng); }); const float output_min = -std::numeric_limits::infinity(); @@ -3072,14 +3072,14 @@ TEST_F(FullyConnectedTestQD8F16QB4W, internally_allocated_dynamic_quantization_p std::vector operator_output(batch_size * output_channels); std::fill(operator_output.begin(), operator_output.end(), nanf("")); std::fill(subgraph_output.begin(), subgraph_output.end(), nanf("")); - std::vector quantization_params(batch_size + XNN_EXTRA_QUANTIZATION_PARAMS); + std::vector quantization_params(batch_size + XNN_EXTRA_QUANTIZATION_PARAMS); std::vector kernel_scale(output_channels * block_size); std::generate(kernel_scale.begin(), kernel_scale.end(), [&]() { return scale_dist(rng); }); std::generate(kernel.begin(), kernel.end(), [&]() { return w8dist(rng); }); std::generate(bias.begin(), bias.end(), [&]() { return f32dist(rng); }); std::generate(convert_input.begin(), convert_input.end(), [&]() { return f32dist(rng); }); - std::generate(quantization_params.begin(), quantization_params.end(), [&]() { return xnn_dynamic_quantization_params{w8dist(rng), f32dist(rng)}; }); + std::generate(quantization_params.begin(), quantization_params.end(), [&]() { return xnn_quantization_params{w8dist(rng), f32dist(rng)}; }); // Adjust number of kernel elements for QC4W. input_channels should be padded to byte boundary, hence even. const size_t rounded_input_channels = round_up_po2(input_channels, 2); @@ -3238,13 +3238,13 @@ TEST_F(FullyConnectedTestQD8F16QC8W, internally_allocated_dynamic_quantization_p std::vector operator_output(batch_size * output_channels); std::fill(operator_output.begin(), operator_output.end(), UINT16_C(0xDEAD)); std::fill(subgraph_output.begin(), subgraph_output.end(), UINT16_C(0xDEAD)); - std::vector quantization_params(batch_size + XNN_EXTRA_QUANTIZATION_PARAMS); + std::vector quantization_params(batch_size + XNN_EXTRA_QUANTIZATION_PARAMS); std::vector kernel_scale(output_channels); std::generate(kernel_scale.begin(), kernel_scale.end(), [&]() { return scale_dist(rng); }); std::generate(kernel.begin(), kernel.end(), [&]() { return w8dist(rng); }); std::generate(bias.begin(), bias.end(), [&]() { return f32dist(rng); }); - std::generate(quantization_params.begin(), quantization_params.end(), [&]() { return xnn_dynamic_quantization_params{w8dist(rng), f32dist(rng)}; }); + std::generate(quantization_params.begin(), quantization_params.end(), [&]() { return xnn_quantization_params{w8dist(rng), f32dist(rng)}; }); std::generate(convert_input.begin(), convert_input.end(), [&]() { return f32dist(rng); }); const float output_min = -std::numeric_limits::infinity(); @@ -3401,13 +3401,13 @@ TEST_F(FullyConnectedTestQD8F32QC8W, internally_allocated_dynamic_quantization_p std::vector operator_output(batch_size * output_channels); std::fill(operator_output.begin(), operator_output.end(), nanf("")); std::fill(subgraph_output.begin(), subgraph_output.end(), nanf("")); - std::vector quantization_params(batch_size + XNN_EXTRA_QUANTIZATION_PARAMS); + std::vector quantization_params(batch_size + XNN_EXTRA_QUANTIZATION_PARAMS); std::vector kernel_scale(output_channels); std::generate(kernel_scale.begin(), kernel_scale.end(), [&]() { return scale_dist(rng); }); std::generate(kernel.begin(), kernel.end(), [&]() { return w8dist(rng); }); std::generate(bias.begin(), bias.end(), [&]() { return f32dist(rng); }); - std::generate(quantization_params.begin(), quantization_params.end(), [&]() { return xnn_dynamic_quantization_params{w8dist(rng), f32dist(rng)}; }); + std::generate(quantization_params.begin(), quantization_params.end(), [&]() { return xnn_quantization_params{w8dist(rng), f32dist(rng)}; }); std::generate(convert_input.begin(), convert_input.end(), [&]() { return f32dist(rng); }); const float output_min = -std::numeric_limits::infinity(); @@ -3569,7 +3569,7 @@ TEST_F(FullyConnectedTestQD8F32QC4W, internally_allocated_dynamic_quantization_p std::vector operator_output(batch_size * output_channels); std::fill(operator_output.begin(), operator_output.end(), nanf("")); std::fill(subgraph_output.begin(), subgraph_output.end(), nanf("")); - std::vector quantization_params(batch_size + XNN_EXTRA_QUANTIZATION_PARAMS); + std::vector quantization_params(batch_size + XNN_EXTRA_QUANTIZATION_PARAMS); // Adjust number of kernel elements for QC4W. input_channels should be padded to byte boundary, hence even. const size_t rounded_input_channels = round_up_po2(input_channels, 2); @@ -3580,7 +3580,7 @@ TEST_F(FullyConnectedTestQD8F32QC4W, internally_allocated_dynamic_quantization_p std::generate(kernel.begin(), kernel.end(), [&]() { return w8dist(rng); }); std::generate(bias.begin(), bias.end(), [&]() { return f32dist(rng); }); std::generate(convert_input.begin(), convert_input.end(), [&]() { return f32dist(rng); }); - std::generate(quantization_params.begin(), quantization_params.end(), [&]() { return xnn_dynamic_quantization_params{w8dist(rng), f32dist(rng)}; }); + std::generate(quantization_params.begin(), quantization_params.end(), [&]() { return xnn_quantization_params{w8dist(rng), f32dist(rng)}; }); const float output_min = -std::numeric_limits::infinity(); const float output_max = std::numeric_limits::infinity(); @@ -3683,7 +3683,7 @@ TEST_F(FullyConnectedTestQD8F32QC4W, internally_allocated_dynamic_quantization_p std::vector operator_output(batch_size * output_channels); std::fill(operator_output.begin(), operator_output.end(), nanf("")); std::fill(subgraph_output.begin(), subgraph_output.end(), nanf("")); - std::vector quantization_params(batch_size + XNN_EXTRA_QUANTIZATION_PARAMS); + std::vector quantization_params(batch_size + XNN_EXTRA_QUANTIZATION_PARAMS); // Adjust number of kernel elements for QC4W. input_channels should be padded to byte boundary, hence even. const size_t rounded_input_channels = round_up_po2(input_channels, 2); @@ -3693,7 +3693,7 @@ TEST_F(FullyConnectedTestQD8F32QC4W, internally_allocated_dynamic_quantization_p std::generate(kernel_scale.begin(), kernel_scale.end(), [&]() { return scale_dist(rng); }); std::generate(kernel.begin(), kernel.end(), [&]() { return w8dist(rng); }); std::generate(convert_input.begin(), convert_input.end(), [&]() { return f32dist(rng); }); - std::generate(quantization_params.begin(), quantization_params.end(), [&]() { return xnn_dynamic_quantization_params{w8dist(rng), f32dist(rng)}; }); + std::generate(quantization_params.begin(), quantization_params.end(), [&]() { return xnn_quantization_params{w8dist(rng), f32dist(rng)}; }); const float output_min = -std::numeric_limits::infinity(); const float output_max = std::numeric_limits::infinity(); @@ -3840,7 +3840,7 @@ TEST_F(FullyConnectedTestQD8F32QC4W, internally_allocated_dynamic_quantization_p std::vector operator_output(batch_size * output_channels); std::fill(operator_output.begin(), operator_output.end(), nanf("")); std::fill(subgraph_output.begin(), subgraph_output.end(), nanf("")); - std::vector quantization_params(batch_size + XNN_EXTRA_QUANTIZATION_PARAMS); + std::vector quantization_params(batch_size + XNN_EXTRA_QUANTIZATION_PARAMS); // Adjust number of kernel elements for QC4W. input_channels should be padded to byte boundary, hence even. const size_t rounded_output_channels = round_up_po2(output_channels, 2); @@ -3851,7 +3851,7 @@ TEST_F(FullyConnectedTestQD8F32QC4W, internally_allocated_dynamic_quantization_p std::generate(kernel.begin(), kernel.end(), [&]() { return w8dist(rng); }); std::generate(bias.begin(), bias.end(), [&]() { return f32dist(rng); }); std::generate(convert_input.begin(), convert_input.end(), [&]() { return f32dist(rng); }); - std::generate(quantization_params.begin(), quantization_params.end(), [&]() { return xnn_dynamic_quantization_params{w8dist(rng), f32dist(rng)}; }); + std::generate(quantization_params.begin(), quantization_params.end(), [&]() { return xnn_quantization_params{w8dist(rng), f32dist(rng)}; }); const float output_min = -std::numeric_limits::infinity(); const float output_max = std::numeric_limits::infinity(); @@ -4024,14 +4024,14 @@ TEST_F(FullyConnectedTestQD8F32QB4W, internally_allocated_dynamic_quantization_p std::vector operator_output(batch_size * output_channels); std::fill(operator_output.begin(), operator_output.end(), nanf("")); std::fill(subgraph_output.begin(), subgraph_output.end(), nanf("")); - std::vector quantization_params(batch_size + XNN_EXTRA_QUANTIZATION_PARAMS); + std::vector quantization_params(batch_size + XNN_EXTRA_QUANTIZATION_PARAMS); std::vector kernel_scale(output_channels * block_size); std::generate(kernel_scale.begin(), kernel_scale.end(), [&]() { return scale_dist(rng); }); std::generate(kernel.begin(), kernel.end(), [&]() { return w8dist(rng); }); std::generate(bias.begin(), bias.end(), [&]() { return f32dist(rng); }); std::generate(convert_input.begin(), convert_input.end(), [&]() { return f32dist(rng); }); - std::generate(quantization_params.begin(), quantization_params.end(), [&]() { return xnn_dynamic_quantization_params{w8dist(rng), f32dist(rng)}; }); + std::generate(quantization_params.begin(), quantization_params.end(), [&]() { return xnn_quantization_params{w8dist(rng), f32dist(rng)}; }); // Adjust number of kernel elements for QC4W. input_channels should be padded to byte boundary, hence even. const size_t rounded_input_channels = round_up_po2(input_channels, 2); diff --git a/test/workspace.cc b/test/workspace.cc index 62d2318d028..aa05c3a783c 100644 --- a/test/workspace.cc +++ b/test/workspace.cc @@ -818,7 +818,7 @@ TEST(WORKSPACE, internally_allocated_dynamic_quantization_parameters) std::vector input(batch_size * input_channels + XNN_EXTRA_BYTES / sizeof(float)); std::vector subgraph_output(batch_size * output_channels); std::fill(subgraph_output.begin(), subgraph_output.end(), nanf("")); - std::vector quantization_params(3 + XNN_EXTRA_QUANTIZATION_PARAMS); + std::vector quantization_params(3 + XNN_EXTRA_QUANTIZATION_PARAMS); std::vector kernel_scale(output_channels); std::vector bias(output_channels); std::vector kernel(input_channels * output_channels); From bec94846c755915615991be7f0c1c0a6d7db96e2 Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Tue, 24 Sep 2024 02:16:30 -0700 Subject: [PATCH 36/50] Add binary operator API parameterized by an enum This change adds a version of the binary subgraph and operator API that allows constructing binary ops of any operator kind or datatype with a uniform interface. PiperOrigin-RevId: 678147227 --- BUILD.bazel | 14 +- CMakeLists.txt | 35 +- bench/vbinary.cc | 10 +- include/xnnpack.h | 606 +--- src/configs/binary-elementwise-config.c | 552 ++-- src/enums/operator-type.c | 40 +- src/enums/operator-type.yaml | 60 +- src/microparams-init.c | 233 +- src/operator-utils.c | 25 + src/operators/binary-elementwise-nd.c | 2908 +++----------------- src/operators/softmax-nc.c | 21 +- src/subgraph.c | 48 + src/subgraph/add2.c | 378 --- src/subgraph/{copysign.c => binary.c} | 158 +- src/subgraph/deprecated.c | 78 + src/subgraph/divide.c | 296 -- src/subgraph/maximum2.c | 281 -- src/subgraph/minimum2.c | 281 -- src/subgraph/multiply2.c | 446 --- src/subgraph/squared-difference.c | 280 -- src/subgraph/subtract.c | 379 --- src/xnnpack/config-types.h | 11 +- src/xnnpack/microfnptr.h | 45 +- src/xnnpack/microparams-init.h | 74 +- src/xnnpack/microparams.h | 8 + src/xnnpack/operator-type.h | 30 +- src/xnnpack/operator.h | 15 +- src/xnnpack/subgraph.h | 3 + test/BUILD.bazel | 48 +- test/add2-reshape.cc | 325 --- test/add2.cc | 386 --- test/binary-elementwise-nd.cc | 704 +++++ test/binary-elementwise-operator-tester.cc | 1672 ----------- test/binary-elementwise-operator-tester.h | 366 --- test/binary-nd.cc | 472 ---- test/binary.cc | 928 +++++++ test/copysign.cc | 138 - test/divide2.cc | 167 -- test/maximum2.cc | 244 -- test/minimum2.cc | 241 -- test/multiply2.cc | 416 --- test/operator-size.c | 49 +- test/squared-difference.cc | 259 -- test/subgraph-binary-tester.h | 161 -- test/subgraph-size.c | 5 +- test/subtract2.cc | 339 --- test/vbinary-microkernel-tester.cc | 123 +- 47 files changed, 2941 insertions(+), 11417 deletions(-) delete mode 100644 src/subgraph/add2.c rename src/subgraph/{copysign.c => binary.c} (63%) create mode 100644 src/subgraph/deprecated.c delete mode 100644 src/subgraph/divide.c delete mode 100644 src/subgraph/maximum2.c delete mode 100644 src/subgraph/minimum2.c delete mode 100644 src/subgraph/multiply2.c delete mode 100644 src/subgraph/squared-difference.c delete mode 100644 src/subgraph/subtract.c delete mode 100644 test/add2-reshape.cc delete mode 100644 test/add2.cc create mode 100644 test/binary-elementwise-nd.cc delete mode 100644 test/binary-elementwise-operator-tester.cc delete mode 100644 test/binary-elementwise-operator-tester.h delete mode 100644 test/binary-nd.cc create mode 100644 test/binary.cc delete mode 100644 test/copysign.cc delete mode 100644 test/divide2.cc delete mode 100644 test/maximum2.cc delete mode 100644 test/minimum2.cc delete mode 100644 test/multiply2.cc delete mode 100644 test/squared-difference.cc delete mode 100644 test/subgraph-binary-tester.h delete mode 100644 test/subtract2.cc diff --git a/BUILD.bazel b/BUILD.bazel index 0ee742076cf..46a01dff727 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -86,22 +86,21 @@ SUBGRAPH_SRCS = [ "src/runtime.c", "src/subgraph.c", "src/subgraph/abs.c", - "src/subgraph/add2.c", "src/subgraph/argmax-pooling-2d.c", "src/subgraph/average-pooling-2d.c", "src/subgraph/bankers-rounding.c", "src/subgraph/batch-matrix-multiply.c", + "src/subgraph/binary.c", "src/subgraph/ceiling.c", "src/subgraph/clamp.c", "src/subgraph/concatenate.c", "src/subgraph/convert.c", "src/subgraph/convolution-2d.c", "src/subgraph/copy.c", - "src/subgraph/copysign.c", "src/subgraph/deconvolution-2d.c", + "src/subgraph/deprecated.c", "src/subgraph/depth-to-space-2d.c", "src/subgraph/depthwise-convolution-2d.c", - "src/subgraph/divide.c", "src/subgraph/elu.c", "src/subgraph/even-split.c", "src/subgraph/exp.c", @@ -115,9 +114,6 @@ SUBGRAPH_SRCS = [ "src/subgraph/leaky-relu.c", "src/subgraph/log.c", "src/subgraph/max-pooling-2d.c", - "src/subgraph/maximum2.c", - "src/subgraph/minimum2.c", - "src/subgraph/multiply2.c", "src/subgraph/negate.c", "src/subgraph/prelu.c", "src/subgraph/reciprocal-square-root.c", @@ -129,13 +125,11 @@ SUBGRAPH_SRCS = [ "src/subgraph/space-to-depth-2d.c", "src/subgraph/square-root.c", "src/subgraph/square.c", - "src/subgraph/squared-difference.c", "src/subgraph/static-constant-pad.c", "src/subgraph/static-mean.c", "src/subgraph/static-resize-bilinear-2d.c", "src/subgraph/static-slice.c", "src/subgraph/static-transpose.c", - "src/subgraph/subtract.c", "src/subgraph/tanh.c", "src/subgraph/unpooling-2d.c", "src/subgraph/validation.c", @@ -562,6 +556,7 @@ xnnpack_cc_library( ":microparams_init", ":packing", ":prod_microkernels", + ":xnnpack_h", "@FP16", ] + select({ ":cpuinfo_enabled": ["@cpuinfo"], @@ -620,7 +615,6 @@ xnnpack_cc_library( ":microparams", ":unaligned", ":xnnpack_h", - "@FP16", ], ) @@ -1045,9 +1039,9 @@ xnnpack_cc_library( ":logging", ":math", ":operator_h", + ":operator_type", ":params", ":xnnpack_h", - "@FP16", ], ) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7c56cbe3019..0fbe62f50ae 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -401,22 +401,21 @@ SET(SUBGRAPH_SRCS src/runtime.c src/subgraph.c src/subgraph/abs.c - src/subgraph/add2.c src/subgraph/argmax-pooling-2d.c src/subgraph/average-pooling-2d.c src/subgraph/bankers-rounding.c src/subgraph/batch-matrix-multiply.c + src/subgraph/binary.c src/subgraph/ceiling.c src/subgraph/clamp.c src/subgraph/concatenate.c src/subgraph/convert.c src/subgraph/convolution-2d.c src/subgraph/copy.c - src/subgraph/copysign.c src/subgraph/deconvolution-2d.c + src/subgraph/deprecated.c src/subgraph/depth-to-space-2d.c src/subgraph/depthwise-convolution-2d.c - src/subgraph/divide.c src/subgraph/elu.c src/subgraph/even-split.c src/subgraph/exp.c @@ -430,9 +429,6 @@ SET(SUBGRAPH_SRCS src/subgraph/leaky-relu.c src/subgraph/log.c src/subgraph/max-pooling-2d.c - src/subgraph/maximum2.c - src/subgraph/minimum2.c - src/subgraph/multiply2.c src/subgraph/negate.c src/subgraph/prelu.c src/subgraph/reciprocal-square-root.c @@ -443,13 +439,11 @@ SET(SUBGRAPH_SRCS src/subgraph/space-to-depth-2d.c src/subgraph/square-root.c src/subgraph/square.c - src/subgraph/squared-difference.c src/subgraph/static-constant-pad.c src/subgraph/static-mean.c src/subgraph/static-resize-bilinear-2d.c src/subgraph/static-slice.c src/subgraph/static-transpose.c - src/subgraph/subtract.c src/subgraph/tanh.c src/subgraph/unpooling-2d.c src/subgraph/validation.c @@ -981,7 +975,7 @@ ENDIF() TARGET_INCLUDE_DIRECTORIES(microkernels-prod PRIVATE . include src) TARGET_INCLUDE_DIRECTORIES(hardware-config PRIVATE include src ${CPUINFO_SOURCE_DIR}/include) TARGET_INCLUDE_DIRECTORIES(indirection PRIVATE include src) -TARGET_INCLUDE_DIRECTORIES(microparams-init PRIVATE src) +TARGET_INCLUDE_DIRECTORIES(microparams-init PRIVATE include src) TARGET_INCLUDE_DIRECTORIES(normalization PRIVATE include src) TARGET_INCLUDE_DIRECTORIES(packing PRIVATE include src) TARGET_INCLUDE_DIRECTORIES(logging PRIVATE include src) @@ -1204,10 +1198,6 @@ IF(XNNPACK_BUILD_TESTS) TARGET_INCLUDE_DIRECTORIES(unary-operator-tester PRIVATE . include src test) TARGET_LINK_LIBRARIES(unary-operator-tester PRIVATE XNNPACK fp16 pthreadpool GTest::gtest) - ADD_LIBRARY(binary-elementwise-operator-tester STATIC test/binary-elementwise-operator-tester.cc) - TARGET_INCLUDE_DIRECTORIES(binary-elementwise-operator-tester PRIVATE . include src test) - TARGET_LINK_LIBRARIES(binary-elementwise-operator-tester PRIVATE XNNPACK fp16 pthreadpool GTest::gtest) - ADD_LIBRARY(dwconv-microkernel-tester STATIC test/dwconv-microkernel-tester.cc) TARGET_INCLUDE_DIRECTORIES(dwconv-microkernel-tester PRIVATE . include src test) TARGET_LINK_LIBRARIES(dwconv-microkernel-tester PRIVATE XNNPACK fp16 pthreadpool GTest::gtest) @@ -1325,15 +1315,14 @@ IF(XNNPACK_BUILD_TESTS) ADD_TEST(NAME ${TEST}-test COMMAND ${TEST}-test) ENDFOREACH() - ADD_EXECUTABLE(binary-nd-test test/binary-nd.cc) - TARGET_INCLUDE_DIRECTORIES(binary-nd-test PRIVATE src test) - TARGET_LINK_LIBRARIES(binary-nd-test PRIVATE - binary-elementwise-operator-tester + ADD_EXECUTABLE(binary-elementwise-nd-test test/binary-elementwise-nd.cc) + TARGET_INCLUDE_DIRECTORIES(binary-elementwise-nd-test PRIVATE src test) + TARGET_LINK_LIBRARIES(binary-elementwise-nd-test PRIVATE fp16 GTest::gtest GTest::gtest_main XNNPACK) - ADD_SHARDED_TEST(binary-nd-test 10) + ADD_SHARDED_TEST(binary-elementwise-nd-test 10) # ---[ Build subgraph optimizations unit tests SET(LIBRARY_SUBGRAPH_OPTIMIZATION_TESTS @@ -1360,12 +1349,11 @@ IF(XNNPACK_BUILD_TESTS) SET(LIBRARY_SUBGRAPH_UNIT_TESTS abs abs-reshape - add2 - add2-reshape argmax-pooling-2d average-pooling-2d average-pooling-2d-reshape bankers-rounding + binary ceiling clamp concatenate2 @@ -1374,9 +1362,7 @@ IF(XNNPACK_BUILD_TESTS) concatenate5 convert copy - copysign depth-to-space-2d - divide2 elu gelu exp @@ -1392,9 +1378,6 @@ IF(XNNPACK_BUILD_TESTS) leaky-relu log max-pooling-2d - maximum2 - minimum2 - multiply2 negate prelu reciprocal-square-root @@ -1405,13 +1388,11 @@ IF(XNNPACK_BUILD_TESTS) space-to-depth-2d square square-root - squared-difference static-constant-pad static-mean static-reshape static-resize-bilinear-2d static-transpose - subtract2 tanh transpose-reshape unpooling-2d) diff --git a/bench/vbinary.cc b/bench/vbinary.cc index 9e259777398..3e7f6332502 100644 --- a/bench/vbinary.cc +++ b/bench/vbinary.cc @@ -80,28 +80,30 @@ struct ParamsWrapper { Params params; }; +xnn_quantization_params quantization = {0, 1.0f}; + template <> struct ParamsWrapper { xnn_qs8_add_minmax_params params = make_params( - xnn_init_qs8_add_minmax_scalar_params, 0, 0, 0, 1.0f, 1.0f, -128, 127); + xnn_init_qs8_add_minmax_scalar_params, &quantization, &quantization, &quantization); }; template <> struct ParamsWrapper { xnn_qu8_add_minmax_params params = make_params( - xnn_init_qu8_add_minmax_scalar_params, 0, 0, 0, 1.0f, 1.0f, 0, 255); + xnn_init_qu8_add_minmax_scalar_params, &quantization, &quantization, &quantization); }; template <> struct ParamsWrapper { xnn_qs8_mul_minmax_params params = make_params( - xnn_init_qs8_mul_minmax_scalar_params, 0, 0, 0, 1.0f, -128, 127); + xnn_init_qs8_mul_minmax_scalar_params, &quantization, &quantization, &quantization); }; template <> struct ParamsWrapper { xnn_qu8_mul_minmax_params params = make_params( - xnn_init_qu8_mul_minmax_scalar_params, 0, 0, 0, 1.0f, 0, 255); + xnn_init_qu8_mul_minmax_scalar_params, &quantization, &quantization, &quantization); }; // Microkernel function, templated on the `params` type. diff --git a/include/xnnpack.h b/include/xnnpack.h index f8062ac894b..7af01be1224 100644 --- a/include/xnnpack.h +++ b/include/xnnpack.h @@ -103,6 +103,12 @@ extern "C" { /// The minimum blocksize for blockwise quantized operators. #define XNN_MIN_BLOCKSIZE 32 +#ifdef __GNUC__ +#define XNN_DEPRECATED __attribute__((deprecated)) +#else +#define XNN_DEPRECATED +#endif + struct xnn_quantization_params { int32_t zero_point; float scale; @@ -981,6 +987,51 @@ enum xnn_status xnn_define_unpooling_2d( uint32_t output_id, uint32_t flags); +enum xnn_binary_operator { + xnn_binary_invalid = -1, + xnn_binary_add, + xnn_binary_subtract, + xnn_binary_multiply, + xnn_binary_divide, + xnn_binary_maximum, + xnn_binary_minimum, + xnn_binary_copysign, + xnn_binary_squared_difference, +}; + +struct xnn_binary_params { + /// lower bound for clipping output values. + double output_min; + /// upper bound for clipping output values. + double output_max; +}; + +/// Define a 2-Input binary operator Node and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param type - Type of operator to apply to the two inputs. +/// @param params - Optional parameters for the operator. +/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in +/// the @a subgraph with each dimension either equal to the corresponding dimension of the second +/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along +/// that dimension. +/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in +/// the @a subgraph with each dimension either equal to the corresponding dimension of the first +/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along +/// that dimension. +/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined +/// in the @a subgraph with each dimension equal to the maximum between the corresponding dimension +/// of the two inputs. +/// @param flags - binary features of the Node. No supported flags are currently defined. +enum xnn_status xnn_define_binary( + xnn_subgraph_t subgraph, + enum xnn_binary_operator type, + const struct xnn_binary_params* params, + uint32_t input1_id, + uint32_t input2_id, + uint32_t output_id, + uint32_t flags); + /// Define a 2-Input Add Node and add it to a Subgraph. /// /// The 2-Input Add Node computes elementwise addition of two tensor inputs with numpy broadcasting rules. @@ -1000,7 +1051,7 @@ enum xnn_status xnn_define_unpooling_2d( /// in the @a subgraph with each dimension equal to the maximum between the corresponding dimension /// of the two inputs. /// @param flags - binary features of the Add Node. No supported flags are currently defined. -enum xnn_status xnn_define_add2( +XNN_DEPRECATED enum xnn_status xnn_define_add2( xnn_subgraph_t subgraph, float output_min, float output_max, @@ -1026,7 +1077,7 @@ enum xnn_status xnn_define_add2( /// in the @a subgraph with each dimension equal to the maximum between the corresponding dimension /// of the two inputs. /// @param flags - binary features of the Multiply Node. No supported flags are currently defined. -enum xnn_status xnn_define_multiply2( +XNN_DEPRECATED enum xnn_status xnn_define_multiply2( xnn_subgraph_t subgraph, float output_min, float output_max, @@ -1115,7 +1166,7 @@ enum xnn_status xnn_define_scaled_dot_product_attention( /// in the @a subgraph with each dimension equal to the maximum between the corresponding dimension /// of the two inputs. /// @param flags - binary features of the Subtract Node. No supported flags are currently defined. -enum xnn_status xnn_define_subtract( +XNN_DEPRECATED enum xnn_status xnn_define_subtract( xnn_subgraph_t subgraph, float output_min, float output_max, @@ -1143,7 +1194,7 @@ enum xnn_status xnn_define_subtract( /// in the @a subgraph with each dimension equal to the maximum between the corresponding dimension /// of the two inputs. /// @param flags - binary features of the Divide Node. No supported flags are currently defined. -enum xnn_status xnn_define_divide( +XNN_DEPRECATED enum xnn_status xnn_define_divide( xnn_subgraph_t subgraph, float output_min, float output_max, @@ -1169,7 +1220,7 @@ enum xnn_status xnn_define_divide( /// in the @a subgraph with each dimension equal to the maximum between the corresponding dimension /// of the two inputs. /// @param flags - binary features of the Maximum Node. No supported flags are currently defined. -enum xnn_status xnn_define_maximum2( +XNN_DEPRECATED enum xnn_status xnn_define_maximum2( xnn_subgraph_t subgraph, uint32_t input1_id, uint32_t input2_id, @@ -1193,7 +1244,7 @@ enum xnn_status xnn_define_maximum2( /// in the @a subgraph with each dimension equal to the maximum between the corresponding dimension /// of the two inputs. /// @param flags - binary features of the Minimum Node. No supported flags are currently defined. -enum xnn_status xnn_define_minimum2( +XNN_DEPRECATED enum xnn_status xnn_define_minimum2( xnn_subgraph_t subgraph, uint32_t input1_id, uint32_t input2_id, @@ -1218,7 +1269,7 @@ enum xnn_status xnn_define_minimum2( /// in the @a subgraph with each dimension equal to the maximum between the corresponding dimension /// of the two inputs. /// @param flags - binary features of the Squared Difference Node. No supported flags are currently defined. -enum xnn_status xnn_define_squared_difference( +XNN_DEPRECATED enum xnn_status xnn_define_squared_difference( xnn_subgraph_t subgraph, uint32_t input1_id, uint32_t input2_id, @@ -1415,7 +1466,7 @@ enum xnn_status xnn_define_concatenate5( /// @param input2_id - Value ID for the second input tensor. The input tensor must be defined in the @a subgraph. /// @param output_id - Value ID for the output tensor. /// @param flags - binary features of the Copy Sign Node. No supported flags are currently defined. -enum xnn_status xnn_define_copysign( +XNN_DEPRECATED enum xnn_status xnn_define_copysign( xnn_subgraph_t subgraph, uint32_t input1_id, uint32_t input2_id, @@ -2202,6 +2253,23 @@ enum xnn_status xnn_run_operator( enum xnn_status xnn_delete_operator( xnn_operator_t op); +struct xnn_binary_operator_params { + union { + struct { + float param; + float param2; + } elu; + struct { + float param; + } leaky_relu; + }; + int32_t a_zero_point; + int32_t b_zero_point; + float a_scale; + float b_scale; + float output_scale; + int32_t output_zero_point; +}; /// Operator API: /// - create operator will create and populate a xnn_operator_t @@ -2211,6 +2279,45 @@ enum xnn_status xnn_delete_operator( /// Operators listed below are in alphabetical order by operator name; within each operator, we sort alphabetically by /// data layout and type. We also group create, reshape, setup (and optionally run) functions of each operator together. +enum xnn_status xnn_create_binary_elementwise_nd( + enum xnn_binary_operator type, + enum xnn_datatype datatype, + const struct xnn_quantization_params* input1_quantization, + const struct xnn_quantization_params* input2_quantization, + const struct xnn_quantization_params* output_quantization, + uint32_t flags, + xnn_operator_t* binary_op_out); + +enum xnn_status xnn_reshape_binary_elementwise_nd( + xnn_operator_t binary_op, + size_t num_input1_dims, + const size_t* input1_shape, + size_t num_input2_dims, + const size_t* input2_shape, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_binary_elementwise_nd( + xnn_operator_t binary_op, + const void* input1, + const void* input2, + void* output); + +enum xnn_status xnn_run_binary_elementwise_nd( + enum xnn_binary_operator type, + enum xnn_datatype datatype, + const struct xnn_quantization_params* input1_quantization, + const struct xnn_quantization_params* input2_quantization, + const struct xnn_quantization_params* output_quantization, + uint32_t flags, + size_t num_input1_dims, + const size_t* input1_shape, + size_t num_input2_dims, + const size_t* input2_shape, + const void* input1, + const void* input2, + void* output, + pthreadpool_t threadpool); + enum xnn_status xnn_create_abs_nc_f16( uint32_t flags, xnn_operator_t* abs_op_out); @@ -2255,169 +2362,6 @@ enum xnn_status xnn_run_abs_nc_f32( uint32_t flags, pthreadpool_t threadpool); -enum xnn_status xnn_create_add_nd_f16( - float output_min, - float output_max, - uint32_t flags, - xnn_operator_t* add_op_out); - -enum xnn_status xnn_reshape_add_nd_f16( - xnn_operator_t add_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_add_nd_f16( - xnn_operator_t add_op, - const void* input1, - const void* input2, - void* output); - -enum xnn_status xnn_create_add_nd_f32( - float output_min, - float output_max, - uint32_t flags, - xnn_operator_t* add_op_out); - -enum xnn_status xnn_reshape_add_nd_f32( - xnn_operator_t add_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_add_nd_f32( - xnn_operator_t add_op, - const float* input1, - const float* input2, - float* output); - -enum xnn_status xnn_run_add_nd_f32( - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - const float* input1, - const float* input2, - float* output, - float output_min, - float output_max, - uint32_t flags, - pthreadpool_t threadpool); - - -enum xnn_status xnn_create_multiply_nd_s32( - uint32_t flags, - xnn_operator_t* multiply_op_out); - -enum xnn_status xnn_reshape_multiply_nd_s32( - xnn_operator_t mul_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_multiply_nd_s32( - xnn_operator_t mul_op, - const int32_t* input1, - const int32_t* input2, - int32_t* output); - - -enum xnn_status xnn_create_add_nd_qs8( - int8_t input1_zero_point, - float input1_scale, - int8_t input2_zero_point, - float input2_scale, - int8_t output_zero_point, - float output_scale, - int8_t output_min, - int8_t output_max, - uint32_t flags, - xnn_operator_t* add_op_out); - -enum xnn_status xnn_reshape_add_nd_qs8( - xnn_operator_t add_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_add_nd_qs8( - xnn_operator_t add_op, - const int8_t* input1, - const int8_t* input2, - int8_t* output); - -enum xnn_status xnn_run_add_nd_qs8( - size_t num_input1_dims, - const size_t* input1_shape, - int8_t input1_zero_point, - float input1_scale, - size_t num_input2_dims, - const size_t* input2_shape, - int8_t input2_zero_point, - float input2_scale, - const int8_t* input1, - const int8_t* input2, - int8_t* output, - int8_t output_zero_point, - float output_scale, - int8_t output_min, - int8_t output_max, - uint32_t flags, - pthreadpool_t threadpool); - -enum xnn_status xnn_create_add_nd_qu8( - uint8_t input1_zero_point, - float input1_scale, - uint8_t input2_zero_point, - float input2_scale, - uint8_t output_zero_point, - float output_scale, - uint8_t output_min, - uint8_t output_max, - uint32_t flags, - xnn_operator_t* add_op_out); - -enum xnn_status xnn_reshape_add_nd_qu8( - xnn_operator_t add_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_add_nd_qu8( - xnn_operator_t add_op, - const uint8_t* input1, - const uint8_t* input2, - uint8_t* output); - -enum xnn_status xnn_run_add_nd_qu8( - size_t num_input1_dims, - const size_t* input1_shape, - uint8_t input1_zero_point, - float input1_scale, - size_t num_input2_dims, - const size_t* input2_shape, - uint8_t input2_zero_point, - float input2_scale, - const uint8_t* input1, - const uint8_t* input2, - uint8_t* output, - uint8_t output_zero_point, - float output_scale, - uint8_t output_min, - uint8_t output_max, - uint32_t flags, - pthreadpool_t threadpool); - enum xnn_status xnn_create_argmax_pooling2d_nhwc_f32( uint32_t input_padding_top, uint32_t input_padding_right, @@ -4023,77 +3967,6 @@ enum xnn_status xnn_setup_depth_to_space_nhwc_x32( const void* input, void* output); -enum xnn_status xnn_create_divide_nd_f16( - float output_min, - float output_max, - uint32_t flags, - xnn_operator_t* divide_op_out); - -enum xnn_status xnn_reshape_divide_nd_f16( - xnn_operator_t divide_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_divide_nd_f16( - xnn_operator_t divide_op, - const void* input1, - const void* input2, - void* output); - -enum xnn_status xnn_create_copysign_nd_f32( - uint32_t flags, - xnn_operator_t* copysign_op_out); - -enum xnn_status xnn_reshape_copysign_nd_f32( - xnn_operator_t copysign_op, - size_t num_mag_dims, - const size_t* mag_shape, - size_t num_sign_dims, - const size_t* sign_shape, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_copysign_nd_f32( - xnn_operator_t copysign_op, - const float* mag, - const float* sign, - float* output); - -enum xnn_status xnn_create_divide_nd_f32( - float output_min, - float output_max, - uint32_t flags, - xnn_operator_t* divide_op_out); - -enum xnn_status xnn_reshape_divide_nd_f32( - xnn_operator_t divide_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_divide_nd_f32( - xnn_operator_t divide_op, - const float* input1, - const float* input2, - float* output); - -enum xnn_status xnn_run_divide_nd_f32( - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - const float* input1, - const float* input2, - float* output, - float output_min, - float output_max, - uint32_t flags, - pthreadpool_t threadpool); - enum xnn_status xnn_create_dynamic_fully_connected_nc_f16( float output_min, float output_max, @@ -5142,53 +5015,6 @@ enum xnn_status xnn_setup_max_pooling2d_nhwc_u8( const uint8_t* input, uint8_t* output); -enum xnn_status xnn_create_maximum_nd_f16( - uint32_t flags, - xnn_operator_t* maximum_op_out); - -enum xnn_status xnn_reshape_maximum_nd_f16( - xnn_operator_t maximum_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_maximum_nd_f16( - xnn_operator_t maximum_op, - const void* input1, - const void* input2, - void* output); - -enum xnn_status xnn_create_maximum_nd_f32( - uint32_t flags, - xnn_operator_t* maximum_op_out); - -enum xnn_status xnn_reshape_maximum_nd_f32( - xnn_operator_t maximum_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_maximum_nd_f32( - xnn_operator_t maximum_op, - const float* input1, - const float* input2, - float* output); - -enum xnn_status xnn_run_maximum_nd_f32( - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - const float* input1, - const float* input2, - float* output, - uint32_t flags, - pthreadpool_t threadpool); - enum xnn_status xnn_create_mean_nd_f16( uint32_t flags, xnn_operator_t* mean_op_out); @@ -5260,18 +5086,6 @@ enum xnn_status xnn_setup_mean_nd_f32( const float* input, float* output); -enum xnn_status xnn_create_minimum_nd_f16( - uint32_t flags, - xnn_operator_t* minimum_op_out); - -enum xnn_status xnn_reshape_minimum_nd_f16( - xnn_operator_t minimum_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool); - enum xnn_status xnn_setup_mean_nd_qs8( xnn_operator_t mean_op, void* workspace, @@ -6148,196 +5962,6 @@ enum xnn_status xnn_run_reciprocal_square_root_nc_f32( size_t batch_size, const float* input, float* output, uint32_t flags, pthreadpool_t threadpool); -enum xnn_status xnn_create_squared_difference_nd_f16( - uint32_t flags, - xnn_operator_t* squared_difference_op_out); - -enum xnn_status xnn_reshape_squared_difference_nd_f16( - xnn_operator_t squared_difference_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_squared_difference_nd_f16( - xnn_operator_t squared_difference_op, - const void* input1, - const void* input2, - void* output); - -enum xnn_status xnn_create_squared_difference_nd_f32( - uint32_t flags, - xnn_operator_t* squared_difference_op_out); - -enum xnn_status xnn_reshape_squared_difference_nd_f32( - xnn_operator_t squared_difference_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_squared_difference_nd_f32( - xnn_operator_t squared_difference_op, - const float* input1, - const float* input2, - float* output); - -enum xnn_status xnn_run_squared_difference_nd_f32( - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - const float* input1, - const float* input2, - float* output, - uint32_t flags, - pthreadpool_t threadpool); - -enum xnn_status xnn_create_subtract_nd_f16( - float output_min, - float output_max, - uint32_t flags, - xnn_operator_t* subtract_op_out); - -enum xnn_status xnn_reshape_subtract_nd_f16( - xnn_operator_t subtract_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_subtract_nd_f16( - xnn_operator_t subtract_op, - const void* input1, - const void* input2, - void* output); - -enum xnn_status xnn_create_subtract_nd_f32( - float output_min, - float output_max, - uint32_t flags, - xnn_operator_t* subtract_op_out); - -enum xnn_status xnn_reshape_subtract_nd_f32( - xnn_operator_t subtract_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_subtract_nd_f32( - xnn_operator_t subtract_op, - const float* input1, - const float* input2, - float* output); - -enum xnn_status xnn_run_subtract_nd_f32( - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - const float* input1, - const float* input2, - float* output, - float output_min, - float output_max, - uint32_t flags, - pthreadpool_t threadpool); - -enum xnn_status xnn_create_subtract_nd_qs8( - int8_t input1_zero_point, - float input1_scale, - int8_t input2_zero_point, - float input2_scale, - int8_t output_zero_point, - float output_scale, - int8_t output_min, - int8_t output_max, - uint32_t flags, - xnn_operator_t* subtract_op_out); - -enum xnn_status xnn_reshape_subtract_nd_qs8( - xnn_operator_t subtract_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_subtract_nd_qs8( - xnn_operator_t subtract_op, - const int8_t* input1, - const int8_t* input2, - int8_t* output); - -enum xnn_status xnn_run_subtract_nd_qs8( - size_t num_input1_dims, - const size_t* input1_shape, - int8_t input1_zero_point, - float input1_scale, - size_t num_input2_dims, - const size_t* input2_shape, - int8_t input2_zero_point, - float input2_scale, - const int8_t* input1, - const int8_t* input2, - int8_t* output, - int8_t output_zero_point, - float output_scale, - int8_t output_min, - int8_t output_max, - uint32_t flags, - pthreadpool_t threadpool); - -enum xnn_status xnn_create_subtract_nd_qu8( - uint8_t input1_zero_point, - float input1_scale, - uint8_t input2_zero_point, - float input2_scale, - uint8_t output_zero_point, - float output_scale, - uint8_t output_min, - uint8_t output_max, - uint32_t flags, - xnn_operator_t* subtract_op_out); - -enum xnn_status xnn_reshape_subtract_nd_qu8( - xnn_operator_t subtract_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_subtract_nd_qu8( - xnn_operator_t subtract_op, - const uint8_t* input1, - const uint8_t* input2, - uint8_t* output); - -enum xnn_status xnn_run_subtract_nd_qu8( - size_t num_input1_dims, - const size_t* input1_shape, - uint8_t input1_zero_point, - float input1_scale, - size_t num_input2_dims, - const size_t* input2_shape, - uint8_t input2_zero_point, - float input2_scale, - const uint8_t* input1, - const uint8_t* input2, - uint8_t* output, - uint8_t output_zero_point, - float output_scale, - uint8_t output_min, - uint8_t output_max, - uint32_t flags, - pthreadpool_t threadpool); - enum xnn_status xnn_create_tanh_nc_f16( uint32_t flags, xnn_operator_t* tanh_op_out); diff --git a/src/configs/binary-elementwise-config.c b/src/configs/binary-elementwise-config.c index 0e7750621bb..453db818652 100644 --- a/src/configs/binary-elementwise-config.c +++ b/src/configs/binary-elementwise-config.c @@ -69,7 +69,7 @@ static void init_f16_vadd_config(void) { f16_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vadd_minmax_ukernel__neonfp16arith_u16; f16_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vaddc_minmax_ukernel__neonfp16arith_u16; f16_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vaddc_minmax_ukernel__neonfp16arith_u16; - f16_vadd_config.init.f16_minmax = xnn_init_f16_minmax_scalar_params; + f16_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; f16_vadd_config.minmax.element_tile = 16; } #elif XNN_ARCH_ARM64 && XNN_ENABLE_ARM_FP16_VECTOR @@ -79,7 +79,7 @@ static void init_f16_vadd_config(void) { f16_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vadd_minmax_ukernel__neonfp16arith_u16; f16_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vaddc_minmax_ukernel__neonfp16arith_u16; f16_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vaddc_minmax_ukernel__neonfp16arith_u16; - f16_vadd_config.init.f16_minmax = xnn_init_f16_minmax_scalar_params; + f16_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; f16_vadd_config.minmax.element_tile = 16; } #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE @@ -90,7 +90,7 @@ static void init_f16_vadd_config(void) { f16_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vadd_minmax_ukernel__avx512fp16_u64; f16_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vaddc_minmax_ukernel__avx512fp16_u64; f16_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vaddc_minmax_ukernel__avx512fp16_u64; - f16_vadd_config.init.f16_minmax = xnn_init_f16_minmax_scalar_params; + f16_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; f16_vadd_config.minmax.element_tile = 64; } else #endif @@ -98,7 +98,7 @@ static void init_f16_vadd_config(void) { f16_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vadd_minmax_ukernel__f16c_u16; f16_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vaddc_minmax_ukernel__f16c_u16; f16_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vaddc_minmax_ukernel__f16c_u16; - f16_vadd_config.init.f16_minmax = xnn_init_f16_minmax_scalar_params; + f16_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; f16_vadd_config.minmax.element_tile = 16; } #endif @@ -112,7 +112,7 @@ static void init_f16_vdiv_config(void) { f16_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vdiv_minmax_ukernel__fp16arith_u2; f16_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vdivc_minmax_ukernel__fp16arith_u2; f16_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vrdivc_minmax_ukernel__fp16arith_u2; - f16_vdiv_config.init.f16_minmax = xnn_init_f16_minmax_scalar_params; + f16_vdiv_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; f16_vdiv_config.minmax.element_tile = 2; } #elif XNN_ARCH_ARM64 && XNN_ENABLE_ARM_FP16_VECTOR @@ -122,7 +122,7 @@ static void init_f16_vdiv_config(void) { f16_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vdiv_minmax_ukernel__aarch64_neonfp16arith_u8; f16_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vdivc_minmax_ukernel__aarch64_neonfp16arith_u8; f16_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vrdivc_minmax_ukernel__aarch64_neonfp16arith_u8; - f16_vdiv_config.init.f16_minmax = xnn_init_f16_minmax_scalar_params; + f16_vdiv_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; f16_vdiv_config.minmax.element_tile = 8; } #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE @@ -133,7 +133,7 @@ static void init_f16_vdiv_config(void) { f16_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vdiv_minmax_ukernel__avx512fp16_u64; f16_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vdivc_minmax_ukernel__avx512fp16_u64; f16_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vrdivc_minmax_ukernel__avx512fp16_u64; - f16_vdiv_config.init.f16_minmax = xnn_init_f16_minmax_scalar_params; + f16_vdiv_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; f16_vdiv_config.minmax.element_tile = 64; } else #endif @@ -141,7 +141,7 @@ static void init_f16_vdiv_config(void) { f16_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vdiv_minmax_ukernel__f16c_u8; f16_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vdivc_minmax_ukernel__f16c_u8; f16_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vrdivc_minmax_ukernel__f16c_u8; - f16_vdiv_config.init.f16_minmax = xnn_init_f16_minmax_scalar_params; + f16_vdiv_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; f16_vdiv_config.minmax.element_tile = 8; } #endif @@ -152,37 +152,37 @@ static void init_f16_vmax_config(void) { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon_fp16_arith) { - f16_vmax_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmax_ukernel__neonfp16arith_u16; - f16_vmax_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmaxc_ukernel__neonfp16arith_u16; - f16_vmax_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmaxc_ukernel__neonfp16arith_u16; - f16_vmax_config.minmax.element_tile = 16; + f16_vmax_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmax_ukernel__neonfp16arith_u16; + f16_vmax_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmaxc_ukernel__neonfp16arith_u16; + f16_vmax_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmaxc_ukernel__neonfp16arith_u16; + f16_vmax_config.linear.element_tile = 16; } #elif XNN_ARCH_ARM64 && XNN_ENABLE_ARM_FP16_VECTOR const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon_fp16_arith) { - f16_vmax_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmax_ukernel__neonfp16arith_u16; - f16_vmax_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmaxc_ukernel__neonfp16arith_u16; - f16_vmax_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmaxc_ukernel__neonfp16arith_u16; - f16_vmax_config.minmax.element_tile = 16; + f16_vmax_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmax_ukernel__neonfp16arith_u16; + f16_vmax_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmaxc_ukernel__neonfp16arith_u16; + f16_vmax_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmaxc_ukernel__neonfp16arith_u16; + f16_vmax_config.linear.element_tile = 16; } #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); #if XNN_ENABLE_AVX512FP16 if (hardware_config->use_x86_avx512fp16) { - f16_vmax_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmax_ukernel__avx512fp16_u64; - f16_vmax_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmaxc_ukernel__avx512fp16_u64; - f16_vmax_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmaxc_ukernel__avx512fp16_u64; - f16_vmax_config.init.f16_minmax = xnn_init_f16_minmax_scalar_params; - f16_vmax_config.minmax.element_tile = 64; + f16_vmax_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmax_ukernel__avx512fp16_u64; + f16_vmax_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmaxc_ukernel__avx512fp16_u64; + f16_vmax_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmaxc_ukernel__avx512fp16_u64; + f16_vmax_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; + f16_vmax_config.linear.element_tile = 64; } else #endif if (hardware_config->use_x86_f16c) { - f16_vmax_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmax_ukernel__f16c_u16; - f16_vmax_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmaxc_ukernel__f16c_u16; - f16_vmax_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmaxc_ukernel__f16c_u16; - f16_vmax_config.minmax.element_tile = 16; + f16_vmax_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmax_ukernel__f16c_u16; + f16_vmax_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmaxc_ukernel__f16c_u16; + f16_vmax_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmaxc_ukernel__f16c_u16; + f16_vmax_config.linear.element_tile = 16; } #endif } @@ -192,37 +192,37 @@ static void init_f16_vmin_config(void) { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon_fp16_arith) { - f16_vmin_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmin_ukernel__neonfp16arith_u16; - f16_vmin_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vminc_ukernel__neonfp16arith_u16; - f16_vmin_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vminc_ukernel__neonfp16arith_u16; - f16_vmin_config.minmax.element_tile = 16; + f16_vmin_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmin_ukernel__neonfp16arith_u16; + f16_vmin_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vminc_ukernel__neonfp16arith_u16; + f16_vmin_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vminc_ukernel__neonfp16arith_u16; + f16_vmin_config.linear.element_tile = 16; } #elif XNN_ARCH_ARM64 && XNN_ENABLE_ARM_FP16_VECTOR const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon_fp16_arith) { - f16_vmin_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmin_ukernel__neonfp16arith_u16; - f16_vmin_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vminc_ukernel__neonfp16arith_u16; - f16_vmin_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vminc_ukernel__neonfp16arith_u16; - f16_vmin_config.minmax.element_tile = 16; + f16_vmin_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmin_ukernel__neonfp16arith_u16; + f16_vmin_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vminc_ukernel__neonfp16arith_u16; + f16_vmin_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vminc_ukernel__neonfp16arith_u16; + f16_vmin_config.linear.element_tile = 16; } #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); #if XNN_ENABLE_AVX512FP16 if (hardware_config->use_x86_avx512fp16) { - f16_vmin_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmin_ukernel__avx512fp16_u64; - f16_vmin_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vminc_ukernel__avx512fp16_u64; - f16_vmin_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vminc_ukernel__avx512fp16_u64; - f16_vmin_config.init.f16_minmax = xnn_init_f16_minmax_scalar_params; - f16_vmin_config.minmax.element_tile = 64; + f16_vmin_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmin_ukernel__avx512fp16_u64; + f16_vmin_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vminc_ukernel__avx512fp16_u64; + f16_vmin_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vminc_ukernel__avx512fp16_u64; + f16_vmin_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; + f16_vmin_config.linear.element_tile = 64; } else #endif if (hardware_config->use_x86_f16c) { - f16_vmin_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmin_ukernel__f16c_u16; - f16_vmin_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vminc_ukernel__f16c_u16; - f16_vmin_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vminc_ukernel__f16c_u16; - f16_vmin_config.minmax.element_tile = 16; + f16_vmin_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmin_ukernel__f16c_u16; + f16_vmin_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vminc_ukernel__f16c_u16; + f16_vmin_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vminc_ukernel__f16c_u16; + f16_vmin_config.linear.element_tile = 16; } #endif } @@ -235,7 +235,7 @@ static void init_f16_vmul_config(void) { f16_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmul_minmax_ukernel__neonfp16arith_u16; f16_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmulc_minmax_ukernel__neonfp16arith_u16; f16_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmulc_minmax_ukernel__neonfp16arith_u16; - f16_vmul_config.init.f16_minmax = xnn_init_f16_minmax_scalar_params; + f16_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; f16_vmul_config.minmax.element_tile = 16; } #elif XNN_ARCH_ARM64 && XNN_ENABLE_ARM_FP16_VECTOR @@ -245,7 +245,7 @@ static void init_f16_vmul_config(void) { f16_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmul_minmax_ukernel__neonfp16arith_u16; f16_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmulc_minmax_ukernel__neonfp16arith_u16; f16_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmulc_minmax_ukernel__neonfp16arith_u16; - f16_vmul_config.init.f16_minmax = xnn_init_f16_minmax_scalar_params; + f16_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; f16_vmul_config.minmax.element_tile = 16; } #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE @@ -256,7 +256,7 @@ static void init_f16_vmul_config(void) { f16_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmul_minmax_ukernel__avx512fp16_u64; f16_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmulc_minmax_ukernel__avx512fp16_u64; f16_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmulc_minmax_ukernel__avx512fp16_u64; - f16_vmul_config.init.f16_minmax = xnn_init_f16_minmax_scalar_params; + f16_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; f16_vmul_config.minmax.element_tile = 64; } else #endif @@ -264,7 +264,7 @@ static void init_f16_vmul_config(void) { f16_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmul_minmax_ukernel__f16c_u16; f16_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmulc_minmax_ukernel__f16c_u16; f16_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmulc_minmax_ukernel__f16c_u16; - f16_vmul_config.init.f16_minmax = xnn_init_f16_minmax_scalar_params; + f16_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; f16_vmul_config.minmax.element_tile = 16; } #endif @@ -278,7 +278,7 @@ static void init_f16_vsub_config(void) { f16_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsub_minmax_ukernel__neonfp16arith_u16; f16_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsubc_minmax_ukernel__neonfp16arith_u16; f16_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vrsubc_minmax_ukernel__neonfp16arith_u16; - f16_vsub_config.init.f16_minmax = xnn_init_f16_minmax_scalar_params; + f16_vsub_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; f16_vsub_config.minmax.element_tile = 16; } #elif XNN_ARCH_ARM64 && XNN_ENABLE_ARM_FP16_VECTOR @@ -288,7 +288,7 @@ static void init_f16_vsub_config(void) { f16_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsub_minmax_ukernel__neonfp16arith_u16; f16_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsubc_minmax_ukernel__neonfp16arith_u16; f16_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vrsubc_minmax_ukernel__neonfp16arith_u16; - f16_vsub_config.init.f16_minmax = xnn_init_f16_minmax_scalar_params; + f16_vsub_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; f16_vsub_config.minmax.element_tile = 16; } #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE @@ -299,7 +299,7 @@ static void init_f16_vsub_config(void) { f16_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsub_minmax_ukernel__avx512fp16_u64; f16_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsubc_minmax_ukernel__avx512fp16_u64; f16_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vrsubc_minmax_ukernel__avx512fp16_u64; - f16_vsub_config.init.f16_minmax = xnn_init_f16_minmax_scalar_params; + f16_vsub_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; f16_vsub_config.minmax.element_tile = 64; } else #endif @@ -307,7 +307,7 @@ static void init_f16_vsub_config(void) { f16_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsub_minmax_ukernel__f16c_u16; f16_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsubc_minmax_ukernel__f16c_u16; f16_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vrsubc_minmax_ukernel__f16c_u16; - f16_vsub_config.init.f16_minmax = xnn_init_f16_minmax_scalar_params; + f16_vsub_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; f16_vsub_config.minmax.element_tile = 16; } #endif @@ -318,40 +318,40 @@ static void init_f16_vsqrdiff_config(void) { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon_fp16_arith) { - f16_vsqrdiff_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiff_ukernel__neonfp16arith_u16; - f16_vsqrdiff_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiffc_ukernel__neonfp16arith_u16; - f16_vsqrdiff_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiffc_ukernel__neonfp16arith_u16; - f16_vsqrdiff_config.init.f16_minmax = xnn_init_f16_minmax_scalar_params; - f16_vsqrdiff_config.minmax.element_tile = 16; + f16_vsqrdiff_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiff_ukernel__neonfp16arith_u16; + f16_vsqrdiff_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiffc_ukernel__neonfp16arith_u16; + f16_vsqrdiff_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiffc_ukernel__neonfp16arith_u16; + f16_vsqrdiff_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; + f16_vsqrdiff_config.linear.element_tile = 16; } #elif XNN_ARCH_ARM64 && XNN_ENABLE_ARM_FP16_VECTOR const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon_fp16_arith) { - f16_vsqrdiff_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiff_ukernel__neonfp16arith_u16; - f16_vsqrdiff_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiffc_ukernel__neonfp16arith_u16; - f16_vsqrdiff_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiffc_ukernel__neonfp16arith_u16; - f16_vsqrdiff_config.init.f16_minmax = xnn_init_f16_minmax_scalar_params; - f16_vsqrdiff_config.minmax.element_tile = 16; + f16_vsqrdiff_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiff_ukernel__neonfp16arith_u16; + f16_vsqrdiff_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiffc_ukernel__neonfp16arith_u16; + f16_vsqrdiff_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiffc_ukernel__neonfp16arith_u16; + f16_vsqrdiff_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; + f16_vsqrdiff_config.linear.element_tile = 16; } #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); #if XNN_ENABLE_AVX512FP16 if (hardware_config->use_x86_avx512fp16) { - f16_vsqrdiff_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiff_ukernel__avx512fp16_u64; - f16_vsqrdiff_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiffc_ukernel__avx512fp16_u64; - f16_vsqrdiff_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiffc_ukernel__avx512fp16_u64; - f16_vsqrdiff_config.init.f16_minmax = xnn_init_f16_minmax_scalar_params; - f16_vsqrdiff_config.minmax.element_tile = 64; + f16_vsqrdiff_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiff_ukernel__avx512fp16_u64; + f16_vsqrdiff_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiffc_ukernel__avx512fp16_u64; + f16_vsqrdiff_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiffc_ukernel__avx512fp16_u64; + f16_vsqrdiff_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; + f16_vsqrdiff_config.linear.element_tile = 64; } else #endif if (hardware_config->use_x86_f16c) { - f16_vsqrdiff_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiff_ukernel__f16c_u16; - f16_vsqrdiff_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiffc_ukernel__f16c_u16; - f16_vsqrdiff_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiffc_ukernel__f16c_u16; - f16_vsqrdiff_config.init.f16_minmax = xnn_init_f16_minmax_scalar_params; - f16_vsqrdiff_config.minmax.element_tile = 16; + f16_vsqrdiff_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiff_ukernel__f16c_u16; + f16_vsqrdiff_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiffc_ukernel__f16c_u16; + f16_vsqrdiff_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiffc_ukernel__f16c_u16; + f16_vsqrdiff_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; + f16_vsqrdiff_config.linear.element_tile = 16; } #endif } @@ -364,20 +364,20 @@ static void init_f32_vadd_config(void) { f32_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_minmax_ukernel__neon_u8; f32_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__neon_u8; f32_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__neon_u8; - f32_vadd_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vadd_config.minmax.element_tile = 8; } else if (!XNN_PLATFORM_MOBILE) { f32_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_minmax_ukernel__scalar_u8; f32_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__scalar_u8; f32_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__scalar_u8; - f32_vadd_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vadd_config.minmax.element_tile = 8; } #elif XNN_ARCH_ARM64 f32_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_minmax_ukernel__neon_u8; f32_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__neon_u8; f32_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__neon_u8; - f32_vadd_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vadd_config.minmax.element_tile = 8; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); @@ -386,19 +386,19 @@ static void init_f32_vadd_config(void) { f32_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_minmax_ukernel__avx512f_u32; f32_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__avx512f_u32; f32_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__avx512f_u32; - f32_vadd_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vadd_config.minmax.element_tile = 32; } else if (hardware_config->use_x86_avx) { f32_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_minmax_ukernel__avx_u16; f32_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__avx_u16; f32_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__avx_u16; - f32_vadd_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vadd_config.minmax.element_tile = 16; } else { f32_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_minmax_ukernel__sse_u8; f32_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__sse_u8; f32_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__sse_u8; - f32_vadd_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vadd_config.minmax.element_tile = 8; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -411,7 +411,7 @@ static void init_f32_vadd_config(void) { f32_vadd_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_ukernel__wasmsimd_u16; f32_vadd_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_ukernel__wasmsimd_u16; f32_vadd_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_ukernel__wasmsimd_u16; - f32_vadd_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vadd_config.minmax.element_tile = 16; f32_vadd_config.linear.element_tile = 16; } else { @@ -421,7 +421,7 @@ static void init_f32_vadd_config(void) { f32_vadd_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_ukernel__wasmsimd_u16; f32_vadd_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_ukernel__wasmsimd_u16; f32_vadd_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_ukernel__wasmsimd_u16; - f32_vadd_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vadd_config.minmax.element_tile = 16; f32_vadd_config.linear.element_tile = 16; } @@ -429,20 +429,20 @@ static void init_f32_vadd_config(void) { f32_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_minmax_ukernel__wasm_u8; f32_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__wasm_u8; f32_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__wasm_u8; - f32_vadd_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vadd_config.minmax.element_tile = 8; #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); f32_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_minmax_ukernel__rvv_u8v; f32_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__rvv_u8v; f32_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__rvv_u8v; - f32_vadd_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vadd_config.minmax.element_tile = hardware_config->vlenb * 2; // VLENB * (8 / sizeof(float)) #else f32_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_minmax_ukernel__scalar_u8; f32_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__scalar_u8; f32_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__scalar_u8; - f32_vadd_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vadd_config.minmax.element_tile = 8; #endif } @@ -564,20 +564,20 @@ static void init_f32_vdiv_config(void) { f32_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_minmax_ukernel__scalar_u2; f32_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_minmax_ukernel__scalar_u2; f32_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_minmax_ukernel__scalar_u2; - f32_vdiv_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vdiv_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vdiv_config.minmax.element_tile = 2; } else if (!XNN_PLATFORM_MOBILE) { f32_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_minmax_ukernel__scalar_u2; f32_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_minmax_ukernel__scalar_u2; f32_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_minmax_ukernel__scalar_u2; - f32_vdiv_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vdiv_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vdiv_config.minmax.element_tile = 2; } #elif XNN_ARCH_ARM64 f32_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_minmax_ukernel__aarch64_neon_u8; f32_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_minmax_ukernel__aarch64_neon_u8; f32_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_minmax_ukernel__aarch64_neon_u8; - f32_vdiv_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vdiv_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vdiv_config.minmax.element_tile = 8; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); @@ -586,19 +586,19 @@ static void init_f32_vdiv_config(void) { f32_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_minmax_ukernel__avx512f_u32; f32_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_minmax_ukernel__avx512f_u32; f32_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_minmax_ukernel__avx512f_u32; - f32_vdiv_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vdiv_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vdiv_config.minmax.element_tile = 32; } else if (hardware_config->use_x86_avx) { f32_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_minmax_ukernel__avx_u16; f32_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_minmax_ukernel__avx_u16; f32_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_minmax_ukernel__avx_u16; - f32_vdiv_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vdiv_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vdiv_config.minmax.element_tile = 16; } else { f32_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_minmax_ukernel__sse_u8; f32_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_minmax_ukernel__sse_u8; f32_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_minmax_ukernel__sse_u8; - f32_vdiv_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vdiv_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vdiv_config.minmax.element_tile = 8; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -611,7 +611,7 @@ static void init_f32_vdiv_config(void) { f32_vdiv_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_ukernel__wasmsimd_u16; f32_vdiv_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_ukernel__wasmsimd_u16; f32_vdiv_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_ukernel__wasmsimd_u16; - f32_vdiv_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vdiv_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vdiv_config.minmax.element_tile = 16; f32_vdiv_config.linear.element_tile = 16; } else { @@ -621,7 +621,7 @@ static void init_f32_vdiv_config(void) { f32_vdiv_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_ukernel__wasmsimd_u16; f32_vdiv_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_ukernel__wasmsimd_u16; f32_vdiv_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_ukernel__wasmsimd_u16; - f32_vdiv_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vdiv_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vdiv_config.minmax.element_tile = 16; f32_vdiv_config.linear.element_tile = 16; } @@ -629,20 +629,20 @@ static void init_f32_vdiv_config(void) { f32_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_minmax_ukernel__wasm_u8; f32_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_minmax_ukernel__wasm_u8; f32_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_minmax_ukernel__wasm_u8; - f32_vdiv_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vdiv_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vdiv_config.minmax.element_tile = 8; #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); f32_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_minmax_ukernel__rvv_u8v; f32_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_minmax_ukernel__rvv_u8v; f32_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_minmax_ukernel__rvv_u8v; - f32_vdiv_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vdiv_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vdiv_config.minmax.element_tile = hardware_config->vlenb * 2; // VLENB * (8 / sizeof(float)) #else f32_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_minmax_ukernel__scalar_u2; f32_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_minmax_ukernel__scalar_u2; f32_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_minmax_ukernel__scalar_u2; - f32_vdiv_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vdiv_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vdiv_config.minmax.element_tile = 2; #endif } @@ -652,70 +652,70 @@ static void init_f32_vmax_config(void) { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon){ - f32_vmax_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__neon_u8; - f32_vmax_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__neon_u8; - f32_vmax_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__neon_u8; - f32_vmax_config.minmax.element_tile = 8; + f32_vmax_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__neon_u8; + f32_vmax_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__neon_u8; + f32_vmax_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__neon_u8; + f32_vmax_config.linear.element_tile = 8; } else if (!XNN_PLATFORM_MOBILE) { - f32_vmax_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__scalar_u8; - f32_vmax_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__scalar_u8; - f32_vmax_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__scalar_u8; - f32_vmax_config.minmax.element_tile = 8; + f32_vmax_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__scalar_u8; + f32_vmax_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__scalar_u8; + f32_vmax_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__scalar_u8; + f32_vmax_config.linear.element_tile = 8; } #elif XNN_ARCH_ARM64 - f32_vmax_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__neon_u8; - f32_vmax_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__neon_u8; - f32_vmax_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__neon_u8; - f32_vmax_config.minmax.element_tile = 8; + f32_vmax_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__neon_u8; + f32_vmax_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__neon_u8; + f32_vmax_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__neon_u8; + f32_vmax_config.linear.element_tile = 8; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) { - f32_vmax_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__avx512f_u32; - f32_vmax_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__avx512f_u32; - f32_vmax_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__avx512f_u32; - f32_vmax_config.minmax.element_tile = 32; + f32_vmax_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__avx512f_u32; + f32_vmax_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__avx512f_u32; + f32_vmax_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__avx512f_u32; + f32_vmax_config.linear.element_tile = 32; } else if (hardware_config->use_x86_avx) { - f32_vmax_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__avx_u16; - f32_vmax_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__avx_u16; - f32_vmax_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__avx_u16; - f32_vmax_config.minmax.element_tile = 16; + f32_vmax_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__avx_u16; + f32_vmax_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__avx_u16; + f32_vmax_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__avx_u16; + f32_vmax_config.linear.element_tile = 16; } else { - f32_vmax_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__sse_u8; - f32_vmax_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__sse_u8; - f32_vmax_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__sse_u8; - f32_vmax_config.minmax.element_tile = 8; + f32_vmax_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__sse_u8; + f32_vmax_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__sse_u8; + f32_vmax_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__sse_u8; + f32_vmax_config.linear.element_tile = 8; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->is_x86) { - f32_vmax_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__wasmsimd_x86_u16; - f32_vmax_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__wasmsimd_x86_u16; - f32_vmax_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__wasmsimd_x86_u16; - f32_vmax_config.minmax.element_tile = 16; + f32_vmax_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__wasmsimd_x86_u16; + f32_vmax_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__wasmsimd_x86_u16; + f32_vmax_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__wasmsimd_x86_u16; + f32_vmax_config.linear.element_tile = 16; } else { - f32_vmax_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__wasmsimd_arm_u16; - f32_vmax_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__wasmsimd_arm_u16; - f32_vmax_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__wasmsimd_arm_u16; - f32_vmax_config.minmax.element_tile = 16; + f32_vmax_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__wasmsimd_arm_u16; + f32_vmax_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__wasmsimd_arm_u16; + f32_vmax_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__wasmsimd_arm_u16; + f32_vmax_config.linear.element_tile = 16; } #elif XNN_ARCH_WASM - f32_vmax_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__wasm_u8; - f32_vmax_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__wasm_u8; - f32_vmax_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__wasm_u8; - f32_vmax_config.minmax.element_tile = 8; + f32_vmax_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__wasm_u8; + f32_vmax_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__wasm_u8; + f32_vmax_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__wasm_u8; + f32_vmax_config.linear.element_tile = 8; #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - f32_vmax_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__rvv_u8v; - f32_vmax_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__rvv_u8v; - f32_vmax_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__rvv_u8v; - f32_vmax_config.minmax.element_tile = hardware_config->vlenb * 2; // VLENB * (8 / sizeof(float)) + f32_vmax_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__rvv_u8v; + f32_vmax_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__rvv_u8v; + f32_vmax_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__rvv_u8v; + f32_vmax_config.linear.element_tile = hardware_config->vlenb * 2; // VLENB * (8 / sizeof(float)) #else - f32_vmax_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__scalar_u8; - f32_vmax_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__scalar_u8; - f32_vmax_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__scalar_u8; - f32_vmax_config.minmax.element_tile = 8; + f32_vmax_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__scalar_u8; + f32_vmax_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__scalar_u8; + f32_vmax_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__scalar_u8; + f32_vmax_config.linear.element_tile = 8; #endif } @@ -724,70 +724,70 @@ static void init_f32_vmin_config(void) { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon){ - f32_vmin_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__neon_u8; - f32_vmin_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__neon_u8; - f32_vmin_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__neon_u8; - f32_vmin_config.minmax.element_tile = 8; + f32_vmin_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__neon_u8; + f32_vmin_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__neon_u8; + f32_vmin_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__neon_u8; + f32_vmin_config.linear.element_tile = 8; } else if (!XNN_PLATFORM_MOBILE) { - f32_vmin_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__scalar_u8; - f32_vmin_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__scalar_u8; - f32_vmin_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__scalar_u8; - f32_vmin_config.minmax.element_tile = 8; + f32_vmin_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__scalar_u8; + f32_vmin_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__scalar_u8; + f32_vmin_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__scalar_u8; + f32_vmin_config.linear.element_tile = 8; } #elif XNN_ARCH_ARM64 - f32_vmin_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__neon_u8; - f32_vmin_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__neon_u8; - f32_vmin_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__neon_u8; - f32_vmin_config.minmax.element_tile = 8; + f32_vmin_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__neon_u8; + f32_vmin_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__neon_u8; + f32_vmin_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__neon_u8; + f32_vmin_config.linear.element_tile = 8; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) { - f32_vmin_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__avx512f_u32; - f32_vmin_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__avx512f_u32; - f32_vmin_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__avx512f_u32; - f32_vmin_config.minmax.element_tile = 32; + f32_vmin_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__avx512f_u32; + f32_vmin_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__avx512f_u32; + f32_vmin_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__avx512f_u32; + f32_vmin_config.linear.element_tile = 32; } else if (hardware_config->use_x86_avx) { - f32_vmin_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__avx_u16; - f32_vmin_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__avx_u16; - f32_vmin_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__avx_u16; - f32_vmin_config.minmax.element_tile = 16; + f32_vmin_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__avx_u16; + f32_vmin_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__avx_u16; + f32_vmin_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__avx_u16; + f32_vmin_config.linear.element_tile = 16; } else { - f32_vmin_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__sse_u8; - f32_vmin_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__sse_u8; - f32_vmin_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__sse_u8; - f32_vmin_config.minmax.element_tile = 8; + f32_vmin_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__sse_u8; + f32_vmin_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__sse_u8; + f32_vmin_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__sse_u8; + f32_vmin_config.linear.element_tile = 8; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->is_x86) { - f32_vmin_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__wasmsimd_x86_u16; - f32_vmin_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__wasmsimd_x86_u16; - f32_vmin_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__wasmsimd_x86_u16; - f32_vmin_config.minmax.element_tile = 16; + f32_vmin_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__wasmsimd_x86_u16; + f32_vmin_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__wasmsimd_x86_u16; + f32_vmin_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__wasmsimd_x86_u16; + f32_vmin_config.linear.element_tile = 16; } else { - f32_vmin_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__wasmsimd_arm_u16; - f32_vmin_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__wasmsimd_arm_u16; - f32_vmin_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__wasmsimd_arm_u16; - f32_vmin_config.minmax.element_tile = 16; + f32_vmin_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__wasmsimd_arm_u16; + f32_vmin_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__wasmsimd_arm_u16; + f32_vmin_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__wasmsimd_arm_u16; + f32_vmin_config.linear.element_tile = 16; } #elif XNN_ARCH_WASM - f32_vmin_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__wasm_u8; - f32_vmin_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__wasm_u8; - f32_vmin_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__wasm_u8; - f32_vmin_config.minmax.element_tile = 8; + f32_vmin_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__wasm_u8; + f32_vmin_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__wasm_u8; + f32_vmin_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__wasm_u8; + f32_vmin_config.linear.element_tile = 8; #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - f32_vmin_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__rvv_u8v; - f32_vmin_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__rvv_u8v; - f32_vmin_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__rvv_u8v; - f32_vmin_config.minmax.element_tile = hardware_config->vlenb * 2; // VLENB * (8 / sizeof(float)) + f32_vmin_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__rvv_u8v; + f32_vmin_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__rvv_u8v; + f32_vmin_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__rvv_u8v; + f32_vmin_config.linear.element_tile = hardware_config->vlenb * 2; // VLENB * (8 / sizeof(float)) #else - f32_vmin_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__scalar_u8; - f32_vmin_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__scalar_u8; - f32_vmin_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__scalar_u8; - f32_vmin_config.minmax.element_tile = 8; + f32_vmin_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__scalar_u8; + f32_vmin_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__scalar_u8; + f32_vmin_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__scalar_u8; + f32_vmin_config.linear.element_tile = 8; #endif } @@ -799,20 +799,20 @@ static void init_f32_vmul_config(void) { f32_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_minmax_ukernel__neon_u8; f32_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__neon_u8; f32_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__neon_u8; - f32_vmul_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vmul_config.minmax.element_tile = 8; } else if (!XNN_PLATFORM_MOBILE) { f32_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_minmax_ukernel__scalar_u8; f32_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__scalar_u8; f32_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__scalar_u8; - f32_vmul_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vmul_config.minmax.element_tile = 8; } #elif XNN_ARCH_ARM64 f32_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_minmax_ukernel__neon_u8; f32_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__neon_u8; f32_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__neon_u8; - f32_vmul_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vmul_config.minmax.element_tile = 8; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); @@ -821,19 +821,19 @@ static void init_f32_vmul_config(void) { f32_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_minmax_ukernel__avx512f_u32; f32_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__avx512f_u32; f32_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__avx512f_u32; - f32_vmul_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vmul_config.minmax.element_tile = 32; } else if (hardware_config->use_x86_avx) { f32_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_minmax_ukernel__avx_u16; f32_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__avx_u16; f32_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__avx_u16; - f32_vmul_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vmul_config.minmax.element_tile = 16; } else { f32_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_minmax_ukernel__sse_u8; f32_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__sse_u8; f32_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__sse_u8; - f32_vmul_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vmul_config.minmax.element_tile = 8; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -846,7 +846,7 @@ static void init_f32_vmul_config(void) { f32_vmul_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_ukernel__wasmsimd_u16; f32_vmul_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_ukernel__wasmsimd_u16; f32_vmul_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_ukernel__wasmsimd_u16; - f32_vmul_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vmul_config.minmax.element_tile = 16; f32_vmul_config.linear.element_tile = 16; } else { @@ -856,7 +856,7 @@ static void init_f32_vmul_config(void) { f32_vmul_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_ukernel__wasmsimd_u16; f32_vmul_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_ukernel__wasmsimd_u16; f32_vmul_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_ukernel__wasmsimd_u16; - f32_vmul_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vmul_config.minmax.element_tile = 16; f32_vmul_config.linear.element_tile = 16; } @@ -864,20 +864,20 @@ static void init_f32_vmul_config(void) { f32_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_minmax_ukernel__wasm_u8; f32_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__wasm_u8; f32_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__wasm_u8; - f32_vmul_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vmul_config.minmax.element_tile = 8; #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); f32_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_minmax_ukernel__rvv_u8v; f32_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__rvv_u8v; f32_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__rvv_u8v; - f32_vmul_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vmul_config.minmax.element_tile = hardware_config->vlenb * 2; // VLENB * (8 / sizeof(float)) #else f32_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_minmax_ukernel__scalar_u8; f32_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__scalar_u8; f32_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__scalar_u8; - f32_vmul_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vmul_config.minmax.element_tile = 8; #endif } @@ -890,20 +890,20 @@ static void init_f32_vsub_config(void) { f32_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_minmax_ukernel__neon_u8; f32_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_minmax_ukernel__neon_u8; f32_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_minmax_ukernel__neon_u8; - f32_vsub_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vsub_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vsub_config.minmax.element_tile = 8; } else if (!XNN_PLATFORM_MOBILE) { f32_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_minmax_ukernel__scalar_u8; f32_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_minmax_ukernel__scalar_u8; f32_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_minmax_ukernel__scalar_u8; - f32_vsub_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vsub_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vsub_config.minmax.element_tile = 8; } #elif XNN_ARCH_ARM64 f32_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_minmax_ukernel__neon_u8; f32_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_minmax_ukernel__neon_u8; f32_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_minmax_ukernel__neon_u8; - f32_vsub_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vsub_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vsub_config.minmax.element_tile = 8; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); @@ -912,19 +912,19 @@ static void init_f32_vsub_config(void) { f32_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_minmax_ukernel__avx512f_u32; f32_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_minmax_ukernel__avx512f_u32; f32_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_minmax_ukernel__avx512f_u32; - f32_vsub_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vsub_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vsub_config.minmax.element_tile = 32; } else if (hardware_config->use_x86_avx) { f32_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_minmax_ukernel__avx_u16; f32_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_minmax_ukernel__avx_u16; f32_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_minmax_ukernel__avx_u16; - f32_vsub_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vsub_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vsub_config.minmax.element_tile = 16; } else { f32_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_minmax_ukernel__sse_u8; f32_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_minmax_ukernel__sse_u8; f32_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_minmax_ukernel__sse_u8; - f32_vsub_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vsub_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vsub_config.minmax.element_tile = 8; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -937,7 +937,7 @@ static void init_f32_vsub_config(void) { f32_vsub_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_ukernel__wasmsimd_u16; f32_vsub_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_ukernel__wasmsimd_u16; f32_vsub_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_ukernel__wasmsimd_u16; - f32_vsub_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vsub_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vsub_config.minmax.element_tile = 16; f32_vsub_config.linear.element_tile = 16; } else { @@ -947,7 +947,7 @@ static void init_f32_vsub_config(void) { f32_vsub_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_ukernel__wasmsimd_u16; f32_vsub_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_ukernel__wasmsimd_u16; f32_vsub_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_ukernel__wasmsimd_u16; - f32_vsub_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vsub_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vsub_config.minmax.element_tile = 16; f32_vsub_config.linear.element_tile = 16; } @@ -955,20 +955,20 @@ static void init_f32_vsub_config(void) { f32_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_minmax_ukernel__wasm_u8; f32_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_minmax_ukernel__wasm_u8; f32_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_minmax_ukernel__wasm_u8; - f32_vsub_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vsub_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vsub_config.minmax.element_tile = 8; #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); f32_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_minmax_ukernel__rvv_u8v; f32_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_minmax_ukernel__rvv_u8v; f32_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_minmax_ukernel__rvv_u8v; - f32_vsub_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vsub_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vsub_config.minmax.element_tile = hardware_config->vlenb * 2; // VLENB * (8 / sizeof(float)) #else f32_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_minmax_ukernel__scalar_u8; f32_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_minmax_ukernel__scalar_u8; f32_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_minmax_ukernel__scalar_u8; - f32_vsub_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_vsub_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; f32_vsub_config.minmax.element_tile = 8; #endif } @@ -978,56 +978,56 @@ static void init_f32_vsqrdiff_config(void) { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon){ - f32_vsqrdiff_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__neon_u8; - f32_vsqrdiff_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__neon_u8; - f32_vsqrdiff_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__neon_u8; - f32_vsqrdiff_config.minmax.element_tile = 8; + f32_vsqrdiff_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__neon_u8; + f32_vsqrdiff_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__neon_u8; + f32_vsqrdiff_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__neon_u8; + f32_vsqrdiff_config.linear.element_tile = 8; } else if (!XNN_PLATFORM_MOBILE) { - f32_vsqrdiff_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__scalar_u8; - f32_vsqrdiff_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__scalar_u8; - f32_vsqrdiff_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__scalar_u8; - f32_vsqrdiff_config.minmax.element_tile = 8; + f32_vsqrdiff_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__scalar_u8; + f32_vsqrdiff_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__scalar_u8; + f32_vsqrdiff_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__scalar_u8; + f32_vsqrdiff_config.linear.element_tile = 8; } #elif XNN_ARCH_ARM64 - f32_vsqrdiff_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__neon_u8; - f32_vsqrdiff_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__neon_u8; - f32_vsqrdiff_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__neon_u8; - f32_vsqrdiff_config.minmax.element_tile = 8; + f32_vsqrdiff_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__neon_u8; + f32_vsqrdiff_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__neon_u8; + f32_vsqrdiff_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__neon_u8; + f32_vsqrdiff_config.linear.element_tile = 8; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) { - f32_vsqrdiff_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__avx512f_u32; - f32_vsqrdiff_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__avx512f_u32; - f32_vsqrdiff_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__avx512f_u32; - f32_vsqrdiff_config.minmax.element_tile = 32; + f32_vsqrdiff_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__avx512f_u32; + f32_vsqrdiff_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__avx512f_u32; + f32_vsqrdiff_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__avx512f_u32; + f32_vsqrdiff_config.linear.element_tile = 32; } else if (hardware_config->use_x86_avx) { - f32_vsqrdiff_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__avx_u16; - f32_vsqrdiff_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__avx_u16; - f32_vsqrdiff_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__avx_u16; - f32_vsqrdiff_config.minmax.element_tile = 16; + f32_vsqrdiff_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__avx_u16; + f32_vsqrdiff_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__avx_u16; + f32_vsqrdiff_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__avx_u16; + f32_vsqrdiff_config.linear.element_tile = 16; } else { - f32_vsqrdiff_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__sse_u8; - f32_vsqrdiff_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__sse_u8; - f32_vsqrdiff_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__sse_u8; - f32_vsqrdiff_config.minmax.element_tile = 8; + f32_vsqrdiff_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__sse_u8; + f32_vsqrdiff_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__sse_u8; + f32_vsqrdiff_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__sse_u8; + f32_vsqrdiff_config.linear.element_tile = 8; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - f32_vsqrdiff_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__wasmsimd_u16; - f32_vsqrdiff_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__wasmsimd_u16; - f32_vsqrdiff_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__wasmsimd_u16; - f32_vsqrdiff_config.minmax.element_tile = 16; + f32_vsqrdiff_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__wasmsimd_u16; + f32_vsqrdiff_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__wasmsimd_u16; + f32_vsqrdiff_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__wasmsimd_u16; + f32_vsqrdiff_config.linear.element_tile = 16; #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - f32_vsqrdiff_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__rvv_u8v; - f32_vsqrdiff_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__rvv_u8v; - f32_vsqrdiff_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__rvv_u8v; - f32_vsqrdiff_config.minmax.element_tile = hardware_config->vlenb * 2; // VLENB * (8 / sizeof(float)) + f32_vsqrdiff_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__rvv_u8v; + f32_vsqrdiff_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__rvv_u8v; + f32_vsqrdiff_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__rvv_u8v; + f32_vsqrdiff_config.linear.element_tile = hardware_config->vlenb * 2; // VLENB * (8 / sizeof(float)) #else - f32_vsqrdiff_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__scalar_u8; - f32_vsqrdiff_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__scalar_u8; - f32_vsqrdiff_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__scalar_u8; - f32_vsqrdiff_config.minmax.element_tile = 8; + f32_vsqrdiff_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__scalar_u8; + f32_vsqrdiff_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__scalar_u8; + f32_vsqrdiff_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__scalar_u8; + f32_vsqrdiff_config.linear.element_tile = 8; #endif } @@ -1039,20 +1039,20 @@ static void init_qs8_vadd_config(void) { qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__neon_ld64_u16; qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_u16; qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_u16; - qs8_vadd_config.init.qs8_add = xnn_init_qs8_add_minmax_scalar_params; + qs8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_add_minmax_scalar_params; qs8_vadd_config.minmax.element_tile = 16; } else if (!XNN_PLATFORM_MOBILE) { qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__scalar_u1; qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__scalar_u1; qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__scalar_u1; - qs8_vadd_config.init.qs8_add = xnn_init_qs8_add_minmax_scalar_params; + qs8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_add_minmax_scalar_params; qs8_vadd_config.minmax.element_tile = 1; } #elif XNN_ARCH_ARM64 qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__neon_ld64_u32; qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_u32; qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_u32; - qs8_vadd_config.init.qs8_add = xnn_init_qs8_add_minmax_scalar_params; + qs8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_add_minmax_scalar_params; qs8_vadd_config.minmax.element_tile = 32; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); @@ -1061,44 +1061,44 @@ static void init_qs8_vadd_config(void) { qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__avx512skx_mul32_ld128_u16; qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_u16; qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_u16; - qs8_vadd_config.init.qs8_add = xnn_init_qs8_add_minmax_scalar_params; + qs8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_add_minmax_scalar_params; qs8_vadd_config.minmax.element_tile = 16; } else if (hardware_config->use_x86_avx2) { qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__avx2_mul32_ld64_u16; qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_u16; qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_u16; - qs8_vadd_config.init.qs8_add = xnn_init_qs8_add_minmax_scalar_params; + qs8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_add_minmax_scalar_params; qs8_vadd_config.minmax.element_tile = 16; } else if (hardware_config->use_x86_avx) { qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__avx_mul32_ld32_u8; qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_u8; qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_u8; - qs8_vadd_config.init.qs8_add = xnn_init_qs8_add_minmax_scalar_params; + qs8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_add_minmax_scalar_params; qs8_vadd_config.minmax.element_tile = 8; } else if (hardware_config->use_x86_sse4_1) { qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_u8; qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_u8; qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_u8; - qs8_vadd_config.init.qs8_add = xnn_init_qs8_add_minmax_scalar_params; + qs8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_add_minmax_scalar_params; qs8_vadd_config.minmax.element_tile = 8; } else { qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_u8; qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_u8; qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_u8; - qs8_vadd_config.init.qs8_add = xnn_init_qs8_add_minmax_scalar_params; + qs8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_add_minmax_scalar_params; qs8_vadd_config.minmax.element_tile = 8; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__wasmsimd_u32; qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__wasmsimd_u32; qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__wasmsimd_u32; - qs8_vadd_config.init.qs8_add = xnn_init_qs8_add_minmax_scalar_params; + qs8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_add_minmax_scalar_params; qs8_vadd_config.minmax.element_tile = 32; #else qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__scalar_u4; qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__scalar_u4; qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__scalar_u4; - qs8_vadd_config.init.qs8_add = xnn_init_qs8_add_minmax_scalar_params; + qs8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_add_minmax_scalar_params; qs8_vadd_config.minmax.element_tile = 4; #endif } @@ -1111,20 +1111,20 @@ static void init_qs8_vmul_config(void) { qs8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmul_minmax_rndnu_ukernel__neon_ld64_u16; qs8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_u16; qs8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_u16; - qs8_vmul_config.init.qs8_mul = xnn_init_qs8_mul_minmax_rndnu_neon_params; + qs8_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_mul_minmax_rndnu_neon_params; qs8_vmul_config.minmax.element_tile = 16; } else if (!XNN_PLATFORM_MOBILE) { qs8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmul_minmax_fp32_ukernel__scalar_u4; qs8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_u4; qs8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_u4; - qs8_vmul_config.init.qs8_mul = xnn_init_qs8_mul_minmax_scalar_params; + qs8_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_mul_minmax_scalar_params; qs8_vmul_config.minmax.element_tile = 4; } #elif XNN_ARCH_ARM64 qs8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmul_minmax_rndnu_ukernel__neon_ld64_u16; qs8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_u16; qs8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_u16; - qs8_vmul_config.init.qs8_mul = xnn_init_qs8_mul_minmax_rndnu_neon_params; + qs8_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_mul_minmax_rndnu_neon_params; qs8_vmul_config.minmax.element_tile = 16; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); @@ -1133,38 +1133,38 @@ static void init_qs8_vmul_config(void) { qs8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmul_minmax_fp32_ukernel__avx_mul16_ld64_u16; qs8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_u16; qs8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_u16; - qs8_vmul_config.init.qs8_mul = xnn_init_qs8_mul_minmax_scalar_params; + qs8_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_mul_minmax_scalar_params; qs8_vmul_config.minmax.element_tile = 16; } else if (hardware_config->use_x86_sse4_1) { qs8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmul_minmax_fp32_ukernel__sse41_mul16_ld64_u16; qs8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_u16; qs8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_u16; - qs8_vmul_config.init.qs8_mul = xnn_init_qs8_mul_minmax_scalar_params; + qs8_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_mul_minmax_scalar_params; qs8_vmul_config.minmax.element_tile = 16; } else { qs8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmul_minmax_fp32_ukernel__sse2_mul16_ld64_u8; qs8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_u8; qs8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_u8; - qs8_vmul_config.init.qs8_mul = xnn_init_qs8_mul_minmax_scalar_params; + qs8_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_mul_minmax_scalar_params; qs8_vmul_config.minmax.element_tile = 8; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD qs8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmul_minmax_fp32_ukernel__wasmsimd_mul32_ld64_u8; qs8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_u8; qs8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_u8; - qs8_vmul_config.init.qs8_mul = xnn_init_qs8_mul_minmax_scalar_params; + qs8_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_mul_minmax_scalar_params; qs8_vmul_config.minmax.element_tile = 8; #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR qs8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmul_minmax_fp32_ukernel__rvv_u2v; qs8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__rvv_u2v; qs8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__rvv_u2v; - qs8_vmul_config.init.qs8_mul = xnn_init_qs8_mul_minmax_scalar_params; + qs8_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_mul_minmax_scalar_params; qs8_vmul_config.minmax.element_tile = 2; #else qs8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmul_minmax_fp32_ukernel__scalar_u4; qs8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_u4; qs8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_u4; - qs8_vmul_config.init.qs8_mul = xnn_init_qs8_mul_minmax_scalar_params; + qs8_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_mul_minmax_scalar_params; qs8_vmul_config.minmax.element_tile = 4; #endif } @@ -1177,20 +1177,20 @@ static void init_qu8_vadd_config(void) { qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__neon_ld64_u16; qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_u16; qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_u16; - qu8_vadd_config.init.qu8_add = xnn_init_qu8_add_minmax_scalar_params; + qu8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_add_minmax_scalar_params; qu8_vadd_config.minmax.element_tile = 8; } else if (!XNN_PLATFORM_MOBILE) { qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__scalar_u1; qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__scalar_u1; qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__scalar_u1; - qu8_vadd_config.init.qu8_add = xnn_init_qu8_add_minmax_scalar_params; + qu8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_add_minmax_scalar_params; qu8_vadd_config.minmax.element_tile = 1; } #elif XNN_ARCH_ARM64 qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__neon_ld64_u32; qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_u32; qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_u32; - qu8_vadd_config.init.qu8_add = xnn_init_qu8_add_minmax_scalar_params; + qu8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_add_minmax_scalar_params; qu8_vadd_config.minmax.element_tile = 8; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); @@ -1199,44 +1199,44 @@ static void init_qu8_vadd_config(void) { qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__avx512skx_mul32_ld128_u16; qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_u16; qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_u16; - qu8_vadd_config.init.qu8_add = xnn_init_qu8_add_minmax_scalar_params; + qu8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_add_minmax_scalar_params; qu8_vadd_config.minmax.element_tile = 16; } else if (hardware_config->use_x86_avx2) { qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__avx2_mul32_ld64_u16; qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_u16; qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_u16; - qu8_vadd_config.init.qu8_add = xnn_init_qu8_add_minmax_scalar_params; + qu8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_add_minmax_scalar_params; qu8_vadd_config.minmax.element_tile = 16; } else if (hardware_config->use_x86_avx) { qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__avx_mul32_ld32_u8; qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__avx_mul32_ld32_u8; qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__avx_mul32_ld32_u8; - qu8_vadd_config.init.qu8_add = xnn_init_qu8_add_minmax_scalar_params; + qu8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_add_minmax_scalar_params; qu8_vadd_config.minmax.element_tile = 8; } else if (hardware_config->use_x86_sse4_1) { qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__sse41_mul16_ld64_u8; qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__sse41_mul16_ld64_u8; qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__sse41_mul16_ld64_u8; - qu8_vadd_config.init.qu8_add = xnn_init_qu8_add_minmax_scalar_params; + qu8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_add_minmax_scalar_params; qu8_vadd_config.minmax.element_tile = 8; } else { qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__sse2_mul16_ld64_u8; qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__sse2_mul16_ld64_u8; qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__sse2_mul16_ld64_u8; - qu8_vadd_config.init.qu8_add = xnn_init_qu8_add_minmax_scalar_params; + qu8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_add_minmax_scalar_params; qu8_vadd_config.minmax.element_tile = 8; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__wasmsimd_u32; qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__wasmsimd_u32; qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__wasmsimd_u32; - qu8_vadd_config.init.qu8_add = xnn_init_qu8_add_minmax_scalar_params; + qu8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_add_minmax_scalar_params; qu8_vadd_config.minmax.element_tile = 32; #else qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__scalar_u4; qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__scalar_u4; qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__scalar_u4; - qu8_vadd_config.init.qu8_add = xnn_init_qu8_add_minmax_scalar_params; + qu8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_add_minmax_scalar_params; qu8_vadd_config.minmax.element_tile = 4; #endif } @@ -1249,20 +1249,20 @@ static void init_qu8_vmul_config(void) { qu8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmul_minmax_rndnu_ukernel__neon_ld64_u16; qu8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_u16; qu8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_u16; - qu8_vmul_config.init.qu8_mul = xnn_init_qu8_mul_minmax_rndnu_neon_params; + qu8_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_mul_minmax_rndnu_neon_params; qu8_vmul_config.minmax.element_tile = 16; } else if (!XNN_PLATFORM_MOBILE) { qu8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmul_minmax_fp32_ukernel__scalar_u4; qu8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_u4; qu8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_u4; - qu8_vmul_config.init.qu8_mul = xnn_init_qu8_mul_minmax_scalar_params; + qu8_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_mul_minmax_scalar_params; qu8_vmul_config.minmax.element_tile = 4; } #elif XNN_ARCH_ARM64 qu8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmul_minmax_rndnu_ukernel__neon_ld64_u16; qu8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_u16; qu8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_u16; - qu8_vmul_config.init.qu8_mul = xnn_init_qu8_mul_minmax_rndnu_neon_params; + qu8_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_mul_minmax_rndnu_neon_params; qu8_vmul_config.minmax.element_tile = 16; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); @@ -1271,38 +1271,38 @@ static void init_qu8_vmul_config(void) { qu8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmul_minmax_fp32_ukernel__avx_mul16_ld64_u16; qu8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_u16; qu8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_u16; - qu8_vmul_config.init.qu8_mul = xnn_init_qu8_mul_minmax_scalar_params; + qu8_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_mul_minmax_scalar_params; qu8_vmul_config.minmax.element_tile = 16; } else if (hardware_config->use_x86_sse4_1) { qu8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmul_minmax_fp32_ukernel__sse41_mul16_ld64_u16; qu8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_u16; qu8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_u16; - qu8_vmul_config.init.qu8_mul = xnn_init_qu8_mul_minmax_scalar_params; + qu8_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_mul_minmax_scalar_params; qu8_vmul_config.minmax.element_tile = 16; } else { qu8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmul_minmax_fp32_ukernel__sse2_mul16_ld64_u8; qu8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_u8; qu8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_u8; - qu8_vmul_config.init.qu8_mul = xnn_init_qu8_mul_minmax_scalar_params; + qu8_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_mul_minmax_scalar_params; qu8_vmul_config.minmax.element_tile = 8; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD qu8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmul_minmax_fp32_ukernel__wasmsimd_mul32_ld64_u8; qu8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_u8; qu8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_u8; - qu8_vmul_config.init.qu8_mul = xnn_init_qu8_mul_minmax_scalar_params; + qu8_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_mul_minmax_scalar_params; qu8_vmul_config.minmax.element_tile = 8; #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR qu8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmul_minmax_fp32_ukernel__rvv_u2v; qu8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__rvv_u2v; qu8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__rvv_u2v; - qu8_vmul_config.init.qu8_mul = xnn_init_qu8_mul_minmax_scalar_params; + qu8_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_mul_minmax_scalar_params; qu8_vmul_config.minmax.element_tile = 2; #else qu8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmul_minmax_fp32_ukernel__scalar_u4; qu8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_u4; qu8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_u4; - qu8_vmul_config.init.qu8_mul = xnn_init_qu8_mul_minmax_scalar_params; + qu8_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_mul_minmax_scalar_params; qu8_vmul_config.minmax.element_tile = 4; #endif } diff --git a/src/enums/operator-type.c b/src/enums/operator-type.c index a1d9acaf394..3e1fd4254ed 100644 --- a/src/enums/operator-type.c +++ b/src/enums/operator-type.c @@ -12,26 +12,23 @@ #include "xnnpack/operator-type.h" -static const uint16_t offset[171] = { +static const uint16_t offset[170] = { 0, 8, 22, 36, 50, 64, 78, 92, 119, 147, 175, 203, 230, 257, 289, 321, 364, 382, 400, 425, 451, 467, 483, 498, 513, 535, 558, 581, 604, 627, 650, 673, 696, 719, 742, 760, 783, 806, 830, 848, 871, 895, 919, 943, 967, 1002, 1037, 1061, 1085, 1109, 1123, 1138, 1153, 1173, 1199, 1225, 1262, 1288, 1318, 1344, 1376, 1408, 1434, 1461, 1488, 1505, 1522, 1556, 1590, 1604, 1618, 1632, 1646, 1662, 1678, 1704, 1730, 1762, 1794, 1831, 1868, 1905, 1942, 1979, 2016, 2053, 2079, 2111, 2137, 2152, 2186, 2220, 2254, 2288, 2322, 2356, 2386, 2416, 2436, 2456, 2477, 2498, 2519, 2540, 2554, - 2578, 2602, 2625, 2648, 2666, 2684, 2699, 2714, 2729, 2744, 2762, 2780, 2799, 2818, 2837, 2856, 2875, 2892, 2909, - 2925, 2941, 2974, 3007, 3035, 3063, 3091, 3119, 3146, 3173, 3190, 3207, 3248, 3289, 3307, 3325, 3343, 3361, 3376, - 3392, 3408, 3426, 3444, 3462, 3488, 3515, 3542, 3559, 3576, 3598, 3620, 3649, 3678, 3697, 3716, 3735, 3754, 3769, - 3784, 3799, 3814, 3833, 3853, 3873, 3893, 3914, 3935 + 2578, 2602, 2625, 2648, 2666, 2684, 2699, 2714, 2729, 2747, 2765, 2784, 2803, 2822, 2841, 2860, 2877, 2894, 2910, + 2926, 2959, 2992, 3020, 3048, 3076, 3104, 3131, 3158, 3175, 3192, 3233, 3274, 3292, 3310, 3328, 3346, 3361, 3377, + 3393, 3411, 3429, 3447, 3473, 3500, 3527, 3544, 3561, 3583, 3605, 3634, 3663, 3682, 3701, 3720, 3739, 3754, 3769, + 3784, 3799, 3818, 3838, 3858, 3878, 3899, 3920 }; static const char data[] = "Invalid\0" "Abs (NC, F16)\0" "Abs (NC, F32)\0" - "Add (ND, F16)\0" - "Add (ND, F32)\0" - "Add (ND, QS8)\0" - "Add (ND, QU8)\0" + "Add (ND)\0" "ArgMax Pooling (NHWC, F32)\0" "Average Pooling (NHWC, F16)\0" "Average Pooling (NHWC, F32)\0" @@ -77,7 +74,7 @@ static const char data[] = "Copy (NC, X8)\0" "Copy (NC, X16)\0" "Copy (NC, X32)\0" - "Copy Sign (NC, F32)\0" + "Copy Sign (NC)\0" "Deconvolution (NHWC, F16)\0" "Deconvolution (NHWC, F32)\0" "Deconvolution (NHWC, QD8, F32, QC8W)\0" @@ -89,8 +86,7 @@ static const char data[] = "Depth To Space (NHWC, X8)\0" "Depth To Space (NHWC, X16)\0" "Depth To Space (NHWC, X32)\0" - "Divide (ND, F16)\0" - "Divide (ND, F32)\0" + "Divide (ND)\0" "Dynamic Fully Connected (NC, F16)\0" "Dynamic Fully Connected (NC, F32)\0" "ELU (NC, F16)\0" @@ -133,19 +129,13 @@ static const char data[] = "Max Pooling (NHWC, F32)\0" "Max Pooling (NHWC, S8)\0" "Max Pooling (NHWC, U8)\0" - "Maximum (ND, F16)\0" - "Maximum (ND, F32)\0" + "Maximum (ND)\0" "Mean (ND, F16)\0" "Mean (ND, F32)\0" "Mean (ND, QS8)\0" "Mean (ND, QU8)\0" - "Minimum (ND, F16)\0" - "Minimum (ND, F32)\0" - "Multiply (ND, F16)\0" - "Multiply (ND, F32)\0" - "Multiply (ND, QS8)\0" - "Multiply (ND, QU8)\0" - "Multiply (ND, S32)\0" + "Minimum (ND)\0" + "Multiply (ND)\0" "Negate (NC, F16)\0" "Negate (NC, F32)\0" "PReLU (NC, F16)\0" @@ -179,12 +169,8 @@ static const char data[] = "Square (NC, F32)\0" "Square Root (NC, F16)\0" "Square Root (NC, F32)\0" - "Squared Difference (NC, F16)\0" - "Squared Difference (NC, F32)\0" - "Subtract (ND, F16)\0" - "Subtract (ND, F32)\0" - "Subtract (ND, QS8)\0" - "Subtract (ND, QU8)\0" + "Squared Difference (NC)\0" + "Subtract (ND)\0" "Tanh (NC, F16)\0" "Tanh (NC, F32)\0" "Tanh (NC, QS8)\0" diff --git a/src/enums/operator-type.yaml b/src/enums/operator-type.yaml index 8a2741526ea..ebb35073453 100644 --- a/src/enums/operator-type.yaml +++ b/src/enums/operator-type.yaml @@ -11,14 +11,8 @@ string: "Abs (NC, F16)" - name: xnn_operator_type_abs_nc_f32 string: "Abs (NC, F32)" -- name: xnn_operator_type_add_nd_f16 - string: "Add (ND, F16)" -- name: xnn_operator_type_add_nd_f32 - string: "Add (ND, F32)" -- name: xnn_operator_type_add_nd_qs8 - string: "Add (ND, QS8)" -- name: xnn_operator_type_add_nd_qu8 - string: "Add (ND, QU8)" +- name: xnn_operator_type_add + string: "Add (ND)" - name: xnn_operator_type_argmax_pooling_nhwc_f32 string: "ArgMax Pooling (NHWC, F32)" - name: xnn_operator_type_average_pooling_nhwc_f16 @@ -109,8 +103,8 @@ string: "Copy (NC, X16)" - name: xnn_operator_type_copy_nc_x32 string: "Copy (NC, X32)" -- name: xnn_operator_type_copysign_nd_f32 - string: "Copy Sign (NC, F32)" +- name: xnn_operator_type_copysign + string: "Copy Sign (ND)" - name: xnn_operator_type_deconvolution_nhwc_f16 string: "Deconvolution (NHWC, F16)" - name: xnn_operator_type_deconvolution_nhwc_f32 @@ -133,10 +127,8 @@ string: "Depth To Space (NHWC, X16)" - name: xnn_operator_type_depth_to_space_nhwc_x32 string: "Depth To Space (NHWC, X32)" -- name: xnn_operator_type_divide_nd_f16 - string: "Divide (ND, F16)" -- name: xnn_operator_type_divide_nd_f32 - string: "Divide (ND, F32)" +- name: xnn_operator_type_divide + string: "Divide (ND)" - name: xnn_operator_type_dynamic_fully_connected_nc_f16 string: "Dynamic Fully Connected (NC, F16)" - name: xnn_operator_type_dynamic_fully_connected_nc_f32 @@ -221,10 +213,8 @@ string: "Max Pooling (NHWC, S8)" - name: xnn_operator_type_max_pooling_nhwc_u8 string: "Max Pooling (NHWC, U8)" -- name: xnn_operator_type_maximum_nd_f16 - string: "Maximum (ND, F16)" -- name: xnn_operator_type_maximum_nd_f32 - string: "Maximum (ND, F32)" +- name: xnn_operator_type_maximum + string: "Maximum (ND)" - name: xnn_operator_type_mean_nd_f16 string: "Mean (ND, F16)" - name: xnn_operator_type_mean_nd_f32 @@ -233,20 +223,10 @@ string: "Mean (ND, QS8)" - name: xnn_operator_type_mean_nd_qu8 string: "Mean (ND, QU8)" -- name: xnn_operator_type_minimum_nd_f16 - string: "Minimum (ND, F16)" -- name: xnn_operator_type_minimum_nd_f32 - string: "Minimum (ND, F32)" -- name: xnn_operator_type_multiply_nd_f16 - string: "Multiply (ND, F16)" -- name: xnn_operator_type_multiply_nd_f32 - string: "Multiply (ND, F32)" -- name: xnn_operator_type_multiply_nd_qs8 - string: "Multiply (ND, QS8)" -- name: xnn_operator_type_multiply_nd_qu8 - string: "Multiply (ND, QU8)" -- name: xnn_operator_type_multiply_nd_s32 - string: "Multiply (ND, S32)" +- name: xnn_operator_type_minimum + string: "Minimum (ND)" +- name: xnn_operator_type_multiply + string: "Multiply (ND)" - name: xnn_operator_type_negate_nc_f16 string: "Negate (NC, F16)" - name: xnn_operator_type_negate_nc_f32 @@ -313,18 +293,10 @@ string: "Square Root (NC, F16)" - name: xnn_operator_type_square_root_nc_f32 string: "Square Root (NC, F32)" -- name: xnn_operator_type_squared_difference_nd_f16 - string: "Squared Difference (NC, F16)" -- name: xnn_operator_type_squared_difference_nd_f32 - string: "Squared Difference (NC, F32)" -- name: xnn_operator_type_subtract_nd_f16 - string: "Subtract (ND, F16)" -- name: xnn_operator_type_subtract_nd_f32 - string: "Subtract (ND, F32)" -- name: xnn_operator_type_subtract_nd_qs8 - string: "Subtract (ND, QS8)" -- name: xnn_operator_type_subtract_nd_qu8 - string: "Subtract (ND, QU8)" +- name: xnn_operator_type_squared_difference + string: "Squared Difference (NC)" +- name: xnn_operator_type_subtract + string: "Subtract (ND)" - name: xnn_operator_type_tanh_nc_f16 string: "Tanh (NC, F16)" - name: xnn_operator_type_tanh_nc_f32 diff --git a/src/microparams-init.c b/src/microparams-init.c index a3fa0b7c019..6d4d38c38bd 100644 --- a/src/microparams-init.c +++ b/src/microparams-init.c @@ -11,6 +11,7 @@ #include #include +#include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/math.h" #include "xnnpack/microparams.h" @@ -1802,16 +1803,36 @@ size_t xnn_init_u8_minmax_scalar_params( return sizeof(params->scalar); } +size_t xnn_init_f16_minmax_binary_params( + union xnn_f16_minmax_params uparams[XNN_MIN_ELEMENTS(1)], + const struct xnn_quantization_params* a_quantization, + const struct xnn_quantization_params* b_quantization, + const struct xnn_quantization_params* output_quantization) { + uparams->scalar.min = xnn_float16_from_float(-INFINITY); + uparams->scalar.max = xnn_float16_from_float(+INFINITY); + return sizeof(uparams->scalar); +} + +size_t xnn_init_f32_minmax_binary_params( + union xnn_f32_minmax_params uparams[XNN_MIN_ELEMENTS(1)], + const struct xnn_quantization_params* a_quantization, + const struct xnn_quantization_params* b_quantization, + const struct xnn_quantization_params* output_quantization) { + uparams->scalar.min = -INFINITY; + uparams->scalar.max = +INFINITY; + return sizeof(uparams->scalar); +} + size_t xnn_init_qu8_add_minmax_scalar_params( - struct xnn_qu8_add_minmax_params params[XNN_MIN_ELEMENTS(1)], - uint8_t a_zero_point, - uint8_t b_zero_point, - uint8_t output_zero_point, - float a_output_scale, - float b_output_scale, - uint8_t output_min, - uint8_t output_max) -{ + struct xnn_qu8_add_minmax_params uparams[XNN_MIN_ELEMENTS(1)], + const struct xnn_quantization_params* a_quantization, + const struct xnn_quantization_params* b_quantization, + const struct xnn_quantization_params* output_quantization) { + assert(a_quantization); + assert(b_quantization); + assert(output_quantization); + const float a_output_scale = a_quantization->scale / output_quantization->scale; + const float b_output_scale = b_quantization->scale / output_quantization->scale; const float abs_a_output_scale = fabsf(a_output_scale); const float abs_b_output_scale = fabsf(b_output_scale); assert(abs_a_output_scale >= 0x1.0p-10f); @@ -1842,28 +1863,30 @@ size_t xnn_init_qu8_add_minmax_scalar_params( const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier; const int32_t rounding = INT32_C(1) << (shift - 1); - params->scalar.bias = rounding - a_multiplier * (int32_t) (uint32_t) a_zero_point - b_multiplier * (int32_t) (uint32_t) b_zero_point; - params->scalar.a_zero_point = a_zero_point; - params->scalar.b_zero_point = b_zero_point; - params->scalar.a_multiplier = a_multiplier; - params->scalar.b_multiplier = b_multiplier; - params->scalar.shift = shift; - params->scalar.output_min = (int32_t) (uint32_t) output_min; - params->scalar.output_max = (int32_t) (uint32_t) output_max; - params->scalar.output_zero_point = (int32_t) (uint32_t) output_zero_point; - return sizeof(params->scalar); + uparams->scalar.bias = rounding - + a_multiplier * (int32_t)(uint32_t)a_quantization->zero_point - + b_multiplier * (int32_t)(uint32_t)b_quantization->zero_point; + uparams->scalar.a_zero_point = a_quantization->zero_point; + uparams->scalar.b_zero_point = b_quantization->zero_point; + uparams->scalar.a_multiplier = a_multiplier; + uparams->scalar.b_multiplier = b_multiplier; + uparams->scalar.shift = shift; + uparams->scalar.output_min = 0; + uparams->scalar.output_max = UINT8_MAX; + uparams->scalar.output_zero_point = (int32_t)(uint32_t)output_quantization->zero_point; + return sizeof(uparams->scalar); } size_t xnn_init_qs8_add_minmax_scalar_params( - struct xnn_qs8_add_minmax_params params[XNN_MIN_ELEMENTS(1)], - int8_t a_zero_point, - int8_t b_zero_point, - int8_t output_zero_point, - float a_output_scale, - float b_output_scale, - int8_t output_min, - int8_t output_max) -{ + struct xnn_qs8_add_minmax_params uparams[XNN_MIN_ELEMENTS(1)], + const struct xnn_quantization_params* a_quantization, + const struct xnn_quantization_params* b_quantization, + const struct xnn_quantization_params* output_quantization) { + assert(a_quantization); + assert(b_quantization); + assert(output_quantization); + const float a_output_scale = a_quantization->scale / output_quantization->scale; + const float b_output_scale = b_quantization->scale / output_quantization->scale; const float abs_a_output_scale = fabsf(a_output_scale); const float abs_b_output_scale = fabsf(b_output_scale); assert(abs_a_output_scale >= 0x1.0p-10f); @@ -1894,49 +1917,53 @@ size_t xnn_init_qs8_add_minmax_scalar_params( const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier; const int32_t rounding = INT32_C(1) << (shift - 1); - params->scalar.bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point; - params->scalar.a_zero_point = a_zero_point; - params->scalar.b_zero_point = b_zero_point; - params->scalar.a_multiplier = a_multiplier; - params->scalar.b_multiplier = b_multiplier; - params->scalar.shift = shift; - params->scalar.output_zero_point = (int32_t) output_zero_point; - params->scalar.output_min = (int32_t) output_min; - params->scalar.output_max = (int32_t) output_max; - return sizeof(params->scalar); + uparams->scalar.bias = rounding - a_multiplier * (int32_t)a_quantization->zero_point - + b_multiplier * (int32_t)b_quantization->zero_point; + uparams->scalar.a_zero_point = a_quantization->zero_point; + uparams->scalar.b_zero_point = b_quantization->zero_point; + uparams->scalar.a_multiplier = a_multiplier; + uparams->scalar.b_multiplier = b_multiplier; + uparams->scalar.shift = shift; + uparams->scalar.output_zero_point = (int32_t)output_quantization->zero_point; + uparams->scalar.output_min = INT8_MIN; + uparams->scalar.output_max = INT8_MAX; + return sizeof(uparams->scalar); } size_t xnn_init_qu8_mul_minmax_scalar_params( - union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)], - uint8_t a_zero_point, - uint8_t b_zero_point, - uint8_t output_zero_point, - float product_output_scale, - uint8_t output_min, - uint8_t output_max) -{ + union xnn_qu8_mul_minmax_params uparams[XNN_MIN_ELEMENTS(1)], + const struct xnn_quantization_params* a_quantization, + const struct xnn_quantization_params* b_quantization, + const struct xnn_quantization_params* output_quantization) { + assert(a_quantization); + assert(b_quantization); + assert(output_quantization); + const float product_scale = a_quantization->scale * b_quantization->scale; + const float product_output_scale = product_scale / output_quantization->scale; assert(product_output_scale >= 0x1.0p-16f); assert(product_output_scale < 0x1.0p+8f); - params->scalar.a_zero_point = a_zero_point; - params->scalar.b_zero_point = b_zero_point; - params->scalar.scale = product_output_scale; - params->scalar.output_zero_point = output_zero_point; - params->scalar.output_min = output_min; - params->scalar.output_max = output_max; - return sizeof(params->scalar); + uparams->scalar.a_zero_point = a_quantization->zero_point; + uparams->scalar.b_zero_point = b_quantization->zero_point; + uparams->scalar.scale = product_output_scale; + uparams->scalar.output_zero_point = output_quantization->zero_point; + uparams->scalar.output_min = 0; + uparams->scalar.output_max = UINT8_MAX; + return sizeof(uparams->scalar); } #if XNN_ARCH_ARM || XNN_ARCH_ARM64 size_t xnn_init_qu8_mul_minmax_rndnu_neon_params( - union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)], - uint8_t a_zero_point, - uint8_t b_zero_point, - uint8_t output_zero_point, - float product_output_scale, - uint8_t output_min, - uint8_t output_max) -{ + union xnn_qu8_mul_minmax_params uparams[XNN_MIN_ELEMENTS(1)], + const struct xnn_quantization_params* a_quantization, + const struct xnn_quantization_params* b_quantization, + const struct xnn_quantization_params* output_quantization) { + assert(a_quantization); + assert(b_quantization); + assert(output_quantization); + + const float product_scale = a_quantization->scale * b_quantization->scale; + const float product_output_scale = product_scale / output_quantization->scale; assert(product_output_scale >= 0x1.0p-16f); assert(product_output_scale < 0x1.0p+8f); @@ -1957,49 +1984,51 @@ size_t xnn_init_qu8_mul_minmax_rndnu_neon_params( const int32_t post_shift = math_max_s32(shift, 1); const int32_t pre_shift = shift - post_shift; - params->rndnu_neon.a_zero_point = a_zero_point; - params->rndnu_neon.b_zero_point = b_zero_point; - params->rndnu_neon.left_pre_shift = -pre_shift; - params->rndnu_neon.multiplier = multiplier; - params->rndnu_neon.left_post_shift = -post_shift; - params->rndnu_neon.output_zero_point = (int16_t) output_zero_point; - params->rndnu_neon.output_min = output_min; - params->rndnu_neon.output_max = output_max; - return sizeof(params->rndnu_neon); + uparams->rndnu_neon.a_zero_point = a_quantization->zero_point; + uparams->rndnu_neon.b_zero_point = b_quantization->zero_point; + uparams->rndnu_neon.left_pre_shift = -pre_shift; + uparams->rndnu_neon.multiplier = multiplier; + uparams->rndnu_neon.left_post_shift = -post_shift; + uparams->rndnu_neon.output_zero_point = (int16_t)output_quantization->zero_point; + uparams->rndnu_neon.output_min = 0; + uparams->rndnu_neon.output_max = UINT8_MAX; + return sizeof(uparams->rndnu_neon); } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 size_t xnn_init_qs8_mul_minmax_scalar_params( - union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)], - int8_t a_zero_point, - int8_t b_zero_point, - int8_t output_zero_point, - float product_output_scale, - int8_t output_min, - int8_t output_max) -{ + union xnn_qs8_mul_minmax_params uparams[XNN_MIN_ELEMENTS(1)], + const struct xnn_quantization_params* a_quantization, + const struct xnn_quantization_params* b_quantization, + const struct xnn_quantization_params* output_quantization) { + assert(a_quantization); + assert(b_quantization); + assert(output_quantization); + const float product_scale = a_quantization->scale * b_quantization->scale; + const float product_output_scale = product_scale / output_quantization->scale; assert(product_output_scale >= 0x1.0p-16f); assert(product_output_scale < 0x1.0p+8f); - params->scalar.a_zero_point = a_zero_point; - params->scalar.b_zero_point = b_zero_point; - params->scalar.scale = product_output_scale; - params->scalar.output_zero_point = output_zero_point; - params->scalar.output_min = output_min; - params->scalar.output_max = output_max; - return sizeof(params->scalar); + uparams->scalar.a_zero_point = a_quantization->zero_point; + uparams->scalar.b_zero_point = b_quantization->zero_point; + uparams->scalar.scale = product_output_scale; + uparams->scalar.output_zero_point = output_quantization->zero_point; + uparams->scalar.output_min = INT8_MIN; + uparams->scalar.output_max = INT8_MAX; + return sizeof(uparams->scalar); } #if XNN_ARCH_ARM || XNN_ARCH_ARM64 size_t xnn_init_qs8_mul_minmax_rndnu_neon_params( - union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)], - int8_t a_zero_point, - int8_t b_zero_point, - int8_t output_zero_point, - float product_output_scale, - int8_t output_min, - int8_t output_max) -{ + union xnn_qs8_mul_minmax_params uparams[XNN_MIN_ELEMENTS(1)], + const struct xnn_quantization_params* a_quantization, + const struct xnn_quantization_params* b_quantization, + const struct xnn_quantization_params* output_quantization) { + assert(a_quantization); + assert(b_quantization); + assert(output_quantization); + const float product_scale = a_quantization->scale * b_quantization->scale; + const float product_output_scale = product_scale / output_quantization->scale; assert(product_output_scale >= 0x1.0p-16f); assert(product_output_scale < 0x1.0p+8f); @@ -2020,15 +2049,15 @@ size_t xnn_init_qs8_mul_minmax_rndnu_neon_params( const int32_t post_shift = math_max_s32(shift, 1); const int32_t pre_shift = shift - post_shift; - params->rndnu_neon.a_zero_point = a_zero_point; - params->rndnu_neon.b_zero_point = b_zero_point; - params->rndnu_neon.left_pre_shift = -pre_shift; - params->rndnu_neon.multiplier = multiplier; - params->rndnu_neon.left_post_shift = -post_shift; - params->rndnu_neon.output_zero_point = (int16_t) output_zero_point; - params->rndnu_neon.output_min = output_min; - params->rndnu_neon.output_max = output_max; - return sizeof(params->rndnu_neon); + uparams->rndnu_neon.a_zero_point = a_quantization->zero_point; + uparams->rndnu_neon.b_zero_point = b_quantization->zero_point; + uparams->rndnu_neon.left_pre_shift = -pre_shift; + uparams->rndnu_neon.multiplier = multiplier; + uparams->rndnu_neon.left_post_shift = -post_shift; + uparams->rndnu_neon.output_zero_point = (int16_t)output_quantization->zero_point; + uparams->rndnu_neon.output_min = INT8_MIN; + uparams->rndnu_neon.output_max = INT8_MAX; + return sizeof(uparams->rndnu_neon); } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 diff --git a/src/operator-utils.c b/src/operator-utils.c index 1162a05e4de..55452605d2f 100644 --- a/src/operator-utils.c +++ b/src/operator-utils.c @@ -15,6 +15,7 @@ #include "xnnpack/math.h" #include "xnnpack/operator-utils.h" #include "xnnpack/operator.h" // For xnn_operator definition. +#include "xnnpack/operator-type.h" void* xnn_get_pointer_to_write_weights( xnn_operator_t op, @@ -146,3 +147,27 @@ uint32_t xnn_get_heuristic_mr_igemm( } return best_mr; } + +enum xnn_operator_type xnn_binary_operator_to_operator_type(enum xnn_binary_operator op) +{ + switch (op) { + case xnn_binary_add: + return xnn_operator_type_add; + case xnn_binary_subtract: + return xnn_operator_type_subtract; + case xnn_binary_multiply: + return xnn_operator_type_multiply; + case xnn_binary_divide: + return xnn_operator_type_divide; + case xnn_binary_maximum: + return xnn_operator_type_maximum; + case xnn_binary_minimum: + return xnn_operator_type_minimum; + case xnn_binary_copysign: + return xnn_operator_type_copysign; + case xnn_binary_squared_difference: + return xnn_operator_type_squared_difference; + default: + return xnn_operator_type_invalid; + } +} \ No newline at end of file diff --git a/src/operators/binary-elementwise-nd.c b/src/operators/binary-elementwise-nd.c index fc460bda0f0..f3ef2ef6913 100644 --- a/src/operators/binary-elementwise-nd.c +++ b/src/operators/binary-elementwise-nd.c @@ -4,14 +4,12 @@ // LICENSE file in the root directory of this source tree. #include -#include #include #include #include #include #include -#include #include "xnnpack.h" #include "xnnpack/allocator.h" #include "xnnpack/common.h" @@ -26,986 +24,296 @@ #include "xnnpack/params.h" #include "pthreadpool.h" -static void init_binary_elementwise_nd( - const void* params, - const void* params2, - size_t params_size, - uint32_t flags, - enum xnn_operator_type operator_type, - const struct xnn_binary_elementwise_subconfig* binary_elementwise_subconfig, - xnn_operator_t binary_elementwise_op) -{ - if (params_size != 0) { - memcpy(&binary_elementwise_op->params, params, params_size); - memcpy(&binary_elementwise_op->params2, params2, params_size); +static uint32_t xnn_datatype_get_log2_element_size(enum xnn_datatype datatype) { + switch (datatype) { + case xnn_datatype_qcint4: + case xnn_datatype_qbint4: + case xnn_datatype_qdint8: + case xnn_datatype_qint8: + case xnn_datatype_quint8: + case xnn_datatype_qcint8: + case xnn_datatype_qpint8: + return 0; + case xnn_datatype_fp16: + return 1; + case xnn_datatype_qint32: + case xnn_datatype_qcint32: + case xnn_datatype_int32: + case xnn_datatype_fp32: + return 2; + case xnn_datatype_invalid: + default: + XNN_UNREACHABLE; + } +} + +static const char* xnn_binary_operator_to_string( + enum xnn_binary_operator type) { + return xnn_operator_type_to_string( + xnn_binary_operator_to_operator_type(type)); +} + +static const struct xnn_binary_elementwise_config* init_config( + enum xnn_binary_operator type, enum xnn_datatype datatype, int* sign_b) { + switch (type) { + case xnn_binary_add: + switch (datatype) { + case xnn_datatype_fp32: + return xnn_init_f32_vadd_config(); + case xnn_datatype_fp16: + return xnn_init_f16_vadd_config(); + case xnn_datatype_qint8: + return xnn_init_qs8_vadd_config(); + case xnn_datatype_quint8: + return xnn_init_qu8_vadd_config(); + default: + return NULL; + } + case xnn_binary_subtract: + *sign_b = -1; + switch (datatype) { + case xnn_datatype_fp32: + return xnn_init_f32_vsub_config(); + case xnn_datatype_fp16: + return xnn_init_f16_vsub_config(); + case xnn_datatype_qint8: + return xnn_init_qs8_vadd_config(); + case xnn_datatype_quint8: + return xnn_init_qu8_vadd_config(); + default: + return NULL; + } + case xnn_binary_multiply: + switch (datatype) { + case xnn_datatype_fp32: + return xnn_init_f32_vmul_config(); + case xnn_datatype_fp16: + return xnn_init_f16_vmul_config(); + case xnn_datatype_qint8: + return xnn_init_qs8_vmul_config(); + case xnn_datatype_quint8: + return xnn_init_qu8_vmul_config(); + case xnn_datatype_int32: + return xnn_init_s32_vmul_config(); + default: + return NULL; + } + case xnn_binary_divide: + switch (datatype) { + case xnn_datatype_fp32: + return xnn_init_f32_vdiv_config(); + case xnn_datatype_fp16: + return xnn_init_f16_vdiv_config(); + default: + return NULL; + } + case xnn_binary_maximum: + switch (datatype) { + case xnn_datatype_fp32: + return xnn_init_f32_vmax_config(); + case xnn_datatype_fp16: + return xnn_init_f16_vmax_config(); + default: + return NULL; + } + case xnn_binary_minimum: + switch (datatype) { + case xnn_datatype_fp32: + return xnn_init_f32_vmin_config(); + case xnn_datatype_fp16: + return xnn_init_f16_vmin_config(); + default: + return NULL; + } + case xnn_binary_copysign: + switch (datatype) { + case xnn_datatype_fp32: + return xnn_init_f32_vcopysign_config(); + default: + return NULL; + } + case xnn_binary_squared_difference: + switch (datatype) { + case xnn_datatype_fp32: + return xnn_init_f32_vsqrdiff_config(); + case xnn_datatype_fp16: + return xnn_init_f16_vsqrdiff_config(); + default: + return NULL; + } + default: + return NULL; } - - binary_elementwise_op->binary_elementwise_subconfig = binary_elementwise_subconfig; - - binary_elementwise_op->type = operator_type; - binary_elementwise_op->flags = flags; - - binary_elementwise_op->state = xnn_run_state_invalid; } -static enum xnn_status create_binary_elementwise_nd( - uint32_t flags, - const void* params, - const void* params2, - size_t params_size, - enum xnn_operator_type operator_type, - const struct xnn_binary_elementwise_subconfig* binary_elementwise_subconfig, - xnn_operator_t* binary_elementwise_op_out) -{ - if (binary_elementwise_subconfig == NULL) { - xnn_log_error("failed to create %s operator: unsupported hardware configuration", - xnn_operator_type_to_string(operator_type)); - return xnn_status_unsupported_hardware; - } - - if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) { - xnn_log_error("failed to create %s operator: XNNPACK is not initialized", - xnn_operator_type_to_string(operator_type)); - return xnn_status_uninitialized; - } - - xnn_operator_t binary_elementwise_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator)); - if (binary_elementwise_op == NULL) { - xnn_log_error( - "failed to allocate %zu bytes for %s operator descriptor", - sizeof(struct xnn_operator), xnn_operator_type_to_string(operator_type)); - return xnn_status_out_of_memory; - } - - init_binary_elementwise_nd( - params, - params2, - params_size, - flags, - operator_type, - binary_elementwise_subconfig, - binary_elementwise_op); - - *binary_elementwise_op_out = binary_elementwise_op; - return xnn_status_success; +static bool can_use_subconfig( + const struct xnn_binary_elementwise_subconfig* subconfig) { + return subconfig->op_ukernel != NULL; } -static enum xnn_status create_binary_elementwise_nd_f16( - float output_min, - float output_max, - uint32_t flags, - enum xnn_operator_type operator_type, +static const struct xnn_binary_elementwise_subconfig* init_subconfig( const struct xnn_binary_elementwise_config* config, - xnn_operator_t* binary_elementwise_op_out) -{ - if (isnan(output_min)) { - xnn_log_error( - "failed to create %s operator with NaN output lower bound: lower bound must be non-NaN", - xnn_operator_type_to_string(operator_type)); - return xnn_status_invalid_parameter; - } - - if (isnan(output_max)) { - xnn_log_error( - "failed to create %s operator with NaN output upper bound: upper bound must be non-NaN", - xnn_operator_type_to_string(operator_type)); - return xnn_status_invalid_parameter; - } - - if (fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(output_min)) >= fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(output_max))) { - xnn_log_error( - "failed to create %s operator with [%.7g, %.7g] output range: lower bound must be below upper bound", - xnn_operator_type_to_string(operator_type), - fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(output_min)), - fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(output_max))); - return xnn_status_invalid_parameter; + enum xnn_binary_operator type) { + // We can use either a minmax or a linear config. + if (can_use_subconfig(&config->minmax)) { + return &config->minmax; } - - if (config == NULL) { - xnn_log_error("failed to create %s operator: unsupported hardware configuration", - xnn_operator_type_to_string(operator_type)); - return xnn_status_unsupported_hardware; + if (can_use_subconfig(&config->linear)) { + return &config->linear; } - - union xnn_f16_minmax_params params; - assert(config->init.f16_minmax != NULL); - config->init.f16_minmax(¶ms, - xnn_float16_from_float(output_min), xnn_float16_from_float(output_max)); - - return create_binary_elementwise_nd( - flags, - ¶ms, - ¶ms, - sizeof(params), - operator_type, - &config->minmax, - binary_elementwise_op_out); + xnn_log_error("failed to create %s operator", + xnn_binary_operator_to_string(type)); + return NULL; } -static enum xnn_status create_binary_elementwise_nd_f32( - float output_min, - float output_max, - uint32_t flags, - enum xnn_operator_type operator_type, - const struct xnn_binary_elementwise_config* config, - xnn_operator_t* binary_elementwise_op_out) -{ +static enum xnn_status init_binary_elementwise_nd( + xnn_operator_t op, enum xnn_binary_operator type, + enum xnn_datatype datatype, + const struct xnn_quantization_params* a_quantization, + const struct xnn_quantization_params* b_quantization, + const struct xnn_quantization_params* output_quantization, uint32_t flags) { if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) { xnn_log_error("failed to create %s operator: XNNPACK is not initialized", - xnn_operator_type_to_string(operator_type)); + xnn_binary_operator_to_string(type)); return xnn_status_uninitialized; } - if (isnan(output_min)) { - xnn_log_error( - "failed to create %s operator with NaN output lower bound: lower bound must be non-NaN", - xnn_operator_type_to_string(operator_type)); - return xnn_status_invalid_parameter; - } - - if (isnan(output_max)) { - xnn_log_error( - "failed to create %s operator with NaN output upper bound: upper bound must be non-NaN", - xnn_operator_type_to_string(operator_type)); - return xnn_status_invalid_parameter; - } - - if (output_min > output_max) { - xnn_log_error( - "failed to create %s operator with [%.7g, %.7g] output range: lower bound must be less than or equal to upper bound", - xnn_operator_type_to_string(operator_type), output_min, output_max); - return xnn_status_invalid_parameter; - } - + int sign_b = 1; + const struct xnn_binary_elementwise_config* config = + init_config(type, datatype, &sign_b); if (config == NULL) { - xnn_log_error("failed to create %s operator: unsupported hardware configuration", - xnn_operator_type_to_string(operator_type)); - return xnn_status_unsupported_hardware; - } - - const bool linear_activation = (output_max == INFINITY) && (output_min == -output_max); - const struct xnn_binary_elementwise_subconfig* binary_elementwise_subconfig = &config->minmax; - if (linear_activation && config->linear.op_ukernel != NULL) { - binary_elementwise_subconfig = &config->linear; - } - - union xnn_f32_minmax_params params; - if (config->init.f32_minmax != NULL) { - config->init.f32_minmax(¶ms, output_min, output_max); - } - - return create_binary_elementwise_nd( - flags, - ¶ms, - ¶ms, - sizeof(params), - operator_type, - binary_elementwise_subconfig, - binary_elementwise_op_out); -} - -enum xnn_status xnn_create_add_nd_qs8( - int8_t input1_zero_point, - float input1_scale, - int8_t input2_zero_point, - float input2_scale, - int8_t output_zero_point, - float output_scale, - int8_t output_min, - int8_t output_max, - uint32_t flags, - xnn_operator_t* add_op_out) -{ - if (input1_scale <= 0.0f || !isnormal(input1_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g input 1 scale: scale must be finite and positive", - xnn_operator_type_to_string(xnn_operator_type_add_nd_qs8), input1_scale); - return xnn_status_invalid_parameter; - } - - if (input2_scale <= 0.0f || !isnormal(input2_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g input 2 scale: scale must be finite and positive", - xnn_operator_type_to_string(xnn_operator_type_add_nd_qs8), input2_scale); - return xnn_status_invalid_parameter; - } - - if (output_scale <= 0.0f || !isnormal(output_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g output scale: scale must be finite and positive", - xnn_operator_type_to_string(xnn_operator_type_add_nd_qs8), output_scale); - return xnn_status_invalid_parameter; - } - - if (output_min > output_max) { - xnn_log_error( - "failed to create %s operator with [%" PRId8 ", %" PRId8 "] output range: lower bound must be less than or equal to upper bound", - xnn_operator_type_to_string(xnn_operator_type_add_nd_qs8), output_min, output_max); - return xnn_status_invalid_parameter; - } - - const float input1_output_scale = input1_scale / output_scale; - if (input1_output_scale < 0x1.0p-10f || input1_output_scale >= 0x1.0p+8f) { - xnn_log_error( - "failed to create %s operator with %.7g input1-to-output scale ratio: scale ratio must be in [2**-10, 2**8) range", - xnn_operator_type_to_string(xnn_operator_type_add_nd_qs8), input1_output_scale); - return xnn_status_unsupported_parameter; - } - - const float input2_output_scale = input2_scale / output_scale; - if (input2_output_scale < 0x1.0p-10f || input2_output_scale >= 0x1.0p+8f) { - xnn_log_error( - "failed to create %s operator with %.7g input2-to-output scale ratio: scale ratio must be in [2**-10, 2**8) range", - xnn_operator_type_to_string(xnn_operator_type_add_nd_qs8), input2_output_scale); - return xnn_status_unsupported_parameter; - } - - const struct xnn_binary_elementwise_config* qs8_vadd_config = xnn_init_qs8_vadd_config(); - if (qs8_vadd_config == NULL) { - xnn_log_error("failed to create %s operator: unsupported hardware configuration", - xnn_operator_type_to_string(xnn_operator_type_add_nd_qs8)); - return xnn_status_unsupported_hardware; - } - - struct xnn_qs8_add_minmax_params params; - struct xnn_qs8_add_minmax_params params2; - assert(qs8_vadd_config->init.qs8_add != NULL); - qs8_vadd_config->init.qs8_add( - ¶ms, input1_zero_point, input2_zero_point, output_zero_point, - input1_output_scale, input2_output_scale, output_min, output_max); - qs8_vadd_config->init.qs8_add( - ¶ms2, input2_zero_point, input1_zero_point, output_zero_point, - input2_output_scale, input1_output_scale, output_min, output_max); - - return create_binary_elementwise_nd( - flags, - ¶ms, - ¶ms2, - sizeof(params), - xnn_operator_type_add_nd_qs8, - &qs8_vadd_config->minmax, - add_op_out); -} - -enum xnn_status xnn_create_add_nd_qu8( - uint8_t input1_zero_point, - float input1_scale, - uint8_t input2_zero_point, - float input2_scale, - uint8_t output_zero_point, - float output_scale, - uint8_t output_min, - uint8_t output_max, - uint32_t flags, - xnn_operator_t* add_op_out) -{ - if (input1_scale <= 0.0f || !isnormal(input1_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g input 1 scale: scale must be finite and positive", - xnn_operator_type_to_string(xnn_operator_type_add_nd_qu8), input1_scale); - return xnn_status_invalid_parameter; - } - - if (input2_scale <= 0.0f || !isnormal(input2_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g input 2 scale: scale must be finite and positive", - xnn_operator_type_to_string(xnn_operator_type_add_nd_qu8), input2_scale); - return xnn_status_invalid_parameter; - } - - if (output_scale <= 0.0f || !isnormal(output_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g output scale: scale must be finite and positive", - xnn_operator_type_to_string(xnn_operator_type_add_nd_qu8), output_scale); - return xnn_status_invalid_parameter; - } - - if (output_min > output_max) { - xnn_log_error( - "failed to create %s operator with [%" PRIu8 ", %" PRIu8 "] output range: lower bound must be less than or equal to upper bound", - xnn_operator_type_to_string(xnn_operator_type_add_nd_qu8), output_min, output_max); - return xnn_status_invalid_parameter; - } - - const float input1_output_scale = input1_scale / output_scale; - if (input1_output_scale < 0x1.0p-10f || input1_output_scale >= 0x1.0p+8f) { - xnn_log_error( - "failed to create %s operator with %.7g input1-to-output scale ratio: scale ratio must be in [2**-10, 2**8) range", - xnn_operator_type_to_string(xnn_operator_type_add_nd_qu8), input1_output_scale); - return xnn_status_unsupported_parameter; - } - - const float input2_output_scale = input2_scale / output_scale; - if (input2_output_scale < 0x1.0p-10f || input2_output_scale >= 0x1.0p+8f) { - xnn_log_error( - "failed to create %s operator with %.7g input2-to-output scale ratio: scale ratio must be in [2**-10, 2**8) range", - xnn_operator_type_to_string(xnn_operator_type_add_nd_qu8), input2_output_scale); - return xnn_status_unsupported_parameter; - } - - const struct xnn_binary_elementwise_config* qu8_vadd_config = xnn_init_qu8_vadd_config(); - if (qu8_vadd_config == NULL) { - xnn_log_error("failed to create %s operator: unsupported hardware configuration", - xnn_operator_type_to_string(xnn_operator_type_add_nd_qu8)); - return xnn_status_unsupported_hardware; - } - - struct xnn_qu8_add_minmax_params params; - struct xnn_qu8_add_minmax_params params2; - assert(qu8_vadd_config->init.qu8_add != NULL); - qu8_vadd_config->init.qu8_add( - ¶ms, input1_zero_point, input2_zero_point, output_zero_point, - input1_output_scale, input2_output_scale, output_min, output_max); - qu8_vadd_config->init.qu8_add( - ¶ms2, input2_zero_point, input1_zero_point, output_zero_point, - input2_output_scale, input1_output_scale, output_min, output_max); - - return create_binary_elementwise_nd( - flags, - ¶ms, - ¶ms2, - sizeof(params), - xnn_operator_type_add_nd_qu8, - &qu8_vadd_config->minmax, - add_op_out); -} - -enum xnn_status xnn_create_add_nd_f16( - float output_min, - float output_max, - uint32_t flags, - xnn_operator_t* add_op_out) -{ - return create_binary_elementwise_nd_f16( - output_min, - output_max, - flags, - xnn_operator_type_add_nd_f16, - xnn_init_f16_vadd_config(), - add_op_out); -} - -enum xnn_status xnn_create_add_nd_f32( - float output_min, - float output_max, - uint32_t flags, - xnn_operator_t* add_op_out) -{ - return create_binary_elementwise_nd_f32( - output_min, - output_max, - flags, - xnn_operator_type_add_nd_f32, - xnn_init_f32_vadd_config(), - add_op_out); -} - -enum xnn_status xnn_create_divide_nd_f16( - float output_min, - float output_max, - uint32_t flags, - xnn_operator_t* divide_op_out) -{ - return create_binary_elementwise_nd_f16( - output_min, - output_max, - flags, - xnn_operator_type_divide_nd_f16, - xnn_init_f16_vdiv_config(), - divide_op_out); -} - -enum xnn_status xnn_create_copysign_nd_f32( - uint32_t flags, - xnn_operator_t* copysign_op_out) -{ - return create_binary_elementwise_nd_f32( - -INFINITY, - INFINITY, - flags, - xnn_operator_type_copysign_nd_f32, - xnn_init_f32_vcopysign_config(), - copysign_op_out); -} - -enum xnn_status xnn_create_divide_nd_f32( - float output_min, - float output_max, - uint32_t flags, - xnn_operator_t* divide_op_out) -{ - return create_binary_elementwise_nd_f32( - output_min, - output_max, - flags, - xnn_operator_type_divide_nd_f32, - xnn_init_f32_vdiv_config(), - divide_op_out); -} - -enum xnn_status xnn_create_maximum_nd_f16( - uint32_t flags, - xnn_operator_t* maximum_op_out) -{ - const struct xnn_binary_elementwise_config* f16_vmax_config = xnn_init_f16_vmax_config(); - if (f16_vmax_config == NULL) { - xnn_log_error("failed to create %s operator: unsupported hardware configuration", - xnn_operator_type_to_string(xnn_operator_type_maximum_nd_f16)); - return xnn_status_unsupported_hardware; - } - return create_binary_elementwise_nd( - flags, - NULL, - NULL, - 0, - xnn_operator_type_maximum_nd_f16, - &f16_vmax_config->minmax, - maximum_op_out); -} - -enum xnn_status xnn_create_maximum_nd_f32( - uint32_t flags, - xnn_operator_t* maximum_op_out) -{ - const struct xnn_binary_elementwise_config* f32_vmax_config = xnn_init_f32_vmax_config(); - if (f32_vmax_config == NULL) { - xnn_log_error("failed to create %s operator: unsupported hardware configuration", - xnn_operator_type_to_string(xnn_operator_type_maximum_nd_f32)); - return xnn_status_unsupported_hardware; - } - - struct xnn_f32_default_params params; - if (f32_vmax_config->init.f32_default != NULL) { - f32_vmax_config->init.f32_default(¶ms); - } - return create_binary_elementwise_nd( - flags, - ¶ms, - ¶ms, - sizeof(params), - xnn_operator_type_maximum_nd_f32, - &f32_vmax_config->minmax, - maximum_op_out); -} - -enum xnn_status xnn_create_minimum_nd_f16( - uint32_t flags, - xnn_operator_t* minimum_op_out) -{ - const struct xnn_binary_elementwise_config* f16_vmin_config = xnn_init_f16_vmin_config(); - if (f16_vmin_config == NULL) { - xnn_log_error("failed to create %s operator: unsupported hardware configuration", - xnn_operator_type_to_string(xnn_operator_type_minimum_nd_f16)); - } - return create_binary_elementwise_nd( - flags, - NULL, - NULL, - 0, - xnn_operator_type_minimum_nd_f16, - &f16_vmin_config->minmax, - minimum_op_out); -} - -enum xnn_status xnn_create_minimum_nd_f32( - uint32_t flags, - xnn_operator_t* minimum_op_out) -{ - const struct xnn_binary_elementwise_config* f32_vmin_config = xnn_init_f32_vmin_config(); - if (f32_vmin_config == NULL) { - xnn_log_error("failed to create %s operator: unsupported hardware configuration", - xnn_operator_type_to_string(xnn_operator_type_minimum_nd_f32)); - return xnn_status_unsupported_hardware; - } - - struct xnn_f32_default_params params; - if (f32_vmin_config->init.f32_default != NULL) { - f32_vmin_config->init.f32_default(¶ms); - } - return create_binary_elementwise_nd( - flags, - ¶ms, - ¶ms, - sizeof(params), - xnn_operator_type_minimum_nd_f32, - &f32_vmin_config->minmax, - minimum_op_out); -} - -enum xnn_status xnn_create_multiply_nd_f16( - float output_min, - float output_max, - uint32_t flags, - xnn_operator_t* multiply_op_out) -{ - return create_binary_elementwise_nd_f16( - output_min, - output_max, - flags, - xnn_operator_type_multiply_nd_f16, - xnn_init_f16_vmul_config(), - multiply_op_out); -} - -enum xnn_status xnn_create_multiply_nd_f32( - float output_min, - float output_max, - uint32_t flags, - xnn_operator_t* multiply_op_out) -{ - return create_binary_elementwise_nd_f32( - output_min, - output_max, - flags, - xnn_operator_type_multiply_nd_f32, - xnn_init_f32_vmul_config(), - multiply_op_out); -} - -enum xnn_status xnn_create_multiply_nd_qs8( - int8_t input1_zero_point, - float input1_scale, - int8_t input2_zero_point, - float input2_scale, - int8_t output_zero_point, - float output_scale, - int8_t output_min, - int8_t output_max, - uint32_t flags, - xnn_operator_t* multiply_op_out) -{ - if (input1_scale <= 0.0f || !isnormal(input1_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g input 1 scale: scale must be finite and positive", - xnn_operator_type_to_string(xnn_operator_type_multiply_nd_qs8), input1_scale); - return xnn_status_invalid_parameter; - } - - if (input2_scale <= 0.0f || !isnormal(input2_scale)) { xnn_log_error( - "failed to create %s operator with %.7g input 2 scale: scale must be finite and positive", - xnn_operator_type_to_string(xnn_operator_type_multiply_nd_qs8), input2_scale); - return xnn_status_invalid_parameter; - } - - if (output_scale <= 0.0f || !isnormal(output_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g output scale: scale must be finite and positive", - xnn_operator_type_to_string(xnn_operator_type_multiply_nd_qs8), output_scale); - return xnn_status_invalid_parameter; - } - - if (output_min > output_max) { - xnn_log_error( - "failed to create %s operator with [%" PRId8 ", %" PRId8 "] output range: lower bound must be less than or equal to upper bound", - xnn_operator_type_to_string(xnn_operator_type_multiply_nd_qs8), output_min, output_max); - return xnn_status_invalid_parameter; - } - - const float product_scale = input1_scale * input2_scale; - const float product_output_scale = product_scale / output_scale; - if (product_output_scale < 0x1.0p-16f || product_output_scale >= 0x1.0p+8f) { - xnn_log_error( - "failed to create %s operator with %.7g product-to-output scale ratio: scale ratio must be in [2**-16, 2**8) range", - xnn_operator_type_to_string(xnn_operator_type_multiply_nd_qs8), product_output_scale); - return xnn_status_unsupported_parameter; - } - - const struct xnn_binary_elementwise_config* qs8_vmul_config = xnn_init_qs8_vmul_config(); - if (qs8_vmul_config == NULL) { - xnn_log_error("failed to create %s operator: unsupported hardware configuration", - xnn_operator_type_to_string(xnn_operator_type_multiply_nd_qs8)); + "failed to create %s operator: unsupported hardware configuration", + xnn_binary_operator_to_string(type)); return xnn_status_unsupported_hardware; } - union xnn_qs8_mul_minmax_params params; - union xnn_qs8_mul_minmax_params params2; - assert(qs8_vmul_config->init.qs8_mul != NULL); - qs8_vmul_config->init.qs8_mul( - ¶ms, input1_zero_point, input2_zero_point, output_zero_point, - product_output_scale, output_min, output_max); - qs8_vmul_config->init.qs8_mul( - ¶ms2, input2_zero_point, input1_zero_point, output_zero_point, - product_output_scale, output_min, output_max); - - return create_binary_elementwise_nd( - flags, - ¶ms, - ¶ms2, - sizeof(params), - xnn_operator_type_multiply_nd_qs8, - &qs8_vmul_config->minmax, - multiply_op_out); -} - -enum xnn_status xnn_create_multiply_nd_qu8( - uint8_t input1_zero_point, - float input1_scale, - uint8_t input2_zero_point, - float input2_scale, - uint8_t output_zero_point, - float output_scale, - uint8_t output_min, - uint8_t output_max, - uint32_t flags, - xnn_operator_t* multiply_op_out) -{ - if (input1_scale <= 0.0f || !isnormal(input1_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g input 1 scale: scale must be finite and positive", - xnn_operator_type_to_string(xnn_operator_type_multiply_nd_qu8), input1_scale); - return xnn_status_invalid_parameter; - } - - if (input2_scale <= 0.0f || !isnormal(input2_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g input 2 scale: scale must be finite and positive", - xnn_operator_type_to_string(xnn_operator_type_multiply_nd_qu8), input2_scale); - return xnn_status_invalid_parameter; - } - - if (output_scale <= 0.0f || !isnormal(output_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g output scale: scale must be finite and positive", - xnn_operator_type_to_string(xnn_operator_type_multiply_nd_qu8), output_scale); - return xnn_status_invalid_parameter; - } - - if (output_min > output_max) { - xnn_log_error( - "failed to create %s operator with [%" PRIu8 ", %" PRIu8 "] output range: lower bound must be less than or equal to upper bound", - xnn_operator_type_to_string(xnn_operator_type_multiply_nd_qu8), output_min, output_max); - return xnn_status_invalid_parameter; - } - - const float product_scale = input1_scale * input2_scale; - const float product_output_scale = product_scale / output_scale; - if (product_output_scale < 0x1.0p-16f || product_output_scale >= 0x1.0p+8f) { - xnn_log_error( - "failed to create %s operator with %.7g product-to-output scale ratio: scale ratio must be in [2**-16, 2**8) range", - xnn_operator_type_to_string(xnn_operator_type_multiply_nd_qu8), product_output_scale); + const struct xnn_binary_elementwise_subconfig* subconfig = + init_subconfig(config, type); + if (subconfig == NULL || !can_use_subconfig(subconfig)) { + xnn_log_error("failed to create %s operator", + xnn_binary_operator_to_string(type)); return xnn_status_unsupported_parameter; } - const struct xnn_binary_elementwise_config* qu8_vmul_config = xnn_init_qu8_vmul_config(); - if (qu8_vmul_config == NULL) { - xnn_log_error("failed to create %s operator: unsupported hardware configuration", - xnn_operator_type_to_string(xnn_operator_type_multiply_nd_qu8)); - return xnn_status_unsupported_hardware; - } - - union xnn_qu8_mul_minmax_params params; - union xnn_qu8_mul_minmax_params params2; - assert(qu8_vmul_config->init.qu8_mul != NULL); - qu8_vmul_config->init.qu8_mul( - ¶ms, input1_zero_point, input2_zero_point, output_zero_point, - product_output_scale, output_min, output_max); - qu8_vmul_config->init.qu8_mul( - ¶ms2, input2_zero_point, input1_zero_point, output_zero_point, - product_output_scale, output_min, output_max); - return create_binary_elementwise_nd( - flags, - ¶ms, - ¶ms2, - sizeof(params), - xnn_operator_type_multiply_nd_qu8, - &qu8_vmul_config->minmax, - multiply_op_out); -} - -enum xnn_status xnn_create_squared_difference_nd_f16( - uint32_t flags, - xnn_operator_t* squared_difference_op_out) -{ - const struct xnn_binary_elementwise_config* f16_vqsrdiff_config = xnn_init_f16_vsqrdiff_config(); - if (f16_vqsrdiff_config == NULL) { - xnn_log_error("failed to create %s operator: unsupported hardware configuration", - xnn_operator_type_to_string(xnn_operator_type_squared_difference_nd_f16)); - return xnn_status_unsupported_hardware; - } - return create_binary_elementwise_nd( - flags, - NULL, - NULL, - 0, - xnn_operator_type_squared_difference_nd_f16, - &f16_vqsrdiff_config->minmax, - squared_difference_op_out); -} - -enum xnn_status xnn_create_squared_difference_nd_f32( - uint32_t flags, - xnn_operator_t* squared_difference_op_out) -{ - const struct xnn_binary_elementwise_config* f32_vsqrdiff_config = xnn_init_f32_vsqrdiff_config(); - if (f32_vsqrdiff_config == NULL) { - xnn_log_error("failed to create %s operator: unsupported hardware configuration", - xnn_operator_type_to_string(xnn_operator_type_squared_difference_nd_f32)); - return xnn_status_unsupported_hardware; - } - - struct xnn_f32_default_params params; - if (f32_vsqrdiff_config->init.f32_default != NULL) { - f32_vsqrdiff_config->init.f32_default(¶ms); - } - return create_binary_elementwise_nd( - flags, - ¶ms, - ¶ms, - sizeof(params), - xnn_operator_type_squared_difference_nd_f32, - &f32_vsqrdiff_config->minmax, - squared_difference_op_out); -} - - -enum xnn_status xnn_create_multiply_nd_s32( - uint32_t flags, - xnn_operator_t* multiply_op_out) -{ - const struct xnn_binary_elementwise_config* s32_multiply_config = xnn_init_s32_vmul_config(); - if (s32_multiply_config == NULL) { - xnn_log_error("failed to create %s operator: unsupported hardware configuration", - xnn_operator_type_to_string(xnn_operator_type_multiply_nd_s32)); - return xnn_status_unsupported_hardware; - } - - struct xnn_s32_default_params params; - - return create_binary_elementwise_nd( - flags, - ¶ms, - ¶ms, - sizeof(params), - xnn_operator_type_multiply_nd_s32, - &s32_multiply_config->linear, - multiply_op_out); -} - - -enum xnn_status xnn_create_subtract_nd_f16( - float output_min, - float output_max, - uint32_t flags, - xnn_operator_t* subtract_op_out) -{ - return create_binary_elementwise_nd_f16( - output_min, - output_max, - flags, - xnn_operator_type_subtract_nd_f16, - xnn_init_f16_vsub_config(), - subtract_op_out); -} - -enum xnn_status xnn_create_subtract_nd_f32( - float output_min, - float output_max, - uint32_t flags, - xnn_operator_t* subtract_op_out) -{ - return create_binary_elementwise_nd_f32( - output_min, - output_max, - flags, - xnn_operator_type_subtract_nd_f32, - xnn_init_f32_vsub_config(), - subtract_op_out); -} - -enum xnn_status xnn_create_subtract_nd_qs8( - int8_t input1_zero_point, - float input1_scale, - int8_t input2_zero_point, - float input2_scale, - int8_t output_zero_point, - float output_scale, - int8_t output_min, - int8_t output_max, - uint32_t flags, - xnn_operator_t* subtract_op_out) -{ - if (input1_scale <= 0.0f || !isnormal(input1_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g input 1 scale: scale must be finite and positive", - xnn_operator_type_to_string(xnn_operator_type_subtract_nd_qs8), input1_scale); - return xnn_status_invalid_parameter; - } + union xnn_binary_uparams uparams; + union xnn_binary_uparams uparams2; + if (config->init != NULL) { + if (datatype == xnn_datatype_qint8 || datatype == xnn_datatype_quint8) { + if (!a_quantization || !b_quantization || !output_quantization) { + xnn_log_error( + "failed to create %s operator with NULL quantization params", + xnn_binary_operator_to_string(type)); + return xnn_status_invalid_parameter; + } + const float a_scale = a_quantization ? a_quantization->scale : 1.0f; + const float b_scale = b_quantization ? b_quantization->scale : 1.0f; + const float output_scale = + output_quantization ? output_quantization->scale : 1.0f; + if (a_scale <= 0.0f || !isnormal(a_scale)) { + xnn_log_error( + "failed to create %s operator with %.7g input 1 scale: scale must be " + "finite and positive", + xnn_binary_operator_to_string(type), a_scale); + return xnn_status_invalid_parameter; + } + if (b_scale <= 0.0f || !isnormal(b_scale)) { + xnn_log_error( + "failed to create %s operator with %.7g input 2 scale: scale must be " + "finite and positive", + xnn_binary_operator_to_string(type), b_scale); + return xnn_status_invalid_parameter; + } + if (output_scale <= 0.0f || !isnormal(output_scale)) { + xnn_log_error( + "failed to create %s operator with %.7g output scale: scale must be " + "finite and positive", + xnn_binary_operator_to_string(type), output_scale); + return xnn_status_invalid_parameter; + } - if (input2_scale <= 0.0f || !isnormal(input2_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g input 2 scale: scale must be finite and positive", - xnn_operator_type_to_string(xnn_operator_type_subtract_nd_qs8), input2_scale); - return xnn_status_invalid_parameter; - } + struct xnn_quantization_params b_quantization_with_sign = *b_quantization; + b_quantization_with_sign.scale *= sign_b; - if (output_scale <= 0.0f || !isnormal(output_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g output scale: scale must be finite and positive", - xnn_operator_type_to_string(xnn_operator_type_subtract_nd_qs8), output_scale); - return xnn_status_invalid_parameter; + config->init(&uparams, a_quantization, &b_quantization_with_sign, + output_quantization); + config->init(&uparams2, &b_quantization_with_sign, a_quantization, + output_quantization); + } else { + config->init(&uparams, NULL, NULL, NULL); + config->init(&uparams2, NULL, NULL, NULL); + } } - if (output_min > output_max) { - xnn_log_error( - "failed to create %s operator with [%" PRId8 ", %" PRId8 "] output range: lower bound must be less than or equal to upper bound", - xnn_operator_type_to_string(xnn_operator_type_subtract_nd_qs8), output_min, output_max); - return xnn_status_invalid_parameter; + if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) { + xnn_log_error("failed to create %s operator: XNNPACK is not initialized", + xnn_binary_operator_to_string(type)); + return xnn_status_uninitialized; } - const float input1_output_scale = input1_scale / output_scale; - if (input1_output_scale < 0x1.0p-10f || input1_output_scale >= 0x1.0p+8f) { - xnn_log_error( - "failed to create %s operator with %.7g input1-to-output scale ratio: scale ratio must be in [2**-10, 2**8) range", - xnn_operator_type_to_string(xnn_operator_type_subtract_nd_qs8), input1_output_scale); - return xnn_status_unsupported_parameter; - } + memcpy(&op->params, &uparams, sizeof(uparams)); + memcpy(&op->params2, &uparams2, sizeof(uparams2)); - const float input2_output_scale = input2_scale / output_scale; - if (input2_output_scale < 0x1.0p-10f || input2_output_scale >= 0x1.0p+8f) { - xnn_log_error( - "failed to create %s operator with %.7g input2-to-output scale ratio: scale ratio must be in [2**-10, 2**8) range", - xnn_operator_type_to_string(xnn_operator_type_subtract_nd_qs8), input2_output_scale); - return xnn_status_unsupported_parameter; - } + op->binary_elementwise_subconfig = subconfig; + op->log2_elementwise_element_size = + xnn_datatype_get_log2_element_size(datatype); - const struct xnn_binary_elementwise_config* qs8_vadd_config = xnn_init_qs8_vadd_config(); - if (qs8_vadd_config == NULL) { - xnn_log_error("failed to create %s operator: unsupported hardware configuration", - xnn_operator_type_to_string(xnn_operator_type_subtract_nd_qs8)); - return xnn_status_unsupported_hardware; - } + op->type = xnn_binary_operator_to_operator_type(type); + op->flags = flags; - struct xnn_qs8_add_minmax_params params; - struct xnn_qs8_add_minmax_params params2; - assert(qs8_vadd_config->init.qs8_add != NULL); - qs8_vadd_config->init.qs8_add( - ¶ms, input1_zero_point, input2_zero_point, output_zero_point, - input1_output_scale, -input2_output_scale, output_min, output_max); - qs8_vadd_config->init.qs8_add( - ¶ms2, input2_zero_point, input1_zero_point, output_zero_point, - -input2_output_scale, input1_output_scale, output_min, output_max); + op->state = xnn_run_state_invalid; - return create_binary_elementwise_nd( - flags, - ¶ms, - ¶ms2, - sizeof(params), - xnn_operator_type_subtract_nd_qs8, - &qs8_vadd_config->minmax, - subtract_op_out); + return xnn_status_success; } -enum xnn_status xnn_create_subtract_nd_qu8( - uint8_t input1_zero_point, - float input1_scale, - uint8_t input2_zero_point, - float input2_scale, - uint8_t output_zero_point, - float output_scale, - uint8_t output_min, - uint8_t output_max, - uint32_t flags, - xnn_operator_t* subtract_op_out) -{ - if (input1_scale <= 0.0f || !isnormal(input1_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g input 1 scale: scale must be finite and positive", - xnn_operator_type_to_string(xnn_operator_type_subtract_nd_qu8), input1_scale); - return xnn_status_invalid_parameter; - } - - if (input2_scale <= 0.0f || !isnormal(input2_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g input 2 scale: scale must be finite and positive", - xnn_operator_type_to_string(xnn_operator_type_subtract_nd_qu8), input2_scale); - return xnn_status_invalid_parameter; - } - - if (output_scale <= 0.0f || !isnormal(output_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g output scale: scale must be finite and positive", - xnn_operator_type_to_string(xnn_operator_type_subtract_nd_qu8), output_scale); - return xnn_status_invalid_parameter; - } - - if (output_min > output_max) { - xnn_log_error( - "failed to create %s operator with [%" PRIu8 ", %" PRIu8 "] output range: lower bound must be less than or equal to upper bound", - xnn_operator_type_to_string(xnn_operator_type_subtract_nd_qu8), output_min, output_max); - return xnn_status_invalid_parameter; - } - - const float input1_output_scale = input1_scale / output_scale; - if (input1_output_scale < 0x1.0p-10f || input1_output_scale >= 0x1.0p+8f) { - xnn_log_error( - "failed to create %s operator with %.7g input1-to-output scale ratio: scale ratio must be in [2**-10, 2**8) range", - xnn_operator_type_to_string(xnn_operator_type_subtract_nd_qu8), input1_output_scale); - return xnn_status_unsupported_parameter; +enum xnn_status xnn_create_binary_elementwise_nd( + enum xnn_binary_operator type, enum xnn_datatype datatype, + const struct xnn_quantization_params* a_quantization, + const struct xnn_quantization_params* b_quantization, + const struct xnn_quantization_params* output_quantization, uint32_t flags, + xnn_operator_t* binary_op_out) { + if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) { + xnn_log_error("failed to create %s operator: XNNPACK is not initialized", + xnn_binary_operator_to_string(type)); + return xnn_status_uninitialized; } - const float input2_output_scale = input2_scale / output_scale; - if (input2_output_scale < 0x1.0p-10f || input2_output_scale >= 0x1.0p+8f) { - xnn_log_error( - "failed to create %s operator with %.7g input2-to-output scale ratio: scale ratio must be in [2**-10, 2**8) range", - xnn_operator_type_to_string(xnn_operator_type_subtract_nd_qu8), input2_output_scale); - return xnn_status_unsupported_parameter; + xnn_operator_t op = + xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator)); + if (op == NULL) { + xnn_log_error("failed to allocate %zu bytes for %s operator descriptor", + sizeof(struct xnn_operator), + xnn_binary_operator_to_string(type)); + return xnn_status_out_of_memory; } - const struct xnn_binary_elementwise_config* qu8_vadd_config = xnn_init_qu8_vadd_config(); - if (qu8_vadd_config == NULL) { - xnn_log_error("failed to create %s operator: unsupported hardware configuration", - xnn_operator_type_to_string(xnn_operator_type_subtract_nd_qu8)); - return xnn_status_unsupported_hardware; + enum xnn_status status = + init_binary_elementwise_nd(op, type, datatype, a_quantization, + b_quantization, output_quantization, flags); + if (status != xnn_status_success) { + xnn_release_memory(op); + return status; } - struct xnn_qu8_add_minmax_params params; - struct xnn_qu8_add_minmax_params params2; - assert(qu8_vadd_config->init.qu8_add != NULL); - qu8_vadd_config->init.qu8_add( - ¶ms, input1_zero_point, input2_zero_point, output_zero_point, - input1_output_scale, -input2_output_scale, output_min, output_max); - qu8_vadd_config->init.qu8_add( - ¶ms2, input2_zero_point, input1_zero_point, output_zero_point, - -input2_output_scale, input1_output_scale, output_min, output_max); - - return create_binary_elementwise_nd( - flags, - ¶ms, - ¶ms2, - sizeof(params), - xnn_operator_type_subtract_nd_qu8, - &qu8_vadd_config->minmax, - subtract_op_out); + *binary_op_out = op; + return xnn_status_success; } -static enum xnn_status reshape_binary_elementwise_nd( - xnn_operator_t binary_elementwise_op, - enum xnn_operator_type expected_operator_type, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - uint32_t log2_element_size, - const void* params, - size_t params_size, - const void* reversed_params, - size_t reversed_params_size, - pthreadpool_t threadpool) -{ - if (binary_elementwise_op->type != expected_operator_type) { - xnn_log_error("failed to reshape operator: operator type mismatch (expected %s, got %s)", - xnn_operator_type_to_string(expected_operator_type), - xnn_operator_type_to_string(binary_elementwise_op->type)); - return xnn_status_invalid_parameter; - } - binary_elementwise_op->state = xnn_run_state_invalid; +enum xnn_status xnn_reshape_binary_elementwise_nd(xnn_operator_t op, + size_t num_input1_dims, + const size_t* input1_shape, + size_t num_input2_dims, + const size_t* input2_shape, + pthreadpool_t threadpool) { + op->state = xnn_run_state_invalid; if (max(num_input1_dims, num_input2_dims) > XNN_MAX_TENSOR_DIMS) { xnn_log_error( - "failed to reshape %s operator with %zu and %zu dimensions in input shapes: " - "the number of input dimensions must not exceed %d", - xnn_operator_type_to_string(binary_elementwise_op->type), num_input1_dims, num_input2_dims, XNN_MAX_TENSOR_DIMS); + "failed to reshape %s operator with %zu and %zu dimensions in input " + "shapes: " + "the number of input dimensions must not exceed %d", + xnn_operator_type_to_string(op->type), num_input1_dims, num_input2_dims, + XNN_MAX_TENSOR_DIMS); return xnn_status_unsupported_parameter; } @@ -1060,10 +368,11 @@ static enum xnn_status reshape_binary_elementwise_nd( compressed_output_shape[num_compressed_dims - 1] *= input1_dim; } else { xnn_log_error( - "failed to reshape %s operator: " - "shape dimension #%zu of input1 (%zu) does not match shape dimension #%zu of input2 (%zu)", - xnn_operator_type_to_string(binary_elementwise_op->type), - num_input1_dims - i, input1_dim, num_input2_dims - i, input2_dim); + "failed to reshape %s operator: " + "shape dimension #%zu of input1 (%zu) does not match shape dimension " + "#%zu of input2 (%zu)", + xnn_operator_type_to_string(op->type), num_input1_dims - i, + input1_dim, num_input2_dims - i, input2_dim); return xnn_status_invalid_parameter; } first_nonunit = false; @@ -1093,517 +402,132 @@ static enum xnn_status reshape_binary_elementwise_nd( // Early exit without setting up context if any shape dimension is zero. if (degenerate_shape) { - binary_elementwise_op->state = xnn_run_state_skip; + op->state = xnn_run_state_skip; return xnn_status_success; } - binary_elementwise_op->context.elementwise_binary = (struct elementwise_binary_context) { - .elements = compressed_output_shape[0] << log2_element_size, + const uint32_t log2_element_size = op->log2_elementwise_element_size; + op->context.elementwise_binary = (struct elementwise_binary_context){ + .elements = compressed_output_shape[0] << log2_element_size, }; - if (params_size != 0) { - memcpy(&binary_elementwise_op->context.elementwise_binary.params, params, params_size); - } + memcpy(&op->context.elementwise_binary.params, &op->params.binary, + sizeof(op->params.binary)); const size_t* compressed_a_shape = compressed_input1_shape; const size_t* compressed_b_shape = compressed_input2_shape; if (compressed_input1_shape[0] == 1) { - binary_elementwise_op->context.elementwise_binary.flip_a_b = true; - binary_elementwise_op->context.elementwise_binary.ukernel = binary_elementwise_op->binary_elementwise_subconfig->ropc_ukernel; + op->context.elementwise_binary.flip_a_b = true; + op->context.elementwise_binary.ukernel = + op->binary_elementwise_subconfig->ropc_ukernel; compressed_a_shape = compressed_input2_shape; compressed_b_shape = compressed_input1_shape; - if (reversed_params_size != 0) { - memcpy(&binary_elementwise_op->context.elementwise_binary.params, reversed_params, reversed_params_size); - } + memcpy(&op->context.elementwise_binary.params, &op->params2.binary, + sizeof(op->params.binary)); } else if (compressed_input2_shape[0] == 1) { - binary_elementwise_op->context.elementwise_binary.ukernel = binary_elementwise_op->binary_elementwise_subconfig->opc_ukernel; + op->context.elementwise_binary.ukernel = + op->binary_elementwise_subconfig->opc_ukernel; } else if (compressed_input1_shape[0] == compressed_input2_shape[0]) { - binary_elementwise_op->context.elementwise_binary.ukernel = binary_elementwise_op->binary_elementwise_subconfig->op_ukernel; + op->context.elementwise_binary.ukernel = + op->binary_elementwise_subconfig->op_ukernel; } - size_t a_stride = compressed_a_shape[0], b_stride = compressed_b_shape[0], y_stride = compressed_output_shape[0]; + size_t a_stride = compressed_a_shape[0]; + size_t b_stride = compressed_b_shape[0]; + size_t y_stride = compressed_output_shape[0]; for (size_t i = 1; i < num_compressed_dims; i++) { if (compressed_a_shape[i] != 1) { - binary_elementwise_op->context.elementwise_binary.a_stride[XNN_MAX_TENSOR_DIMS - 1 - i] = a_stride << log2_element_size; + op->context.elementwise_binary.a_stride[XNN_MAX_TENSOR_DIMS - 1 - i] = + a_stride << log2_element_size; } if (compressed_b_shape[i] != 1) { - binary_elementwise_op->context.elementwise_binary.b_stride[XNN_MAX_TENSOR_DIMS - 1 - i] = b_stride << log2_element_size; + op->context.elementwise_binary.b_stride[XNN_MAX_TENSOR_DIMS - 1 - i] = + b_stride << log2_element_size; } - binary_elementwise_op->context.elementwise_binary.y_stride[XNN_MAX_TENSOR_DIMS - 1 - i] = y_stride << log2_element_size; + op->context.elementwise_binary.y_stride[XNN_MAX_TENSOR_DIMS - 1 - i] = + y_stride << log2_element_size; a_stride *= compressed_a_shape[i]; b_stride *= compressed_b_shape[i]; y_stride *= compressed_output_shape[i]; } const size_t num_threads = pthreadpool_get_threads_count(threadpool); - const size_t element_tile = binary_elementwise_op->binary_elementwise_subconfig->element_tile; + const size_t element_tile = op->binary_elementwise_subconfig->element_tile; if (compressed_output_shape[5] == 1) { if (compressed_output_shape[4] == 1) { if (compressed_output_shape[3] == 1) { if (compressed_output_shape[2] == 1) { if (compressed_output_shape[1] == 1) { - binary_elementwise_op->context.elementwise_binary.a_stride[4] = compressed_a_shape[0] == 1 ? 0 : (1 << log2_element_size); - binary_elementwise_op->context.elementwise_binary.b_stride[4] = compressed_b_shape[0] == 1 ? 0 : (1 << log2_element_size); - binary_elementwise_op->context.elementwise_binary.y_stride[4] = (1 << log2_element_size); - binary_elementwise_op->context.elementwise_binary.elements = (1 << log2_element_size); - binary_elementwise_op->compute[0].type = xnn_parallelization_type_1d_tile_1d; - binary_elementwise_op->compute[0].task_1d_tile_1d = (pthreadpool_task_1d_tile_1d_t) xnn_compute_elementwise_binary_1d_tile; - binary_elementwise_op->compute[0].range[0] = compressed_output_shape[0] * (1 << log2_element_size); - binary_elementwise_op->compute[0].tile[0] = max(element_tile, round_up_po2(binary_elementwise_op->compute[0].range[0] / num_threads, (1 << log2_element_size))); + op->context.elementwise_binary.a_stride[4] = + compressed_a_shape[0] == 1 ? 0 : (1 << log2_element_size); + op->context.elementwise_binary.b_stride[4] = + compressed_b_shape[0] == 1 ? 0 : (1 << log2_element_size); + op->context.elementwise_binary.y_stride[4] = + (1 << log2_element_size); + op->context.elementwise_binary.elements = (1 << log2_element_size); + op->compute[0].type = xnn_parallelization_type_1d_tile_1d; + op->compute[0].task_1d_tile_1d = (pthreadpool_task_1d_tile_1d_t) + xnn_compute_elementwise_binary_1d_tile; + op->compute[0].range[0] = + compressed_output_shape[0] * (1 << log2_element_size); + op->compute[0].tile[0] = + max(element_tile, + round_up_po2(op->compute[0].range[0] / num_threads, + (1 << log2_element_size))); } else { - binary_elementwise_op->compute[0].type = xnn_parallelization_type_1d; - binary_elementwise_op->compute[0].task_1d = (pthreadpool_task_1d_t) xnn_compute_elementwise_binary_1d; - binary_elementwise_op->compute[0].range[0] = compressed_output_shape[1]; + op->compute[0].type = xnn_parallelization_type_1d; + op->compute[0].task_1d = + (pthreadpool_task_1d_t)xnn_compute_elementwise_binary_1d; + op->compute[0].range[0] = compressed_output_shape[1]; } } else { - binary_elementwise_op->compute[0].type = xnn_parallelization_type_2d; - binary_elementwise_op->compute[0].task_2d = (pthreadpool_task_2d_t) xnn_compute_elementwise_binary_2d; - binary_elementwise_op->compute[0].range[0] = compressed_output_shape[2]; - binary_elementwise_op->compute[0].range[1] = compressed_output_shape[1]; + op->compute[0].type = xnn_parallelization_type_2d; + op->compute[0].task_2d = + (pthreadpool_task_2d_t)xnn_compute_elementwise_binary_2d; + op->compute[0].range[0] = compressed_output_shape[2]; + op->compute[0].range[1] = compressed_output_shape[1]; } } else { - binary_elementwise_op->compute[0].type = xnn_parallelization_type_3d; - binary_elementwise_op->compute[0].task_3d = (pthreadpool_task_3d_t) xnn_compute_elementwise_binary_3d; - binary_elementwise_op->compute[0].range[0] = compressed_output_shape[3]; - binary_elementwise_op->compute[0].range[1] = compressed_output_shape[2]; - binary_elementwise_op->compute[0].range[2] = compressed_output_shape[1]; + op->compute[0].type = xnn_parallelization_type_3d; + op->compute[0].task_3d = + (pthreadpool_task_3d_t)xnn_compute_elementwise_binary_3d; + op->compute[0].range[0] = compressed_output_shape[3]; + op->compute[0].range[1] = compressed_output_shape[2]; + op->compute[0].range[2] = compressed_output_shape[1]; } } else { - binary_elementwise_op->compute[0].type = xnn_parallelization_type_4d; - binary_elementwise_op->compute[0].task_4d = (pthreadpool_task_4d_t) xnn_compute_elementwise_binary_4d; - binary_elementwise_op->compute[0].range[0] = compressed_output_shape[4]; - binary_elementwise_op->compute[0].range[1] = compressed_output_shape[3]; - binary_elementwise_op->compute[0].range[2] = compressed_output_shape[2]; - binary_elementwise_op->compute[0].range[3] = compressed_output_shape[1]; + op->compute[0].type = xnn_parallelization_type_4d; + op->compute[0].task_4d = + (pthreadpool_task_4d_t)xnn_compute_elementwise_binary_4d; + op->compute[0].range[0] = compressed_output_shape[4]; + op->compute[0].range[1] = compressed_output_shape[3]; + op->compute[0].range[2] = compressed_output_shape[2]; + op->compute[0].range[3] = compressed_output_shape[1]; } } else { - binary_elementwise_op->compute[0].type = xnn_parallelization_type_5d; - binary_elementwise_op->compute[0].task_5d = (pthreadpool_task_5d_t) xnn_compute_elementwise_binary_5d; - binary_elementwise_op->compute[0].range[0] = compressed_output_shape[5]; - binary_elementwise_op->compute[0].range[1] = compressed_output_shape[4]; - binary_elementwise_op->compute[0].range[2] = compressed_output_shape[3]; - binary_elementwise_op->compute[0].range[3] = compressed_output_shape[2]; - binary_elementwise_op->compute[0].range[4] = compressed_output_shape[1]; + op->compute[0].type = xnn_parallelization_type_5d; + op->compute[0].task_5d = + (pthreadpool_task_5d_t)xnn_compute_elementwise_binary_5d; + op->compute[0].range[0] = compressed_output_shape[5]; + op->compute[0].range[1] = compressed_output_shape[4]; + op->compute[0].range[2] = compressed_output_shape[3]; + op->compute[0].range[3] = compressed_output_shape[2]; + op->compute[0].range[4] = compressed_output_shape[1]; } - binary_elementwise_op->state = xnn_run_state_needs_setup; + op->state = xnn_run_state_needs_setup; return xnn_status_success; } -static enum xnn_status reshape_binary_elementwise_nd_f16( - xnn_operator_t binary_elementwise_op, - enum xnn_operator_type expected_operator_type, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool) -{ - return reshape_binary_elementwise_nd( - binary_elementwise_op, - expected_operator_type, - num_input1_dims, - input1_shape, - num_input2_dims, - input2_shape, - /*log2_element_size=*/XNN_LOG2_SIZEOF_HALF, - &binary_elementwise_op->params.f16_minmax, sizeof(binary_elementwise_op->params.f16_minmax), - &binary_elementwise_op->params.f16_minmax, sizeof(binary_elementwise_op->params.f16_minmax), - threadpool); -} - -static enum xnn_status reshape_binary_elementwise_nd_f32( - xnn_operator_t binary_elementwise_op, - enum xnn_operator_type expected_operator_type, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool) -{ - return reshape_binary_elementwise_nd( - binary_elementwise_op, expected_operator_type, - num_input1_dims, input1_shape, - num_input2_dims, input2_shape, - /*log2_element_size=*/XNN_LOG2_SIZEOF_FLOAT, - &binary_elementwise_op->params.f32_minmax, sizeof(binary_elementwise_op->params.f32_minmax), - &binary_elementwise_op->params.f32_minmax, sizeof(binary_elementwise_op->params.f32_minmax), - threadpool); -} - -enum xnn_status xnn_reshape_add_nd_f16( - xnn_operator_t add_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool) -{ - return reshape_binary_elementwise_nd_f16( - add_op, xnn_operator_type_add_nd_f16, - num_input1_dims, input1_shape, - num_input2_dims, input2_shape, - threadpool); -} - -enum xnn_status xnn_reshape_add_nd_f32( - xnn_operator_t add_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool) -{ - return reshape_binary_elementwise_nd_f32( - add_op, xnn_operator_type_add_nd_f32, - num_input1_dims, input1_shape, - num_input2_dims, input2_shape, - threadpool); -} - -enum xnn_status xnn_reshape_add_nd_qs8( - xnn_operator_t add_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool) -{ - return reshape_binary_elementwise_nd( - add_op, xnn_operator_type_add_nd_qs8, - num_input1_dims, input1_shape, - num_input2_dims, input2_shape, - /*log2_element_size=*/XNN_LOG2_SIZEOF_INT8_T, - &add_op->params.qs8_add, sizeof(add_op->params.qs8_add), - &add_op->params2.qs8_add, sizeof(add_op->params2.qs8_add), - threadpool); -} - -enum xnn_status xnn_reshape_add_nd_qu8( - xnn_operator_t add_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool) -{ - return reshape_binary_elementwise_nd( - add_op, xnn_operator_type_add_nd_qu8, - num_input1_dims, input1_shape, - num_input2_dims, input2_shape, - /*log2_element_size=*/XNN_LOG2_SIZEOF_UINT8_T, - &add_op->params.qu8_add, sizeof(add_op->params.qu8_add), - &add_op->params2.qu8_add, sizeof(add_op->params2.qu8_add), - threadpool); -} - -enum xnn_status xnn_reshape_divide_nd_f16( - xnn_operator_t divide_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool) -{ - return reshape_binary_elementwise_nd_f16( - divide_op, xnn_operator_type_divide_nd_f16, - num_input1_dims, input1_shape, - num_input2_dims, input2_shape, - threadpool); -} - -enum xnn_status xnn_reshape_divide_nd_f32( - xnn_operator_t divide_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool) -{ - return reshape_binary_elementwise_nd_f32( - divide_op, xnn_operator_type_divide_nd_f32, - num_input1_dims, input1_shape, - num_input2_dims, input2_shape, - threadpool); -} - -enum xnn_status xnn_reshape_copysign_nd_f32( - xnn_operator_t copysign_op, - size_t num_mag_dims, - const size_t* mag_shape, - size_t num_sign_dims, - const size_t* sign_shape, - pthreadpool_t threadpool) -{ - return reshape_binary_elementwise_nd_f32( - copysign_op, xnn_operator_type_copysign_nd_f32, - num_mag_dims, mag_shape, - num_sign_dims, sign_shape, - threadpool); -} - -enum xnn_status xnn_reshape_maximum_nd_f16( - xnn_operator_t maximum_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool) -{ - return reshape_binary_elementwise_nd_f16( - maximum_op, xnn_operator_type_maximum_nd_f16, - num_input1_dims, input1_shape, - num_input2_dims, input2_shape, - threadpool); -} - -enum xnn_status xnn_reshape_maximum_nd_f32( - xnn_operator_t maximum_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool) -{ - return reshape_binary_elementwise_nd_f32( - maximum_op, xnn_operator_type_maximum_nd_f32, - num_input1_dims, input1_shape, - num_input2_dims, input2_shape, - threadpool); -} - -enum xnn_status xnn_reshape_minimum_nd_f16( - xnn_operator_t minimum_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool) -{ - return reshape_binary_elementwise_nd_f16( - minimum_op, xnn_operator_type_minimum_nd_f16, - num_input1_dims, input1_shape, - num_input2_dims, input2_shape, - threadpool); -} - -enum xnn_status xnn_reshape_minimum_nd_f32( - xnn_operator_t minimum_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool) -{ - return reshape_binary_elementwise_nd_f32( - minimum_op, xnn_operator_type_minimum_nd_f32, - num_input1_dims, input1_shape, - num_input2_dims, input2_shape, - threadpool); -} - - -enum xnn_status xnn_reshape_multiply_nd_f16( - xnn_operator_t multiply_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool) -{ - return reshape_binary_elementwise_nd_f16( - multiply_op, xnn_operator_type_multiply_nd_f16, - num_input1_dims, input1_shape, - num_input2_dims, input2_shape, - threadpool); -} - -enum xnn_status xnn_reshape_multiply_nd_f32( - xnn_operator_t multiply_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool) -{ - return reshape_binary_elementwise_nd_f32( - multiply_op, xnn_operator_type_multiply_nd_f32, - num_input1_dims, input1_shape, - num_input2_dims, input2_shape, - threadpool); -} - -enum xnn_status xnn_reshape_multiply_nd_qs8( - xnn_operator_t multiply_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool) -{ - return reshape_binary_elementwise_nd( - multiply_op, xnn_operator_type_multiply_nd_qs8, - num_input1_dims, input1_shape, - num_input2_dims, input2_shape, - /*log2_element_size=*/XNN_LOG2_SIZEOF_INT8_T, - &multiply_op->params.qs8_mul, sizeof(multiply_op->params.qs8_mul), - &multiply_op->params2.qs8_mul, sizeof(multiply_op->params2.qs8_mul), - threadpool); -} - -enum xnn_status xnn_reshape_multiply_nd_qu8( - xnn_operator_t multiply_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool) -{ - return reshape_binary_elementwise_nd( - multiply_op, xnn_operator_type_multiply_nd_qu8, - num_input1_dims, input1_shape, - num_input2_dims, input2_shape, - /*log2_element_size=*/XNN_LOG2_SIZEOF_UINT8_T, - &multiply_op->params.qu8_mul, sizeof(multiply_op->params.qu8_mul), - &multiply_op->params2.qu8_mul, sizeof(multiply_op->params2.qu8_mul), - threadpool); -} - -enum xnn_status xnn_reshape_squared_difference_nd_f16( - xnn_operator_t squared_difference_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool) -{ - return reshape_binary_elementwise_nd_f16( - squared_difference_op, xnn_operator_type_squared_difference_nd_f16, - num_input1_dims, input1_shape, - num_input2_dims, input2_shape, - threadpool); -} - -enum xnn_status xnn_reshape_squared_difference_nd_f32( - xnn_operator_t squared_difference_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool) -{ - return reshape_binary_elementwise_nd_f32( - squared_difference_op, xnn_operator_type_squared_difference_nd_f32, - num_input1_dims, input1_shape, - num_input2_dims, input2_shape, - threadpool); -} - - -enum xnn_status xnn_reshape_multiply_nd_s32( - xnn_operator_t mul_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool) -{ - - return reshape_binary_elementwise_nd( - mul_op, xnn_operator_type_multiply_nd_s32, - num_input1_dims, input1_shape, - num_input2_dims, input2_shape, - /*log2_element_size=*/XNN_LOG2_SIZEOF_INT32_T, - &mul_op->params.s32_default, sizeof(mul_op->params.s32_default), - &mul_op->params.s32_default, sizeof(mul_op->params.s32_default), - threadpool); -} - -enum xnn_status xnn_reshape_subtract_nd_f16( - xnn_operator_t subtract_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool) -{ - return reshape_binary_elementwise_nd_f16( - subtract_op, xnn_operator_type_subtract_nd_f16, - num_input1_dims, input1_shape, - num_input2_dims, input2_shape, - threadpool); -} - -enum xnn_status xnn_reshape_subtract_nd_f32( - xnn_operator_t subtract_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool) -{ - return reshape_binary_elementwise_nd_f32( - subtract_op, xnn_operator_type_subtract_nd_f32, - num_input1_dims, input1_shape, - num_input2_dims, input2_shape, - threadpool); -} - -enum xnn_status xnn_reshape_subtract_nd_qs8( - xnn_operator_t subtract_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool) -{ - return reshape_binary_elementwise_nd( - subtract_op, xnn_operator_type_subtract_nd_qs8, - num_input1_dims, input1_shape, - num_input2_dims, input2_shape, - /*log2_element_size=*/XNN_LOG2_SIZEOF_INT8_T, - &subtract_op->params.qs8_add, sizeof(subtract_op->params.qs8_add), - &subtract_op->params2.qs8_add, sizeof(subtract_op->params2.qs8_add), - threadpool); -} - -enum xnn_status xnn_reshape_subtract_nd_qu8( - xnn_operator_t subtract_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool) -{ - return reshape_binary_elementwise_nd( - subtract_op, xnn_operator_type_subtract_nd_qu8, - num_input1_dims, input1_shape, - num_input2_dims, input2_shape, - /*log2_element_size=*/XNN_LOG2_SIZEOF_UINT8_T, - &subtract_op->params.qu8_add, sizeof(subtract_op->params.qu8_add), - &subtract_op->params2.qu8_add, sizeof(subtract_op->params2.qu8_add), - threadpool); -} - -static enum xnn_status setup_binary_elementwise_nd( - xnn_operator_t binary_elementwise_op, - enum xnn_operator_type expected_operator_type, - const void* input1, - const void* input2, - void* output) -{ - if (binary_elementwise_op->type != expected_operator_type) { - xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)", - xnn_operator_type_to_string(expected_operator_type), - xnn_operator_type_to_string(binary_elementwise_op->type)); - return xnn_status_invalid_parameter; - } - - switch (binary_elementwise_op->state) { +enum xnn_status xnn_setup_binary_elementwise_nd(xnn_operator_t op, + const void* input1, + const void* input2, + void* output) { + switch (op->state) { case xnn_run_state_skip: return xnn_status_success; case xnn_run_state_invalid: xnn_log_error( - "failed to setup %s operator: operator has not been reshaped yet", - xnn_operator_type_to_string(binary_elementwise_op->type)); + "failed to setup %s operator: operator has not been reshaped yet", + xnn_operator_type_to_string(op->type)); return xnn_status_invalid_state; case xnn_run_state_needs_setup: // Operator has been reshaped, but not setup, continue with setup. @@ -1612,1165 +536,49 @@ static enum xnn_status setup_binary_elementwise_nd( break; } - binary_elementwise_op->context.elementwise_binary.a = input1; - binary_elementwise_op->context.elementwise_binary.b = input2; - binary_elementwise_op->context.elementwise_binary.y = output; + op->context.elementwise_binary.a = input1; + op->context.elementwise_binary.b = input2; + op->context.elementwise_binary.y = output; - if (binary_elementwise_op->context.elementwise_binary.flip_a_b) { - binary_elementwise_op->context.elementwise_binary.a = input2; - binary_elementwise_op->context.elementwise_binary.b = input1; + if (op->context.elementwise_binary.flip_a_b) { + op->context.elementwise_binary.a = input2; + op->context.elementwise_binary.b = input1; } - binary_elementwise_op->state = xnn_run_state_ready; + op->state = xnn_run_state_ready; return xnn_status_success; } -enum xnn_status xnn_setup_add_nd_f16( - xnn_operator_t add_op, - const void* input1, - const void* input2, - void* output) -{ - return setup_binary_elementwise_nd( - add_op, xnn_operator_type_add_nd_f16, - input1, input2, output); -} - -enum xnn_status xnn_setup_add_nd_f32( - xnn_operator_t add_op, - const float* input1, - const float* input2, - float* output) -{ - return setup_binary_elementwise_nd( - add_op, xnn_operator_type_add_nd_f32, - input1, input2, output); -} - -enum xnn_status xnn_setup_add_nd_qs8( - xnn_operator_t add_op, - const int8_t* input1, - const int8_t* input2, - int8_t* output) -{ - return setup_binary_elementwise_nd( - add_op, xnn_operator_type_add_nd_qs8, - input1, input2, output); -} - -enum xnn_status xnn_setup_add_nd_qu8( - xnn_operator_t add_op, - const uint8_t* input1, - const uint8_t* input2, - uint8_t* output) -{ - return setup_binary_elementwise_nd( - add_op, xnn_operator_type_add_nd_qu8, - input1, input2, output); -} - -enum xnn_status xnn_setup_divide_nd_f16( - xnn_operator_t divide_op, - const void* input1, - const void* input2, - void* output) -{ - return setup_binary_elementwise_nd( - divide_op, xnn_operator_type_divide_nd_f16, - input1, input2, output); -} - -enum xnn_status xnn_setup_divide_nd_f32( - xnn_operator_t divide_op, - const float* input1, - const float* input2, - float* output) -{ - return setup_binary_elementwise_nd( - divide_op, xnn_operator_type_divide_nd_f32, - input1, input2, output); -} - -enum xnn_status xnn_setup_copysign_nd_f32( - xnn_operator_t copysign_op, - const float* mag, - const float* sign, - float* output) -{ - return setup_binary_elementwise_nd( - copysign_op, xnn_operator_type_copysign_nd_f32, - mag, sign, output); -} - -enum xnn_status xnn_setup_maximum_nd_f16( - xnn_operator_t maximum_op, - const void* input1, - const void* input2, - void* output) -{ - return setup_binary_elementwise_nd( - maximum_op, xnn_operator_type_maximum_nd_f16, - input1, input2, output); -} - -enum xnn_status xnn_setup_maximum_nd_f32( - xnn_operator_t maximum_op, - const float* input1, - const float* input2, - float* output) -{ - return setup_binary_elementwise_nd( - maximum_op, xnn_operator_type_maximum_nd_f32, - input1, input2, output); -} - -enum xnn_status xnn_setup_minimum_nd_f16( - xnn_operator_t minimum_op, - const void* input1, - const void* input2, - void* output) -{ - return setup_binary_elementwise_nd( - minimum_op, xnn_operator_type_minimum_nd_f16, - input1, input2, output); -} - -enum xnn_status xnn_setup_minimum_nd_f32( - xnn_operator_t minimum_op, - const float* input1, - const float* input2, - float* output) -{ - return setup_binary_elementwise_nd( - minimum_op, xnn_operator_type_minimum_nd_f32, - input1, input2, output); -} - -enum xnn_status xnn_setup_multiply_nd_f16( - xnn_operator_t multiply_op, - const void* input1, - const void* input2, - void* output) -{ - return setup_binary_elementwise_nd( - multiply_op, xnn_operator_type_multiply_nd_f16, - input1, input2, output); -} - -enum xnn_status xnn_setup_multiply_nd_f32( - xnn_operator_t multiply_op, - const float* input1, - const float* input2, - float* output) -{ - return setup_binary_elementwise_nd( - multiply_op, xnn_operator_type_multiply_nd_f32, - input1, input2, output); -} - -enum xnn_status xnn_setup_multiply_nd_qs8( - xnn_operator_t multiply_op, - const int8_t* input1, - const int8_t* input2, - int8_t* output) -{ - return setup_binary_elementwise_nd( - multiply_op, xnn_operator_type_multiply_nd_qs8, - input1, input2, output); -} - -enum xnn_status xnn_setup_multiply_nd_qu8( - xnn_operator_t multiply_op, - const uint8_t* input1, - const uint8_t* input2, - uint8_t* output) -{ - return setup_binary_elementwise_nd( - multiply_op, xnn_operator_type_multiply_nd_qu8, - input1, input2, output); -} - -enum xnn_status xnn_setup_squared_difference_nd_f16( - xnn_operator_t squared_difference_op, - const void* input1, - const void* input2, - void* output) -{ - return setup_binary_elementwise_nd( - squared_difference_op, xnn_operator_type_squared_difference_nd_f16, - input1, input2, output); -} - -enum xnn_status xnn_setup_squared_difference_nd_f32( - xnn_operator_t squared_difference_op, - const float* input1, - const float* input2, - float* output) -{ - return setup_binary_elementwise_nd( - squared_difference_op, xnn_operator_type_squared_difference_nd_f32, - input1, input2, output); -} - -enum xnn_status xnn_setup_subtract_nd_f16( - xnn_operator_t subtract_op, - const void* input1, - const void* input2, - void* output) -{ - return setup_binary_elementwise_nd( - subtract_op, xnn_operator_type_subtract_nd_f16, - input1, input2, output); -} - - -enum xnn_status xnn_setup_multiply_nd_s32( - xnn_operator_t mul_op, - const int32_t* input1, - const int32_t* input2, - int32_t* output) -{ - return setup_binary_elementwise_nd( - mul_op, xnn_operator_type_multiply_nd_s32, - input1, input2, output); -} - - -enum xnn_status xnn_setup_subtract_nd_f32( - xnn_operator_t subtract_op, - const float* input1, - const float* input2, - float* output) -{ - return setup_binary_elementwise_nd( - subtract_op, xnn_operator_type_subtract_nd_f32, - input1, input2, output); -} - -enum xnn_status xnn_setup_subtract_nd_qs8( - xnn_operator_t subtract_op, - const int8_t* input1, - const int8_t* input2, - int8_t* output) -{ - return setup_binary_elementwise_nd( - subtract_op, xnn_operator_type_subtract_nd_qs8, - input1, input2, output); -} - -enum xnn_status xnn_setup_subtract_nd_qu8( - xnn_operator_t subtract_op, - const uint8_t* input1, - const uint8_t* input2, - uint8_t* output) -{ - return setup_binary_elementwise_nd( - subtract_op, xnn_operator_type_subtract_nd_qu8, - input1, input2, output); -} - -static enum xnn_status run_binary_elementwise_nd( - enum xnn_operator_type operator_type, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - const void* input1, - const void* input2, - void* output, - uint32_t log2_element_size, - size_t params_offset, - size_t setup_params_size, - size_t rparams_offset, - size_t setup_reversed_params_size, - const struct xnn_binary_elementwise_subconfig* binary_elementwise_subconfig, - const void* create_params, - const void* create_params2, - size_t create_params_size, - uint32_t flags, - pthreadpool_t threadpool) -{ - struct xnn_operator binary_elementwise_op; - memset(&binary_elementwise_op, 0, sizeof(binary_elementwise_op)); - - init_binary_elementwise_nd( - create_params, - create_params2, - create_params_size, - flags, - operator_type, - binary_elementwise_subconfig, - &binary_elementwise_op); - - const void* setup_params = (void*) ((uintptr_t) &binary_elementwise_op + params_offset); - const void* setup_reversed_params = (void*) ((uintptr_t) &binary_elementwise_op + rparams_offset); - - enum xnn_status status = reshape_binary_elementwise_nd( - &binary_elementwise_op, operator_type, - num_input1_dims, input1_shape, - num_input2_dims, input2_shape, - log2_element_size, - setup_params, setup_params_size, - setup_reversed_params, setup_reversed_params_size, - threadpool); - - - status = setup_binary_elementwise_nd( - &binary_elementwise_op, operator_type, - input1, input2, output); - +enum xnn_status xnn_run_binary_elementwise_nd( + enum xnn_binary_operator type, enum xnn_datatype datatype, + const struct xnn_quantization_params* input1_quantization, + const struct xnn_quantization_params* input2_quantization, + const struct xnn_quantization_params* output_quantization, uint32_t flags, + size_t num_input1_dims, const size_t* input1_shape, size_t num_input2_dims, + const size_t* input2_shape, const void* input1, const void* input2, + void* output, pthreadpool_t threadpool) { + struct xnn_operator op; + memset(&op, 0, sizeof(op)); + + enum xnn_status status = init_binary_elementwise_nd( + &op, type, datatype, input1_quantization, input2_quantization, + output_quantization, flags); if (status != xnn_status_success) { return status; } - return xnn_run_operator(&binary_elementwise_op, threadpool); -} - -static enum xnn_status run_binary_elementwise_nd_f32( - enum xnn_operator_type operator_type, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - const float* input1, - const float* input2, - float* output, - float output_min, - float output_max, - const struct xnn_binary_elementwise_config* config, - uint32_t flags, - pthreadpool_t threadpool) -{ - if (isnan(output_min)) { - xnn_log_error( - "failed to run %s operator with NaN output lower bound: lower bound must be non-NaN", - xnn_operator_type_to_string(operator_type)); - return xnn_status_invalid_parameter; - } - - if (isnan(output_max)) { - xnn_log_error( - "failed to run %s operator with NaN output upper bound: upper bound must be non-NaN", - xnn_operator_type_to_string(operator_type)); - return xnn_status_invalid_parameter; - } - - if (output_min > output_max) { - xnn_log_error( - "failed to create %s operator with [%.7g, %.7g] output range: lower bound must be less than or equal to upper bound", - xnn_operator_type_to_string(operator_type), output_min, output_max); - return xnn_status_invalid_parameter; - } - - if (config == NULL) { - xnn_log_error("failed to create %s operator: unsupported hardware configuration", - xnn_operator_type_to_string(operator_type)); - return xnn_status_unsupported_hardware; - } - - union xnn_f32_minmax_params params; - assert(config->init.f32_minmax != NULL); - config->init.f32_minmax(¶ms, output_min, output_max); - - const bool linear_activation = (output_max == INFINITY) && (output_min == -output_max); - const struct xnn_binary_elementwise_subconfig* binary_elementwise_subconfig = &config->minmax; - if (linear_activation && config->linear.op_ukernel != NULL) { - binary_elementwise_subconfig = &config->linear; - } - - return run_binary_elementwise_nd( - operator_type, - num_input1_dims, input1_shape, - num_input2_dims, input2_shape, - input1, input2, output, - /*log2_element_size=*/XNN_LOG2_SIZEOF_FLOAT, - offsetof(struct xnn_operator, params.f32_minmax), sizeof(params), - offsetof(struct xnn_operator, params2.f32_minmax), sizeof(params), - binary_elementwise_subconfig, - ¶ms, - ¶ms, - sizeof(params), - flags, - threadpool); -} - -enum xnn_status xnn_run_add_nd_f32( - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - const float* input1, - const float* input2, - float* output, - float output_min, - float output_max, - uint32_t flags, - pthreadpool_t threadpool) -{ - return run_binary_elementwise_nd_f32( - xnn_operator_type_add_nd_f32, - num_input1_dims, input1_shape, - num_input2_dims, input2_shape, - input1, input2, output, - output_min, output_max, - xnn_init_f32_vadd_config(), - flags, - threadpool); -} - -enum xnn_status xnn_run_divide_nd_f32( - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - const float* input1, - const float* input2, - float* output, - float output_min, - float output_max, - uint32_t flags, - pthreadpool_t threadpool) -{ - return run_binary_elementwise_nd_f32( - xnn_operator_type_divide_nd_f32, - num_input1_dims, input1_shape, - num_input2_dims, input2_shape, - input1, input2, output, - output_min, output_max, - xnn_init_f32_vdiv_config(), - flags, - threadpool); -} - -enum xnn_status xnn_run_maximum_nd_f32( - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - const float* input1, - const float* input2, - float* output, - uint32_t flags, - pthreadpool_t threadpool) -{ - const struct xnn_binary_elementwise_config* f32_vmax_config = xnn_init_f32_vmax_config(); - if (f32_vmax_config == NULL) { - xnn_log_error("failed to create %s operator: unsupported hardware configuration", - xnn_operator_type_to_string(xnn_operator_type_maximum_nd_f32)); - return xnn_status_unsupported_hardware; - } - - struct xnn_f32_default_params params; - if (f32_vmax_config->init.f32_default != NULL) { - f32_vmax_config->init.f32_default(¶ms); - } - - const struct xnn_binary_elementwise_subconfig* binary_elementwise_subconfig = &f32_vmax_config->minmax; - - return run_binary_elementwise_nd( - xnn_operator_type_maximum_nd_f32, - num_input1_dims, input1_shape, - num_input2_dims, input2_shape, - input1, input2, output, - /*log2_element_size=*/XNN_LOG2_SIZEOF_FLOAT, - offsetof(struct xnn_operator, params.f32_minmax), sizeof(params), - offsetof(struct xnn_operator, params2.f32_minmax), sizeof(params), - binary_elementwise_subconfig, - ¶ms, - ¶ms, - sizeof(params), - flags, - threadpool); -} - -enum xnn_status xnn_run_minimum_nd_f32( - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - const float* input1, - const float* input2, - float* output, - uint32_t flags, - pthreadpool_t threadpool) -{ - const struct xnn_binary_elementwise_config* f32_vmin_config = xnn_init_f32_vmin_config(); - if (f32_vmin_config == NULL) { - xnn_log_error("failed to create %s operator: unsupported hardware configuration", - xnn_operator_type_to_string(xnn_operator_type_minimum_nd_f32)); - return xnn_status_unsupported_hardware; - } - - struct xnn_f32_default_params params; - if (f32_vmin_config->init.f32_default != NULL) { - f32_vmin_config->init.f32_default(¶ms); - } - const struct xnn_binary_elementwise_subconfig* binary_elementwise_subconfig = &f32_vmin_config->minmax; - - return run_binary_elementwise_nd( - xnn_operator_type_minimum_nd_f32, - num_input1_dims, input1_shape, - num_input2_dims, input2_shape, - input1, input2, output, - /*log2_element_size=*/XNN_LOG2_SIZEOF_FLOAT, - offsetof(struct xnn_operator, params.f32_minmax), sizeof(params), - offsetof(struct xnn_operator, params2.f32_minmax), sizeof(params), - binary_elementwise_subconfig, - ¶ms, - ¶ms, - sizeof(params), - flags, - threadpool); -} - -enum xnn_status xnn_run_multiply_nd_f32( - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - const float* input1, - const float* input2, - float* output, - float output_min, - float output_max, - uint32_t flags, - pthreadpool_t threadpool) -{ - return run_binary_elementwise_nd_f32( - xnn_operator_type_multiply_nd_f32, - num_input1_dims, input1_shape, - num_input2_dims, input2_shape, - input1, input2, output, - output_min, output_max, - xnn_init_f32_vmul_config(), - flags, - threadpool); -} - -enum xnn_status xnn_run_subtract_nd_f32( - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - const float* input1, - const float* input2, - float* output, - float output_min, - float output_max, - uint32_t flags, - pthreadpool_t threadpool) -{ - return run_binary_elementwise_nd_f32( - xnn_operator_type_subtract_nd_f32, - num_input1_dims, input1_shape, - num_input2_dims, input2_shape, - input1, input2, output, - output_min, output_max, - xnn_init_f32_vsub_config(), - flags, - threadpool); -} - -enum xnn_status xnn_run_squared_difference_nd_f32( - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - const float* input1, - const float* input2, - float* output, - uint32_t flags, - pthreadpool_t threadpool) -{ - const struct xnn_binary_elementwise_config* f32_vsqrdiff_config = xnn_init_f32_vsqrdiff_config(); - if (f32_vsqrdiff_config == NULL) { - xnn_log_error("failed to create %s operator: unsupported hardware configuration", - xnn_operator_type_to_string(xnn_operator_type_squared_difference_nd_f32)); - return xnn_status_unsupported_hardware; - } - - struct xnn_f32_default_params params; - if (f32_vsqrdiff_config->init.f32_default != NULL) { - f32_vsqrdiff_config->init.f32_default(¶ms); - } - - const struct xnn_binary_elementwise_subconfig* binary_elementwise_subconfig = &f32_vsqrdiff_config->minmax; - - return run_binary_elementwise_nd( - xnn_operator_type_squared_difference_nd_f32, - num_input1_dims, input1_shape, - num_input2_dims, input2_shape, - input1, input2, output, - /*log2_element_size=*/XNN_LOG2_SIZEOF_FLOAT, - offsetof(struct xnn_operator, params.f32_minmax), sizeof(params), - offsetof(struct xnn_operator, params2.f32_minmax), sizeof(params), - binary_elementwise_subconfig, - ¶ms, - ¶ms, - sizeof(params), - flags, - threadpool); -} - - -enum xnn_status xnn_run_add_nd_qs8( - size_t num_input1_dims, - const size_t* input1_shape, - int8_t input1_zero_point, - float input1_scale, - size_t num_input2_dims, - const size_t* input2_shape, - int8_t input2_zero_point, - float input2_scale, - const int8_t* input1, - const int8_t* input2, - int8_t* output, - int8_t output_zero_point, - float output_scale, - int8_t output_min, - int8_t output_max, - uint32_t flags, - pthreadpool_t threadpool) -{ - if (input1_scale <= 0.0f || !isnormal(input1_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g input 1 scale: scale must be finite and positive", - xnn_operator_type_to_string(xnn_operator_type_add_nd_qs8), input1_scale); - return xnn_status_invalid_parameter; - } - - if (input2_scale <= 0.0f || !isnormal(input2_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g input 2 scale: scale must be finite and positive", - xnn_operator_type_to_string(xnn_operator_type_add_nd_qs8), input2_scale); - return xnn_status_invalid_parameter; - } - - if (output_scale <= 0.0f || !isnormal(output_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g output scale: scale must be finite and positive", - xnn_operator_type_to_string(xnn_operator_type_add_nd_qs8), output_scale); - return xnn_status_invalid_parameter; - } - - if (output_min > output_max) { - xnn_log_error( - "failed to create %s operator with [%" PRId8 ", %" PRId8 "] output range: lower bound must be less than or equal to upper bound", - xnn_operator_type_to_string(xnn_operator_type_add_nd_qs8), output_min, output_max); - return xnn_status_invalid_parameter; - } - - const float input1_output_scale = input1_scale / output_scale; - if (input1_output_scale < 0x1.0p-10f || input1_output_scale >= 0x1.0p+8f) { - xnn_log_error( - "failed to create %s operator with %.7g input1-to-output scale ratio: scale ratio must be in [2**-10, 2**8) range", - xnn_operator_type_to_string(xnn_operator_type_add_nd_qs8), input1_output_scale); - return xnn_status_unsupported_parameter; - } - - const float input2_output_scale = input2_scale / output_scale; - if (input2_output_scale < 0x1.0p-10f || input2_output_scale >= 0x1.0p+8f) { - xnn_log_error( - "failed to create %s operator with %.7g input2-to-output scale ratio: scale ratio must be in [2**-10, 2**8) range", - xnn_operator_type_to_string(xnn_operator_type_add_nd_qs8), input2_output_scale); - return xnn_status_unsupported_parameter; - } - - const struct xnn_binary_elementwise_config* qs8_vadd_config = xnn_init_qs8_vadd_config(); - if (qs8_vadd_config == NULL) { - xnn_log_error("failed to create %s operator: unsupported hardware configuration", - xnn_operator_type_to_string(xnn_operator_type_add_nd_qs8)); - return xnn_status_unsupported_hardware; - } - struct xnn_qs8_add_minmax_params params; - struct xnn_qs8_add_minmax_params params2; - assert(qs8_vadd_config->init.qs8_add != NULL); - qs8_vadd_config->init.qs8_add( - ¶ms, input1_zero_point, input2_zero_point, output_zero_point, - input1_output_scale, input2_output_scale, output_min, output_max); - qs8_vadd_config->init.qs8_add( - ¶ms2, input2_zero_point, input1_zero_point, output_zero_point, - input2_output_scale, input1_output_scale, output_min, output_max); - - - return run_binary_elementwise_nd( - xnn_operator_type_add_nd_qs8, - num_input1_dims, input1_shape, - num_input2_dims, input2_shape, - input1, input2, output, - /*log2_element_size=*/XNN_LOG2_SIZEOF_INT8_T, - offsetof(struct xnn_operator, params.qs8_add), sizeof(params), - offsetof(struct xnn_operator, params2.qs8_add), sizeof(params2), - &qs8_vadd_config->minmax, - ¶ms, - ¶ms2, - sizeof(params), - flags, - threadpool); -} - -enum xnn_status xnn_run_multiply_nd_qs8( - size_t num_input1_dims, - const size_t* input1_shape, - int8_t input1_zero_point, - float input1_scale, - size_t num_input2_dims, - const size_t* input2_shape, - int8_t input2_zero_point, - float input2_scale, - const int8_t* input1, - const int8_t* input2, - int8_t* output, - int8_t output_zero_point, - float output_scale, - int8_t output_min, - int8_t output_max, - uint32_t flags, - pthreadpool_t threadpool) -{ - if (input1_scale <= 0.0f || !isnormal(input1_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g input 1 scale: scale must be finite and positive", - xnn_operator_type_to_string(xnn_operator_type_multiply_nd_qs8), input1_scale); - return xnn_status_invalid_parameter; - } - - if (input2_scale <= 0.0f || !isnormal(input2_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g input 2 scale: scale must be finite and positive", - xnn_operator_type_to_string(xnn_operator_type_multiply_nd_qs8), input2_scale); - return xnn_status_invalid_parameter; - } - - if (output_scale <= 0.0f || !isnormal(output_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g output scale: scale must be finite and positive", - xnn_operator_type_to_string(xnn_operator_type_multiply_nd_qs8), output_scale); - return xnn_status_invalid_parameter; - } - - if (output_min > output_max) { - xnn_log_error( - "failed to create %s operator with [%" PRId8 ", %" PRId8 "] output range: lower bound must be less than or equal to upper bound", - xnn_operator_type_to_string(xnn_operator_type_multiply_nd_qs8), output_min, output_max); - return xnn_status_invalid_parameter; - } - - const float product_scale = input1_scale * input2_scale; - const float product_output_scale = product_scale / output_scale; - - if (product_output_scale < 0x1.0p-16f || product_output_scale >= 0x1.0p+8f) { - xnn_log_error( - "failed to create %s operator with %.7g product-to-output scale ratio: scale ratio must be in [2**-16, 2**8) range", - xnn_operator_type_to_string(xnn_operator_type_multiply_nd_qs8), product_output_scale); - return xnn_status_unsupported_parameter; - } - - const struct xnn_binary_elementwise_config* qs8_vmul_config = xnn_init_qs8_vmul_config(); - if (qs8_vmul_config == NULL) { - xnn_log_error("failed to create %s operator: unsupported hardware configuration", - xnn_operator_type_to_string(xnn_operator_type_multiply_nd_qs8)); - return xnn_status_unsupported_hardware; - } - union xnn_qs8_mul_minmax_params params; - union xnn_qs8_mul_minmax_params params2; - - assert(qs8_vmul_config->init.qs8_mul != NULL); - qs8_vmul_config->init.qs8_mul( - ¶ms, input1_zero_point, input2_zero_point, output_zero_point, - product_output_scale, output_min, output_max); - qs8_vmul_config->init.qs8_mul( - ¶ms2, input2_zero_point, input1_zero_point, output_zero_point, - product_output_scale, output_min, output_max); - - return run_binary_elementwise_nd( - xnn_operator_type_multiply_nd_qs8, - num_input1_dims, input1_shape, - num_input2_dims, input2_shape, - input1, input2, output, - /*log2_element_size=*/XNN_LOG2_SIZEOF_INT8_T, - offsetof(struct xnn_operator, params.qs8_mul), sizeof(params), - offsetof(struct xnn_operator, params2.qs8_mul), sizeof(params2), - &qs8_vmul_config->minmax, - ¶ms, - ¶ms2, - sizeof(params), - flags, - threadpool); -} - -enum xnn_status xnn_run_subtract_nd_qs8( - size_t num_input1_dims, - const size_t* input1_shape, - int8_t input1_zero_point, - float input1_scale, - size_t num_input2_dims, - const size_t* input2_shape, - int8_t input2_zero_point, - float input2_scale, - const int8_t* input1, - const int8_t* input2, - int8_t* output, - int8_t output_zero_point, - float output_scale, - int8_t output_min, - int8_t output_max, - uint32_t flags, - pthreadpool_t threadpool) -{ - if (input1_scale <= 0.0f || !isnormal(input1_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g input 1 scale: scale must be finite and positive", - xnn_operator_type_to_string(xnn_operator_type_subtract_nd_qs8), input1_scale); - return xnn_status_invalid_parameter; - } - - if (input2_scale <= 0.0f || !isnormal(input2_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g input 2 scale: scale must be finite and positive", - xnn_operator_type_to_string(xnn_operator_type_subtract_nd_qs8), input2_scale); - return xnn_status_invalid_parameter; - } - - if (output_scale <= 0.0f || !isnormal(output_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g output scale: scale must be finite and positive", - xnn_operator_type_to_string(xnn_operator_type_subtract_nd_qs8), output_scale); - return xnn_status_invalid_parameter; - } - - if (output_min > output_max) { - xnn_log_error( - "failed to create %s operator with [%" PRId8 ", %" PRId8 "] output range: lower bound must be less than or equal to upper bound", - xnn_operator_type_to_string(xnn_operator_type_subtract_nd_qs8), output_min, output_max); - return xnn_status_invalid_parameter; - } - - const float input1_output_scale = input1_scale / output_scale; - if (input1_output_scale < 0x1.0p-10f || input1_output_scale >= 0x1.0p+8f) { - xnn_log_error( - "failed to create %s operator with %.7g input1-to-output scale ratio: scale ratio must be in [2**-10, 2**8) range", - xnn_operator_type_to_string(xnn_operator_type_subtract_nd_qs8), input1_output_scale); - return xnn_status_unsupported_parameter; - } - - const float input2_output_scale = input2_scale / output_scale; - if (input2_output_scale < 0x1.0p-10f || input2_output_scale >= 0x1.0p+8f) { - xnn_log_error( - "failed to create %s operator with %.7g input2-to-output scale ratio: scale ratio must be in [2**-10, 2**8) range", - xnn_operator_type_to_string(xnn_operator_type_subtract_nd_qs8), input2_output_scale); - return xnn_status_unsupported_parameter; - } - - const struct xnn_binary_elementwise_config* qs8_vadd_config = xnn_init_qs8_vadd_config(); - if (qs8_vadd_config == NULL) { - xnn_log_error("failed to create %s operator: unsupported hardware configuration", - xnn_operator_type_to_string(xnn_operator_type_subtract_nd_qs8)); - return xnn_status_unsupported_hardware; - } - struct xnn_qs8_add_minmax_params params; - struct xnn_qs8_add_minmax_params params2; - assert(qs8_vadd_config->init.qs8_add != NULL); - qs8_vadd_config->init.qs8_add( - ¶ms, input1_zero_point, input2_zero_point, output_zero_point, - input1_output_scale, -input2_output_scale, output_min, output_max); - qs8_vadd_config->init.qs8_add( - ¶ms2, input2_zero_point, input1_zero_point, output_zero_point, - -input2_output_scale, input1_output_scale, output_min, output_max); - - return run_binary_elementwise_nd( - xnn_operator_type_subtract_nd_qs8, - num_input1_dims, input1_shape, - num_input2_dims, input2_shape, - input1, input2, output, - /*log2_element_size=*/XNN_LOG2_SIZEOF_INT8_T, - offsetof(struct xnn_operator, params.qs8_add), sizeof(params), - offsetof(struct xnn_operator, params2.qs8_add), sizeof(params2), - &qs8_vadd_config->minmax, - ¶ms, - ¶ms2, - sizeof(params), - flags, - threadpool); -} - -enum xnn_status xnn_run_add_nd_qu8( - size_t num_input1_dims, - const size_t* input1_shape, - uint8_t input1_zero_point, - float input1_scale, - size_t num_input2_dims, - const size_t* input2_shape, - uint8_t input2_zero_point, - float input2_scale, - const uint8_t* input1, - const uint8_t* input2, - uint8_t* output, - uint8_t output_zero_point, - float output_scale, - uint8_t output_min, - uint8_t output_max, - uint32_t flags, - pthreadpool_t threadpool) -{ - if (input1_scale <= 0.0f || !isnormal(input1_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g input 1 scale: scale must be finite and positive", - xnn_operator_type_to_string(xnn_operator_type_add_nd_qu8), input1_scale); - return xnn_status_invalid_parameter; - } - - if (input2_scale <= 0.0f || !isnormal(input2_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g input 2 scale: scale must be finite and positive", - xnn_operator_type_to_string(xnn_operator_type_add_nd_qu8), input2_scale); - return xnn_status_invalid_parameter; - } - - if (output_scale <= 0.0f || !isnormal(output_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g output scale: scale must be finite and positive", - xnn_operator_type_to_string(xnn_operator_type_add_nd_qu8), output_scale); - return xnn_status_invalid_parameter; - } - - if (output_min > output_max) { - xnn_log_error( - "failed to create %s operator with [%" PRIu8 ", %" PRIu8 "] output range: lower bound must be less than or equal to upper bound", - xnn_operator_type_to_string(xnn_operator_type_add_nd_qu8), output_min, output_max); - return xnn_status_invalid_parameter; - } - - const float input1_output_scale = input1_scale / output_scale; - if (input1_output_scale < 0x1.0p-10f || input1_output_scale >= 0x1.0p+8f) { - xnn_log_error( - "failed to create %s operator with %.7g input1-to-output scale ratio: scale ratio must be in [2**-10, 2**8) range", - xnn_operator_type_to_string(xnn_operator_type_add_nd_qu8), input1_output_scale); - return xnn_status_unsupported_parameter; - } - - const float input2_output_scale = input2_scale / output_scale; - if (input2_output_scale < 0x1.0p-10f || input2_output_scale >= 0x1.0p+8f) { - xnn_log_error( - "failed to create %s operator with %.7g input2-to-output scale ratio: scale ratio must be in [2**-10, 2**8) range", - xnn_operator_type_to_string(xnn_operator_type_add_nd_qu8), input2_output_scale); - return xnn_status_unsupported_parameter; - } - - const struct xnn_binary_elementwise_config* qu8_vadd_config = xnn_init_qu8_vadd_config(); - if (qu8_vadd_config == NULL) { - xnn_log_error("failed to create %s operator: unsupported hardware configuration", - xnn_operator_type_to_string(xnn_operator_type_add_nd_qu8)); - return xnn_status_unsupported_hardware; - } - struct xnn_qu8_add_minmax_params params; - struct xnn_qu8_add_minmax_params params2; - assert(qu8_vadd_config->init.qu8_add != NULL); - qu8_vadd_config->init.qu8_add( - ¶ms, input1_zero_point, input2_zero_point, output_zero_point, - input1_output_scale, input2_output_scale, output_min, output_max); - qu8_vadd_config->init.qu8_add( - ¶ms2, input2_zero_point, input1_zero_point, output_zero_point, - input2_output_scale, input1_output_scale, output_min, output_max); - - return run_binary_elementwise_nd( - xnn_operator_type_add_nd_qu8, - num_input1_dims, input1_shape, - num_input2_dims, input2_shape, - input1, input2, output, - /*log2_element_size=*/XNN_LOG2_SIZEOF_UINT8_T, - offsetof(struct xnn_operator, params.qu8_add), sizeof(params), - offsetof(struct xnn_operator, params2.qu8_add), sizeof(params2), - &qu8_vadd_config->minmax, - ¶ms, - ¶ms2, - sizeof(params), - flags, - threadpool); -} - -enum xnn_status xnn_run_multiply_nd_qu8( - size_t num_input1_dims, - const size_t* input1_shape, - uint8_t input1_zero_point, - float input1_scale, - size_t num_input2_dims, - const size_t* input2_shape, - uint8_t input2_zero_point, - float input2_scale, - const uint8_t* input1, - const uint8_t* input2, - uint8_t* output, - uint8_t output_zero_point, - float output_scale, - uint8_t output_min, - uint8_t output_max, - uint32_t flags, - pthreadpool_t threadpool) -{ - if (input1_scale <= 0.0f || !isnormal(input1_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g input 1 scale: scale must be finite and positive", - xnn_operator_type_to_string(xnn_operator_type_multiply_nd_qu8), input1_scale); - return xnn_status_invalid_parameter; - } - - if (input2_scale <= 0.0f || !isnormal(input2_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g input 2 scale: scale must be finite and positive", - xnn_operator_type_to_string(xnn_operator_type_multiply_nd_qu8), input2_scale); - return xnn_status_invalid_parameter; - } - - if (output_scale <= 0.0f || !isnormal(output_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g output scale: scale must be finite and positive", - xnn_operator_type_to_string(xnn_operator_type_multiply_nd_qu8), output_scale); - return xnn_status_invalid_parameter; - } - - if (output_min > output_max) { - xnn_log_error( - "failed to create %s operator with [%" PRIu8 ", %" PRIu8 "] output range: lower bound must be less than or equal to upper bound", - xnn_operator_type_to_string(xnn_operator_type_multiply_nd_qu8), output_min, output_max); - return xnn_status_invalid_parameter; - } - - const float product_scale = input1_scale * input2_scale; - const float product_output_scale = product_scale / output_scale; - if (product_output_scale < 0x1.0p-16f || product_output_scale >= 0x1.0p+8f) { - xnn_log_error( - "failed to create %s operator with %.7g product-to-output scale ratio: scale ratio must be in [2**-16, 2**8) range", - xnn_operator_type_to_string(xnn_operator_type_multiply_nd_qu8), product_output_scale); - return xnn_status_unsupported_parameter; - } - - const struct xnn_binary_elementwise_config* qu8_vmul_config = xnn_init_qu8_vmul_config(); - if (qu8_vmul_config == NULL) { - xnn_log_error("failed to create %s operator: unsupported hardware configuration", - xnn_operator_type_to_string(xnn_operator_type_multiply_nd_qu8)); - return xnn_status_unsupported_hardware; - } - - union xnn_qu8_mul_minmax_params params; - union xnn_qu8_mul_minmax_params params2; - assert(qu8_vmul_config->init.qu8_mul != NULL); - qu8_vmul_config->init.qu8_mul( - ¶ms, input1_zero_point, input2_zero_point, output_zero_point, - product_output_scale, output_min, output_max); - qu8_vmul_config->init.qu8_mul( - ¶ms2, input2_zero_point, input1_zero_point, output_zero_point, - product_output_scale, output_min, output_max); - - return run_binary_elementwise_nd( - xnn_operator_type_multiply_nd_qu8, - num_input1_dims, input1_shape, - num_input2_dims, input2_shape, - input1, input2, output, - /*log2_element_size=*/XNN_LOG2_SIZEOF_UINT8_T, - offsetof(struct xnn_operator, params.qu8_mul), sizeof(params), - offsetof(struct xnn_operator, params2.qu8_mul), sizeof(params2), - &qu8_vmul_config->minmax, - ¶ms, - ¶ms2, - sizeof(params), - flags, - threadpool); -} - -enum xnn_status xnn_run_subtract_nd_qu8( - size_t num_input1_dims, - const size_t* input1_shape, - uint8_t input1_zero_point, - float input1_scale, - size_t num_input2_dims, - const size_t* input2_shape, - uint8_t input2_zero_point, - float input2_scale, - const uint8_t* input1, - const uint8_t* input2, - uint8_t* output, - uint8_t output_zero_point, - float output_scale, - uint8_t output_min, - uint8_t output_max, - uint32_t flags, - pthreadpool_t threadpool) -{ - if (input1_scale <= 0.0f || !isnormal(input1_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g input 1 scale: scale must be finite and positive", - xnn_operator_type_to_string(xnn_operator_type_subtract_nd_qu8), input1_scale); - return xnn_status_invalid_parameter; - } - - if (input2_scale <= 0.0f || !isnormal(input2_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g input 2 scale: scale must be finite and positive", - xnn_operator_type_to_string(xnn_operator_type_subtract_nd_qu8), input2_scale); - return xnn_status_invalid_parameter; - } - - if (output_scale <= 0.0f || !isnormal(output_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g output scale: scale must be finite and positive", - xnn_operator_type_to_string(xnn_operator_type_subtract_nd_qu8), output_scale); - return xnn_status_invalid_parameter; - } - - if (output_min > output_max) { - xnn_log_error( - "failed to create %s operator with [%" PRIu8 ", %" PRIu8 "] output range: lower bound must be less than or equal to upper bound", - xnn_operator_type_to_string(xnn_operator_type_subtract_nd_qu8), output_min, output_max); - return xnn_status_invalid_parameter; - } - - const float input1_output_scale = input1_scale / output_scale; - if (input1_output_scale < 0x1.0p-10f || input1_output_scale >= 0x1.0p+8f) { - xnn_log_error( - "failed to create %s operator with %.7g input1-to-output scale ratio: scale ratio must be in [2**-10, 2**8) range", - xnn_operator_type_to_string(xnn_operator_type_subtract_nd_qu8), input1_output_scale); - return xnn_status_unsupported_parameter; - } - - const float input2_output_scale = input2_scale / output_scale; - if (input2_output_scale < 0x1.0p-10f || input2_output_scale >= 0x1.0p+8f) { - xnn_log_error( - "failed to create %s operator with %.7g input2-to-output scale ratio: scale ratio must be in [2**-10, 2**8) range", - xnn_operator_type_to_string(xnn_operator_type_subtract_nd_qu8), input2_output_scale); - return xnn_status_unsupported_parameter; + status = xnn_reshape_binary_elementwise_nd(&op, num_input1_dims, input1_shape, + num_input2_dims, input2_shape, + threadpool); + if (status != xnn_status_success) { + return status; } - const struct xnn_binary_elementwise_config* qu8_vadd_config = xnn_init_qu8_vadd_config(); - if (qu8_vadd_config == NULL) { - xnn_log_error("failed to create %s operator: unsupported hardware configuration", - xnn_operator_type_to_string(xnn_operator_type_subtract_nd_qu8)); - return xnn_status_unsupported_hardware; + status = xnn_setup_binary_elementwise_nd(&op, input1, input2, output); + if (status != xnn_status_success) { + return status; } - struct xnn_qu8_add_minmax_params params; - struct xnn_qu8_add_minmax_params params2; - assert(qu8_vadd_config->init.qu8_add != NULL); - qu8_vadd_config->init.qu8_add( - ¶ms, input1_zero_point, input2_zero_point, output_zero_point, - input1_output_scale, -input2_output_scale, output_min, output_max); - qu8_vadd_config->init.qu8_add( - ¶ms2, input2_zero_point, input1_zero_point, output_zero_point, - -input2_output_scale, input1_output_scale, output_min, output_max); - - return run_binary_elementwise_nd( - xnn_operator_type_subtract_nd_qu8, - num_input1_dims, input1_shape, - num_input2_dims, input2_shape, - input1, input2, output, - /*log2_element_size=*/XNN_LOG2_SIZEOF_UINT8_T, - offsetof(struct xnn_operator, params.qu8_add), sizeof(params), - offsetof(struct xnn_operator, params2.qu8_add), sizeof(params2), - &qu8_vadd_config->minmax, - ¶ms, - ¶ms2, - sizeof(params), - flags, - threadpool); + return xnn_run_operator(&op, threadpool); } diff --git a/src/operators/softmax-nc.c b/src/operators/softmax-nc.c index 52a5f932652..5fdcaaee100 100644 --- a/src/operators/softmax-nc.c +++ b/src/operators/softmax-nc.c @@ -296,7 +296,7 @@ enum xnn_status xnn_create_softmax_nc_f16( if (vmul_config == NULL) { xnn_log_error( "failed to create %s operator: unsupported hardware configuration", - xnn_operator_type_to_string(xnn_operator_type_multiply_nd_f16)); + xnn_operator_type_to_string(xnn_operator_type_softmax_nc_f16)); return xnn_status_unsupported_hardware; } @@ -334,7 +334,7 @@ enum xnn_status xnn_create_softmax_nc_f32( if (vmul_config == NULL) { xnn_log_error( "failed to create %s operator: unsupported hardware configuration", - xnn_operator_type_to_string(xnn_operator_type_multiply_nd_f32)); + xnn_operator_type_to_string(xnn_operator_type_softmax_nc_f32)); return xnn_status_unsupported_hardware; } @@ -528,10 +528,9 @@ enum xnn_status xnn_reshape_softmax_nc_f16( } const struct xnn_binary_elementwise_config* f16_vmul_config = softmax_op->vmul_config; - union xnn_f16_minmax_params minmax_params; - if (f16_vmul_config->init.f16_minmax != NULL) { - f16_vmul_config->init.f16_minmax(&minmax_params, xnn_float16_from_float(-INFINITY), - xnn_float16_from_float(INFINITY)); + union xnn_binary_uparams mul_params; + if (f16_vmul_config->init != NULL) { + f16_vmul_config->init(&mul_params, NULL, NULL, NULL); } return reshape_softmax_nc_floating_point( softmax_op, xnn_operator_type_softmax_nc_f16, @@ -542,7 +541,7 @@ enum xnn_status xnn_reshape_softmax_nc_f16( (xnn_compute_reciprocal_fn) compute_reciprocal_f16, &rmax_params, sizeof(rmax_params), &expminus_params, sizeof(expminus_params), - &minmax_params, sizeof(minmax_params)); + &mul_params, sizeof(mul_params)); } enum xnn_status xnn_reshape_softmax_nc_f32( @@ -564,9 +563,9 @@ enum xnn_status xnn_reshape_softmax_nc_f32( if (softmax_op->raddstoreexpminusmax_config->init.f32 != NULL) { softmax_op->raddstoreexpminusmax_config->init.f32(&expminus_params); } - union xnn_f32_minmax_params minmax_params; - if (f32_vmul_config->init.f32_minmax != NULL) { - f32_vmul_config->init.f32_minmax(&minmax_params, -INFINITY, INFINITY); + union xnn_binary_uparams mul_params; + if (f32_vmul_config->init != NULL) { + f32_vmul_config->init(&mul_params, NULL, NULL, NULL); } return reshape_softmax_nc_floating_point( softmax_op, xnn_operator_type_softmax_nc_f32, @@ -577,5 +576,5 @@ enum xnn_status xnn_reshape_softmax_nc_f32( (xnn_compute_reciprocal_fn) compute_reciprocal_f32, &rmax_params, sizeof(rmax_params), &expminus_params, sizeof(expminus_params), - &minmax_params, sizeof(minmax_params)); + &mul_params, sizeof(mul_params)); } diff --git a/src/subgraph.c b/src/subgraph.c index 54c74529207..f7dfd9a6603 100644 --- a/src/subgraph.c +++ b/src/subgraph.c @@ -1400,3 +1400,51 @@ enum xnn_status xnn_delete_subgraph( } return xnn_status_success; } + +enum xnn_node_type xnn_binary_operator_to_node_type(enum xnn_binary_operator op) +{ + switch (op) { + case xnn_binary_add: + return xnn_node_type_add2; + case xnn_binary_divide: + return xnn_node_type_divide; + case xnn_binary_multiply: + return xnn_node_type_multiply2; + case xnn_binary_subtract: + return xnn_node_type_subtract; + case xnn_binary_copysign: + return xnn_node_type_copysign; + case xnn_binary_squared_difference: + return xnn_node_type_squared_difference; + case xnn_binary_minimum: + return xnn_node_type_minimum2; + case xnn_binary_maximum: + return xnn_node_type_maximum2; + default: + return xnn_node_type_invalid; + } +} + +enum xnn_binary_operator xnn_node_type_to_binary_operator(enum xnn_node_type op) +{ + switch (op) { + case xnn_node_type_add2: + return xnn_binary_add; + case xnn_node_type_divide: + return xnn_binary_divide; + case xnn_node_type_multiply2: + return xnn_binary_multiply; + case xnn_node_type_subtract: + return xnn_binary_subtract; + case xnn_node_type_copysign: + return xnn_binary_copysign; + case xnn_node_type_squared_difference: + return xnn_binary_squared_difference; + case xnn_node_type_minimum2: + return xnn_binary_minimum; + case xnn_node_type_maximum2: + return xnn_binary_maximum; + default: + return xnn_binary_invalid; + } +} diff --git a/src/subgraph/add2.c b/src/subgraph/add2.c deleted file mode 100644 index 847e9ac4922..00000000000 --- a/src/subgraph/add2.c +++ /dev/null @@ -1,378 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include - -#include "xnnpack.h" -#include "xnnpack/common.h" -#include "xnnpack/log.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/requantization.h" -#include "xnnpack/reshape-helpers.h" -#include "xnnpack/subgraph-validation.h" -#include "xnnpack/subgraph.h" -#include "pthreadpool.h" - -static enum xnn_status create_add_operator( - const struct xnn_node* node, - const struct xnn_value* values, - size_t num_values, - struct xnn_operator_data* opdata, - struct xnn_code_cache* code_cache, - xnn_weights_cache_t weights_cache) -{ - assert(node->num_inputs == 2); - const uint32_t input1_id = node->inputs[0]; - assert(input1_id != XNN_INVALID_VALUE_ID); - assert(input1_id < num_values); - const uint32_t input2_id = node->inputs[1]; - assert(input2_id != XNN_INVALID_VALUE_ID); - assert(input2_id < num_values); - - assert(node->num_outputs == 1); - const uint32_t output_id = node->outputs[0]; - assert(output_id != XNN_INVALID_VALUE_ID); - assert(output_id < num_values); - - enum xnn_status status; - const struct xnn_value *input1_value = &values[input1_id]; - switch (input1_value->datatype) { - case xnn_datatype_fp16: - status = xnn_create_add_nd_f16( - node->activation.output_min, - node->activation.output_max, - node->flags, - &opdata->operator_objects[0]); - break; - case xnn_datatype_fp32: - status = xnn_create_add_nd_f32( - node->activation.output_min, - node->activation.output_max, - node->flags, - &opdata->operator_objects[0]); - break; - case xnn_datatype_qint8: - { - const float output_scale = values[output_id].quantization.scale; - const int32_t output_zero_point = values[output_id].quantization.zero_point; - const int8_t output_min = xnn_qs8_quantize(node->activation.output_min, output_scale, output_zero_point); - const int8_t output_max = xnn_qs8_quantize(node->activation.output_max, output_scale, output_zero_point); - status = xnn_create_add_nd_qs8( - (int8_t) values[input1_id].quantization.zero_point, - values[input1_id].quantization.scale, - (int8_t) values[input2_id].quantization.zero_point, - values[input2_id].quantization.scale, - (int8_t) output_zero_point, - output_scale, output_min, output_max, node->flags, - &opdata->operator_objects[0]); - break; - } - case xnn_datatype_quint8: - { - const float output_scale = values[output_id].quantization.scale; - const int32_t output_zero_point = values[output_id].quantization.zero_point; - const uint8_t output_min = xnn_qu8_quantize(node->activation.output_min, output_scale, output_zero_point); - const uint8_t output_max = xnn_qu8_quantize(node->activation.output_max, output_scale, output_zero_point); - status = xnn_create_add_nd_qu8( - (uint8_t) values[input1_id].quantization.zero_point, - values[input1_id].quantization.scale, - (uint8_t) values[input2_id].quantization.zero_point, - values[input2_id].quantization.scale, - (uint8_t) output_zero_point, - output_scale, output_min, output_max, node->flags, - &opdata->operator_objects[0]); - break; - } - default: - XNN_UNREACHABLE; - } - return status; -} - -static enum xnn_status reshape_add_operator( - struct xnn_operator_data* opdata, - struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input1_id = opdata->inputs[0]; - assert(input1_id < num_values); - const uint32_t input2_id = opdata->inputs[1]; - assert(input2_id < num_values); - const uint32_t output_id = opdata->outputs[0]; - assert(output_id < num_values); - - opdata->shape1.num_dims = values[input1_id].shape.num_dims; - opdata->shape2.num_dims = values[input2_id].shape.num_dims; - if (values[output_id].layout == xnn_layout_type_nchw) { - assert(values[input1_id].layout == xnn_layout_type_nchw); - assert(values[input2_id].layout == xnn_layout_type_nchw); - opdata->shape1.dim[0] = values[input1_id].shape.dim[0]; - opdata->shape1.dim[1] = values[input1_id].shape.dim[values[input1_id].shape.num_dims - 1]; - if (values[input1_id].shape.num_dims > 2) { - memcpy(&opdata->shape1.dim[2], &values[input1_id].shape.dim[1], (values[input1_id].shape.num_dims - 2) * sizeof(size_t)); - } - opdata->shape2.dim[0] = values[input2_id].shape.dim[0]; - opdata->shape2.dim[1] = values[input2_id].shape.dim[values[input2_id].shape.num_dims - 1]; - if (values[input1_id].shape.num_dims > 2) { - memcpy(&opdata->shape2.dim[2], &values[input2_id].shape.dim[1], (values[input2_id].shape.num_dims - 2) * sizeof(size_t)); - } - } else { - assert(values[output_id].layout == xnn_layout_type_nhwc); - assert(values[input1_id].layout == xnn_layout_type_nhwc); - assert(values[input2_id].layout == xnn_layout_type_nhwc); - memcpy(opdata->shape1.dim, values[input1_id].shape.dim, values[input1_id].shape.num_dims * sizeof(size_t)); - memcpy(opdata->shape2.dim, values[input2_id].shape.dim, values[input2_id].shape.num_dims * sizeof(size_t)); - } - - // Handle scalars. Although the output shape is dimensionless, the reshape - // function must be passed a valid shape to prevent skipping the op. - if (opdata->shape1.num_dims == 0) { - opdata->shape1.num_dims = 1; - opdata->shape1.dim[0] = 1; - } - if (opdata->shape2.num_dims == 0) { - opdata->shape2.num_dims = 1; - opdata->shape2.dim[0] = 1; - } - const size_t old_workspace_size = opdata->workspace_size; - enum xnn_status status = xnn_status_invalid_state; - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_add_nd_f32: - status = xnn_reshape_add_nd_f32( - opdata->operator_objects[0], - opdata->shape1.num_dims, - opdata->shape1.dim, - opdata->shape2.num_dims, - opdata->shape2.dim, - threadpool); - break; - case xnn_operator_type_add_nd_f16: - status = xnn_reshape_add_nd_f16( - opdata->operator_objects[0], - opdata->shape1.num_dims, - opdata->shape1.dim, - opdata->shape2.num_dims, - opdata->shape2.dim, - threadpool); - break; - case xnn_operator_type_add_nd_qs8: - status = xnn_reshape_add_nd_qs8( - opdata->operator_objects[0], - opdata->shape1.num_dims, - opdata->shape1.dim, - opdata->shape2.num_dims, - opdata->shape2.dim, - threadpool); - break; - case xnn_operator_type_add_nd_qu8: - status = xnn_reshape_add_nd_qu8( - opdata->operator_objects[0], - opdata->shape1.num_dims, - opdata->shape1.dim, - opdata->shape2.num_dims, - opdata->shape2.dim, - threadpool); - break; - default: - XNN_UNREACHABLE; - } - if (status != xnn_status_success) { - return status; - } - return resize_binary_elementwise_output_tensor(opdata, values, num_values, old_workspace_size, threadpool); -} - -static enum xnn_status setup_add_operator( - const struct xnn_operator_data* opdata, - const struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input1_id = opdata->inputs[0]; - assert(input1_id != XNN_INVALID_VALUE_ID); - assert(input1_id < num_values); - - const uint32_t input2_id = opdata->inputs[1]; - assert(input2_id != XNN_INVALID_VALUE_ID); - assert(input2_id < num_values); - - const uint32_t output_id = opdata->outputs[0]; - assert(output_id != XNN_INVALID_VALUE_ID); - assert(output_id < num_values); - - const struct xnn_value* input1_value = values + input1_id; - const void* input1_data = input1_value->data; - assert(input1_data != NULL); - - const struct xnn_value* input2_value = values + input2_id; - const void* input2_data = input2_value->data; - assert(input2_data != NULL); - - const struct xnn_value* output_value = values + output_id; - void* output_data = output_value->data; - assert(output_data != NULL); - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_add_nd_f32: - return xnn_setup_add_nd_f32( - opdata->operator_objects[0], - input1_data, input2_data, output_data); - case xnn_operator_type_add_nd_f16: - return xnn_setup_add_nd_f16( - opdata->operator_objects[0], - input1_data, input2_data, output_data); - case xnn_operator_type_add_nd_qs8: - return xnn_setup_add_nd_qs8( - opdata->operator_objects[0], - input1_data, input2_data, output_data); - case xnn_operator_type_add_nd_qu8: - return xnn_setup_add_nd_qu8( - opdata->operator_objects[0], - input1_data, input2_data, output_data); - default: - XNN_UNREACHABLE; - } -} - -enum xnn_status xnn_define_add2( - xnn_subgraph_t subgraph, - float output_min, - float output_max, - uint32_t input1_id, - uint32_t input2_id, - uint32_t output_id, - uint32_t flags) -{ - enum xnn_status status; - if ((status = xnn_subgraph_check_xnnpack_initialized(xnn_node_type_add2)) != xnn_status_success) { - return status; - } - - status = xnn_subgraph_check_output_min_max(xnn_node_type_add2, output_min, output_max); - if (status != xnn_status_success) { - return status; - } - - if ((status = xnn_subgraph_check_nth_input_node_id(xnn_node_type_add2, input1_id, subgraph->num_values, 1)) != - xnn_status_success) { - return status; - } - - const struct xnn_value* input1_value = &subgraph->values[input1_id]; - status = xnn_subgraph_check_nth_input_type_dense(xnn_node_type_add2, input1_id, input1_value, 1); - if (status != xnn_status_success) { - return status; - } - - switch (input1_value->datatype) { - case xnn_datatype_fp32: - case xnn_datatype_fp16: - case xnn_datatype_qint8: - case xnn_datatype_quint8: - break; - default: - xnn_log_error( - "failed to define %s operator with the first input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_add2), input1_id, - xnn_datatype_to_string(input1_value->datatype), input1_value->datatype); - return xnn_status_invalid_parameter; - } - - if ((status = xnn_subgraph_check_nth_input_node_id(xnn_node_type_add2, input2_id, subgraph->num_values, 2)) != - xnn_status_success) { - return status; - } - - const struct xnn_value* input2_value = &subgraph->values[input2_id]; - status = xnn_subgraph_check_nth_input_type_dense(xnn_node_type_add2, input2_id, input2_value, 2); - if (status != xnn_status_success) { - return status; - } - - switch (input2_value->datatype) { - case xnn_datatype_fp32: - case xnn_datatype_fp16: - case xnn_datatype_qint8: - case xnn_datatype_quint8: - break; - default: - xnn_log_error( - "failed to define %s operator with the second input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_add2), input2_id, - xnn_datatype_to_string(input2_value->datatype), input2_value->datatype); - return xnn_status_invalid_parameter; - } - - status = xnn_subgraph_check_output_node_id(xnn_node_type_add2, output_id, subgraph->num_values); - if (status != xnn_status_success) { - return status; - } - - const struct xnn_value* output_value = &subgraph->values[output_id]; - status = xnn_subgraph_check_output_type_dense(xnn_node_type_add2, output_id, output_value); - if (status != xnn_status_success) { - return status; - } - - enum xnn_compute_type compute_type = xnn_compute_type_invalid; - switch (output_value->datatype) { - case xnn_datatype_fp32: - compute_type = xnn_compute_type_fp32; - break; - case xnn_datatype_fp16: - compute_type = xnn_compute_type_fp16; - break; - case xnn_datatype_qint8: - compute_type = xnn_compute_type_qs8; - break; - case xnn_datatype_quint8: - compute_type = xnn_compute_type_qu8; - break; - default: - xnn_log_error( - "failed to define %s operator with output ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_add2), output_id, - xnn_datatype_to_string(output_value->datatype), output_value->datatype); - return xnn_status_invalid_parameter; - } - - status = xnn_subgraph_check_datatype_matches_two_inputs( - xnn_node_type_add2, input1_id, input1_value, input2_id, input2_value, output_id, output_value); - if (status != xnn_status_success) { - return status; - } - - struct xnn_node* node = xnn_subgraph_new_node(subgraph); - if (node == NULL) { - return xnn_status_out_of_memory; - } - - node->type = xnn_node_type_add2; - node->compute_type = compute_type; - node->activation.output_min = output_min; - node->activation.output_max = output_max; - node->num_inputs = 2; - node->inputs[0] = input1_id; - node->inputs[1] = input2_id; - node->num_outputs = 1; - node->outputs[0] = output_id; - node->flags = flags; - - node->create = create_add_operator; - node->reshape = reshape_add_operator; - node->setup = setup_add_operator; - - if (output_min != -INFINITY && output_max != INFINITY) { - xnn_insert_clamp_node(subgraph, output_min, output_max, node); - } - return xnn_status_success; -} diff --git a/src/subgraph/copysign.c b/src/subgraph/binary.c similarity index 63% rename from src/subgraph/copysign.c rename to src/subgraph/binary.c index 2bcc0624093..7d5467442e8 100644 --- a/src/subgraph/copysign.c +++ b/src/subgraph/binary.c @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -20,7 +21,7 @@ #include "xnnpack/subgraph.h" #include "pthreadpool.h" -static enum xnn_status create_copysign_operator( +static enum xnn_status create_binary_operator( const struct xnn_node* node, const struct xnn_value* values, size_t num_values, @@ -28,26 +29,35 @@ static enum xnn_status create_copysign_operator( struct xnn_code_cache* code_cache, xnn_weights_cache_t weights_cache) { - assert(node->num_inputs == 2); - assert(node->num_outputs == 1); + const uint32_t input1_id = opdata->inputs[0]; + assert(input1_id < num_values); + const uint32_t input2_id = opdata->inputs[1]; + assert(input2_id < num_values); + const uint32_t output_id = opdata->outputs[0]; + assert(output_id < num_values); - enum xnn_status status; - const uint32_t input_id = opdata->inputs[0]; - assert(input_id < num_values); - const struct xnn_value *input_value = &values[input_id]; - switch (input_value->datatype) { - case xnn_datatype_fp32: - status = xnn_create_copysign_nd_f32( - node->flags, - &opdata->operator_objects[0]); - break; - default: - XNN_UNREACHABLE; - } - return status; + enum xnn_datatype datatype = values[output_id].datatype; + struct xnn_quantization_params a_quantization = { + .scale = values[input1_id].quantization.scale, + .zero_point = values[input1_id].quantization.zero_point, + }; + struct xnn_quantization_params b_quantization = { + .scale = values[input2_id].quantization.scale, + .zero_point = values[input2_id].quantization.zero_point, + }; + struct xnn_quantization_params output_quantization = { + .scale = values[output_id].quantization.scale, + .zero_point = values[output_id].quantization.zero_point, + }; + + return xnn_create_binary_elementwise_nd( + xnn_node_type_to_binary_operator(node->type), + datatype, &a_quantization, &b_quantization, &output_quantization, + node->flags, + &opdata->operator_objects[0]); } -static enum xnn_status reshape_copysign_operator( +static enum xnn_status reshape_binary_operator( struct xnn_operator_data* opdata, struct xnn_value* values, size_t num_values, @@ -94,27 +104,20 @@ static enum xnn_status reshape_copysign_operator( opdata->shape2.dim[0] = 1; } const size_t old_workspace_size = opdata->workspace_size; - enum xnn_status status = xnn_status_invalid_state; - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_copysign_nd_f32: - status = xnn_reshape_copysign_nd_f32( - opdata->operator_objects[0], - opdata->shape1.num_dims, - opdata->shape1.dim, - opdata->shape2.num_dims, - opdata->shape2.dim, - threadpool); - break; - default: - XNN_UNREACHABLE; - } + enum xnn_status status = xnn_reshape_binary_elementwise_nd( + opdata->operator_objects[0], + opdata->shape1.num_dims, + opdata->shape1.dim, + opdata->shape2.num_dims, + opdata->shape2.dim, + threadpool); if (status != xnn_status_success) { return status; } return resize_binary_elementwise_output_tensor(opdata, values, num_values, old_workspace_size, threadpool); } -static enum xnn_status setup_copysign_operator( +static enum xnn_status setup_binary_operator( const struct xnn_operator_data* opdata, const struct xnn_value* values, size_t num_values, @@ -144,102 +147,97 @@ static enum xnn_status setup_copysign_operator( void* output_data = output_value->data; assert(output_data != NULL); - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_copysign_nd_f32: - return xnn_setup_copysign_nd_f32( - opdata->operator_objects[0], - input1_data, input2_data, output_data); - default: - XNN_UNREACHABLE; - } + return xnn_setup_binary_elementwise_nd( + opdata->operator_objects[0], + input1_data, input2_data, output_data); } -enum xnn_status xnn_define_copysign( +enum xnn_status xnn_define_binary( xnn_subgraph_t subgraph, + enum xnn_binary_operator type, + const struct xnn_binary_params* params, uint32_t input1_id, uint32_t input2_id, uint32_t output_id, uint32_t flags) { + enum xnn_node_type node_type = xnn_binary_operator_to_node_type(type); + enum xnn_status status; - if ((status = xnn_subgraph_check_xnnpack_initialized(xnn_node_type_copysign)) != xnn_status_success) { + if ((status = xnn_subgraph_check_xnnpack_initialized(node_type)) != xnn_status_success) { return status; } - if ((status = xnn_subgraph_check_nth_input_node_id(xnn_node_type_copysign, input1_id, subgraph->num_values, 1)) != + if ((status = xnn_subgraph_check_nth_input_node_id(node_type, input1_id, subgraph->num_values, 1)) != xnn_status_success) { return status; } const struct xnn_value* input1_value = &subgraph->values[input1_id]; - status = xnn_subgraph_check_nth_input_type_dense(xnn_node_type_copysign, input1_id, input1_value, 1); + status = xnn_subgraph_check_nth_input_type_dense(node_type, input1_id, input1_value, 1); if (status != xnn_status_success) { return status; } - switch (input1_value->datatype) { - case xnn_datatype_fp32: - break; - default: - xnn_log_error( - "failed to define %s operator with the first input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_copysign), input1_id, - xnn_datatype_to_string(input1_value->datatype), input1_value->datatype); - return xnn_status_invalid_parameter; - } - - if ((status = xnn_subgraph_check_nth_input_node_id( - xnn_node_type_copysign, input2_id, subgraph->num_values, 2)) != xnn_status_success) { + if ((status = xnn_subgraph_check_nth_input_node_id(node_type, input2_id, subgraph->num_values, 2)) != + xnn_status_success) { return status; } const struct xnn_value* input2_value = &subgraph->values[input2_id]; - status = xnn_subgraph_check_nth_input_type_dense(xnn_node_type_copysign, input2_id, input2_value, 2); + status = xnn_subgraph_check_nth_input_type_dense(node_type, input2_id, input2_value, 2); if (status != xnn_status_success) { return status; } - switch (input2_value->datatype) { - case xnn_datatype_fp32: - break; - default: - xnn_log_error( - "failed to define %s operator with the second input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_copysign), input2_id, - xnn_datatype_to_string(input2_value->datatype), input2_value->datatype); - return xnn_status_invalid_parameter; - } - - status = xnn_subgraph_check_output_node_id(xnn_node_type_copysign, output_id, subgraph->num_values); + status = xnn_subgraph_check_output_node_id(node_type, output_id, subgraph->num_values); if (status != xnn_status_success) { return status; } const struct xnn_value* output_value = &subgraph->values[output_id]; - status = xnn_subgraph_check_output_type_dense(xnn_node_type_copysign, output_id, output_value); + status = xnn_subgraph_check_output_type_dense(node_type, output_id, output_value); if (status != xnn_status_success) { return status; } enum xnn_compute_type compute_type = xnn_compute_type_invalid; switch (output_value->datatype) { + case xnn_datatype_int32: + compute_type = xnn_compute_type_s32; + break; case xnn_datatype_fp32: compute_type = xnn_compute_type_fp32; break; + case xnn_datatype_fp16: + compute_type = xnn_compute_type_fp16; + break; + case xnn_datatype_qint8: + compute_type = xnn_compute_type_qs8; + break; + case xnn_datatype_quint8: + compute_type = xnn_compute_type_qu8; + break; default: xnn_log_error( "failed to define %s operator with output ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_copysign), output_id, + xnn_node_type_to_string(node_type), output_id, xnn_datatype_to_string(output_value->datatype), output_value->datatype); return xnn_status_invalid_parameter; } + status = xnn_subgraph_check_datatype_matches_two_inputs( + node_type, input1_id, input1_value, input2_id, input2_value, output_id, output_value); + if (status != xnn_status_success) { + return status; + } + struct xnn_node* node = xnn_subgraph_new_node(subgraph); if (node == NULL) { return xnn_status_out_of_memory; } - node->type = xnn_node_type_copysign; + node->type = node_type; node->compute_type = compute_type; node->num_inputs = 2; node->inputs[0] = input1_id; @@ -248,9 +246,15 @@ enum xnn_status xnn_define_copysign( node->outputs[0] = output_id; node->flags = flags; - node->create = create_copysign_operator; - node->reshape = reshape_copysign_operator; - node->setup = setup_copysign_operator; + node->create = create_binary_operator; + node->reshape = reshape_binary_operator; + node->setup = setup_binary_operator; + + if (params) { + if (params->output_min != -INFINITY || params->output_max != INFINITY) { + xnn_insert_clamp_node(subgraph, params->output_min, params->output_max, node); + } + } return xnn_status_success; } diff --git a/src/subgraph/deprecated.c b/src/subgraph/deprecated.c new file mode 100644 index 00000000000..a9bdc0d942c --- /dev/null +++ b/src/subgraph/deprecated.c @@ -0,0 +1,78 @@ +#include +#include + +#include "xnnpack.h" + +enum xnn_status xnn_define_add2(xnn_subgraph_t subgraph, float output_min, + float output_max, uint32_t input1_id, + uint32_t input2_id, uint32_t output_id, + uint32_t flags) { + struct xnn_binary_params params; + params.output_min = output_min; + params.output_max = output_max; + return xnn_define_binary(subgraph, xnn_binary_add, ¶ms, input1_id, + input2_id, output_id, flags); +} + +enum xnn_status xnn_define_subtract(xnn_subgraph_t subgraph, float output_min, + float output_max, uint32_t input1_id, + uint32_t input2_id, uint32_t output_id, + uint32_t flags) { + struct xnn_binary_params params; + params.output_min = output_min; + params.output_max = output_max; + return xnn_define_binary(subgraph, xnn_binary_subtract, ¶ms, input1_id, + input2_id, output_id, flags); +} + +enum xnn_status xnn_define_multiply2(xnn_subgraph_t subgraph, float output_min, + float output_max, uint32_t input1_id, + uint32_t input2_id, uint32_t output_id, + uint32_t flags) { + struct xnn_binary_params params; + params.output_min = output_min; + params.output_max = output_max; + return xnn_define_binary(subgraph, xnn_binary_multiply, ¶ms, input1_id, + input2_id, output_id, flags); +} + +enum xnn_status xnn_define_divide(xnn_subgraph_t subgraph, float output_min, + float output_max, uint32_t input1_id, + uint32_t input2_id, uint32_t output_id, + uint32_t flags) { + struct xnn_binary_params params; + params.output_min = output_min; + params.output_max = output_max; + return xnn_define_binary(subgraph, xnn_binary_divide, ¶ms, input1_id, + input2_id, output_id, flags); +} + +enum xnn_status xnn_define_maximum2(xnn_subgraph_t subgraph, uint32_t input1_id, + uint32_t input2_id, uint32_t output_id, + uint32_t flags) { + return xnn_define_binary(subgraph, xnn_binary_maximum, NULL, input1_id, + input2_id, output_id, flags); +} + +enum xnn_status xnn_define_minimum2(xnn_subgraph_t subgraph, uint32_t input1_id, + uint32_t input2_id, uint32_t output_id, + uint32_t flags) { + return xnn_define_binary(subgraph, xnn_binary_minimum, NULL, input1_id, + input2_id, output_id, flags); +} + +enum xnn_status xnn_define_squared_difference(xnn_subgraph_t subgraph, + uint32_t input1_id, + uint32_t input2_id, + uint32_t output_id, + uint32_t flags) { + return xnn_define_binary(subgraph, xnn_binary_squared_difference, NULL, + input1_id, input2_id, output_id, flags); +} + +enum xnn_status xnn_define_copysign(xnn_subgraph_t subgraph, uint32_t input1_id, + uint32_t input2_id, uint32_t output_id, + uint32_t flags) { + return xnn_define_binary(subgraph, xnn_binary_copysign, NULL, input1_id, + input2_id, output_id, flags); +} diff --git a/src/subgraph/divide.c b/src/subgraph/divide.c deleted file mode 100644 index 1c6ffbc9736..00000000000 --- a/src/subgraph/divide.c +++ /dev/null @@ -1,296 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include - -#include "xnnpack.h" -#include "xnnpack/common.h" -#include "xnnpack/log.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/reshape-helpers.h" -#include "xnnpack/subgraph-validation.h" -#include "xnnpack/subgraph.h" -#include "pthreadpool.h" - -static enum xnn_status create_divide_operator( - const struct xnn_node* node, - const struct xnn_value* values, - size_t num_values, - struct xnn_operator_data* opdata, - struct xnn_code_cache* code_cache, - xnn_weights_cache_t weights_cache) -{ - assert(node->num_inputs == 2); - assert(node->num_outputs == 1); - - enum xnn_status status; - const uint32_t input1_id = opdata->inputs[0]; - assert(input1_id < num_values); - const struct xnn_value *input1_value = &values[input1_id]; - switch (input1_value->datatype) { - case xnn_datatype_fp16: - status = xnn_create_divide_nd_f16( - node->activation.output_min, - node->activation.output_max, - node->flags, - &opdata->operator_objects[0]); - break; - case xnn_datatype_fp32: - status = xnn_create_divide_nd_f32( - node->activation.output_min, - node->activation.output_max, - node->flags, - &opdata->operator_objects[0]); - break; - default: - XNN_UNREACHABLE; - } - return status; -} - -static enum xnn_status reshape_divide_operator( - struct xnn_operator_data* opdata, - struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input1_id = opdata->inputs[0]; - assert(input1_id < num_values); - const uint32_t input2_id = opdata->inputs[1]; - assert(input2_id < num_values); - const uint32_t output_id = opdata->outputs[0]; - assert(output_id < num_values); - - opdata->shape1.num_dims = values[input1_id].shape.num_dims; - opdata->shape2.num_dims = values[input2_id].shape.num_dims; - if (values[output_id].layout == xnn_layout_type_nchw) { - assert(values[input1_id].layout == xnn_layout_type_nchw); - assert(values[input2_id].layout == xnn_layout_type_nchw); - opdata->shape1.dim[0] = values[input1_id].shape.dim[0]; - opdata->shape1.dim[1] = values[input1_id].shape.dim[values[input1_id].shape.num_dims - 1]; - if (values[input1_id].shape.num_dims > 2) { - memcpy(&opdata->shape1.dim[2], &values[input1_id].shape.dim[1], (values[input1_id].shape.num_dims - 2) * sizeof(size_t)); - } - opdata->shape2.dim[0] = values[input2_id].shape.dim[0]; - opdata->shape2.dim[1] = values[input2_id].shape.dim[values[input2_id].shape.num_dims - 1]; - if (values[input1_id].shape.num_dims > 2) { - memcpy(&opdata->shape2.dim[2], &values[input2_id].shape.dim[1], (values[input2_id].shape.num_dims - 2) * sizeof(size_t)); - } - } else { - assert(values[output_id].layout == xnn_layout_type_nhwc); - assert(values[input1_id].layout == xnn_layout_type_nhwc); - assert(values[input2_id].layout == xnn_layout_type_nhwc); - memcpy(opdata->shape1.dim, values[input1_id].shape.dim, values[input1_id].shape.num_dims * sizeof(size_t)); - memcpy(opdata->shape2.dim, values[input2_id].shape.dim, values[input2_id].shape.num_dims * sizeof(size_t)); - } - - // Handle scalars. Although the output shape is dimensionless, the reshape - // function must be passed a valid shape to prevent skipping the op. - if (opdata->shape1.num_dims == 0) { - opdata->shape1.num_dims = 1; - opdata->shape1.dim[0] = 1; - } - if (opdata->shape2.num_dims == 0) { - opdata->shape2.num_dims = 1; - opdata->shape2.dim[0] = 1; - } - const size_t old_workspace_size = opdata->workspace_size; - enum xnn_status status = xnn_status_invalid_state; - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_divide_nd_f16: - status = xnn_reshape_divide_nd_f16( - opdata->operator_objects[0], - opdata->shape1.num_dims, - opdata->shape1.dim, - opdata->shape2.num_dims, - opdata->shape2.dim, - threadpool); - break; - case xnn_operator_type_divide_nd_f32: - status = xnn_reshape_divide_nd_f32( - opdata->operator_objects[0], - opdata->shape1.num_dims, - opdata->shape1.dim, - opdata->shape2.num_dims, - opdata->shape2.dim, - threadpool); - break; - default: - XNN_UNREACHABLE; - } - if (status != xnn_status_success) { - return status; - } - return resize_binary_elementwise_output_tensor(opdata, values, num_values, old_workspace_size, threadpool); -} - -static enum xnn_status setup_divide_operator( - const struct xnn_operator_data* opdata, - const struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input1_id = opdata->inputs[0]; - assert(input1_id != XNN_INVALID_VALUE_ID); - assert(input1_id < num_values); - - const uint32_t input2_id = opdata->inputs[1]; - assert(input2_id != XNN_INVALID_VALUE_ID); - assert(input2_id < num_values); - - const uint32_t output_id = opdata->outputs[0]; - assert(output_id != XNN_INVALID_VALUE_ID); - assert(output_id < num_values); - - const struct xnn_value* input1_value = values + input1_id; - const void* input1_data = input1_value->data; - assert(input1_data != NULL); - - const struct xnn_value* input2_value = values + input2_id; - const void* input2_data = input2_value->data; - assert(input2_data != NULL); - - const struct xnn_value* output_value = values + output_id; - void* output_data = output_value->data; - assert(output_data != NULL); - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_divide_nd_f16: - return xnn_setup_divide_nd_f16( - opdata->operator_objects[0], - input1_data, input2_data, output_data); - case xnn_operator_type_divide_nd_f32: - return xnn_setup_divide_nd_f32( - opdata->operator_objects[0], - input1_data, input2_data, output_data); - default: - XNN_UNREACHABLE; - } -} - -enum xnn_status xnn_define_divide( - xnn_subgraph_t subgraph, - float output_min, - float output_max, - uint32_t input1_id, - uint32_t input2_id, - uint32_t output_id, - uint32_t flags) -{ - enum xnn_status status; - if ((status = xnn_subgraph_check_xnnpack_initialized(xnn_node_type_divide)) != xnn_status_success) { - return status; - } - - status = xnn_subgraph_check_output_min_max(xnn_node_type_divide, output_min, output_max); - if (status != xnn_status_success) { - return status; - } - - if ((status = xnn_subgraph_check_nth_input_node_id(xnn_node_type_divide, input1_id, subgraph->num_values, 1)) != - xnn_status_success) { - return status; - } - - const struct xnn_value* input1_value = &subgraph->values[input1_id]; - status = xnn_subgraph_check_nth_input_type_dense(xnn_node_type_divide, input1_id, input1_value, 1); - if (status != xnn_status_success) { - return status; - } - - switch (input1_value->datatype) { - case xnn_datatype_fp16: - case xnn_datatype_fp32: - break; - default: - xnn_log_error( - "failed to define %s operator with the first input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_divide), input1_id, - xnn_datatype_to_string(input1_value->datatype), input1_value->datatype); - return xnn_status_invalid_parameter; - } - - if ((status = xnn_subgraph_check_nth_input_node_id( - xnn_node_type_divide, input2_id, subgraph->num_values, 2)) != xnn_status_success) { - return status; - } - - const struct xnn_value* input2_value = &subgraph->values[input2_id]; - status = xnn_subgraph_check_nth_input_type_dense(xnn_node_type_divide, input2_id, input2_value, 2); - if (status != xnn_status_success) { - return status; - } - - switch (input2_value->datatype) { - case xnn_datatype_fp16: - case xnn_datatype_fp32: - break; - default: - xnn_log_error( - "failed to define %s operator with the second input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_divide), input2_id, - xnn_datatype_to_string(input2_value->datatype), input2_value->datatype); - return xnn_status_invalid_parameter; - } - - status = xnn_subgraph_check_output_node_id(xnn_node_type_divide, output_id, subgraph->num_values); - if (status != xnn_status_success) { - return status; - } - - const struct xnn_value* output_value = &subgraph->values[output_id]; - status = xnn_subgraph_check_output_type_dense(xnn_node_type_divide, output_id, output_value); - if (status != xnn_status_success) { - return status; - } - - enum xnn_compute_type compute_type = xnn_compute_type_invalid; - switch (output_value->datatype) { - case xnn_datatype_fp16: - compute_type = xnn_compute_type_fp16; - break; - case xnn_datatype_fp32: - compute_type = xnn_compute_type_fp32; - break; - default: - xnn_log_error( - "failed to define %s operator with output ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_divide), output_id, - xnn_datatype_to_string(output_value->datatype), output_value->datatype); - return xnn_status_invalid_parameter; - } - - struct xnn_node* node = xnn_subgraph_new_node(subgraph); - if (node == NULL) { - return xnn_status_out_of_memory; - } - - node->type = xnn_node_type_divide; - node->compute_type = compute_type; - node->activation.output_min = output_min; - node->activation.output_max = output_max; - node->num_inputs = 2; - node->inputs[0] = input1_id; - node->inputs[1] = input2_id; - node->num_outputs = 1; - node->outputs[0] = output_id; - node->flags = flags; - - node->create = create_divide_operator; - node->reshape = reshape_divide_operator; - node->setup = setup_divide_operator; - - if (output_min != -INFINITY && output_max != INFINITY) { - xnn_insert_clamp_node(subgraph, output_min, output_max, node); - } - return xnn_status_success; -} diff --git a/src/subgraph/maximum2.c b/src/subgraph/maximum2.c deleted file mode 100644 index 293eff33e50..00000000000 --- a/src/subgraph/maximum2.c +++ /dev/null @@ -1,281 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include - -#include "xnnpack.h" -#include "xnnpack/common.h" -#include "xnnpack/log.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/reshape-helpers.h" -#include "xnnpack/subgraph-validation.h" -#include "xnnpack/subgraph.h" -#include "pthreadpool.h" - -static enum xnn_status create_maximum_operator( - const struct xnn_node* node, - const struct xnn_value* values, - size_t num_values, - struct xnn_operator_data* opdata, - struct xnn_code_cache* code_cache, - xnn_weights_cache_t weights_cache) -{ - assert(node->num_inputs == 2); - assert(node->num_outputs == 1); - - enum xnn_status status; - const uint32_t input1_id = node->inputs[0]; - assert(input1_id < num_values); - const struct xnn_value *input1_value = &values[input1_id]; - switch (input1_value->datatype) { - case xnn_datatype_fp16: - status = xnn_create_maximum_nd_f16( - node->flags, - &opdata->operator_objects[0]); - break; - case xnn_datatype_fp32: - status = xnn_create_maximum_nd_f32( - node->flags, - &opdata->operator_objects[0]); - break; - default: - XNN_UNREACHABLE; - } - return status; -} - -static enum xnn_status reshape_maximum_operator( - struct xnn_operator_data* opdata, - struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input1_id = opdata->inputs[0]; - assert(input1_id < num_values); - const uint32_t input2_id = opdata->inputs[1]; - assert(input2_id < num_values); - const uint32_t output_id = opdata->outputs[0]; - assert(output_id < num_values); - - opdata->shape1.num_dims = values[input1_id].shape.num_dims; - opdata->shape2.num_dims = values[input2_id].shape.num_dims; - if (values[output_id].layout == xnn_layout_type_nchw) { - assert(values[input1_id].layout == xnn_layout_type_nchw); - assert(values[input2_id].layout == xnn_layout_type_nchw); - opdata->shape1.dim[0] = values[input1_id].shape.dim[0]; - opdata->shape1.dim[1] = values[input1_id].shape.dim[values[input1_id].shape.num_dims - 1]; - if (values[input1_id].shape.num_dims > 2) { - memcpy(&opdata->shape1.dim[2], &values[input1_id].shape.dim[1], (values[input1_id].shape.num_dims - 2) * sizeof(size_t)); - } - opdata->shape2.dim[0] = values[input2_id].shape.dim[0]; - opdata->shape2.dim[1] = values[input2_id].shape.dim[values[input2_id].shape.num_dims - 1]; - if (values[input1_id].shape.num_dims > 2) { - memcpy(&opdata->shape2.dim[2], &values[input2_id].shape.dim[1], (values[input2_id].shape.num_dims - 2) * sizeof(size_t)); - } - } else { - assert(values[output_id].layout == xnn_layout_type_nhwc); - assert(values[input1_id].layout == xnn_layout_type_nhwc); - assert(values[input2_id].layout == xnn_layout_type_nhwc); - memcpy(opdata->shape1.dim, values[input1_id].shape.dim, values[input1_id].shape.num_dims * sizeof(size_t)); - memcpy(opdata->shape2.dim, values[input2_id].shape.dim, values[input2_id].shape.num_dims * sizeof(size_t)); - } - - // Handle scalars. Although the output shape is dimensionless, the reshape - // function must be passed a valid shape to prevent skipping the op. - if (opdata->shape1.num_dims == 0) { - opdata->shape1.num_dims = 1; - opdata->shape1.dim[0] = 1; - } - if (opdata->shape2.num_dims == 0) { - opdata->shape2.num_dims = 1; - opdata->shape2.dim[0] = 1; - } - const size_t old_workspace_size = opdata->workspace_size; - enum xnn_status status = xnn_status_invalid_state; - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_maximum_nd_f16: - status = xnn_reshape_maximum_nd_f16( - opdata->operator_objects[0], - opdata->shape1.num_dims, - opdata->shape1.dim, - opdata->shape2.num_dims, - opdata->shape2.dim, - threadpool); - break; - case xnn_operator_type_maximum_nd_f32: - status = xnn_reshape_maximum_nd_f32( - opdata->operator_objects[0], - opdata->shape1.num_dims, - opdata->shape1.dim, - opdata->shape2.num_dims, - opdata->shape2.dim, - threadpool); - break; - default: - XNN_UNREACHABLE; - } - if (status != xnn_status_success) { - return status; - } - return resize_binary_elementwise_output_tensor(opdata, values, num_values, old_workspace_size, threadpool); -} - -static enum xnn_status setup_maximum_operator( - const struct xnn_operator_data* opdata, - const struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input1_id = opdata->inputs[0]; - assert(input1_id != XNN_INVALID_VALUE_ID); - assert(input1_id < num_values); - - const uint32_t input2_id = opdata->inputs[1]; - assert(input2_id != XNN_INVALID_VALUE_ID); - assert(input2_id < num_values); - - const uint32_t output_id = opdata->outputs[0]; - assert(output_id != XNN_INVALID_VALUE_ID); - assert(output_id < num_values); - - const struct xnn_value* input1_value = values + input1_id; - const void* input1_data = input1_value->data; - assert(input1_data != NULL); - - const struct xnn_value* input2_value = values + input2_id; - const void* input2_data = input2_value->data; - assert(input2_data != NULL); - - const struct xnn_value* output_value = values + output_id; - void* output_data = output_value->data; - assert(output_data != NULL); - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_maximum_nd_f16: - return xnn_setup_maximum_nd_f16( - opdata->operator_objects[0], - input1_data, input2_data, output_data); - case xnn_operator_type_maximum_nd_f32: - return xnn_setup_maximum_nd_f32( - opdata->operator_objects[0], - input1_data, input2_data, output_data); - default: - XNN_UNREACHABLE; - } -} - -enum xnn_status xnn_define_maximum2( - xnn_subgraph_t subgraph, - uint32_t input1_id, - uint32_t input2_id, - uint32_t output_id, - uint32_t flags) -{ - enum xnn_status status; - if ((status = xnn_subgraph_check_xnnpack_initialized(xnn_node_type_maximum2)) != xnn_status_success) { - return status; - } - - if ((status = xnn_subgraph_check_nth_input_node_id( - xnn_node_type_maximum2, input1_id, subgraph->num_values, 1)) != xnn_status_success) { - return status; - } - - const struct xnn_value* input1_value = &subgraph->values[input1_id]; - status = xnn_subgraph_check_nth_input_type_dense(xnn_node_type_maximum2, input1_id, input1_value, 1); - if (status != xnn_status_success) { - return status; - } - - switch (input1_value->datatype) { - case xnn_datatype_fp16: - break; - case xnn_datatype_fp32: - break; - default: - xnn_log_error( - "failed to define %s operator with the first input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_maximum2), input1_id, - xnn_datatype_to_string(input1_value->datatype), input1_value->datatype); - return xnn_status_invalid_parameter; - } - - if ((status = xnn_subgraph_check_nth_input_node_id( - xnn_node_type_maximum2, input2_id, subgraph->num_values, 2)) != xnn_status_success) { - return status; - } - - const struct xnn_value* input2_value = &subgraph->values[input2_id]; - status = xnn_subgraph_check_nth_input_type_dense(xnn_node_type_maximum2, input2_id, input2_value, 2); - if (status != xnn_status_success) { - return status; - } - - switch (input2_value->datatype) { - case xnn_datatype_fp16: - break; - case xnn_datatype_fp32: - break; - default: - xnn_log_error( - "failed to define %s operator with the second input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_maximum2), input2_id, - xnn_datatype_to_string(input2_value->datatype), input2_value->datatype); - return xnn_status_invalid_parameter; - } - - status = xnn_subgraph_check_output_node_id(xnn_node_type_maximum2, output_id, subgraph->num_values); - if (status != xnn_status_success) { - return status; - } - - const struct xnn_value* output_value = &subgraph->values[output_id]; - status = xnn_subgraph_check_output_type_dense(xnn_node_type_maximum2, output_id, output_value); - if (status != xnn_status_success) { - return status; - } - - enum xnn_compute_type compute_type = xnn_compute_type_invalid; - switch (output_value->datatype) { - case xnn_datatype_fp16: - compute_type = xnn_compute_type_fp16; - break; - case xnn_datatype_fp32: - compute_type = xnn_compute_type_fp32; - break; - default: - xnn_log_error( - "failed to define %s operator with output ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_maximum2), output_id, - xnn_datatype_to_string(output_value->datatype), output_value->datatype); - return xnn_status_invalid_parameter; - } - - struct xnn_node* node = xnn_subgraph_new_node(subgraph); - if (node == NULL) { - return xnn_status_out_of_memory; - } - - node->type = xnn_node_type_maximum2; - node->compute_type = compute_type; - node->num_inputs = 2; - node->inputs[0] = input1_id; - node->inputs[1] = input2_id; - node->num_outputs = 1; - node->outputs[0] = output_id; - node->flags = flags; - - node->create = create_maximum_operator; - node->reshape = reshape_maximum_operator; - node->setup = setup_maximum_operator; - - return xnn_status_success; -} diff --git a/src/subgraph/minimum2.c b/src/subgraph/minimum2.c deleted file mode 100644 index e05167ba722..00000000000 --- a/src/subgraph/minimum2.c +++ /dev/null @@ -1,281 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include - -#include "xnnpack.h" -#include "xnnpack/common.h" -#include "xnnpack/log.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/reshape-helpers.h" -#include "xnnpack/subgraph-validation.h" -#include "xnnpack/subgraph.h" -#include "pthreadpool.h" - -static enum xnn_status create_minimum_operator( - const struct xnn_node* node, - const struct xnn_value* values, - size_t num_values, - struct xnn_operator_data* opdata, - struct xnn_code_cache* code_cache, - xnn_weights_cache_t weights_cache) -{ - assert(node->num_inputs == 2); - assert(node->num_outputs == 1); - - enum xnn_status status; - const uint32_t input1_id = node->inputs[0]; - assert(input1_id < num_values); - const struct xnn_value *input1_value = &values[input1_id]; - switch (input1_value->datatype) { - case xnn_datatype_fp16: - status = xnn_create_minimum_nd_f16( - node->flags, - &opdata->operator_objects[0]); - break; - case xnn_datatype_fp32: - status = xnn_create_minimum_nd_f32( - node->flags, - &opdata->operator_objects[0]); - break; - default: - XNN_UNREACHABLE; - } - return status; -} - -static enum xnn_status reshape_minimum_operator( - struct xnn_operator_data* opdata, - struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input1_id = opdata->inputs[0]; - assert(input1_id < num_values); - const uint32_t input2_id = opdata->inputs[1]; - assert(input2_id < num_values); - const uint32_t output_id = opdata->outputs[0]; - assert(output_id < num_values); - - opdata->shape1.num_dims = values[input1_id].shape.num_dims; - opdata->shape2.num_dims = values[input2_id].shape.num_dims; - if (values[output_id].layout == xnn_layout_type_nchw) { - assert(values[input1_id].layout == xnn_layout_type_nchw); - assert(values[input2_id].layout == xnn_layout_type_nchw); - opdata->shape1.dim[0] = values[input1_id].shape.dim[0]; - opdata->shape1.dim[1] = values[input1_id].shape.dim[values[input1_id].shape.num_dims - 1]; - if (values[input1_id].shape.num_dims > 2) { - memcpy(&opdata->shape1.dim[2], &values[input1_id].shape.dim[1], (values[input1_id].shape.num_dims - 2) * sizeof(size_t)); - } - opdata->shape2.dim[0] = values[input2_id].shape.dim[0]; - opdata->shape2.dim[1] = values[input2_id].shape.dim[values[input2_id].shape.num_dims - 1]; - if (values[input1_id].shape.num_dims > 2) { - memcpy(&opdata->shape2.dim[2], &values[input2_id].shape.dim[1], (values[input2_id].shape.num_dims - 2) * sizeof(size_t)); - } - } else { - assert(values[output_id].layout == xnn_layout_type_nhwc); - assert(values[input1_id].layout == xnn_layout_type_nhwc); - assert(values[input2_id].layout == xnn_layout_type_nhwc); - memcpy(opdata->shape1.dim, values[input1_id].shape.dim, values[input1_id].shape.num_dims * sizeof(size_t)); - memcpy(opdata->shape2.dim, values[input2_id].shape.dim, values[input2_id].shape.num_dims * sizeof(size_t)); - } - - // Handle scalars. Although the output shape is dimensionless, the reshape - // function must be passed a valid shape to prevent skipping the op. - if (opdata->shape1.num_dims == 0) { - opdata->shape1.num_dims = 1; - opdata->shape1.dim[0] = 1; - } - if (opdata->shape2.num_dims == 0) { - opdata->shape2.num_dims = 1; - opdata->shape2.dim[0] = 1; - } - const size_t old_workspace_size = opdata->workspace_size; - enum xnn_status status = xnn_status_invalid_state; - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_minimum_nd_f16: - status = xnn_reshape_minimum_nd_f16( - opdata->operator_objects[0], - opdata->shape1.num_dims, - opdata->shape1.dim, - opdata->shape2.num_dims, - opdata->shape2.dim, - threadpool); - break; - case xnn_operator_type_minimum_nd_f32: - status = xnn_reshape_minimum_nd_f32( - opdata->operator_objects[0], - opdata->shape1.num_dims, - opdata->shape1.dim, - opdata->shape2.num_dims, - opdata->shape2.dim, - threadpool); - break; - default: - XNN_UNREACHABLE; - } - if (status != xnn_status_success) { - return status; - } - return resize_binary_elementwise_output_tensor(opdata, values, num_values, old_workspace_size, threadpool); -} - -static enum xnn_status setup_minimum_operator( - const struct xnn_operator_data* opdata, - const struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input1_id = opdata->inputs[0]; - assert(input1_id != XNN_INVALID_VALUE_ID); - assert(input1_id < num_values); - - const uint32_t input2_id = opdata->inputs[1]; - assert(input2_id != XNN_INVALID_VALUE_ID); - assert(input2_id < num_values); - - const uint32_t output_id = opdata->outputs[0]; - assert(output_id != XNN_INVALID_VALUE_ID); - assert(output_id < num_values); - - const struct xnn_value* input1_value = values + input1_id; - const void* input1_data = input1_value->data; - assert(input1_data != NULL); - - const struct xnn_value* input2_value = values + input2_id; - const void* input2_data = input2_value->data; - assert(input2_data != NULL); - - const struct xnn_value* output_value = values + output_id; - void* output_data = output_value->data; - assert(output_data != NULL); - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_minimum_nd_f16: - return xnn_setup_minimum_nd_f16( - opdata->operator_objects[0], - input1_data, input2_data, output_data); - case xnn_operator_type_minimum_nd_f32: - return xnn_setup_minimum_nd_f32( - opdata->operator_objects[0], - input1_data, input2_data, output_data); - default: - XNN_UNREACHABLE; - } -} - -enum xnn_status xnn_define_minimum2( - xnn_subgraph_t subgraph, - uint32_t input1_id, - uint32_t input2_id, - uint32_t output_id, - uint32_t flags) -{ - enum xnn_status status; - if ((status = xnn_subgraph_check_xnnpack_initialized(xnn_node_type_minimum2)) != xnn_status_success) { - return status; - } - - if ((status = xnn_subgraph_check_nth_input_node_id( - xnn_node_type_minimum2, input1_id, subgraph->num_values, 1)) != xnn_status_success) { - return status; - } - - const struct xnn_value* input1_value = &subgraph->values[input1_id]; - status = xnn_subgraph_check_nth_input_type_dense(xnn_node_type_minimum2, input1_id, input1_value, 1); - if (status != xnn_status_success) { - return status; - } - - switch (input1_value->datatype) { - case xnn_datatype_fp16: - break; - case xnn_datatype_fp32: - break; - default: - xnn_log_error( - "failed to define %s operator with the first input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_minimum2), input1_id, - xnn_datatype_to_string(input1_value->datatype), input1_value->datatype); - return xnn_status_invalid_parameter; - } - - if ((status = xnn_subgraph_check_nth_input_node_id( - xnn_node_type_minimum2, input2_id, subgraph->num_values, 2)) != xnn_status_success) { - return status; - } - - const struct xnn_value* input2_value = &subgraph->values[input2_id]; - status = xnn_subgraph_check_nth_input_type_dense(xnn_node_type_minimum2, input2_id, input2_value, 2); - if (status != xnn_status_success) { - return status; - } - - switch (input2_value->datatype) { - case xnn_datatype_fp16: - break; - case xnn_datatype_fp32: - break; - default: - xnn_log_error( - "failed to define %s operator with the second input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_minimum2), input2_id, - xnn_datatype_to_string(input2_value->datatype), input2_value->datatype); - return xnn_status_invalid_parameter; - } - - status = xnn_subgraph_check_output_node_id(xnn_node_type_minimum2, output_id, subgraph->num_values); - if (status != xnn_status_success) { - return status; - } - - const struct xnn_value* output_value = &subgraph->values[output_id]; - status = xnn_subgraph_check_output_type_dense(xnn_node_type_minimum2, output_id, output_value); - if (status != xnn_status_success) { - return status; - } - - enum xnn_compute_type compute_type = xnn_compute_type_invalid; - switch (output_value->datatype) { - case xnn_datatype_fp16: - compute_type = xnn_compute_type_fp16; - break; - case xnn_datatype_fp32: - compute_type = xnn_compute_type_fp32; - break; - default: - xnn_log_error( - "failed to define %s operator with output ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_minimum2), output_id, - xnn_datatype_to_string(output_value->datatype), output_value->datatype); - return xnn_status_invalid_parameter; - } - - struct xnn_node* node = xnn_subgraph_new_node(subgraph); - if (node == NULL) { - return xnn_status_out_of_memory; - } - - node->type = xnn_node_type_minimum2; - node->compute_type = compute_type; - node->num_inputs = 2; - node->inputs[0] = input1_id; - node->inputs[1] = input2_id; - node->num_outputs = 1; - node->outputs[0] = output_id; - node->flags = flags; - - node->create = create_minimum_operator; - node->reshape = reshape_minimum_operator; - node->setup = setup_minimum_operator; - - return xnn_status_success; -} diff --git a/src/subgraph/multiply2.c b/src/subgraph/multiply2.c deleted file mode 100644 index 7e3d80ca74c..00000000000 --- a/src/subgraph/multiply2.c +++ /dev/null @@ -1,446 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include - - -#include "xnnpack.h" -#include "xnnpack/common.h" -#include "xnnpack/log.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/requantization.h" -#include "xnnpack/reshape-helpers.h" -#include "xnnpack/subgraph-validation.h" -#include "xnnpack/subgraph.h" -#include "pthreadpool.h" - -static enum xnn_status create_multiply_operator( - const struct xnn_node* node, - const struct xnn_value* values, - size_t num_values, - struct xnn_operator_data* opdata, - struct xnn_code_cache* code_cache, - xnn_weights_cache_t weights_cache) -{ - assert(node->num_inputs == 2); - const uint32_t input1_id = node->inputs[0]; - assert(input1_id != XNN_INVALID_VALUE_ID); - assert(input1_id < num_values); - const uint32_t input2_id = node->inputs[1]; - assert(input2_id != XNN_INVALID_VALUE_ID); - assert(input2_id < num_values); - - assert(node->num_outputs == 1); - const uint32_t output_id = node->outputs[0]; - assert(output_id != XNN_INVALID_VALUE_ID); - assert(output_id < num_values); - - enum xnn_status status; - const struct xnn_value *input1_value = &values[input1_id]; - switch (input1_value->datatype) { - case xnn_datatype_fp16: - status = xnn_create_multiply_nd_f16( - node->activation.output_min, - node->activation.output_max, - node->flags, - &opdata->operator_objects[0]); - break; - case xnn_datatype_fp32: - status = xnn_create_multiply_nd_f32( - node->activation.output_min, - node->activation.output_max, - node->flags, - &opdata->operator_objects[0]); - break; - case xnn_datatype_int32: - status = xnn_create_multiply_nd_s32(node->flags, &opdata->operator_objects[0]); - break; - case xnn_datatype_qint8: - { - const float output_scale = values[output_id].quantization.scale; - const int32_t output_zero_point = values[output_id].quantization.zero_point; - const int8_t output_min = xnn_qs8_quantize(node->activation.output_min, output_scale, output_zero_point); - const int8_t output_max = xnn_qs8_quantize(node->activation.output_max, output_scale, output_zero_point); - status = xnn_create_multiply_nd_qs8( - (int8_t) values[input1_id].quantization.zero_point, - values[input1_id].quantization.scale, - (int8_t) values[input2_id].quantization.zero_point, - values[input2_id].quantization.scale, - (int8_t) output_zero_point, - output_scale, output_min, output_max, node->flags, - &opdata->operator_objects[0]); - break; - } - case xnn_datatype_quint8: - { - const float output_scale = values[output_id].quantization.scale; - const int32_t output_zero_point = values[output_id].quantization.zero_point; - const uint8_t output_min = xnn_qu8_quantize(node->activation.output_min, output_scale, output_zero_point); - const uint8_t output_max = xnn_qu8_quantize(node->activation.output_max, output_scale, output_zero_point); - status = xnn_create_multiply_nd_qu8( - (uint8_t) values[input1_id].quantization.zero_point, - values[input1_id].quantization.scale, - (uint8_t) values[input2_id].quantization.zero_point, - values[input2_id].quantization.scale, - (uint8_t) output_zero_point, - output_scale, output_min, output_max, node->flags, - &opdata->operator_objects[0]); - break; - } - default: - XNN_UNREACHABLE; - } - return status; -} - -static enum xnn_status reshape_multiply_operator( - struct xnn_operator_data* opdata, - struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input1_id = opdata->inputs[0]; - assert(input1_id < num_values); - const uint32_t input2_id = opdata->inputs[1]; - assert(input2_id < num_values); - const uint32_t output_id = opdata->outputs[0]; - assert(output_id < num_values); - - opdata->shape1.num_dims = values[input1_id].shape.num_dims; - opdata->shape2.num_dims = values[input2_id].shape.num_dims; - if (values[output_id].layout == xnn_layout_type_nchw) { - assert(values[input1_id].layout == xnn_layout_type_nchw); - assert(values[input2_id].layout == xnn_layout_type_nchw); - opdata->shape1.dim[0] = values[input1_id].shape.dim[0]; - opdata->shape1.dim[1] = values[input1_id].shape.dim[values[input1_id].shape.num_dims - 1]; - if (values[input1_id].shape.num_dims > 2) { - memcpy(&opdata->shape1.dim[2], &values[input1_id].shape.dim[1], (values[input1_id].shape.num_dims - 2) * sizeof(size_t)); - } - opdata->shape2.dim[0] = values[input2_id].shape.dim[0]; - opdata->shape2.dim[1] = values[input2_id].shape.dim[values[input2_id].shape.num_dims - 1]; - if (values[input1_id].shape.num_dims > 2) { - memcpy(&opdata->shape2.dim[2], &values[input2_id].shape.dim[1], (values[input2_id].shape.num_dims - 2) * sizeof(size_t)); - } - } else { - assert(values[output_id].layout == xnn_layout_type_nhwc); - assert(values[input1_id].layout == xnn_layout_type_nhwc); - assert(values[input2_id].layout == xnn_layout_type_nhwc); - memcpy(opdata->shape1.dim, values[input1_id].shape.dim, values[input1_id].shape.num_dims * sizeof(size_t)); - memcpy(opdata->shape2.dim, values[input2_id].shape.dim, values[input2_id].shape.num_dims * sizeof(size_t)); - } - - // Handle scalars. Although the output shape is dimensionless, the reshape - // function must be passed a valid shape to prevent skipping the op. - if (opdata->shape1.num_dims == 0) { - opdata->shape1.num_dims = 1; - opdata->shape1.dim[0] = 1; - } - if (opdata->shape2.num_dims == 0) { - opdata->shape2.num_dims = 1; - opdata->shape2.dim[0] = 1; - } - const size_t old_workspace_size = opdata->workspace_size; - enum xnn_status status = xnn_status_invalid_state; - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_multiply_nd_f16: - status = xnn_reshape_multiply_nd_f16( - opdata->operator_objects[0], - opdata->shape1.num_dims, - opdata->shape1.dim, - opdata->shape2.num_dims, - opdata->shape2.dim, - threadpool); - break; - case xnn_operator_type_multiply_nd_f32: - status = xnn_reshape_multiply_nd_f32( - opdata->operator_objects[0], - opdata->shape1.num_dims, - opdata->shape1.dim, - opdata->shape2.num_dims, - opdata->shape2.dim, - threadpool); - break; - case xnn_operator_type_multiply_nd_s32: - status = xnn_reshape_multiply_nd_s32( - opdata->operator_objects[0], opdata->shape1.num_dims, opdata->shape1.dim, opdata->shape2.num_dims, - opdata->shape2.dim, threadpool); - break; - case xnn_operator_type_multiply_nd_qs8: - status = xnn_reshape_multiply_nd_qs8( - opdata->operator_objects[0], - opdata->shape1.num_dims, - opdata->shape1.dim, - opdata->shape2.num_dims, - opdata->shape2.dim, - threadpool); - break; - case xnn_operator_type_multiply_nd_qu8: - status = xnn_reshape_multiply_nd_qu8( - opdata->operator_objects[0], - opdata->shape1.num_dims, - opdata->shape1.dim, - opdata->shape2.num_dims, - opdata->shape2.dim, - threadpool); - break; - default: - XNN_UNREACHABLE; - } - if (status != xnn_status_success) { - return status; - } - return resize_binary_elementwise_output_tensor(opdata, values, num_values, old_workspace_size, threadpool); -} - -static enum xnn_status setup_multiply_operator( - const struct xnn_operator_data* opdata, - const struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input1_id = opdata->inputs[0]; - assert(input1_id != XNN_INVALID_VALUE_ID); - assert(input1_id < num_values); - - const uint32_t input2_id = opdata->inputs[1]; - assert(input2_id != XNN_INVALID_VALUE_ID); - assert(input2_id < num_values); - - const uint32_t output_id = opdata->outputs[0]; - assert(output_id != XNN_INVALID_VALUE_ID); - assert(output_id < num_values); - - const struct xnn_value* input1_value = values + input1_id; - const void* input1_data = input1_value->data; - assert(input1_data != NULL); - - const struct xnn_value* input2_value = values + input2_id; - const void* input2_data = input2_value->data; - assert(input2_data != NULL); - - const struct xnn_value* output_value = values + output_id; - void* output_data = output_value->data; - assert(output_data != NULL); - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_multiply_nd_f16: - return xnn_setup_multiply_nd_f16( - opdata->operator_objects[0], - input1_data, input2_data, output_data); - break; - case xnn_operator_type_multiply_nd_f32: - return xnn_setup_multiply_nd_f32( - opdata->operator_objects[0], - input1_data, input2_data, output_data); - break; - case xnn_operator_type_multiply_nd_s32: - return xnn_setup_multiply_nd_s32(opdata->operator_objects[0], input1_data, input2_data, output_data); - break; - case xnn_operator_type_multiply_nd_qs8: - return xnn_setup_multiply_nd_qs8( - opdata->operator_objects[0], - input1_data, input2_data, output_data); - break; - case xnn_operator_type_multiply_nd_qu8: - return xnn_setup_multiply_nd_qu8( - opdata->operator_objects[0], - input1_data, input2_data, output_data); - break; - default: - XNN_UNREACHABLE; - } -} - -enum xnn_status define_multiply2( - xnn_subgraph_t subgraph, - float output_min, - float output_max, - uint32_t input1_id, - uint32_t input2_id, - uint32_t output_id, - uint32_t flags) -{ - enum xnn_status status; - if ((status = xnn_subgraph_check_xnnpack_initialized(xnn_node_type_multiply2)) != xnn_status_success) { - return status; - } - - if ((status = xnn_subgraph_check_nth_input_node_id( - xnn_node_type_multiply2, input1_id, subgraph->num_values, 1)) != xnn_status_success) { - return status; - } - - const struct xnn_value* input1_value = &subgraph->values[input1_id]; - status = xnn_subgraph_check_nth_input_type_dense(xnn_node_type_multiply2, input1_id, input1_value, 1); - if (status != xnn_status_success) { - return status; - } - - if ((status = xnn_subgraph_check_nth_input_node_id( - xnn_node_type_multiply2, input2_id, subgraph->num_values, 2)) != xnn_status_success) { - return status; - } - - const struct xnn_value* input2_value = &subgraph->values[input2_id]; - status = xnn_subgraph_check_nth_input_type_dense(xnn_node_type_multiply2, input2_id, input2_value, 2); - if (status != xnn_status_success) { - return status; - } - - status = xnn_subgraph_check_output_node_id(xnn_node_type_multiply2, output_id, subgraph->num_values); - if (status != xnn_status_success) { - return status; - } - - const struct xnn_value* output_value = &subgraph->values[output_id]; - status = xnn_subgraph_check_output_type_dense(xnn_node_type_multiply2, output_id, output_value); - if (status != xnn_status_success) { - return status; - } - - enum xnn_compute_type compute_type = xnn_compute_type_invalid; - switch (output_value->datatype) { - case xnn_datatype_fp16: - compute_type = xnn_compute_type_fp16; - break; - case xnn_datatype_fp32: - compute_type = xnn_compute_type_fp32; - break; - case xnn_datatype_int32: - compute_type = xnn_compute_type_s32; - break; - case xnn_datatype_qint8: - compute_type = xnn_compute_type_qs8; - break; - case xnn_datatype_quint8: - compute_type = xnn_compute_type_qu8; - break; - default: - xnn_log_error( - "failed to define %s operator with output ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_multiply2), output_id, - xnn_datatype_to_string(output_value->datatype), output_value->datatype); - return xnn_status_invalid_parameter; - } - - status = xnn_subgraph_check_datatype_matches_two_inputs( - xnn_node_type_multiply2, input1_id, input1_value, input2_id, input2_value, output_id, output_value); - if (status != xnn_status_success) { - return status; - } - - struct xnn_node* node = xnn_subgraph_new_node(subgraph); - if (node == NULL) { - return xnn_status_out_of_memory; - } - - node->type = xnn_node_type_multiply2; - node->compute_type = compute_type; - node->activation.output_min = output_min; - node->activation.output_max = output_max; - node->num_inputs = 2; - node->inputs[0] = input1_id; - node->inputs[1] = input2_id; - node->num_outputs = 1; - node->outputs[0] = output_id; - node->flags = flags; - - node->create = create_multiply_operator; - node->reshape = reshape_multiply_operator; - node->setup = setup_multiply_operator; - - if (output_min != -INFINITY && output_max != INFINITY) { - xnn_insert_clamp_node(subgraph, output_min, output_max, node); - } - return xnn_status_success; -} - -enum xnn_status xnn_define_multiply2( - xnn_subgraph_t subgraph, - float output_min, - float output_max, - uint32_t input1_id, - uint32_t input2_id, - uint32_t output_id, - uint32_t flags) { - enum xnn_status status = xnn_subgraph_check_output_min_max(xnn_node_type_multiply2, output_min, output_max); - if (status != xnn_status_success) { - return status; - } - const struct xnn_value* input1_value = &subgraph->values[input1_id]; - switch (input1_value->datatype) { - case xnn_datatype_fp16: - case xnn_datatype_fp32: - case xnn_datatype_qint8: - case xnn_datatype_quint8: - break; - default: - xnn_log_error( - "failed to define %s operator with the first input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_multiply2), input1_id, - xnn_datatype_to_string(input1_value->datatype), input1_value->datatype); - return xnn_status_invalid_parameter; - } - const struct xnn_value* input2_value = &subgraph->values[input2_id]; - switch (input2_value->datatype) { - case xnn_datatype_fp16: - case xnn_datatype_fp32: - case xnn_datatype_qint8: - case xnn_datatype_quint8: - break; - default: - xnn_log_error( - "failed to define %s operator with the second input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_multiply2), input2_id, - xnn_datatype_to_string(input2_value->datatype), input2_value->datatype); - return xnn_status_invalid_parameter; - } - return define_multiply2(subgraph, output_min, output_max, input1_id, input2_id, output_id, flags); -} - -enum xnn_status xnn_define_multiply2_v2( - xnn_subgraph_t subgraph, - uint32_t input1_id, - uint32_t input2_id, - uint32_t output_id, - uint32_t flags) { - const struct xnn_value* input1_value = &subgraph->values[input1_id]; - switch (input1_value->datatype) { - case xnn_datatype_fp16: - case xnn_datatype_fp32: - case xnn_datatype_int32: - case xnn_datatype_qint8: - case xnn_datatype_quint8: - break; - default: - xnn_log_error( - "failed to define %s operator with the first input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_multiply2), input1_id, - xnn_datatype_to_string(input1_value->datatype), input1_value->datatype); - return xnn_status_invalid_parameter; - } - const struct xnn_value* input2_value = &subgraph->values[input2_id]; - switch (input2_value->datatype) { - case xnn_datatype_fp16: - case xnn_datatype_fp32: - case xnn_datatype_int32: - case xnn_datatype_qint8: - case xnn_datatype_quint8: - break; - default: - xnn_log_error( - "failed to define %s operator with the second input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_multiply2), input2_id, - xnn_datatype_to_string(input2_value->datatype), input2_value->datatype); - return xnn_status_invalid_parameter; - } - return define_multiply2(subgraph, -INFINITY, INFINITY, input1_id, input2_id, output_id, flags); -} diff --git a/src/subgraph/squared-difference.c b/src/subgraph/squared-difference.c deleted file mode 100644 index 80c0978890d..00000000000 --- a/src/subgraph/squared-difference.c +++ /dev/null @@ -1,280 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include - -#include "xnnpack.h" -#include "xnnpack/common.h" -#include "xnnpack/log.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/reshape-helpers.h" -#include "xnnpack/subgraph-validation.h" -#include "xnnpack/subgraph.h" -#include "pthreadpool.h" - -static enum xnn_status create_squared_difference_operator( - const struct xnn_node* node, - const struct xnn_value* values, - size_t num_values, - struct xnn_operator_data* opdata, - struct xnn_code_cache* code_cache, - xnn_weights_cache_t weights_cache) -{ - assert(node->num_inputs == 2); - assert(node->num_outputs == 1); - - enum xnn_status status; - const uint32_t input_id = node->inputs[0]; - assert(input_id != XNN_INVALID_VALUE_ID); - assert(input_id < num_values); - const struct xnn_value *input_value = &values[input_id]; - switch (input_value->datatype) { - case xnn_datatype_fp16: - status = xnn_create_squared_difference_nd_f16( - node->flags, - &opdata->operator_objects[0]); - break; - case xnn_datatype_fp32: - status = xnn_create_squared_difference_nd_f32( - node->flags, - &opdata->operator_objects[0]); - break; - default: - XNN_UNREACHABLE; - } - return status; -} - -static enum xnn_status reshape_squared_difference_operator( - struct xnn_operator_data* opdata, - struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input1_id = opdata->inputs[0]; - assert(input1_id < num_values); - const uint32_t input2_id = opdata->inputs[1]; - assert(input2_id < num_values); - const uint32_t output_id = opdata->outputs[0]; - assert(output_id < num_values); - - opdata->shape1.num_dims = values[input1_id].shape.num_dims; - opdata->shape2.num_dims = values[input2_id].shape.num_dims; - if (values[output_id].layout == xnn_layout_type_nchw) { - assert(values[input1_id].layout == xnn_layout_type_nchw); - assert(values[input2_id].layout == xnn_layout_type_nchw); - opdata->shape1.dim[0] = values[input1_id].shape.dim[0]; - opdata->shape1.dim[1] = values[input1_id].shape.dim[values[input1_id].shape.num_dims - 1]; - if (values[input1_id].shape.num_dims > 2) { - memcpy(&opdata->shape1.dim[2], &values[input1_id].shape.dim[1], (values[input1_id].shape.num_dims - 2) * sizeof(size_t)); - } - opdata->shape2.dim[0] = values[input2_id].shape.dim[0]; - opdata->shape2.dim[1] = values[input2_id].shape.dim[values[input2_id].shape.num_dims - 1]; - if (values[input1_id].shape.num_dims > 2) { - memcpy(&opdata->shape2.dim[2], &values[input2_id].shape.dim[1], (values[input2_id].shape.num_dims - 2) * sizeof(size_t)); - } - } else { - assert(values[output_id].layout == xnn_layout_type_nhwc); - assert(values[input1_id].layout == xnn_layout_type_nhwc); - assert(values[input2_id].layout == xnn_layout_type_nhwc); - memcpy(opdata->shape1.dim, values[input1_id].shape.dim, values[input1_id].shape.num_dims * sizeof(size_t)); - memcpy(opdata->shape2.dim, values[input2_id].shape.dim, values[input2_id].shape.num_dims * sizeof(size_t)); - } - - // Handle scalars. Although the output shape is dimensionless, the reshape - // function must be passed a valid shape to prevent skipping the op. - if (opdata->shape1.num_dims == 0) { - opdata->shape1.num_dims = 1; - opdata->shape1.dim[0] = 1; - } - if (opdata->shape2.num_dims == 0) { - opdata->shape2.num_dims = 1; - opdata->shape2.dim[0] = 1; - } - const size_t old_workspace_size = opdata->workspace_size; - enum xnn_status status = xnn_status_invalid_state; - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_squared_difference_nd_f16: - status = xnn_reshape_squared_difference_nd_f16( - opdata->operator_objects[0], - opdata->shape1.num_dims, - opdata->shape1.dim, - opdata->shape2.num_dims, - opdata->shape2.dim, - threadpool); - break; - case xnn_operator_type_squared_difference_nd_f32: - status = xnn_reshape_squared_difference_nd_f32( - opdata->operator_objects[0], - opdata->shape1.num_dims, - opdata->shape1.dim, - opdata->shape2.num_dims, - opdata->shape2.dim, - threadpool); - break; - default: - XNN_UNREACHABLE; - } - if (status != xnn_status_success) { - return status; - } - return resize_binary_elementwise_output_tensor(opdata, values, num_values, old_workspace_size, threadpool); -} - -static enum xnn_status setup_squared_difference_operator( - const struct xnn_operator_data* opdata, - const struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input1_id = opdata->inputs[0]; - assert(input1_id != XNN_INVALID_VALUE_ID); - assert(input1_id < num_values); - - const uint32_t input2_id = opdata->inputs[1]; - assert(input2_id != XNN_INVALID_VALUE_ID); - assert(input2_id < num_values); - - const uint32_t output_id = opdata->outputs[0]; - assert(output_id != XNN_INVALID_VALUE_ID); - assert(output_id < num_values); - - const struct xnn_value* input1_value = values + input1_id; - const void* input1_data = input1_value->data; - assert(input1_data != NULL); - - const struct xnn_value* input2_value = values + input2_id; - const void* input2_data = input2_value->data; - assert(input2_data != NULL); - - const struct xnn_value* output_value = values + output_id; - void* output_data = output_value->data; - assert(output_data != NULL); - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_squared_difference_nd_f16: - return xnn_setup_squared_difference_nd_f16( - opdata->operator_objects[0], - input1_data, input2_data, output_data); - case xnn_operator_type_squared_difference_nd_f32: - return xnn_setup_squared_difference_nd_f32( - opdata->operator_objects[0], - input1_data, input2_data, output_data); - default: - XNN_UNREACHABLE; - } -} - -enum xnn_status xnn_define_squared_difference( - xnn_subgraph_t subgraph, - uint32_t input1_id, - uint32_t input2_id, - uint32_t output_id, - uint32_t flags) -{ - enum xnn_status status; - if ((status = xnn_subgraph_check_xnnpack_initialized(xnn_node_type_squared_difference)) != xnn_status_success) { - return status; - } - - if ((status = xnn_subgraph_check_nth_input_node_id( - xnn_node_type_squared_difference, input1_id, subgraph->num_values, 1)) != xnn_status_success) { - return status; - } - - const struct xnn_value* input1_value = &subgraph->values[input1_id]; - status = xnn_subgraph_check_nth_input_type_dense(xnn_node_type_squared_difference, input1_id, input1_value, 1); - if (status != xnn_status_success) { - return status; - } - - switch (input1_value->datatype) { - case xnn_datatype_fp16: - case xnn_datatype_fp32: - break; - default: - xnn_log_error( - "failed to define %s operator with first input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_squared_difference), input1_id, - xnn_datatype_to_string(input1_value->datatype), input1_value->datatype); - return xnn_status_invalid_parameter; - } - - if ((status = xnn_subgraph_check_nth_input_node_id( - xnn_node_type_squared_difference, input2_id, subgraph->num_values, 2)) != xnn_status_success) { - return status; - } - - const struct xnn_value* input2_value = &subgraph->values[input2_id]; - status = xnn_subgraph_check_nth_input_type_dense(xnn_node_type_squared_difference, input2_id, input2_value, 2); - if (status != xnn_status_success) { - return status; - } - - switch (input2_value->datatype) { - case xnn_datatype_fp16: - case xnn_datatype_fp32: - break; - default: - xnn_log_error( - "failed to define %s operator with second input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_squared_difference), input2_id, - xnn_datatype_to_string(input2_value->datatype), input2_value->datatype); - return xnn_status_invalid_parameter; - } - - status = xnn_subgraph_check_output_node_id(xnn_node_type_squared_difference, output_id, subgraph->num_values); - if (status != xnn_status_success) { - return status; - } - - const struct xnn_value* output_value = &subgraph->values[output_id]; - status = xnn_subgraph_check_output_type_dense(xnn_node_type_squared_difference, output_id, output_value); - if (status != xnn_status_success) { - return status; - } - - enum xnn_compute_type compute_type = xnn_compute_type_invalid; - switch (output_value->datatype) { - case xnn_datatype_fp16: - compute_type = xnn_compute_type_fp16; - break; - case xnn_datatype_fp32: - compute_type = xnn_compute_type_fp32; - break; - default: - xnn_log_error( - "failed to define %s operator with output ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_squared_difference), output_id, - xnn_datatype_to_string(output_value->datatype), output_value->datatype); - return xnn_status_invalid_parameter; - } - - struct xnn_node* node = xnn_subgraph_new_node(subgraph); - if (node == NULL) { - return xnn_status_out_of_memory; - } - - node->type = xnn_node_type_squared_difference; - node->compute_type = compute_type; - node->num_inputs = 2; - node->inputs[0] = input1_id; - node->inputs[1] = input2_id; - node->num_outputs = 1; - node->outputs[0] = output_id; - node->flags = flags; - - node->create = create_squared_difference_operator; - node->reshape = reshape_squared_difference_operator; - node->setup = setup_squared_difference_operator; - - return xnn_status_success; -} diff --git a/src/subgraph/subtract.c b/src/subgraph/subtract.c deleted file mode 100644 index f6aa4048b9c..00000000000 --- a/src/subgraph/subtract.c +++ /dev/null @@ -1,379 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include - -#include "xnnpack.h" -#include "xnnpack/common.h" -#include "xnnpack/log.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/requantization.h" -#include "xnnpack/reshape-helpers.h" -#include "xnnpack/subgraph-validation.h" -#include "xnnpack/subgraph.h" -#include "pthreadpool.h" - -static enum xnn_status create_subtract_operator( - const struct xnn_node* node, - const struct xnn_value* values, - size_t num_values, - struct xnn_operator_data* opdata, - struct xnn_code_cache* code_cache, - xnn_weights_cache_t weights_cache) -{ - assert(node->num_inputs == 2); - const uint32_t input1_id = node->inputs[0]; - assert(input1_id != XNN_INVALID_VALUE_ID); - assert(input1_id < num_values); - const uint32_t input2_id = node->inputs[1]; - assert(input2_id != XNN_INVALID_VALUE_ID); - assert(input2_id < num_values); - - assert(node->num_outputs == 1); - const uint32_t output_id = node->outputs[0]; - assert(output_id != XNN_INVALID_VALUE_ID); - assert(output_id < num_values); - - enum xnn_status status; - const struct xnn_value *input1_value = &values[input1_id]; - switch (input1_value->datatype) { - case xnn_datatype_fp16: - status = xnn_create_subtract_nd_f16( - node->activation.output_min, - node->activation.output_max, - node->flags, - &opdata->operator_objects[0]); - break; - case xnn_datatype_fp32: - status = xnn_create_subtract_nd_f32( - node->activation.output_min, - node->activation.output_max, - node->flags, - &opdata->operator_objects[0]); - break; - case xnn_datatype_qint8: - { - const float output_scale = values[output_id].quantization.scale; - const int32_t output_zero_point = values[output_id].quantization.zero_point; - const int8_t output_min = xnn_qs8_quantize(node->activation.output_min, output_scale, output_zero_point); - const int8_t output_max = xnn_qs8_quantize(node->activation.output_max, output_scale, output_zero_point); - status = xnn_create_subtract_nd_qs8( - (int8_t) values[input1_id].quantization.zero_point, - values[input1_id].quantization.scale, - (int8_t) values[input2_id].quantization.zero_point, - values[input2_id].quantization.scale, - (int8_t) output_zero_point, - output_scale, output_min, output_max, node->flags, - &opdata->operator_objects[0]); - break; - } - case xnn_datatype_quint8: - { - const float output_scale = values[output_id].quantization.scale; - const int32_t output_zero_point = values[output_id].quantization.zero_point; - const uint8_t output_min = xnn_qu8_quantize(node->activation.output_min, output_scale, output_zero_point); - const uint8_t output_max = xnn_qu8_quantize(node->activation.output_max, output_scale, output_zero_point); - status = xnn_create_subtract_nd_qu8( - (uint8_t) values[input1_id].quantization.zero_point, - values[input1_id].quantization.scale, - (uint8_t) values[input2_id].quantization.zero_point, - values[input2_id].quantization.scale, - (uint8_t) output_zero_point, - output_scale, output_min, output_max, node->flags, - &opdata->operator_objects[0]); - break; - } - default: - XNN_UNREACHABLE; - } - return status; -} - -static enum xnn_status reshape_subtract_operator( - struct xnn_operator_data* opdata, - struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input1_id = opdata->inputs[0]; - assert(input1_id < num_values); - const uint32_t input2_id = opdata->inputs[1]; - assert(input2_id < num_values); - const uint32_t output_id = opdata->outputs[0]; - assert(output_id < num_values); - - opdata->shape1.num_dims = values[input1_id].shape.num_dims; - opdata->shape2.num_dims = values[input2_id].shape.num_dims; - if (values[output_id].layout == xnn_layout_type_nchw) { - assert(values[input1_id].layout == xnn_layout_type_nchw); - assert(values[input2_id].layout == xnn_layout_type_nchw); - opdata->shape1.dim[0] = values[input1_id].shape.dim[0]; - opdata->shape1.dim[1] = values[input1_id].shape.dim[values[input1_id].shape.num_dims - 1]; - if (values[input1_id].shape.num_dims > 2) { - memcpy(&opdata->shape1.dim[2], &values[input1_id].shape.dim[1], (values[input1_id].shape.num_dims - 2) * sizeof(size_t)); - } - opdata->shape2.dim[0] = values[input2_id].shape.dim[0]; - opdata->shape2.dim[1] = values[input2_id].shape.dim[values[input2_id].shape.num_dims - 1]; - if (values[input1_id].shape.num_dims > 2) { - memcpy(&opdata->shape2.dim[2], &values[input2_id].shape.dim[1], (values[input2_id].shape.num_dims - 2) * sizeof(size_t)); - } - } else { - assert(values[output_id].layout == xnn_layout_type_nhwc); - assert(values[input1_id].layout == xnn_layout_type_nhwc); - assert(values[input2_id].layout == xnn_layout_type_nhwc); - memcpy(opdata->shape1.dim, values[input1_id].shape.dim, values[input1_id].shape.num_dims * sizeof(size_t)); - memcpy(opdata->shape2.dim, values[input2_id].shape.dim, values[input2_id].shape.num_dims * sizeof(size_t)); - } - opdata->outputs[0] = output_id; - - // Handle scalars. Although the output shape is dimensionless, the reshape - // function must be passed a valid shape to prevent skipping the op. - if (opdata->shape1.num_dims == 0) { - opdata->shape1.num_dims = 1; - opdata->shape1.dim[0] = 1; - } - if (opdata->shape2.num_dims == 0) { - opdata->shape2.num_dims = 1; - opdata->shape2.dim[0] = 1; - } - const size_t old_workspace_size = opdata->workspace_size; - enum xnn_status status = xnn_status_invalid_state; - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_subtract_nd_f16: - status = xnn_reshape_subtract_nd_f16( - opdata->operator_objects[0], - opdata->shape1.num_dims, - opdata->shape1.dim, - opdata->shape2.num_dims, - opdata->shape2.dim, - threadpool); - break; - case xnn_operator_type_subtract_nd_f32: - status = xnn_reshape_subtract_nd_f32( - opdata->operator_objects[0], - opdata->shape1.num_dims, - opdata->shape1.dim, - opdata->shape2.num_dims, - opdata->shape2.dim, - threadpool); - break; - case xnn_operator_type_subtract_nd_qs8: - status = xnn_reshape_subtract_nd_qs8( - opdata->operator_objects[0], - opdata->shape1.num_dims, - opdata->shape1.dim, - opdata->shape2.num_dims, - opdata->shape2.dim, - threadpool); - break; - case xnn_operator_type_subtract_nd_qu8: - status = xnn_reshape_subtract_nd_qu8( - opdata->operator_objects[0], - opdata->shape1.num_dims, - opdata->shape1.dim, - opdata->shape2.num_dims, - opdata->shape2.dim, - threadpool); - break; - default: - XNN_UNREACHABLE; - } - if (status != xnn_status_success) { - return status; - } - return resize_binary_elementwise_output_tensor(opdata, values, num_values, old_workspace_size, threadpool); -} - -static enum xnn_status setup_subtract_operator( - const struct xnn_operator_data* opdata, - const struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input1_id = opdata->inputs[0]; - assert(input1_id != XNN_INVALID_VALUE_ID); - assert(input1_id < num_values); - - const uint32_t input2_id = opdata->inputs[1]; - assert(input2_id != XNN_INVALID_VALUE_ID); - assert(input2_id < num_values); - - const uint32_t output_id = opdata->outputs[0]; - assert(output_id != XNN_INVALID_VALUE_ID); - assert(output_id < num_values); - - const struct xnn_value* input1_value = values + input1_id; - const void* input1_data = input1_value->data; - assert(input1_data != NULL); - - const struct xnn_value* input2_value = values + input2_id; - const void* input2_data = input2_value->data; - assert(input2_data != NULL); - - const struct xnn_value* output_value = values + output_id; - void* output_data = output_value->data; - assert(output_data != NULL); - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_subtract_nd_f16: - return xnn_setup_subtract_nd_f16( - opdata->operator_objects[0], - input1_data, input2_data, output_data); - case xnn_operator_type_subtract_nd_f32: - return xnn_setup_subtract_nd_f32( - opdata->operator_objects[0], - input1_data, input2_data, output_data); - case xnn_operator_type_subtract_nd_qs8: - return xnn_setup_subtract_nd_qs8( - opdata->operator_objects[0], - input1_data, input2_data, output_data); - case xnn_operator_type_subtract_nd_qu8: - return xnn_setup_subtract_nd_qu8( - opdata->operator_objects[0], - input1_data, input2_data, output_data); - default: - XNN_UNREACHABLE; - } -} - -enum xnn_status xnn_define_subtract( - xnn_subgraph_t subgraph, - float output_min, - float output_max, - uint32_t input1_id, - uint32_t input2_id, - uint32_t output_id, - uint32_t flags) -{ - enum xnn_status status; - if ((status = xnn_subgraph_check_xnnpack_initialized(xnn_node_type_subtract)) != xnn_status_success) { - return status; - } - - status = xnn_subgraph_check_output_min_max(xnn_node_type_subtract, output_min, output_max); - if (status != xnn_status_success) { - return status; - } - - if ((status = xnn_subgraph_check_nth_input_node_id( - xnn_node_type_subtract, input1_id, subgraph->num_values, 2)) != xnn_status_success) { - return status; - } - - const struct xnn_value* input1_value = &subgraph->values[input1_id]; - status = xnn_subgraph_check_nth_input_type_dense(xnn_node_type_subtract, input1_id, input1_value, 1); - if (status != xnn_status_success) { - return status; - } - - switch (input1_value->datatype) { - case xnn_datatype_fp16: - case xnn_datatype_fp32: - case xnn_datatype_qint8: - case xnn_datatype_quint8: - break; - default: - xnn_log_error( - "failed to define %s operator with the first input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_subtract), input1_id, - xnn_datatype_to_string(input1_value->datatype), input1_value->datatype); - return xnn_status_invalid_parameter; - } - - if ((status = xnn_subgraph_check_nth_input_node_id( - xnn_node_type_subtract, input2_id, subgraph->num_values, 1)) != xnn_status_success) { - return status; - } - - const struct xnn_value* input2_value = &subgraph->values[input2_id]; - status = xnn_subgraph_check_nth_input_type_dense(xnn_node_type_subtract, input2_id, input2_value, 2); - if (status != xnn_status_success) { - return status; - } - - switch (input2_value->datatype) { - case xnn_datatype_fp16: - case xnn_datatype_fp32: - case xnn_datatype_qint8: - case xnn_datatype_quint8: - break; - default: - xnn_log_error( - "failed to define %s operator with the second input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_subtract), input2_id, - xnn_datatype_to_string(input2_value->datatype), input2_value->datatype); - return xnn_status_invalid_parameter; - } - - status = xnn_subgraph_check_output_node_id(xnn_node_type_subtract, output_id, subgraph->num_values); - if (status != xnn_status_success) { - return status; - } - - const struct xnn_value* output_value = &subgraph->values[output_id]; - status = xnn_subgraph_check_output_type_dense(xnn_node_type_subtract, output_id, output_value); - if (status != xnn_status_success) { - return status; - } - - enum xnn_compute_type compute_type = xnn_compute_type_invalid; - switch (output_value->datatype) { - case xnn_datatype_fp16: - compute_type = xnn_compute_type_fp16; - break; - case xnn_datatype_fp32: - compute_type = xnn_compute_type_fp32; - break; - case xnn_datatype_qint8: - compute_type = xnn_compute_type_qs8; - break; - case xnn_datatype_quint8: - compute_type = xnn_compute_type_qu8; - break; - default: - xnn_log_error( - "failed to define %s operator with output ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_subtract), output_id, - xnn_datatype_to_string(output_value->datatype), output_value->datatype); - return xnn_status_invalid_parameter; - } - - status = xnn_subgraph_check_datatype_matches_two_inputs( - xnn_node_type_subtract, input1_id, input1_value, input2_id, input2_value, output_id, output_value); - if (status != xnn_status_success) { - return status; - } - - struct xnn_node* node = xnn_subgraph_new_node(subgraph); - if (node == NULL) { - return xnn_status_out_of_memory; - } - - node->type = xnn_node_type_subtract; - node->compute_type = compute_type; - node->activation.output_min = output_min; - node->activation.output_max = output_max; - node->num_inputs = 2; - node->inputs[0] = input1_id; - node->inputs[1] = input2_id; - node->num_outputs = 1; - node->outputs[0] = output_id; - node->flags = flags; - - node->create = create_subtract_operator; - node->reshape = reshape_subtract_operator; - node->setup = setup_subtract_operator; - - if (output_min != -INFINITY && output_max != INFINITY) { - xnn_insert_clamp_node(subgraph, output_min, output_max, node); - } - return xnn_status_success; -} diff --git a/src/xnnpack/config-types.h b/src/xnnpack/config-types.h index 0bb8428a819..660920dfd51 100644 --- a/src/xnnpack/config-types.h +++ b/src/xnnpack/config-types.h @@ -65,16 +65,7 @@ struct xnn_binary_elementwise_subconfig { struct xnn_binary_elementwise_config { struct xnn_binary_elementwise_subconfig minmax; struct xnn_binary_elementwise_subconfig linear; - union { - xnn_init_f16_minmax_params_fn f16_minmax; - xnn_init_f32_default_params_fn f32_default; - xnn_init_f32_minmax_params_fn f32_minmax; - xnn_init_qs8_add_minmax_params_fn qs8_add; - xnn_init_qs8_mul_minmax_params_fn qs8_mul; - xnn_init_qu8_add_minmax_params_fn qu8_add; - xnn_init_qu8_mul_minmax_params_fn qu8_mul; - xnn_init_s32_default_params_fn s32_default; - } init; + xnn_init_binary_params_fn init; }; struct xnn_unary_elementwise_config { diff --git a/src/xnnpack/microfnptr.h b/src/xnnpack/microfnptr.h index a6d41063a59..014dd0bf6e0 100644 --- a/src/xnnpack/microfnptr.h +++ b/src/xnnpack/microfnptr.h @@ -9,6 +9,7 @@ #include #include +#include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/microparams.h" @@ -2506,6 +2507,12 @@ typedef void (*xnn_f32_vscaleextexp_ukernel_fn)( /***************** Microkernel parameter initializer pointers ****************/ +typedef size_t (*xnn_init_binary_params_fn)( + union xnn_binary_uparams* uparams, + const struct xnn_quantization_params* a_quantization, + const struct xnn_quantization_params* b_quantization, + const struct xnn_quantization_params* output_quantization); + typedef size_t (*xnn_init_f16_qs8_cvt_params_fn)( struct xnn_f16_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)], xnn_float16 scale, @@ -2638,41 +2645,27 @@ typedef void (*xnn_update_f16_gavgpool_scalar_params_fn)( typedef size_t (*xnn_init_qs8_add_minmax_params_fn)( struct xnn_qs8_add_minmax_params params[XNN_MIN_ELEMENTS(1)], - int8_t input_x_zero_point, - int8_t input_y_zero_point, - int8_t output_zero_point, - float input_x_output_scale, - float input_y_output_scale, - int8_t output_min, - int8_t output_max); + const struct xnn_quantization_params* a_quantization, + const struct xnn_quantization_params* b_quantization, + const struct xnn_quantization_params* output_quantization); typedef size_t (*xnn_init_qu8_add_minmax_params_fn)( struct xnn_qu8_add_minmax_params params[XNN_MIN_ELEMENTS(1)], - uint8_t input_x_zero_point, - uint8_t input_y_zero_point, - uint8_t output_zero_point, - float input_x_output_scale, - float input_y_output_scale, - uint8_t output_min, - uint8_t output_max); + const struct xnn_quantization_params* a_quantization, + const struct xnn_quantization_params* b_quantization, + const struct xnn_quantization_params* output_quantization); typedef size_t (*xnn_init_qs8_mul_minmax_params_fn)( union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)], - int8_t input_x_zero_point, - int8_t input_y_zero_point, - int8_t output_zero_point, - float product_output_scale, - int8_t output_min, - int8_t output_max); + const struct xnn_quantization_params* a_quantization, + const struct xnn_quantization_params* b_quantization, + const struct xnn_quantization_params* output_quantization); typedef size_t (*xnn_init_qu8_mul_minmax_params_fn)( union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)], - uint8_t input_x_zero_point, - uint8_t input_y_zero_point, - uint8_t output_zero_point, - float product_output_scale, - uint8_t output_min, - uint8_t output_max); + const struct xnn_quantization_params* a_quantization, + const struct xnn_quantization_params* b_quantization, + const struct xnn_quantization_params* output_quantization); typedef size_t (*xnn_init_bf16_default_params_fn)( struct xnn_bf16_default_params params[XNN_MIN_ELEMENTS(1)]); diff --git a/src/xnnpack/microparams-init.h b/src/xnnpack/microparams-init.h index a1c7bb96edf..647d627cef8 100644 --- a/src/xnnpack/microparams-init.h +++ b/src/xnnpack/microparams-init.h @@ -8,6 +8,7 @@ #include #include +#include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/microparams.h" @@ -426,54 +427,51 @@ XNN_INTERNAL size_t xnn_init_qu8_lrelu_scalar_params( uint8_t input_zero_point, uint8_t output_zero_point); -XNN_INTERNAL size_t xnn_init_qs8_add_minmax_scalar_params( - struct xnn_qs8_add_minmax_params params[XNN_MIN_ELEMENTS(1)], - int8_t x_zero_point, - int8_t y_zero_point, - int8_t output_zero_point, - float x_output_scale, - float y_output_scale, - int8_t output_min, - int8_t output_max); +XNN_INTERNAL size_t xnn_init_f16_minmax_binary_params( + union xnn_f16_minmax_params uparams[XNN_MIN_ELEMENTS(1)], + const struct xnn_quantization_params* a_quantization, + const struct xnn_quantization_params* b_quantization, + const struct xnn_quantization_params* output_quantization); -XNN_INTERNAL size_t xnn_init_qu8_add_minmax_scalar_params( - struct xnn_qu8_add_minmax_params params[XNN_MIN_ELEMENTS(1)], - uint8_t x_zero_point, - uint8_t y_zero_point, - uint8_t output_zero_point, - float x_output_scale, - float y_output_scale, - uint8_t output_min, - uint8_t output_max); +XNN_INTERNAL size_t xnn_init_f32_minmax_binary_params( + union xnn_f32_minmax_params uparams[XNN_MIN_ELEMENTS(1)], + const struct xnn_quantization_params* a_quantization, + const struct xnn_quantization_params* b_quantization, + const struct xnn_quantization_params* output_quantization); +XNN_INTERNAL size_t xnn_init_qs8_add_minmax_scalar_params( + struct xnn_qs8_add_minmax_params uparams[XNN_MIN_ELEMENTS(1)], + const struct xnn_quantization_params* a_quantization, + const struct xnn_quantization_params* b_quantization, + const struct xnn_quantization_params* output_quantization); -#define DECLARE_INIT_QS8_MUL_MINMAX_PARAMS_FUNCTION(fn_name) \ - XNN_INTERNAL size_t fn_name( \ - union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)], \ - int8_t a_zero_point, \ - int8_t b_zero_point, \ - int8_t output_zero_point, \ - float product_output_scale, \ - int8_t output_min, \ - int8_t output_max); +XNN_INTERNAL size_t xnn_init_qu8_add_minmax_scalar_params( + struct xnn_qu8_add_minmax_params uparams[XNN_MIN_ELEMENTS(1)], + const struct xnn_quantization_params* a_quantization, + const struct xnn_quantization_params* b_quantization, + const struct xnn_quantization_params* output_quantization); + +#define DECLARE_INIT_QS8_MUL_MINMAX_PARAMS_FUNCTION(fn_name) \ + XNN_INTERNAL size_t fn_name( \ + union xnn_qs8_mul_minmax_params uparams[XNN_MIN_ELEMENTS(1)], \ + const struct xnn_quantization_params* a_quantization, \ + const struct xnn_quantization_params* b_quantization, \ + const struct xnn_quantization_params* output_quantization); DECLARE_INIT_QS8_MUL_MINMAX_PARAMS_FUNCTION(xnn_init_qs8_mul_minmax_scalar_params) #if XNN_ARCH_ARM || XNN_ARCH_ARM64 DECLARE_INIT_QS8_MUL_MINMAX_PARAMS_FUNCTION(xnn_init_qs8_mul_minmax_rndnu_neon_params) #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 +#define DECLARE_INIT_QU8_MUL_MINMAX_PARAMS_FUNCTION(fn_name) \ + XNN_INTERNAL size_t fn_name( \ + union xnn_qu8_mul_minmax_params uparams[XNN_MIN_ELEMENTS(1)], \ + const struct xnn_quantization_params* a_quantization, \ + const struct xnn_quantization_params* b_quantization, \ + const struct xnn_quantization_params* output_quantization); -#define DECLARE_INIT_QU8_MUL_MINMAX_PARAMS_FUNCTION(fn_name) \ - XNN_INTERNAL size_t fn_name( \ - union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)], \ - uint8_t a_zero_point, \ - uint8_t b_zero_point, \ - uint8_t output_zero_point, \ - float product_output_scale, \ - uint8_t output_min, \ - uint8_t output_max); - -DECLARE_INIT_QU8_MUL_MINMAX_PARAMS_FUNCTION(xnn_init_qu8_mul_minmax_scalar_params) + DECLARE_INIT_QU8_MUL_MINMAX_PARAMS_FUNCTION( + xnn_init_qu8_mul_minmax_scalar_params) #if XNN_ARCH_ARM || XNN_ARCH_ARM64 DECLARE_INIT_QU8_MUL_MINMAX_PARAMS_FUNCTION(xnn_init_qu8_mul_minmax_rndnu_neon_params) #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 diff --git a/src/xnnpack/microparams.h b/src/xnnpack/microparams.h index 0819502a99c..0c5e4f47392 100644 --- a/src/xnnpack/microparams.h +++ b/src/xnnpack/microparams.h @@ -365,6 +365,14 @@ union xnn_qu8_mul_minmax_params { #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 }; +union xnn_binary_uparams { + struct xnn_qs8_add_minmax_params qs8_add; + struct xnn_qu8_add_minmax_params qu8_add; + union xnn_qs8_mul_minmax_params qs8_mul; + union xnn_qu8_mul_minmax_params qu8_mul; + union xnn_f16_minmax_params f16_minmax; + union xnn_f32_minmax_params f32_minmax; +}; // RSum params used by RSUM & RDSUM microkernels. struct xnn_qs8_rsum_params { diff --git a/src/xnnpack/operator-type.h b/src/xnnpack/operator-type.h index 0a6749d1667..7b431bd0f28 100644 --- a/src/xnnpack/operator-type.h +++ b/src/xnnpack/operator-type.h @@ -20,10 +20,7 @@ enum xnn_operator_type { xnn_operator_type_invalid = 0, xnn_operator_type_abs_nc_f16, xnn_operator_type_abs_nc_f32, - xnn_operator_type_add_nd_f16, - xnn_operator_type_add_nd_f32, - xnn_operator_type_add_nd_qs8, - xnn_operator_type_add_nd_qu8, + xnn_operator_type_add, xnn_operator_type_argmax_pooling_nhwc_f32, xnn_operator_type_average_pooling_nhwc_f16, xnn_operator_type_average_pooling_nhwc_f32, @@ -69,7 +66,7 @@ enum xnn_operator_type { xnn_operator_type_copy_nc_x8, xnn_operator_type_copy_nc_x16, xnn_operator_type_copy_nc_x32, - xnn_operator_type_copysign_nd_f32, + xnn_operator_type_copysign, xnn_operator_type_deconvolution_nhwc_f16, xnn_operator_type_deconvolution_nhwc_f32, xnn_operator_type_deconvolution_nhwc_qd8_f32_qc8w, @@ -81,8 +78,7 @@ enum xnn_operator_type { xnn_operator_type_depth_to_space_nhwc_x8, xnn_operator_type_depth_to_space_nhwc_x16, xnn_operator_type_depth_to_space_nhwc_x32, - xnn_operator_type_divide_nd_f16, - xnn_operator_type_divide_nd_f32, + xnn_operator_type_divide, xnn_operator_type_dynamic_fully_connected_nc_f16, xnn_operator_type_dynamic_fully_connected_nc_f32, xnn_operator_type_elu_nc_f16, @@ -125,19 +121,13 @@ enum xnn_operator_type { xnn_operator_type_max_pooling_nhwc_f32, xnn_operator_type_max_pooling_nhwc_s8, xnn_operator_type_max_pooling_nhwc_u8, - xnn_operator_type_maximum_nd_f16, - xnn_operator_type_maximum_nd_f32, + xnn_operator_type_maximum, xnn_operator_type_mean_nd_f16, xnn_operator_type_mean_nd_f32, xnn_operator_type_mean_nd_qs8, xnn_operator_type_mean_nd_qu8, - xnn_operator_type_minimum_nd_f16, - xnn_operator_type_minimum_nd_f32, - xnn_operator_type_multiply_nd_f16, - xnn_operator_type_multiply_nd_f32, - xnn_operator_type_multiply_nd_qs8, - xnn_operator_type_multiply_nd_qu8, - xnn_operator_type_multiply_nd_s32, + xnn_operator_type_minimum, + xnn_operator_type_multiply, xnn_operator_type_negate_nc_f16, xnn_operator_type_negate_nc_f32, xnn_operator_type_prelu_nc_f16, @@ -171,12 +161,8 @@ enum xnn_operator_type { xnn_operator_type_square_nc_f32, xnn_operator_type_square_root_nc_f16, xnn_operator_type_square_root_nc_f32, - xnn_operator_type_squared_difference_nd_f16, - xnn_operator_type_squared_difference_nd_f32, - xnn_operator_type_subtract_nd_f16, - xnn_operator_type_subtract_nd_f32, - xnn_operator_type_subtract_nd_qs8, - xnn_operator_type_subtract_nd_qu8, + xnn_operator_type_squared_difference, + xnn_operator_type_subtract, xnn_operator_type_tanh_nc_f16, xnn_operator_type_tanh_nc_f32, xnn_operator_type_tanh_nc_qs8, diff --git a/src/xnnpack/operator.h b/src/xnnpack/operator.h index 4e7c00acfe3..2cbbec079f1 100644 --- a/src/xnnpack/operator.h +++ b/src/xnnpack/operator.h @@ -209,7 +209,10 @@ struct xnn_operator { struct subconvolution_params* subconvolution_buffer; uint32_t flags; + uint32_t log2_elementwise_element_size; + union { + union xnn_binary_uparams binary; struct xnn_f16_default_params f16_default; struct xnn_f16_hswish_params f16_hswish; struct xnn_f16_elu_params f16_elu; @@ -261,10 +264,6 @@ struct xnn_operator { }; struct xnn_qs8_mean_minmax_params qs8_mean; struct xnn_qu8_mean_minmax_params qu8_mean; - struct xnn_qs8_add_minmax_params qs8_add; - union xnn_qs8_mul_minmax_params qs8_mul; - struct xnn_qu8_add_minmax_params qu8_add; - union xnn_qu8_mul_minmax_params qu8_mul; union xnn_qu8_conv_minmax_params qu8_conv_minmax; // Average Pooling normally use qu8_avgpool_params, but also initialize qu8_gavgpool_params in case it needs to switch // to Global Average Pooling operation. @@ -285,14 +284,11 @@ struct xnn_operator { // We also use this to store parameters to binary operators. For most such operators, this is a copy of params, // but params need to be swapped for commutative ops with per-operand params. union { + union xnn_binary_uparams binary; struct xnn_f16_expminus_params f16_expminus_params; union xnn_f32_minmax_params f32_minmax; struct xnn_f32_expminus_params f32_expminus_params; struct xnn_f32_default_params f32_default; - struct xnn_qs8_add_minmax_params qs8_add; - union xnn_qs8_mul_minmax_params qs8_mul; - struct xnn_qu8_add_minmax_params qu8_add; - union xnn_qu8_mul_minmax_params qu8_mul; struct xnn_s8_minmax_params s8_minmax; struct xnn_u8_minmax_params u8_minmax; } params2; @@ -430,3 +426,6 @@ XNN_INTERNAL enum xnn_status xnn_run_operator_with_index( size_t opdata_index, size_t operator_object_index, pthreadpool_t threadpool); + +XNN_INTERNAL enum xnn_operator_type xnn_binary_operator_to_operator_type( + enum xnn_binary_operator op); \ No newline at end of file diff --git a/src/xnnpack/subgraph.h b/src/xnnpack/subgraph.h index 8b3b1205626..95977e2204d 100644 --- a/src/xnnpack/subgraph.h +++ b/src/xnnpack/subgraph.h @@ -585,6 +585,9 @@ enum xnn_status resize_fully_connected_output_tensor( size_t old_workspace_size, pthreadpool_t threadpool); +XNN_INTERNAL enum xnn_node_type xnn_binary_operator_to_node_type(enum xnn_binary_operator type); +XNN_INTERNAL enum xnn_binary_operator xnn_node_type_to_binary_operator(enum xnn_node_type type); + #ifdef __cplusplus } // extern "C" #endif diff --git a/test/BUILD.bazel b/test/BUILD.bazel index 83b716c5ca5..3c94369e7d4 100644 --- a/test/BUILD.bazel +++ b/test/BUILD.bazel @@ -100,14 +100,6 @@ xnnpack_cxx_library( deps = OPERATOR_TEST_DEPS + xnnpack_test_deps_for_library(), ) -xnnpack_cxx_library( - name = "binary_elementwise_operator_tester", - testonly = True, - srcs = ["binary-elementwise-operator-tester.cc"], - hdrs = ["binary-elementwise-operator-tester.h"], - deps = OPERATOR_TEST_DEPS + xnnpack_test_deps_for_library(), -) - xnnpack_cxx_library( name = "vunary_microkernel_tester", testonly = True, @@ -1376,11 +1368,11 @@ xnnpack_binary( ]] xnnpack_unit_test( - name = "binary_nd_test", + name = "binary_elementwise_nd_test", timeout = "long", - srcs = ["binary-nd.cc"], + srcs = ["binary-elementwise-nd.cc"], shard_count = 10, - deps = OPERATOR_TEST_DEPS + [":binary_elementwise_operator_tester"], + deps = OPERATOR_TEST_DEPS, ) xnnpack_unit_test( @@ -1868,40 +1860,16 @@ xnnpack_unit_test( ], ) -[xnnpack_unit_test( - name = "%s_test" % operator, - srcs = [ - "%s.cc" % operator.replace("_", "-"), - ], +xnnpack_unit_test( + name = "binary_test", + srcs = ["binary.cc"], deps = [ - ":subgraph_binary_tester", - "@FP16", + ":replicable_random_device", "//:XNNPACK", + "//:aligned_allocator", "//:math", "//:node_type", "//:operators", - "//:requantization", - "//:subgraph", - ], -) for operator in [ - "add2", - "copysign", - "divide2", - "maximum2", - "minimum2", - "multiply2", - "squared_difference", - "subtract2", -]] - -xnnpack_unit_test( - name = "add2_reshape_test", - srcs = [ - "add2-reshape.cc", - ], - deps = [ - "//:XNNPACK", - "//:node_type", "//:subgraph", ], ) diff --git a/test/add2-reshape.cc b/test/add2-reshape.cc deleted file mode 100644 index 8ea13b8d36f..00000000000 --- a/test/add2-reshape.cc +++ /dev/null @@ -1,325 +0,0 @@ -// Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/node-type.h" -#include "xnnpack/subgraph.h" - -TEST(Add2TestF32, Reshape) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/3, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - std::vector dims{2, 3, 4}; - uint32_t input0_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input0_id)); - ASSERT_NE(input0_id, XNN_INVALID_NODE_ID); - uint32_t input1_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, /*external_id=*/1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input1_id)); - ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, /*external_id=*/2, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - const float output_min = -std::numeric_limits::infinity(); - const float output_max = std::numeric_limits::infinity(); - ASSERT_EQ(xnn_status_success, xnn_define_add2(subgraph, output_min, output_max, input0_id, input1_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_add2); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp32); - ASSERT_EQ(node->num_inputs, 2); - ASSERT_EQ(node->inputs[0], input0_id); - ASSERT_EQ(node->inputs[1], input1_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - - ASSERT_EQ(node->reshape(&runtime->opdata[0], subgraph->values, subgraph->num_values, /*threadpool=*/nullptr), xnn_status_success); - - dims[0] = 7; - ASSERT_EQ(xnn_status_success, xnn_reshape_external_value(runtime, input0_id, dims.size(), dims.data())); - ASSERT_EQ(xnn_status_success, xnn_reshape_external_value(runtime, input1_id, dims.size(), dims.data())); - - ASSERT_EQ(node->reshape(&runtime->opdata[0], runtime->values, runtime->num_values, /*threadpool=*/nullptr), xnn_status_reallocation_required); - const xnn_shape* output_shape = &runtime->values[node->outputs[0]].shape; - const size_t num_input_elements = std::accumulate(dims.cbegin(), dims.cend(), size_t{1}, std::multiplies()); - ASSERT_EQ(output_shape->dim[0], dims[0]); - ASSERT_EQ(runtime->values[node->outputs[0]].size, num_input_elements * sizeof(float)); -} - -TEST(Add2TestF32, ReshapeBroadcastDim0) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/3, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - std::vector dim0{1, 3, 4}; - std::vector dim1{5, 3, 4}; - uint32_t input0_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dim0.size(), dim0.data(), nullptr, /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input0_id)); - ASSERT_NE(input0_id, XNN_INVALID_NODE_ID); - uint32_t input1_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dim1.size(), dim1.data(), nullptr, /*external_id=*/1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input1_id)); - ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - // Output dims will be correctly set by reshape. - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dim1.size(), dim1.data(), nullptr, /*external_id=*/2, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - const float output_min = -std::numeric_limits::infinity(); - const float output_max = std::numeric_limits::infinity(); - ASSERT_EQ(xnn_status_success, xnn_define_add2(subgraph, output_min, output_max, input0_id, input1_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_add2); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp32); - ASSERT_EQ(node->num_inputs, 2); - ASSERT_EQ(node->inputs[0], input0_id); - ASSERT_EQ(node->inputs[1], input1_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - - ASSERT_EQ(node->reshape(&runtime->opdata[0], subgraph->values, subgraph->num_values, /*threadpool=*/nullptr), xnn_status_success); - - dim0[0] = 7; - dim1[0] = 1; - ASSERT_EQ(xnn_status_success, xnn_reshape_external_value(runtime, input0_id, dim0.size(), dim0.data())); - ASSERT_EQ(xnn_status_success, xnn_reshape_external_value(runtime, input1_id, dim1.size(), dim1.data())); - - ASSERT_EQ(node->reshape(&runtime->opdata[0], runtime->values, runtime->num_values, /*threadpool=*/nullptr), xnn_status_reallocation_required); - const xnn_shape* output_shape = &runtime->values[node->outputs[0]].shape; - const size_t num_input_elements = std::accumulate(dim0.cbegin(), dim0.cend(), size_t{1}, std::multiplies()); - ASSERT_EQ(output_shape->dim[0], dim0[0]); - ASSERT_EQ(runtime->values[node->outputs[0]].size, num_input_elements * sizeof(float)); -} - -TEST(Add2TestF32, ReshapeBroadcast1D) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/3, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - std::vector dim0{1, 20, 80, 32}; - std::vector dim1{32}; - uint32_t input0_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dim0.size(), dim0.data(), nullptr, /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input0_id)); - ASSERT_NE(input0_id, XNN_INVALID_NODE_ID); - uint32_t input1_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dim1.size(), dim1.data(), nullptr, /*external_id=*/1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input1_id)); - ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dim0.size(), dim0.data(), nullptr, /*external_id=*/2, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - const float output_min = -std::numeric_limits::infinity(); - const float output_max = std::numeric_limits::infinity(); - ASSERT_EQ(xnn_status_success, xnn_define_add2(subgraph, output_min, output_max, input0_id, input1_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_add2); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp32); - ASSERT_EQ(node->num_inputs, 2); - ASSERT_EQ(node->inputs[0], input0_id); - ASSERT_EQ(node->inputs[1], input1_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - - ASSERT_EQ(node->reshape(&runtime->opdata[0], subgraph->values, subgraph->num_values, /*threadpool=*/nullptr), xnn_status_success); - - dim0[0] = 7; - dim1[0] = 1; - ASSERT_EQ(xnn_status_success, xnn_reshape_external_value(runtime, input0_id, dim0.size(), dim0.data())); - ASSERT_EQ(xnn_status_success, xnn_reshape_external_value(runtime, input1_id, dim1.size(), dim1.data())); - - ASSERT_EQ(node->reshape(&runtime->opdata[0], runtime->values, runtime->num_values, /*threadpool=*/nullptr), xnn_status_reallocation_required); - const xnn_shape* output_shape = &runtime->values[node->outputs[0]].shape; - const size_t num_input_elements = std::accumulate(dim0.cbegin(), dim0.cend(), size_t{1}, std::multiplies()); - ASSERT_EQ(output_shape->dim[0], dim0[0]); - ASSERT_EQ(runtime->values[node->outputs[0]].size, num_input_elements * sizeof(float)); -} - -TEST(Add2TestF32, ReshapeBroadcast2D) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/3, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - std::vector dim0{1, 20, 80, 32}; - std::vector dim1{80, 32}; - uint32_t input0_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dim0.size(), dim0.data(), nullptr, /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input0_id)); - ASSERT_NE(input0_id, XNN_INVALID_NODE_ID); - uint32_t input1_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dim1.size(), dim1.data(), nullptr, /*external_id=*/1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input1_id)); - ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dim0.size(), dim0.data(), nullptr, /*external_id=*/2, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - const float output_min = -std::numeric_limits::infinity(); - const float output_max = std::numeric_limits::infinity(); - ASSERT_EQ(xnn_status_success, xnn_define_add2(subgraph, output_min, output_max, input0_id, input1_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_add2); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp32); - ASSERT_EQ(node->num_inputs, 2); - ASSERT_EQ(node->inputs[0], input0_id); - ASSERT_EQ(node->inputs[1], input1_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - - ASSERT_EQ(node->reshape(&runtime->opdata[0], subgraph->values, subgraph->num_values, /*threadpool=*/nullptr), xnn_status_success); - - dim0[0] = 7; - dim1[0] = 1; - ASSERT_EQ(xnn_status_success, xnn_reshape_external_value(runtime, input0_id, dim0.size(), dim0.data())); - ASSERT_EQ(xnn_status_success, xnn_reshape_external_value(runtime, input1_id, dim1.size(), dim1.data())); - - ASSERT_EQ(node->reshape(&runtime->opdata[0], runtime->values, runtime->num_values, /*threadpool=*/nullptr), xnn_status_reallocation_required); - const xnn_shape* output_shape = &runtime->values[node->outputs[0]].shape; - const size_t num_input_elements = std::accumulate(dim0.cbegin(), dim0.cend(), size_t{1}, std::multiplies()); - ASSERT_EQ(output_shape->dim[0], dim0[0]); - ASSERT_EQ(runtime->values[node->outputs[0]].size, num_input_elements * sizeof(float)); -} - -TEST(Add2TestF32, DegenerateDimension) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/3, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - std::vector dim0{0, 32}; - std::vector dim1{2, 0, 32}; - uint32_t input0_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dim0.size(), dim0.data(), nullptr, /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input0_id)); - ASSERT_NE(input0_id, XNN_INVALID_NODE_ID); - uint32_t input1_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dim1.size(), dim1.data(), nullptr, /*external_id=*/1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input1_id)); - ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dim1.size(), dim1.data(), nullptr, /*external_id=*/2, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - const float output_min = -std::numeric_limits::infinity(); - const float output_max = std::numeric_limits::infinity(); - ASSERT_EQ(xnn_status_success, xnn_define_add2(subgraph, output_min, output_max, input0_id, input1_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_add2); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp32); - ASSERT_EQ(node->num_inputs, 2); - ASSERT_EQ(node->inputs[0], input0_id); - ASSERT_EQ(node->inputs[1], input1_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - - ASSERT_EQ(node->reshape(&runtime->opdata[0], subgraph->values, subgraph->num_values, /*threadpool=*/nullptr), xnn_status_success); -} diff --git a/test/add2.cc b/test/add2.cc deleted file mode 100644 index b7a96a79061..00000000000 --- a/test/add2.cc +++ /dev/null @@ -1,386 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include - -#include -#include -#include -#include -#include -#include - -#include -#include -#include "xnnpack.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/requantization.h" -#include "xnnpack/subgraph.h" -#include "subgraph-binary-tester.h" - -using Add2TestQS8 = BinaryTest; -using Add2TestQU8 = BinaryTest; -using Add2TestF16 = BinaryTest; -using Add2TestF32 = BinaryTest; - -TEST_F(Add2TestQS8, matches_operator_api) -{ - std::generate(input1.begin(), input1.end(), [&]() { return i8dist(rng); }); - std::generate(input2.begin(), input2.end(), [&]() { return i8dist(rng); }); - std::fill(operator_output.begin(), operator_output.end(), INT8_C(0xA5)); - std::fill(subgraph_output.begin(), subgraph_output.end(), INT8_C(0xA5)); - - const int32_t input1_zero_point = i8dist(rng); - const float input1_scale = scale_dist(rng); - const int32_t input2_zero_point = i8dist(rng); - const float input2_scale = scale_dist(rng); - const int32_t output_zero_point = i8dist(rng); - const float output_scale = scale_dist(rng); - const int8_t quantized_output_min = xnn_qs8_quantize(output_min, output_scale, output_zero_point); - const int8_t quantized_output_max = xnn_qs8_quantize(output_max, output_scale, output_zero_point); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_operator_t op = nullptr; - - // Call operator API. - ASSERT_EQ( - xnn_status_success, xnn_create_add_nd_qs8( - input1_zero_point, input1_scale, input2_zero_point, input2_scale, output_zero_point, - output_scale, quantized_output_min, quantized_output_max, /*flags=*/0, &op)); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ( - xnn_status_success, xnn_reshape_add_nd_qs8( - op, input1_dims.size(), input1_dims.data(), input2_dims.size(), input2_dims.data(), - /*threadpool=*/nullptr)); - - ASSERT_EQ( - xnn_status_success, xnn_setup_add_nd_qs8(op, input1.data(), input2.data(), operator_output.data())); - - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(3, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - uint32_t input1_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, input1_zero_point, input1_scale, input1_dims.size(), input1_dims.data(), nullptr, - /*external_id=*/0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input1_id)); - ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); - - uint32_t input2_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, input2_zero_point, input2_scale, input2_dims.size(), input2_dims.data(), nullptr, - /*external_id=*/1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input2_id)); - ASSERT_NE(input2_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, output_zero_point, output_scale, output_dims.size(), - output_dims.data(), nullptr, /*external_id=*/2, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ( - xnn_status_success, - xnn_define_add2(subgraph, output_min, output_max, input1_id, input2_id, output_id, /*flags=*/0)); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input1_id, input1.data()}, xnn_external_value{input2_id, input2.data()}, - xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - // Check output shape matches. - size_t observed_output_num_dims = 0; - std::vector observed_output_dims(XNN_MAX_TENSOR_DIMS, 0); - ASSERT_EQ( - xnn_status_success, - xnn_get_external_value_shape(runtime, output_id, &observed_output_num_dims, observed_output_dims.data())); - ASSERT_EQ(output_dims.size(), observed_output_num_dims); - for (size_t i = 0; i < observed_output_num_dims; i++) { - ASSERT_EQ(output_dims[i], observed_output_dims[i]); - } - - // Check outputs match. - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(Add2TestQU8, matches_operator_api) -{ - std::generate(input1.begin(), input1.end(), [&]() { return u8dist(rng); }); - std::generate(input2.begin(), input2.end(), [&]() { return u8dist(rng); }); - std::fill(operator_output.begin(), operator_output.end(), UINT8_C(0xA5)); - std::fill(subgraph_output.begin(), subgraph_output.end(), UINT8_C(0xA5)); - - const int32_t input1_zero_point = u8dist(rng); - const float input1_scale = scale_dist(rng); - const int32_t input2_zero_point = u8dist(rng); - const float input2_scale = scale_dist(rng); - const int32_t output_zero_point = u8dist(rng); - const float output_scale = scale_dist(rng); - const uint8_t quantized_output_min = xnn_qu8_quantize(output_min, output_scale, output_zero_point); - const uint8_t quantized_output_max = xnn_qu8_quantize(output_max, output_scale, output_zero_point); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_operator_t op = nullptr; - - // Call operator API. - ASSERT_EQ( - xnn_status_success, xnn_create_add_nd_qu8( - input1_zero_point, input1_scale, input2_zero_point, input2_scale, output_zero_point, - output_scale, quantized_output_min, quantized_output_max, /*flags=*/0, &op)); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ( - xnn_status_success, xnn_reshape_add_nd_qu8( - op, input1_dims.size(), input1_dims.data(), input2_dims.size(), input2_dims.data(), - /*threadpool=*/nullptr)); - - ASSERT_EQ( - xnn_status_success, xnn_setup_add_nd_qu8(op, input1.data(), input2.data(), operator_output.data())); - - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(3, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - uint32_t input1_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_quint8, input1_zero_point, input1_scale, input1_dims.size(), input1_dims.data(), nullptr, - /*external_id=*/0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input1_id)); - ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); - - uint32_t input2_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_quint8, input2_zero_point, input2_scale, input2_dims.size(), input2_dims.data(), nullptr, - /*external_id=*/1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input2_id)); - ASSERT_NE(input2_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_quint8, output_zero_point, output_scale, output_dims.size(), - output_dims.data(), nullptr, /*external_id=*/2, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ( - xnn_status_success, - xnn_define_add2(subgraph, output_min, output_max, input1_id, input2_id, output_id, /*flags=*/0)); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input1_id, input1.data()}, xnn_external_value{input2_id, input2.data()}, - xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - // Check output shape matches. - size_t observed_output_num_dims = 0; - std::vector observed_output_dims(XNN_MAX_TENSOR_DIMS, 0); - ASSERT_EQ( - xnn_status_success, - xnn_get_external_value_shape(runtime, output_id, &observed_output_num_dims, observed_output_dims.data())); - ASSERT_EQ(output_dims.size(), observed_output_num_dims); - for (size_t i = 0; i < observed_output_num_dims; i++) { - ASSERT_EQ(output_dims[i], observed_output_dims[i]); - } - - // Check outputs match. - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(Add2TestF16, matches_operator_api) -{ - std::generate(input1.begin(), input1.end(), [&]() { return f32dist(rng); }); - std::generate(input2.begin(), input2.end(), [&]() { return f32dist(rng); }); - std::fill(operator_output.begin(), operator_output.end(), std::nanf("")); - std::fill(subgraph_output.begin(), subgraph_output.end(), std::nanf("")); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_operator_t op = nullptr; - - // Call operator API. - const xnn_status status = xnn_create_add_nd_f16(output_min, output_max, 0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ( - xnn_status_success, xnn_reshape_add_nd_f16( - op, input1_dims.size(), input1_dims.data(), input2_dims.size(), input2_dims.data(), - /*threadpool=*/nullptr)); - - ASSERT_EQ( - xnn_status_success, xnn_setup_add_nd_f16(op, input1.data(), input2.data(), operator_output.data())); - - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(3, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - uint32_t input1_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, input1_dims.size(), input1_dims.data(), nullptr, - /*external_id=*/0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input1_id)); - ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); - - uint32_t input2_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, input2_dims.size(), input2_dims.data(), nullptr, - /*external_id=*/1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input2_id)); - ASSERT_NE(input2_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, output_dims.size(), output_dims.data(), nullptr, /*external_id=*/2, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ( - xnn_status_success, - xnn_define_add2(subgraph, output_min, output_max, input1_id, input2_id, output_id, /*flags=*/0)); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input1_id, input1.data()}, xnn_external_value{input2_id, input2.data()}, - xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - // Check outputs match. - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(Add2TestF32, matches_operator_api) -{ - std::generate(input1.begin(), input1.end(), [&]() { return f32dist(rng); }); - std::generate(input2.begin(), input2.end(), [&]() { return f32dist(rng); }); - std::fill(operator_output.begin(), operator_output.end(), nanf("")); - std::fill(subgraph_output.begin(), subgraph_output.end(), nanf("")); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_operator_t op = nullptr; - - // Call operator API. - ASSERT_EQ(xnn_status_success, xnn_create_add_nd_f32(output_min, output_max, 0, &op)); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ( - xnn_status_success, xnn_reshape_add_nd_f32( - op, input1_dims.size(), input1_dims.data(), input2_dims.size(), input2_dims.data(), - /*threadpool=*/nullptr)); - - ASSERT_EQ( - xnn_status_success, xnn_setup_add_nd_f32(op, input1.data(), input2.data(), operator_output.data())); - - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(3, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - uint32_t input1_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, input1_dims.size(), input1_dims.data(), nullptr, - /*external_id=*/0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input1_id)); - ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); - - uint32_t input2_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, input2_dims.size(), input2_dims.data(), nullptr, - /*external_id=*/1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input2_id)); - ASSERT_NE(input2_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, output_dims.size(), output_dims.data(), nullptr, /*external_id=*/2, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ( - xnn_status_success, - xnn_define_add2(subgraph, output_min, output_max, input1_id, input2_id, output_id, /*flags=*/0)); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input1_id, input1.data()}, xnn_external_value{input2_id, input2.data()}, - xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - // Check output shape matches. - size_t observed_output_num_dims = 0; - std::vector observed_output_dims(XNN_MAX_TENSOR_DIMS, 0); - - // Check Invalid calls for this API - ASSERT_EQ(xnn_status_invalid_parameter, - xnn_get_external_value_shape(runtime, XNN_INVALID_VALUE_ID, &observed_output_num_dims, observed_output_dims.data())); - - ASSERT_EQ(xnn_status_invalid_parameter, - xnn_get_external_value_shape(runtime, output_id, nullptr, observed_output_dims.data())); - - ASSERT_EQ(xnn_status_invalid_parameter, - xnn_get_external_value_shape(runtime, output_id, &observed_output_num_dims, nullptr)); - - // Actually get the valid shape and check it matches - ASSERT_EQ( - xnn_status_success, - xnn_get_external_value_shape(runtime, output_id, &observed_output_num_dims, observed_output_dims.data())); - ASSERT_EQ(output_dims.size(), observed_output_num_dims); - for (size_t i = 0; i < observed_output_num_dims; i++) { - ASSERT_EQ(output_dims[i], observed_output_dims[i]); - } - - // Check outputs match. - ASSERT_EQ(subgraph_output, operator_output); -} diff --git a/test/binary-elementwise-nd.cc b/test/binary-elementwise-nd.cc new file mode 100644 index 00000000000..ecadd4f522f --- /dev/null +++ b/test/binary-elementwise-nd.cc @@ -0,0 +1,704 @@ +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "xnnpack.h" +#include "xnnpack/math.h" +#include "replicable_random_device.h" + +constexpr size_t kDim1 = 2; +constexpr size_t kDim2 = 3; +constexpr size_t kDim3 = 4; +constexpr size_t kDim4 = 5; +constexpr size_t kDim5 = 6; +constexpr size_t kDim6 = 7; +const size_t kDims[] = {kDim1, kDim2, kDim3, kDim4, kDim5, kDim6}; + +const size_t kBroadcastRanks[] = {0, 1, 2, 3, 4, 5, 6}; +const size_t kTestRank = 4; + +enum class RunMode { + kCreateReshapeRun, + kEager, +}; + +template +xnn_datatype datatype_of() { + if (std::is_same::value) { + return xnn_datatype_quint8; + } else if (std::is_same::value) { + return xnn_datatype_qint8; + } else if (std::is_same::value) { + return xnn_datatype_fp16; + } else if (std::is_same::value) { + return xnn_datatype_fp32; + } else if (std::is_same::value) { + return xnn_datatype_int32; + } else { + XNN_UNREACHABLE; + } +} + +template +double compute_tolerance(double output_ref) { + if (std::is_integral::value) { + return 0.6; + } else if (std::is_same::value) { + return 1.0e-3 * std::abs(output_ref); + } else { + return 1.0e-6 * std::abs(output_ref); + } +} + +class BinaryElementwiseOperatorTester { + public: + static std::string ToString(xnn_binary_operator operation_type) { + switch (operation_type) { + case xnn_binary_invalid: + return "Unknown"; + case xnn_binary_add: + return "Add"; + case xnn_binary_copysign: + return "CopySign"; + case xnn_binary_divide: + return "Divide"; + case xnn_binary_maximum: + return "Maximum"; + case xnn_binary_minimum: + return "Minimum"; + case xnn_binary_multiply: + return "Multiply"; + case xnn_binary_subtract: + return "Subtract"; + case xnn_binary_squared_difference: + return "SquaredDifference"; + default: + return "Unknown"; + } + } + + double Compute(double a, double b) const { + switch (operation_type()) { + case xnn_binary_add: + return a + b; + case xnn_binary_copysign: + return std::copysign(a, b); + case xnn_binary_divide: + return a / b; + case xnn_binary_maximum: + return std::max(a, b); + case xnn_binary_minimum: + return std::min(a, b); + case xnn_binary_multiply: + return a * b; + case xnn_binary_subtract: + return a - b; + case xnn_binary_squared_difference: + return (a - b) * (a - b); + default: + return std::nanf(""); + } + } + + BinaryElementwiseOperatorTester& input1_shape( + std::vector input1_shape) { + assert(input1_shape.size() <= XNN_MAX_TENSOR_DIMS); + this->input1_shape_ = std::move(input1_shape); + return *this; + } + + const std::vector& input1_shape() const { + return this->input1_shape_; + } + + size_t input1_dim(size_t i) const { + return i < num_input1_dims() ? this->input1_shape_[i] : 1; + } + + size_t num_input1_dims() const { return this->input1_shape_.size(); } + + size_t num_input1_elements() const { + return std::accumulate(this->input1_shape_.begin(), + this->input1_shape_.end(), size_t(1), + std::multiplies()); + } + + BinaryElementwiseOperatorTester& input1_zero_point( + int32_t input1_zero_point) { + this->input1_zero_point_ = input1_zero_point; + return *this; + } + + int32_t input1_zero_point() const { return this->input1_zero_point_; } + + BinaryElementwiseOperatorTester& input1_scale(float input1_scale) { + assert(std::isfinite(input1_scale)); + this->input1_scale_ = input1_scale; + return *this; + } + + float input1_scale() const { return this->input1_scale_; } + + BinaryElementwiseOperatorTester& input2_shape( + std::vector input2_shape) { + assert(input2_shape.size() <= XNN_MAX_TENSOR_DIMS); + this->input2_shape_ = std::move(input2_shape); + return *this; + } + + const std::vector& input2_shape() const { + return this->input2_shape_; + } + + size_t input2_dim(size_t i) const { + return i < num_input2_dims() ? this->input2_shape_[i] : 1; + } + + size_t num_input2_dims() const { return this->input2_shape_.size(); } + + size_t num_input2_elements() const { + return std::accumulate(this->input2_shape_.begin(), + this->input2_shape_.end(), size_t(1), + std::multiplies()); + } + + BinaryElementwiseOperatorTester& input2_zero_point( + int32_t input2_zero_point) { + this->input2_zero_point_ = input2_zero_point; + return *this; + } + + int32_t input2_zero_point() const { return this->input2_zero_point_; } + + BinaryElementwiseOperatorTester& input2_scale(float input2_scale) { + assert(std::isfinite(input2_scale)); + this->input2_scale_ = input2_scale; + return *this; + } + + float input2_scale() const { return this->input2_scale_; } + + BinaryElementwiseOperatorTester& output_zero_point( + int32_t output_zero_point) { + this->output_zero_point_ = output_zero_point; + return *this; + } + + int32_t output_zero_point() const { return this->output_zero_point_; } + + BinaryElementwiseOperatorTester& output_scale(float output_scale) { + assert(std::isfinite(output_scale)); + this->output_scale_ = output_scale; + return *this; + } + + float output_scale() const { return this->output_scale_; } + + BinaryElementwiseOperatorTester& operation_type( + xnn_binary_operator operation_type) { + this->operation_type_ = operation_type; + return *this; + } + + xnn_binary_operator operation_type() const { return this->operation_type_; } + + BinaryElementwiseOperatorTester& iterations(size_t iterations) { + this->iterations_ = iterations; + return *this; + } + + size_t iterations() const { return this->iterations_; } + + template + void Test(RunMode mode) { + ASSERT_NE(operation_type(), xnn_binary_invalid); + + xnnpack::ReplicableRandomDevice rng; + double input_min = std::is_integral::value + ? static_cast(std::numeric_limits::min()) + : 0.01; + double input_max = std::is_integral::value + ? static_cast(std::numeric_limits::max()) + : 1.0; + std::uniform_real_distribution dist(input_min, input_max); + + // Compute generalized shapes. + std::array input1_dims; + std::array input2_dims; + std::array output_dims; + std::fill(input1_dims.begin(), input1_dims.end(), 1); + std::fill(input2_dims.begin(), input2_dims.end(), 1); + std::fill(output_dims.begin(), output_dims.end(), 1); + std::copy(input1_shape().cbegin(), input1_shape().cend(), + input1_dims.end() - num_input1_dims()); + std::copy(input2_shape().cbegin(), input2_shape().cend(), + input2_dims.end() - num_input2_dims()); + for (size_t i = 0; i < XNN_MAX_TENSOR_DIMS; i++) { + if (input1_dims[i] != 1 && input2_dims[i] != 1) { + ASSERT_EQ(input1_dims[i], input2_dims[i]); + } + output_dims[i] = std::max(input1_dims[i], input2_dims[i]); + } + const size_t num_output_elements = + std::accumulate(output_dims.begin(), output_dims.end(), size_t(1), + std::multiplies()); + + // Compute generalized strides. + std::array input1_strides; + std::array input2_strides; + std::array output_strides; + size_t input1_stride = 1, input2_stride = 1, output_stride = 1; + for (size_t i = XNN_MAX_TENSOR_DIMS; i != 0; i--) { + input1_strides[i - 1] = input1_dims[i - 1] == 1 ? 0 : input1_stride; + input2_strides[i - 1] = input2_dims[i - 1] == 1 ? 0 : input2_stride; + output_strides[i - 1] = output_stride; + input1_stride *= input1_dims[i - 1]; + input2_stride *= input2_dims[i - 1]; + output_stride *= output_dims[i - 1]; + } + + xnn_datatype datatype = datatype_of(); + xnn_quantization_params input1_quantization = {input1_zero_point(), + input1_scale()}; + xnn_quantization_params input2_quantization = {input2_zero_point(), + input2_scale()}; + xnn_quantization_params output_quantization = {output_zero_point(), + output_scale()}; + std::vector input1(XNN_EXTRA_BYTES / sizeof(T) + num_input1_elements()); + std::vector input2(XNN_EXTRA_BYTES / sizeof(T) + num_input2_elements()); + std::vector output(num_output_elements); + for (size_t iteration = 0; iteration < iterations(); iteration++) { + std::generate(input1.begin(), input1.end(), [&]() { return dist(rng); }); + std::generate(input2.begin(), input2.end(), [&]() { return dist(rng); }); + std::fill(output.begin(), output.end(), 0xAA); + + if (mode == RunMode::kCreateReshapeRun) { + // Create, setup, run, and destroy a binary elementwise operator. + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + xnn_operator_t binary_elementwise_op = nullptr; + xnn_status status = xnn_create_binary_elementwise_nd( + operation_type(), datatype, &input1_quantization, + &input2_quantization, &output_quantization, 0, + &binary_elementwise_op); + if (status == xnn_status_unsupported_hardware) { + GTEST_SKIP(); + } + ASSERT_EQ(xnn_status_success, status); + ASSERT_NE(nullptr, binary_elementwise_op); + + // Smart pointer to automatically delete binary_elementwise_op. + std::unique_ptr + auto_binary_elementwise_op(binary_elementwise_op, + xnn_delete_operator); + + ASSERT_EQ( + xnn_status_success, + xnn_reshape_binary_elementwise_nd( + binary_elementwise_op, num_input1_dims(), input1_shape().data(), + num_input2_dims(), input2_shape().data(), + /*threadpool=*/nullptr)); + ASSERT_EQ(xnn_status_success, xnn_setup_binary_elementwise_nd( + binary_elementwise_op, input1.data(), + input2.data(), output.data())); + + ASSERT_EQ(xnn_status_success, xnn_run_operator(binary_elementwise_op, + /*threadpool=*/nullptr)); + } else if (mode == RunMode::kEager) { + // Run a binary elementwise operator without creating it. + xnn_status status = xnn_run_binary_elementwise_nd( + operation_type(), datatype, &input1_quantization, + &input2_quantization, &output_quantization, 0, input1_dims.size(), + input1_dims.data(), input2_dims.size(), input2_dims.data(), + input1.data(), input2.data(), output.data(), + /*threadpool=*/nullptr); + if (status == xnn_status_unsupported_hardware) { + GTEST_SKIP(); + } + ASSERT_EQ(xnn_status_success, status); + } else { + XNN_UNREACHABLE; + } + + // Verify results. + for (size_t i = 0; i < output_dims[0]; i++) { + for (size_t j = 0; j < output_dims[1]; j++) { + for (size_t k = 0; k < output_dims[2]; k++) { + for (size_t l = 0; l < output_dims[3]; l++) { + for (size_t m = 0; m < output_dims[4]; m++) { + for (size_t n = 0; n < output_dims[5]; n++) { + const double input1_value = + input1_scale() * + (input1[i * input1_strides[0] + j * input1_strides[1] + + k * input1_strides[2] + l * input1_strides[3] + + m * input1_strides[4] + n * input1_strides[5]] - + input1_zero_point()); + const double input2_value = + input2_scale() * + (input2[i * input2_strides[0] + j * input2_strides[1] + + k * input2_strides[2] + l * input2_strides[3] + + m * input2_strides[4] + n * input2_strides[5]] - + input2_zero_point()); + double output_ref = + Compute(input1_value, input2_value) / output_scale() + + output_zero_point(); + const size_t index = + i * output_strides[0] + j * output_strides[1] + + k * output_strides[2] + l * output_strides[3] + + m * output_strides[4] + n * output_strides[5]; + if (output_ref < std::numeric_limits::lowest() || + output_ref > std::numeric_limits::max()) { + // This is expected to overflow. + } else { + const double tolerance = compute_tolerance(output_ref); + ASSERT_NEAR(output[index], output_ref, tolerance) + << "input1_value = " << input1_value << ", " + << "input2_value = " << input2_value << ", " + << "(i, j, k, l, m, n) = (" << i << ", " << j << ", " + << k << ", " << l << ", " << m << ", " << n << ")" + << ", input1 zero point = " << input1_zero_point() + << ", input1 scale = " << input1_scale() + << ", input2 zero point = " << input2_zero_point() + << ", input2 scale = " << input2_scale() + << ", output zero point = " << output_zero_point() + << ", output scale = " << output_scale(); + } + } + } + } + } + } + } + } + } + + private: + std::vector input1_shape_; + std::vector input2_shape_; + int32_t input1_zero_point_{0}; + float input1_scale_{1.0f}; + int32_t input2_zero_point_{0}; + float input2_scale_{1.0f}; + int32_t output_zero_point_{0}; + float output_scale_{1.0f}; + xnn_binary_operator operation_type_{xnn_binary_invalid}; + size_t iterations_{3}; +}; + +// Make a shape of `rank` dimensions, broadcasting in each dimension according +// `broadcast_mask`. +inline std::vector MakeShapeOfRank(size_t rank, uint32_t broadcast_mask, + const size_t* dims) { + std::vector shape; + for (size_t i = 0; i < rank; i++) { + const bool broadcast = (broadcast_mask & (uint32_t(1) << i)) != 0; + shape.push_back(broadcast ? 1 : dims[i]); + } + std::reverse(shape.begin(), shape.end()); + return shape; +} + +template +void RunBinaryOpTester(size_t rank_a, size_t rank_b, const size_t* dims, + RunMode run_mode, + BinaryElementwiseOperatorTester& tester) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << rank_a); bm1++) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << rank_b); bm2++) { + tester.input1_shape(MakeShapeOfRank(rank_a, bm1, dims)) + .input2_shape(MakeShapeOfRank(rank_b, bm2, dims)); + tester.Test(run_mode); + } + } +} + +template +void BroadcastNDTestImpl(const Params& params) { + RunMode mode = std::get<0>(params); + xnn_binary_operator op = std::get<1>(params); + const size_t rank_a = std::get<2>(params); + const size_t rank_b = std::get<3>(params); + BinaryElementwiseOperatorTester tester; + tester.operation_type(op); + RunBinaryOpTester(rank_a, rank_b, kDims, mode, tester); +} + +template +class BroadcastNDTest + : public testing::TestWithParam< + std::tuple> {}; + +using BroadcastNDTestQS8 = BroadcastNDTest; +using BroadcastNDTestQU8 = BroadcastNDTest; +#ifndef XNN_EXCLUDE_F16_TESTS +using BroadcastNDTestF16 = BroadcastNDTest; +#endif // XNN_EXCLUDE_F16_TESTS +using BroadcastNDTestF32 = BroadcastNDTest; +using BroadcastNDTestS32 = BroadcastNDTest; + +TEST_P(BroadcastNDTestQS8, op) { BroadcastNDTestImpl(GetParam()); } +TEST_P(BroadcastNDTestQU8, op) { BroadcastNDTestImpl(GetParam()); } +#ifndef XNN_EXCLUDE_F16_TESTS +TEST_P(BroadcastNDTestF16, op) { BroadcastNDTestImpl(GetParam()); } +#endif // XNN_EXCLUDE_F16_TESTS +TEST_P(BroadcastNDTestF32, op) { BroadcastNDTestImpl(GetParam()); } +TEST_P(BroadcastNDTestS32, op) { BroadcastNDTestImpl(GetParam()); } + +std::string ToString( + const std::tuple& param) { + return BinaryElementwiseOperatorTester::ToString(std::get<1>(param)) + "_" + + std::to_string(std::get<2>(param)) + "d_x_" + + std::to_string(std::get<3>(param)) + "d"; +} + +std::string ToString(const std::tuple& param) { + return BinaryElementwiseOperatorTester::ToString(std::get<1>(param)); +} + +INSTANTIATE_TEST_SUITE_P( + CreateReshapeRun, BroadcastNDTestQS8, + testing::Combine(testing::Values(RunMode::kCreateReshapeRun), + testing::Values(xnn_binary_add, xnn_binary_subtract, + xnn_binary_multiply), + testing::ValuesIn(kBroadcastRanks), + testing::ValuesIn(kBroadcastRanks)), + [](const auto& info) { return ToString(info.param); }); +INSTANTIATE_TEST_SUITE_P(Eager, BroadcastNDTestQS8, + testing::Combine(testing::Values(RunMode::kEager), + testing::Values(xnn_binary_add, + xnn_binary_subtract, + xnn_binary_multiply), + testing::ValuesIn(kBroadcastRanks), + testing::ValuesIn(kBroadcastRanks)), + [](const auto& info) { return ToString(info.param); }); +INSTANTIATE_TEST_SUITE_P( + CreateReshapeRun, BroadcastNDTestQU8, + testing::Combine(testing::Values(RunMode::kCreateReshapeRun), + testing::Values(xnn_binary_add, xnn_binary_subtract, + xnn_binary_multiply), + testing::ValuesIn(kBroadcastRanks), + testing::ValuesIn(kBroadcastRanks)), + [](const auto& info) { return ToString(info.param); }); +INSTANTIATE_TEST_SUITE_P(Eager, BroadcastNDTestQU8, + testing::Combine(testing::Values(RunMode::kEager), + testing::Values(xnn_binary_add, + xnn_binary_subtract, + xnn_binary_multiply), + testing::ValuesIn(kBroadcastRanks), + testing::ValuesIn(kBroadcastRanks)), + [](const auto& info) { return ToString(info.param); }); +#ifndef XNN_EXCLUDE_F16_TESTS +INSTANTIATE_TEST_SUITE_P( + CreateReshapeRun, BroadcastNDTestF16, + testing::Combine( + testing::Values(RunMode::kCreateReshapeRun), + testing::Values(xnn_binary_add, xnn_binary_divide, xnn_binary_maximum, + xnn_binary_minimum, xnn_binary_multiply, + xnn_binary_squared_difference, xnn_binary_subtract), + testing::ValuesIn(kBroadcastRanks), testing::ValuesIn(kBroadcastRanks)), + [](const auto& info) { return ToString(info.param); }); +INSTANTIATE_TEST_SUITE_P( + Eager, BroadcastNDTestF16, + testing::Combine( + testing::Values(RunMode::kEager), + testing::Values(xnn_binary_add, xnn_binary_divide, xnn_binary_maximum, + xnn_binary_minimum, xnn_binary_multiply, + xnn_binary_squared_difference, xnn_binary_subtract), + testing::ValuesIn(kBroadcastRanks), testing::ValuesIn(kBroadcastRanks)), + [](const auto& info) { return ToString(info.param); }); +#endif +INSTANTIATE_TEST_SUITE_P( + CreateReshapeRun, BroadcastNDTestF32, + testing::Combine(testing::Values(RunMode::kCreateReshapeRun), + testing::Values(xnn_binary_add, xnn_binary_copysign, + xnn_binary_divide, xnn_binary_maximum, + xnn_binary_minimum, xnn_binary_multiply, + xnn_binary_subtract, + xnn_binary_squared_difference), + testing::ValuesIn(kBroadcastRanks), + testing::ValuesIn(kBroadcastRanks)), + [](const auto& info) { return ToString(info.param); }); +INSTANTIATE_TEST_SUITE_P( + Eager, BroadcastNDTestF32, + testing::Combine(testing::Values(RunMode::kEager), + testing::Values(xnn_binary_add, xnn_binary_divide, + xnn_binary_maximum, xnn_binary_minimum, + xnn_binary_multiply, xnn_binary_subtract, + xnn_binary_squared_difference), + testing::ValuesIn(kBroadcastRanks), + testing::ValuesIn(kBroadcastRanks)), + [](const auto& info) { return ToString(info.param); }); +INSTANTIATE_TEST_SUITE_P( + CreateReshapeRun, BroadcastNDTestS32, + testing::Combine(testing::Values(RunMode::kCreateReshapeRun), + testing::Values(xnn_binary_multiply), + testing::ValuesIn(kBroadcastRanks), + testing::ValuesIn(kBroadcastRanks)), + [](const auto& info) { return ToString(info.param); }); +INSTANTIATE_TEST_SUITE_P(Eager, BroadcastNDTestS32, + testing::Combine(testing::Values(RunMode::kEager), + testing::Values(xnn_binary_multiply), + testing::ValuesIn(kBroadcastRanks), + testing::ValuesIn(kBroadcastRanks)), + [](const auto& info) { return ToString(info.param); }); + +template +void QuantizedTest_Input1Scale(Params params) { + for (float input1_scale = 0.1f; input1_scale <= 10.0f; + input1_scale *= 3.14f) { + RunBinaryOpTester(kTestRank, kTestRank, kDims, std::get<0>(params), + BinaryElementwiseOperatorTester() + .operation_type(std::get<1>(params)) + .input1_scale(input1_scale)); + } +} + +template +void QuantizedTest_Input1ZeroPoint(Params params) { + for (int32_t input1_zero_point = std::numeric_limits::min(); + input1_zero_point <= std::numeric_limits::max(); + input1_zero_point += 51) { + RunBinaryOpTester(kTestRank, kTestRank, kDims, std::get<0>(params), + BinaryElementwiseOperatorTester() + .operation_type(std::get<1>(params)) + .input1_zero_point(input1_zero_point)); + } +} + +template +void QuantizedTest_Input2Scale(Params params) { + for (float input2_scale = 0.1f; input2_scale <= 10.0f; + input2_scale *= 3.14f) { + RunBinaryOpTester(kTestRank, kTestRank, kDims, std::get<0>(params), + BinaryElementwiseOperatorTester() + .operation_type(std::get<1>(params)) + .input2_scale(input2_scale)); + } +} + +template +void QuantizedTest_Input2ZeroPoint(Params params) { + for (int32_t input2_zero_point = std::numeric_limits::min(); + input2_zero_point <= std::numeric_limits::max(); + input2_zero_point += 51) { + RunBinaryOpTester(kTestRank, kTestRank, kDims, std::get<0>(params), + BinaryElementwiseOperatorTester() + .operation_type(std::get<1>(params)) + .input2_zero_point(input2_zero_point)); + } +} + +template +void QuantizedTest_OutputScale(Params params) { + for (float output_scale = 0.1f; output_scale <= 10.0f; + output_scale *= 3.14f) { + RunBinaryOpTester(kTestRank, kTestRank, kDims, std::get<0>(params), + BinaryElementwiseOperatorTester() + .operation_type(std::get<1>(params)) + .output_scale(output_scale)); + } +} + +template +void QuantizedTest_OutputZeroPoint(Params params) { + for (int32_t output_zero_point = std::numeric_limits::min(); + output_zero_point <= std::numeric_limits::max(); + output_zero_point += 51) { + RunBinaryOpTester(kTestRank, kTestRank, kDims, std::get<0>(params), + BinaryElementwiseOperatorTester() + .operation_type(std::get<1>(params)) + .output_zero_point(output_zero_point)); + } +} + +template +class QuantizedTest + : public testing::TestWithParam> { +}; + +using QuantizedTestQS8 = QuantizedTest; + +TEST_P(QuantizedTestQS8, input1_scale) { + QuantizedTest_Input1Scale(GetParam()); +} +TEST_P(QuantizedTestQS8, input1_zero_point) { + QuantizedTest_Input1ZeroPoint(GetParam()); +} +TEST_P(QuantizedTestQS8, input2_scale) { + QuantizedTest_Input2Scale(GetParam()); +} +TEST_P(QuantizedTestQS8, input2_zero_point) { + QuantizedTest_Input2ZeroPoint(GetParam()); +} + +TEST_P(QuantizedTestQS8, output_scale) { + QuantizedTest_OutputScale(GetParam()); +} +TEST_P(QuantizedTestQS8, output_zero_point) { + QuantizedTest_OutputZeroPoint(GetParam()); +} + +INSTANTIATE_TEST_SUITE_P( + CreateReshapeRun, QuantizedTestQS8, + testing::Combine(testing::Values(RunMode::kCreateReshapeRun), + testing::Values(xnn_binary_add, xnn_binary_subtract, + xnn_binary_multiply)), + [](const auto& info) { return ToString(info.param); }); +INSTANTIATE_TEST_SUITE_P(Eager, QuantizedTestQS8, + testing::Combine(testing::Values(RunMode::kEager), + testing::Values(xnn_binary_add, + xnn_binary_subtract, + xnn_binary_multiply)), + [](const auto& info) { return ToString(info.param); }); + +using QuantizedTestQU8 = QuantizedTest; + +TEST_P(QuantizedTestQU8, input1_scale) { + QuantizedTest_Input1Scale(GetParam()); +} +TEST_P(QuantizedTestQU8, input1_zero_point) { + QuantizedTest_Input1ZeroPoint(GetParam()); +} +TEST_P(QuantizedTestQU8, input2_scale) { + QuantizedTest_Input2Scale(GetParam()); +} +TEST_P(QuantizedTestQU8, input2_zero_point) { + QuantizedTest_Input2ZeroPoint(GetParam()); +} + +TEST_P(QuantizedTestQU8, output_scale) { + QuantizedTest_OutputScale(GetParam()); +} +TEST_P(QuantizedTestQU8, output_zero_point) { + QuantizedTest_OutputZeroPoint(GetParam()); +} + +INSTANTIATE_TEST_SUITE_P( + CreateReshapeRun, QuantizedTestQU8, + testing::Combine(testing::Values(RunMode::kCreateReshapeRun), + testing::Values(xnn_binary_add, xnn_binary_subtract, + xnn_binary_multiply)), + [](const auto& info) { return ToString(info.param); }); +INSTANTIATE_TEST_SUITE_P(Eager, QuantizedTestQU8, + testing::Combine(testing::Values(RunMode::kEager), + testing::Values(xnn_binary_add, + xnn_binary_subtract, + xnn_binary_multiply)), + [](const auto& info) { return ToString(info.param); }); diff --git a/test/binary-elementwise-operator-tester.cc b/test/binary-elementwise-operator-tester.cc deleted file mode 100644 index ca99fb2a145..00000000000 --- a/test/binary-elementwise-operator-tester.cc +++ /dev/null @@ -1,1672 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include "binary-elementwise-operator-tester.h" - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/math.h" -#include "replicable_random_device.h" -#include - -void BinaryElementwiseOperatorTester::TestQS8() const { - ASSERT_NE(operation_type(), OperationType::Unknown); - ASSERT_GE(input1_zero_point(), std::numeric_limits::min()); - ASSERT_LE(input1_zero_point(), std::numeric_limits::max()); - ASSERT_GE(input2_zero_point(), std::numeric_limits::min()); - ASSERT_LE(input2_zero_point(), std::numeric_limits::max()); - ASSERT_GE(output_zero_point(), std::numeric_limits::min()); - ASSERT_LE(output_zero_point(), std::numeric_limits::max()); - ASSERT_GE(qmin(), std::numeric_limits::min()); - ASSERT_LE(qmax(), std::numeric_limits::max()); - ASSERT_LT(qmin(), qmax()); - - xnnpack::ReplicableRandomDevice rng; - std::uniform_int_distribution i8dist( - std::numeric_limits::min(), std::numeric_limits::max()); - - // Compute generalized shapes. - std::array input1_dims; - std::array input2_dims; - std::array output_dims; - std::fill(input1_dims.begin(), input1_dims.end(), 1); - std::fill(input2_dims.begin(), input2_dims.end(), 1); - std::fill(output_dims.begin(), output_dims.end(), 1); - std::copy(input1_shape().cbegin(), input1_shape().cend(), - input1_dims.end() - num_input1_dims()); - std::copy(input2_shape().cbegin(), input2_shape().cend(), - input2_dims.end() - num_input2_dims()); - for (size_t i = 0; i < XNN_MAX_TENSOR_DIMS; i++) { - if (input1_dims[i] != 1 && input2_dims[i] != 1) { - ASSERT_EQ(input1_dims[i], input2_dims[i]); - } - output_dims[i] = std::max(input1_dims[i], input2_dims[i]); - } - const size_t num_output_elements = - std::accumulate(output_dims.begin(), output_dims.end(), size_t(1), - std::multiplies()); - - // Compute generalized strides. - std::array input1_strides; - std::array input2_strides; - std::array output_strides; - size_t input1_stride = 1, input2_stride = 1, output_stride = 1; - for (size_t i = XNN_MAX_TENSOR_DIMS; i != 0; i--) { - input1_strides[i - 1] = input1_dims[i - 1] == 1 ? 0 : input1_stride; - input2_strides[i - 1] = input2_dims[i - 1] == 1 ? 0 : input2_stride; - output_strides[i - 1] = output_stride; - input1_stride *= input1_dims[i - 1]; - input2_stride *= input2_dims[i - 1]; - output_stride *= output_dims[i - 1]; - } - - std::vector input1(XNN_EXTRA_BYTES / sizeof(xnn_float16) + - num_input1_elements()); - std::vector input2(XNN_EXTRA_BYTES / sizeof(xnn_float16) + - num_input2_elements()); - std::vector output(num_output_elements); - std::vector output_ref(num_output_elements); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input1.begin(), input1.end(), [&]() { return i8dist(rng); }); - std::generate(input2.begin(), input2.end(), [&]() { return i8dist(rng); }); - std::fill(output.begin(), output.end(), 0xAA); - - // Compute reference results. - for (size_t i = 0; i < output_dims[0]; i++) { - for (size_t j = 0; j < output_dims[1]; j++) { - for (size_t k = 0; k < output_dims[2]; k++) { - for (size_t l = 0; l < output_dims[3]; l++) { - for (size_t m = 0; m < output_dims[4]; m++) { - for (size_t n = 0; n < output_dims[5]; n++) { - output_ref[i * output_strides[0] + j * output_strides[1] + - k * output_strides[2] + l * output_strides[3] + - m * output_strides[4] + n * output_strides[5]] = - Compute( - input1_scale() * (static_cast( - input1[i * input1_strides[0] + - j * input1_strides[1] + - k * input1_strides[2] + - l * input1_strides[3] + - m * input1_strides[4] + - n * input1_strides[5]]) - - input1_zero_point()), - input2_scale() * (static_cast( - input2[i * input2_strides[0] + - j * input2_strides[1] + - k * input2_strides[2] + - l * input2_strides[3] + - m * input2_strides[4] + - n * input2_strides[5]]) - - input2_zero_point())) / - output_scale() + - static_cast(output_zero_point()); - } - } - } - } - } - } - - for (float& output_value : output_ref) { - output_value = std::max(output_value, static_cast(qmin())); - output_value = std::min(output_value, static_cast(qmax())); - } - - // Create, setup, run, and destroy a binary elementwise operator. - ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); - xnn_operator_t binary_elementwise_op = nullptr; - xnn_status status = xnn_status_unsupported_parameter; - switch (operation_type()) { - case OperationType::Add: - status = xnn_create_add_nd_qs8( - input1_zero_point(), input1_scale(), input2_zero_point(), - input2_scale(), output_zero_point(), output_scale(), - static_cast(qmin()), static_cast(qmax()), 0, - &binary_elementwise_op); - break; - case OperationType::Multiply: - status = xnn_create_multiply_nd_qs8( - input1_zero_point(), input1_scale(), input2_zero_point(), - input2_scale(), output_zero_point(), output_scale(), - static_cast(qmin()), static_cast(qmax()), 0, - &binary_elementwise_op); - break; - case OperationType::Subtract: - status = xnn_create_subtract_nd_qs8( - input1_zero_point(), input1_scale(), input2_zero_point(), - input2_scale(), output_zero_point(), output_scale(), - static_cast(qmin()), static_cast(qmax()), 0, - &binary_elementwise_op); - break; - default: - FAIL() << "Unsupported operation type"; - } - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, binary_elementwise_op); - - // Smart pointer to automatically delete binary_elementwise_op. - std::unique_ptr - auto_binary_elementwise_op(binary_elementwise_op, xnn_delete_operator); - - switch (operation_type()) { - case OperationType::Add: - ASSERT_EQ( - xnn_status_success, - xnn_reshape_add_nd_qs8(binary_elementwise_op, num_input1_dims(), - input1_shape().data(), num_input2_dims(), - input2_shape().data(), - /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, - xnn_setup_add_nd_qs8(binary_elementwise_op, input1.data(), - input2.data(), output.data())); - break; - case OperationType::Multiply: - ASSERT_EQ( - xnn_status_success, - xnn_reshape_multiply_nd_qs8( - binary_elementwise_op, num_input1_dims(), input1_shape().data(), - num_input2_dims(), input2_shape().data(), - /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_multiply_nd_qs8( - binary_elementwise_op, input1.data(), - input2.data(), output.data())); - break; - case OperationType::Subtract: - ASSERT_EQ( - xnn_status_success, - xnn_reshape_subtract_nd_qs8( - binary_elementwise_op, num_input1_dims(), input1_shape().data(), - num_input2_dims(), input2_shape().data(), - /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_subtract_nd_qs8( - binary_elementwise_op, input1.data(), - input2.data(), output.data())); - break; - default: - FAIL() << "Unsupported operation type"; - } - - ASSERT_EQ(xnn_status_success, - xnn_run_operator(binary_elementwise_op, /*threadpool=*/nullptr)); - - // Verify results. - for (size_t i = 0; i < output_dims[0]; i++) { - for (size_t j = 0; j < output_dims[1]; j++) { - for (size_t k = 0; k < output_dims[2]; k++) { - for (size_t l = 0; l < output_dims[3]; l++) { - for (size_t m = 0; m < output_dims[4]; m++) { - for (size_t n = 0; n < output_dims[5]; n++) { - const size_t index = - i * output_strides[0] + j * output_strides[1] + - k * output_strides[2] + l * output_strides[3] + - m * output_strides[4] + n * output_strides[5]; - ASSERT_NEAR(static_cast(output[index]), - output_ref[index], 0.6f) - << "(i, j, k, l, m, n) = (" << i << ", " << j << ", " << k - << ", " << l << ", " << m << ", " << n << ")" - << ", input1 zero point = " << input1_zero_point() - << ", input1 scale = " << input1_scale() - << ", input2 zero point = " << input2_zero_point() - << ", input2 scale = " << input2_scale() - << ", output zero point = " << output_zero_point() - << ", output scale = " << output_scale(); - } - } - } - } - } - } - } -} - -void BinaryElementwiseOperatorTester::TestQU8() const { - ASSERT_NE(operation_type(), OperationType::Unknown); - ASSERT_GE(input1_zero_point(), std::numeric_limits::min()); - ASSERT_LE(input1_zero_point(), std::numeric_limits::max()); - ASSERT_GE(input2_zero_point(), std::numeric_limits::min()); - ASSERT_LE(input2_zero_point(), std::numeric_limits::max()); - ASSERT_GE(output_zero_point(), std::numeric_limits::min()); - ASSERT_LE(output_zero_point(), std::numeric_limits::max()); - ASSERT_GE(qmin(), std::numeric_limits::min()); - ASSERT_LE(qmax(), std::numeric_limits::max()); - ASSERT_LT(qmin(), qmax()); - - xnnpack::ReplicableRandomDevice rng; - std::uniform_int_distribution u8dist( - std::numeric_limits::min(), std::numeric_limits::max()); - - // Compute generalized shapes. - std::array input1_dims; - std::array input2_dims; - std::array output_dims; - std::fill(input1_dims.begin(), input1_dims.end(), 1); - std::fill(input2_dims.begin(), input2_dims.end(), 1); - std::fill(output_dims.begin(), output_dims.end(), 1); - std::copy(input1_shape().cbegin(), input1_shape().cend(), - input1_dims.end() - num_input1_dims()); - std::copy(input2_shape().cbegin(), input2_shape().cend(), - input2_dims.end() - num_input2_dims()); - for (size_t i = 0; i < XNN_MAX_TENSOR_DIMS; i++) { - if (input1_dims[i] != 1 && input2_dims[i] != 1) { - ASSERT_EQ(input1_dims[i], input2_dims[i]); - } - output_dims[i] = std::max(input1_dims[i], input2_dims[i]); - } - const size_t num_output_elements = - std::accumulate(output_dims.begin(), output_dims.end(), size_t(1), - std::multiplies()); - - // Compute generalized strides. - std::array input1_strides; - std::array input2_strides; - std::array output_strides; - size_t input1_stride = 1, input2_stride = 1, output_stride = 1; - for (size_t i = XNN_MAX_TENSOR_DIMS; i != 0; i--) { - input1_strides[i - 1] = input1_dims[i - 1] == 1 ? 0 : input1_stride; - input2_strides[i - 1] = input2_dims[i - 1] == 1 ? 0 : input2_stride; - output_strides[i - 1] = output_stride; - input1_stride *= input1_dims[i - 1]; - input2_stride *= input2_dims[i - 1]; - output_stride *= output_dims[i - 1]; - } - - std::vector input1(XNN_EXTRA_BYTES / sizeof(xnn_float16) + - num_input1_elements()); - std::vector input2(XNN_EXTRA_BYTES / sizeof(xnn_float16) + - num_input2_elements()); - std::vector output(num_output_elements); - std::vector output_ref(num_output_elements); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input1.begin(), input1.end(), [&]() { return u8dist(rng); }); - std::generate(input2.begin(), input2.end(), [&]() { return u8dist(rng); }); - std::fill(output.begin(), output.end(), 0xAA); - - // Compute reference results. - for (size_t i = 0; i < output_dims[0]; i++) { - for (size_t j = 0; j < output_dims[1]; j++) { - for (size_t k = 0; k < output_dims[2]; k++) { - for (size_t l = 0; l < output_dims[3]; l++) { - for (size_t m = 0; m < output_dims[4]; m++) { - for (size_t n = 0; n < output_dims[5]; n++) { - output_ref[i * output_strides[0] + j * output_strides[1] + - k * output_strides[2] + l * output_strides[3] + - m * output_strides[4] + n * output_strides[5]] = - Compute( - input1_scale() * (static_cast( - input1[i * input1_strides[0] + - j * input1_strides[1] + - k * input1_strides[2] + - l * input1_strides[3] + - m * input1_strides[4] + - n * input1_strides[5]]) - - input1_zero_point()), - input2_scale() * (static_cast( - input2[i * input2_strides[0] + - j * input2_strides[1] + - k * input2_strides[2] + - l * input2_strides[3] + - m * input2_strides[4] + - n * input2_strides[5]]) - - input2_zero_point())) / - output_scale() + - static_cast(output_zero_point()); - } - } - } - } - } - } - - for (float& output_value : output_ref) { - output_value = std::max(output_value, static_cast(qmin())); - output_value = std::min(output_value, static_cast(qmax())); - } - - // Create, setup, run, and destroy a binary elementwise operator. - ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); - xnn_operator_t binary_elementwise_op = nullptr; - xnn_status status = xnn_status_unsupported_parameter; - switch (operation_type()) { - case OperationType::Add: - status = xnn_create_add_nd_qu8( - input1_zero_point(), input1_scale(), input2_zero_point(), - input2_scale(), output_zero_point(), output_scale(), - static_cast(qmin()), static_cast(qmax()), 0, - &binary_elementwise_op); - break; - case OperationType::Multiply: - status = xnn_create_multiply_nd_qu8( - input1_zero_point(), input1_scale(), input2_zero_point(), - input2_scale(), output_zero_point(), output_scale(), - static_cast(qmin()), static_cast(qmax()), 0, - &binary_elementwise_op); - break; - case OperationType::Subtract: - status = xnn_create_subtract_nd_qu8( - input1_zero_point(), input1_scale(), input2_zero_point(), - input2_scale(), output_zero_point(), output_scale(), - static_cast(qmin()), static_cast(qmax()), 0, - &binary_elementwise_op); - break; - default: - FAIL() << "Unsupported operation type"; - } - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, binary_elementwise_op); - - // Smart pointer to automatically delete binary_elementwise_op. - std::unique_ptr - auto_binary_elementwise_op(binary_elementwise_op, xnn_delete_operator); - - switch (operation_type()) { - case OperationType::Add: - ASSERT_EQ( - xnn_status_success, - xnn_reshape_add_nd_qu8(binary_elementwise_op, num_input1_dims(), - input1_shape().data(), num_input2_dims(), - input2_shape().data(), - /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, - xnn_setup_add_nd_qu8(binary_elementwise_op, input1.data(), - input2.data(), output.data())); - break; - case OperationType::Multiply: - ASSERT_EQ( - xnn_status_success, - xnn_reshape_multiply_nd_qu8( - binary_elementwise_op, num_input1_dims(), input1_shape().data(), - num_input2_dims(), input2_shape().data(), - /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_multiply_nd_qu8( - binary_elementwise_op, input1.data(), - input2.data(), output.data())); - break; - case OperationType::Subtract: - ASSERT_EQ( - xnn_status_success, - xnn_reshape_subtract_nd_qu8( - binary_elementwise_op, num_input1_dims(), input1_shape().data(), - num_input2_dims(), input2_shape().data(), - /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_subtract_nd_qu8( - binary_elementwise_op, input1.data(), - input2.data(), output.data())); - break; - default: - FAIL() << "Unsupported operation type"; - } - - ASSERT_EQ(xnn_status_success, - xnn_run_operator(binary_elementwise_op, /*threadpool=*/nullptr)); - - // Verify results. - for (size_t i = 0; i < output_dims[0]; i++) { - for (size_t j = 0; j < output_dims[1]; j++) { - for (size_t k = 0; k < output_dims[2]; k++) { - for (size_t l = 0; l < output_dims[3]; l++) { - for (size_t m = 0; m < output_dims[4]; m++) { - for (size_t n = 0; n < output_dims[5]; n++) { - const size_t index = - i * output_strides[0] + j * output_strides[1] + - k * output_strides[2] + l * output_strides[3] + - m * output_strides[4] + n * output_strides[5]; - ASSERT_NEAR( - static_cast(static_cast(output[index])), - output_ref[index], 0.6f) - << "(i, j, k, l, m, n) = (" << i << ", " << j << ", " << k - << ", " << l << ", " << m << ", " << n << ")" - << ", input1 zero point = " << input1_zero_point() - << ", input1 scale = " << input1_scale() - << ", input2 zero point = " << input2_zero_point() - << ", input2 scale = " << input2_scale() - << ", output zero point = " << output_zero_point() - << ", output scale = " << output_scale(); - } - } - } - } - } - } - } -} - -void BinaryElementwiseOperatorTester::TestF16() const { - ASSERT_NE(operation_type(), OperationType::Unknown); - ASSERT_LT(qmin(), qmax()); - - xnnpack::ReplicableRandomDevice rng; - std::uniform_real_distribution f32dist(0.01f, 1.0f); - - // Compute generalized shapes. - std::array input1_dims; - std::array input2_dims; - std::array output_dims; - std::fill(input1_dims.begin(), input1_dims.end(), 1); - std::fill(input2_dims.begin(), input2_dims.end(), 1); - std::fill(output_dims.begin(), output_dims.end(), 1); - std::copy(input1_shape().cbegin(), input1_shape().cend(), - input1_dims.end() - num_input1_dims()); - std::copy(input2_shape().cbegin(), input2_shape().cend(), - input2_dims.end() - num_input2_dims()); - for (size_t i = 0; i < XNN_MAX_TENSOR_DIMS; i++) { - if (input1_dims[i] != 1 && input2_dims[i] != 1) { - ASSERT_EQ(input1_dims[i], input2_dims[i]); - } - output_dims[i] = std::max(input1_dims[i], input2_dims[i]); - } - const size_t num_output_elements = - std::accumulate(output_dims.begin(), output_dims.end(), size_t(1), - std::multiplies()); - - // Compute generalized strides. - std::array input1_strides; - std::array input2_strides; - std::array output_strides; - size_t input1_stride = 1, input2_stride = 1, output_stride = 1; - for (size_t i = XNN_MAX_TENSOR_DIMS; i != 0; i--) { - input1_strides[i - 1] = input1_dims[i - 1] == 1 ? 0 : input1_stride; - input2_strides[i - 1] = input2_dims[i - 1] == 1 ? 0 : input2_stride; - output_strides[i - 1] = output_stride; - input1_stride *= input1_dims[i - 1]; - input2_stride *= input2_dims[i - 1]; - output_stride *= output_dims[i - 1]; - } - - std::vector input1(XNN_EXTRA_BYTES / sizeof(xnn_float16) + - num_input1_elements()); - std::vector input2(XNN_EXTRA_BYTES / sizeof(xnn_float16) + - num_input2_elements()); - std::vector output(num_output_elements); - std::vector output_ref(num_output_elements); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input1.begin(), input1.end(), - [&]() { return f32dist(rng); }); - std::generate(input2.begin(), input2.end(), - [&]() { return f32dist(rng); }); - std::fill(output.begin(), output.end(), std::nanf("")); - - // Compute reference results. - for (size_t i = 0; i < output_dims[0]; i++) { - for (size_t j = 0; j < output_dims[1]; j++) { - for (size_t k = 0; k < output_dims[2]; k++) { - for (size_t l = 0; l < output_dims[3]; l++) { - for (size_t m = 0; m < output_dims[4]; m++) { - for (size_t n = 0; n < output_dims[5]; n++) { - output_ref[i * output_strides[0] + j * output_strides[1] + - k * output_strides[2] + l * output_strides[3] + - m * output_strides[4] + n * output_strides[5]] = - Compute( - input1[i * input1_strides[0] + - j * input1_strides[1] + - k * input1_strides[2] + - l * input1_strides[3] + - m * input1_strides[4] + - n * input1_strides[5]], - input2[i * input2_strides[0] + - j * input2_strides[1] + - k * input2_strides[2] + - l * input2_strides[3] + - m * input2_strides[4] + - n * input2_strides[5]]); - } - } - } - } - } - } - - // Compute clamping parameters. - const float accumulated_min = - *std::min_element(output_ref.cbegin(), output_ref.cend()); - const float accumulated_max = - *std::max_element(output_ref.cbegin(), output_ref.cend()); - const float accumulated_range = accumulated_max - accumulated_min; - float output_min = - accumulated_min + - accumulated_range * - (static_cast(qmin() - std::numeric_limits::min()) / - static_cast(std::numeric_limits::max() - - std::numeric_limits::min())); - if (qmin() == std::numeric_limits::min()) { - output_min = -std::numeric_limits::infinity(); - } - float output_max = - accumulated_max - - accumulated_range * - (static_cast(std::numeric_limits::max() - qmax()) / - static_cast(std::numeric_limits::max() - - std::numeric_limits::min())); - if (qmax() == std::numeric_limits::max()) { - output_max = +std::numeric_limits::infinity(); - } - output_min = xnn_float16(output_min); - output_max = xnn_float16(output_max); - - for (float& output_value : output_ref) { - output_value = std::max(output_value, output_min); - output_value = std::min(output_value, output_max); - } - - // Create, setup, run, and destroy a binary elementwise operator. - ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); - xnn_operator_t binary_elementwise_op = nullptr; - xnn_status status = xnn_status_unsupported_parameter; - switch (operation_type()) { - case OperationType::Add: - status = xnn_create_add_nd_f16(output_min, output_max, 0, - &binary_elementwise_op); - break; - case OperationType::Divide: - status = xnn_create_divide_nd_f16(output_min, output_max, 0, - &binary_elementwise_op); - break; - case OperationType::Maximum: - status = xnn_create_maximum_nd_f16(0, &binary_elementwise_op); - break; - case OperationType::Minimum: - status = xnn_create_minimum_nd_f16(0, &binary_elementwise_op); - break; - case OperationType::Multiply: - status = xnn_create_multiply_nd_f16(output_min, output_max, 0, - &binary_elementwise_op); - break; - case OperationType::SquaredDifference: - status = - xnn_create_squared_difference_nd_f16(0, &binary_elementwise_op); - break; - case OperationType::Subtract: - status = xnn_create_subtract_nd_f16(output_min, output_max, 0, - &binary_elementwise_op); - break; - default: - FAIL() << "Unsupported operation type"; - } - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, binary_elementwise_op); - - // Smart pointer to automatically delete binary_elementwise_op. - std::unique_ptr - auto_binary_elementwise_op(binary_elementwise_op, xnn_delete_operator); - - switch (operation_type()) { - case OperationType::Add: - ASSERT_EQ( - xnn_status_success, - xnn_reshape_add_nd_f16(binary_elementwise_op, num_input1_dims(), - input1_shape().data(), num_input2_dims(), - input2_shape().data(), - /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, - xnn_setup_add_nd_f16(binary_elementwise_op, input1.data(), - input2.data(), output.data())); - break; - case OperationType::Divide: - ASSERT_EQ( - xnn_status_success, - xnn_reshape_divide_nd_f16(binary_elementwise_op, num_input1_dims(), - input1_shape().data(), num_input2_dims(), - input2_shape().data(), - /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, - xnn_setup_divide_nd_f16(binary_elementwise_op, input1.data(), - input2.data(), output.data())); - break; - case OperationType::Maximum: - ASSERT_EQ( - xnn_status_success, - xnn_reshape_maximum_nd_f16(binary_elementwise_op, num_input1_dims(), - input1_shape().data(), num_input2_dims(), - input2_shape().data(), - /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, - xnn_setup_maximum_nd_f16(binary_elementwise_op, input1.data(), - input2.data(), output.data())); - break; - case OperationType::Minimum: - ASSERT_EQ( - xnn_status_success, - xnn_reshape_minimum_nd_f16(binary_elementwise_op, num_input1_dims(), - input1_shape().data(), num_input2_dims(), - input2_shape().data(), - /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, - xnn_setup_minimum_nd_f16(binary_elementwise_op, input1.data(), - input2.data(), output.data())); - break; - case OperationType::Multiply: - ASSERT_EQ( - xnn_status_success, - xnn_reshape_multiply_nd_f16( - binary_elementwise_op, num_input1_dims(), input1_shape().data(), - num_input2_dims(), input2_shape().data(), - /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_multiply_nd_f16( - binary_elementwise_op, input1.data(), - input2.data(), output.data())); - break; - case OperationType::SquaredDifference: - ASSERT_EQ( - xnn_status_success, - xnn_reshape_squared_difference_nd_f16( - binary_elementwise_op, num_input1_dims(), input1_shape().data(), - num_input2_dims(), input2_shape().data(), - /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_squared_difference_nd_f16( - binary_elementwise_op, input1.data(), - input2.data(), output.data())); - break; - case OperationType::Subtract: - ASSERT_EQ( - xnn_status_success, - xnn_reshape_subtract_nd_f16( - binary_elementwise_op, num_input1_dims(), input1_shape().data(), - num_input2_dims(), input2_shape().data(), - /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_subtract_nd_f16( - binary_elementwise_op, input1.data(), - input2.data(), output.data())); - break; - default: - FAIL() << "Unsupported operation type"; - } - - ASSERT_EQ(xnn_status_success, - xnn_run_operator(binary_elementwise_op, /*threadpool=*/nullptr)); - - // Verify results. - for (size_t i = 0; i < output_dims[0]; i++) { - for (size_t j = 0; j < output_dims[1]; j++) { - for (size_t k = 0; k < output_dims[2]; k++) { - for (size_t l = 0; l < output_dims[3]; l++) { - for (size_t m = 0; m < output_dims[4]; m++) { - for (size_t n = 0; n < output_dims[5]; n++) { - const size_t index = - i * output_strides[0] + j * output_strides[1] + - k * output_strides[2] + l * output_strides[3] + - m * output_strides[4] + n * output_strides[5]; - ASSERT_NEAR( - output[index], output_ref[index], - std::max(1.0e-4f, std::abs(output_ref[index]) * 1.0e-2f)) - << "(i, j, k, l, m, n) = (" << i << ", " << j << ", " << k - << ", " << l << ", " << m << ", " << n << ")"; - } - } - } - } - } - } - } -} - -void BinaryElementwiseOperatorTester::TestS32() const { - ASSERT_NE(operation_type(), OperationType::Unknown); - ASSERT_LT(qmin(), qmax()); - - xnnpack::ReplicableRandomDevice rng; - std::uniform_int_distribution s32dist(-10000, 10000); - // Compute generalized shapes. - std::array input1_dims; - std::array input2_dims; - std::array output_dims; - std::fill(input1_dims.begin(), input1_dims.end(), 1); - std::fill(input2_dims.begin(), input2_dims.end(), 1); - std::fill(output_dims.begin(), output_dims.end(), 1); - std::copy(input1_shape().cbegin(), input1_shape().cend(), - input1_dims.end() - num_input1_dims()); - std::copy(input2_shape().cbegin(), input2_shape().cend(), - input2_dims.end() - num_input2_dims()); - for (size_t i = 0; i < XNN_MAX_TENSOR_DIMS; i++) { - if (input1_dims[i] != 1 && input2_dims[i] != 1) { - ASSERT_EQ(input1_dims[i], input2_dims[i]); - } - output_dims[i] = std::max(input1_dims[i], input2_dims[i]); - } - const size_t num_output_elements = - std::accumulate(output_dims.begin(), output_dims.end(), size_t(1), - std::multiplies()); - - // Compute generalized strides. - std::array input1_strides; - std::array input2_strides; - std::array output_strides; - size_t input1_stride = 1, input2_stride = 1, output_stride = 1; - for (size_t i = XNN_MAX_TENSOR_DIMS; i != 0; i--) { - input1_strides[i - 1] = input1_dims[i - 1] == 1 ? 0 : input1_stride; - input2_strides[i - 1] = input2_dims[i - 1] == 1 ? 0 : input2_stride; - output_strides[i - 1] = output_stride; - input1_stride *= input1_dims[i - 1]; - input2_stride *= input2_dims[i - 1]; - output_stride *= output_dims[i - 1]; - } - - std::vector input1(XNN_EXTRA_BYTES / sizeof(int32_t) + - num_input1_elements()); - std::vector input2(XNN_EXTRA_BYTES / sizeof(int32_t) + - num_input2_elements()); - std::vector output(num_output_elements); - std::vector output_ref(num_output_elements); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input1.begin(), input1.end(), [&]() { return s32dist(rng); }); - std::generate(input2.begin(), input2.end(), [&]() { return s32dist(rng); }); - std::fill(output.begin(), output.end(), INT_MAX); - - // Compute reference results. - for (size_t i = 0; i < output_dims[0]; i++) { - for (size_t j = 0; j < output_dims[1]; j++) { - for (size_t k = 0; k < output_dims[2]; k++) { - for (size_t l = 0; l < output_dims[3]; l++) { - for (size_t m = 0; m < output_dims[4]; m++) { - for (size_t n = 0; n < output_dims[5]; n++) { - output_ref[i * output_strides[0] + j * output_strides[1] + - k * output_strides[2] + l * output_strides[3] + - m * output_strides[4] + n * output_strides[5]] = - Compute( - input1[i * input1_strides[0] + j * input1_strides[1] + - k * input1_strides[2] + l * input1_strides[3] + - m * input1_strides[4] + n * input1_strides[5]], - input2[i * input2_strides[0] + j * input2_strides[1] + - k * input2_strides[2] + l * input2_strides[3] + - m * input2_strides[4] + n * input2_strides[5]]); - } - } - } - } - } - } - - // Create, setup, run, and destroy a binary elementwise operator. - ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); - xnn_operator_t binary_elementwise_op = nullptr; - - switch (operation_type()) { - case OperationType::Multiply: - ASSERT_EQ(xnn_status_success, - xnn_create_multiply_nd_s32(0, &binary_elementwise_op)); - break; - default: - FAIL() << "Unsupported operation type"; - } - ASSERT_NE(nullptr, binary_elementwise_op); - - // Smart pointer to automatically delete binary_elementwise_op. - std::unique_ptr - auto_binary_elementwise_op(binary_elementwise_op, xnn_delete_operator); - - switch (operation_type()) { - case OperationType::Multiply: - ASSERT_EQ( - xnn_status_success, - xnn_reshape_multiply_nd_s32( - binary_elementwise_op, num_input1_dims(), input1_shape().data(), - num_input2_dims(), input2_shape().data(), - /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_multiply_nd_s32( - binary_elementwise_op, input1.data(), - input2.data(), output.data())); - break; - default: - FAIL() << "Unsupported operation type"; - } - - ASSERT_EQ(xnn_status_success, - xnn_run_operator(binary_elementwise_op, /*threadpool=*/nullptr)); - // Verify results. - - for (size_t i = 0; i < output_dims[0]; i++) { - for (size_t j = 0; j < output_dims[1]; j++) { - for (size_t k = 0; k < output_dims[2]; k++) { - for (size_t l = 0; l < output_dims[3]; l++) { - for (size_t m = 0; m < output_dims[4]; m++) { - for (size_t n = 0; n < output_dims[5]; n++) { - const size_t index = - i * output_strides[0] + j * output_strides[1] + - k * output_strides[2] + l * output_strides[3] + - m * output_strides[4] + n * output_strides[5]; - ASSERT_EQ(output[index], output_ref[index]); - } - } - } - } - } - } - } -} - - -void BinaryElementwiseOperatorTester::TestF32() const { - ASSERT_NE(operation_type(), OperationType::Unknown); - ASSERT_LT(qmin(), qmax()); - - xnnpack::ReplicableRandomDevice rng; - std::uniform_real_distribution f32dist(0.01f, 1.0f); - - // Compute generalized shapes. - std::array input1_dims; - std::array input2_dims; - std::array output_dims; - std::fill(input1_dims.begin(), input1_dims.end(), 1); - std::fill(input2_dims.begin(), input2_dims.end(), 1); - std::fill(output_dims.begin(), output_dims.end(), 1); - std::copy(input1_shape().cbegin(), input1_shape().cend(), - input1_dims.end() - num_input1_dims()); - std::copy(input2_shape().cbegin(), input2_shape().cend(), - input2_dims.end() - num_input2_dims()); - for (size_t i = 0; i < XNN_MAX_TENSOR_DIMS; i++) { - if (input1_dims[i] != 1 && input2_dims[i] != 1) { - ASSERT_EQ(input1_dims[i], input2_dims[i]); - } - output_dims[i] = std::max(input1_dims[i], input2_dims[i]); - } - const size_t num_output_elements = - std::accumulate(output_dims.begin(), output_dims.end(), size_t(1), - std::multiplies()); - - // Compute generalized strides. - std::array input1_strides; - std::array input2_strides; - std::array output_strides; - size_t input1_stride = 1, input2_stride = 1, output_stride = 1; - for (size_t i = XNN_MAX_TENSOR_DIMS; i != 0; i--) { - input1_strides[i - 1] = input1_dims[i - 1] == 1 ? 0 : input1_stride; - input2_strides[i - 1] = input2_dims[i - 1] == 1 ? 0 : input2_stride; - output_strides[i - 1] = output_stride; - input1_stride *= input1_dims[i - 1]; - input2_stride *= input2_dims[i - 1]; - output_stride *= output_dims[i - 1]; - } - - std::vector input1(XNN_EXTRA_BYTES / sizeof(float) + - num_input1_elements()); - std::vector input2(XNN_EXTRA_BYTES / sizeof(float) + - num_input2_elements()); - std::vector output(num_output_elements); - std::vector output_ref(num_output_elements); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input1.begin(), input1.end(), [&]() { return f32dist(rng); }); - std::generate(input2.begin(), input2.end(), [&]() { return f32dist(rng); }); - std::fill(output.begin(), output.end(), nanf("")); - - // Compute reference results. - for (size_t i = 0; i < output_dims[0]; i++) { - for (size_t j = 0; j < output_dims[1]; j++) { - for (size_t k = 0; k < output_dims[2]; k++) { - for (size_t l = 0; l < output_dims[3]; l++) { - for (size_t m = 0; m < output_dims[4]; m++) { - for (size_t n = 0; n < output_dims[5]; n++) { - output_ref[i * output_strides[0] + j * output_strides[1] + - k * output_strides[2] + l * output_strides[3] + - m * output_strides[4] + n * output_strides[5]] = - Compute( - input1[i * input1_strides[0] + j * input1_strides[1] + - k * input1_strides[2] + l * input1_strides[3] + - m * input1_strides[4] + n * input1_strides[5]], - input2[i * input2_strides[0] + j * input2_strides[1] + - k * input2_strides[2] + l * input2_strides[3] + - m * input2_strides[4] + n * input2_strides[5]]); - } - } - } - } - } - } - const float accumulated_min = - *std::min_element(output_ref.cbegin(), output_ref.cend()); - const float accumulated_max = - *std::max_element(output_ref.cbegin(), output_ref.cend()); - const float accumulated_range = accumulated_max - accumulated_min; - float output_min = - accumulated_min + - accumulated_range * - (static_cast(qmin() - std::numeric_limits::min()) / - static_cast(std::numeric_limits::max() - - std::numeric_limits::min())); - if (qmin() == std::numeric_limits::min()) { - output_min = -std::numeric_limits::infinity(); - } - float output_max = - accumulated_max - - accumulated_range * - (static_cast(std::numeric_limits::max() - qmax()) / - static_cast(std::numeric_limits::max() - - std::numeric_limits::min())); - if (qmax() == std::numeric_limits::max()) { - output_max = +std::numeric_limits::infinity(); - } - for (float& output_value : output_ref) { - output_value = std::max(output_value, output_min); - output_value = std::min(output_value, output_max); - } - - // Create, setup, run, and destroy a binary elementwise operator. - ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); - xnn_operator_t binary_elementwise_op = nullptr; - - switch (operation_type()) { - case OperationType::Add: - ASSERT_EQ(xnn_status_success, - xnn_create_add_nd_f32(output_min, output_max, 0, - &binary_elementwise_op)); - break; - case OperationType::CopySign: - ASSERT_EQ(xnn_status_success, - xnn_create_copysign_nd_f32(/*flags=*/0, &binary_elementwise_op)); - break; - case OperationType::Divide: - ASSERT_EQ(xnn_status_success, - xnn_create_divide_nd_f32(output_min, output_max, 0, - &binary_elementwise_op)); - break; - case OperationType::Maximum: - ASSERT_EQ(xnn_status_success, - xnn_create_maximum_nd_f32(0, &binary_elementwise_op)); - break; - case OperationType::Minimum: - ASSERT_EQ(xnn_status_success, - xnn_create_minimum_nd_f32(0, &binary_elementwise_op)); - break; - case OperationType::Multiply: - ASSERT_EQ(xnn_status_success, - xnn_create_multiply_nd_f32(output_min, output_max, 0, - &binary_elementwise_op)); - break; - case OperationType::Subtract: - ASSERT_EQ(xnn_status_success, - xnn_create_subtract_nd_f32(output_min, output_max, 0, - &binary_elementwise_op)); - break; - case OperationType::SquaredDifference: - ASSERT_EQ(xnn_status_success, xnn_create_squared_difference_nd_f32( - 0, &binary_elementwise_op)); - break; - default: - FAIL() << "Unsupported operation type"; - } - ASSERT_NE(nullptr, binary_elementwise_op); - - // Smart pointer to automatically delete binary_elementwise_op. - std::unique_ptr - auto_binary_elementwise_op(binary_elementwise_op, xnn_delete_operator); - - switch (operation_type()) { - case OperationType::Add: - ASSERT_EQ( - xnn_status_success, - xnn_reshape_add_nd_f32(binary_elementwise_op, num_input1_dims(), - input1_shape().data(), num_input2_dims(), - input2_shape().data(), - /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, - xnn_setup_add_nd_f32(binary_elementwise_op, input1.data(), - input2.data(), output.data())); - break; - case OperationType::CopySign: - ASSERT_EQ( - xnn_status_success, - xnn_reshape_copysign_nd_f32(binary_elementwise_op, num_input1_dims(), - input1_shape().data(), num_input2_dims(), - input2_shape().data(), - /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, - xnn_setup_copysign_nd_f32(binary_elementwise_op, input1.data(), - input2.data(), output.data())); - break; - case OperationType::Divide: - ASSERT_EQ( - xnn_status_success, - xnn_reshape_divide_nd_f32(binary_elementwise_op, num_input1_dims(), - input1_shape().data(), num_input2_dims(), - input2_shape().data(), - /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, - xnn_setup_divide_nd_f32(binary_elementwise_op, input1.data(), - input2.data(), output.data())); - break; - case OperationType::Maximum: - ASSERT_EQ( - xnn_status_success, - xnn_reshape_maximum_nd_f32(binary_elementwise_op, num_input1_dims(), - input1_shape().data(), num_input2_dims(), - input2_shape().data(), - /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, - xnn_setup_maximum_nd_f32(binary_elementwise_op, input1.data(), - input2.data(), output.data())); - break; - case OperationType::Minimum: - ASSERT_EQ( - xnn_status_success, - xnn_reshape_minimum_nd_f32(binary_elementwise_op, num_input1_dims(), - input1_shape().data(), num_input2_dims(), - input2_shape().data(), - /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, - xnn_setup_minimum_nd_f32(binary_elementwise_op, input1.data(), - input2.data(), output.data())); - break; - case OperationType::Multiply: - ASSERT_EQ( - xnn_status_success, - xnn_reshape_multiply_nd_f32( - binary_elementwise_op, num_input1_dims(), input1_shape().data(), - num_input2_dims(), input2_shape().data(), - /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_multiply_nd_f32( - binary_elementwise_op, input1.data(), - input2.data(), output.data())); - break; - case OperationType::Subtract: - ASSERT_EQ( - xnn_status_success, - xnn_reshape_subtract_nd_f32( - binary_elementwise_op, num_input1_dims(), input1_shape().data(), - num_input2_dims(), input2_shape().data(), - /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_subtract_nd_f32( - binary_elementwise_op, input1.data(), - input2.data(), output.data())); - break; - case OperationType::SquaredDifference: - ASSERT_EQ( - xnn_status_success, - xnn_reshape_squared_difference_nd_f32( - binary_elementwise_op, num_input1_dims(), input1_shape().data(), - num_input2_dims(), input2_shape().data(), - /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_squared_difference_nd_f32( - binary_elementwise_op, input1.data(), - input2.data(), output.data())); - break; - default: - FAIL() << "Unsupported operation type"; - } - - ASSERT_EQ(xnn_status_success, - xnn_run_operator(binary_elementwise_op, /*threadpool=*/nullptr)); - - // Verify results. - for (size_t i = 0; i < output_dims[0]; i++) { - for (size_t j = 0; j < output_dims[1]; j++) { - for (size_t k = 0; k < output_dims[2]; k++) { - for (size_t l = 0; l < output_dims[3]; l++) { - for (size_t m = 0; m < output_dims[4]; m++) { - for (size_t n = 0; n < output_dims[5]; n++) { - const size_t index = - i * output_strides[0] + j * output_strides[1] + - k * output_strides[2] + l * output_strides[3] + - m * output_strides[4] + n * output_strides[5]; - ASSERT_NEAR(output[index], output_ref[index], - 1.0e-6f * std::abs(output_ref[index])) - << "(i, j, k, l, m, n) = (" << i << ", " << j << ", " << k - << ", " << l << ", " << m << ", " << n << ")"; - } - } - } - } - } - } - } -} - -void BinaryElementwiseOperatorTester::TestRunF32() const { - ASSERT_NE(operation_type(), OperationType::Unknown); - ASSERT_LT(qmin(), qmax()); - - xnnpack::ReplicableRandomDevice rng; - std::uniform_real_distribution f32dist(0.01f, 1.0f); - - // Compute generalized shapes. - std::array input1_dims; - std::array input2_dims; - std::array output_dims; - std::fill(input1_dims.begin(), input1_dims.end(), 1); - std::fill(input2_dims.begin(), input2_dims.end(), 1); - std::fill(output_dims.begin(), output_dims.end(), 1); - std::copy(input1_shape().cbegin(), input1_shape().cend(), - input1_dims.end() - num_input1_dims()); - std::copy(input2_shape().cbegin(), input2_shape().cend(), - input2_dims.end() - num_input2_dims()); - for (size_t i = 0; i < XNN_MAX_TENSOR_DIMS; i++) { - if (input1_dims[i] != 1 && input2_dims[i] != 1) { - ASSERT_EQ(input1_dims[i], input2_dims[i]); - } - output_dims[i] = std::max(input1_dims[i], input2_dims[i]); - } - const size_t num_output_elements = - std::accumulate(output_dims.begin(), output_dims.end(), size_t(1), - std::multiplies()); - - // Compute generalized strides. - std::array input1_strides; - std::array input2_strides; - std::array output_strides; - size_t input1_stride = 1, input2_stride = 1, output_stride = 1; - for (size_t i = XNN_MAX_TENSOR_DIMS; i != 0; i--) { - input1_strides[i - 1] = input1_dims[i - 1] == 1 ? 0 : input1_stride; - input2_strides[i - 1] = input2_dims[i - 1] == 1 ? 0 : input2_stride; - output_strides[i - 1] = output_stride; - input1_stride *= input1_dims[i - 1]; - input2_stride *= input2_dims[i - 1]; - output_stride *= output_dims[i - 1]; - } - - std::vector input1(XNN_EXTRA_BYTES / sizeof(float) + - num_input1_elements()); - std::vector input2(XNN_EXTRA_BYTES / sizeof(float) + - num_input2_elements()); - std::vector output(num_output_elements); - std::vector output_ref(num_output_elements); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input1.begin(), input1.end(), [&]() { return f32dist(rng); }); - std::generate(input2.begin(), input2.end(), [&]() { return f32dist(rng); }); - std::fill(output.begin(), output.end(), nanf("")); - - // Compute reference results. - for (size_t i = 0; i < output_dims[0]; i++) { - for (size_t j = 0; j < output_dims[1]; j++) { - for (size_t k = 0; k < output_dims[2]; k++) { - for (size_t l = 0; l < output_dims[3]; l++) { - for (size_t m = 0; m < output_dims[4]; m++) { - for (size_t n = 0; n < output_dims[5]; n++) { - output_ref[i * output_strides[0] + j * output_strides[1] + - k * output_strides[2] + l * output_strides[3] + - m * output_strides[4] + n * output_strides[5]] = - Compute( - input1[i * input1_strides[0] + j * input1_strides[1] + - k * input1_strides[2] + l * input1_strides[3] + - m * input1_strides[4] + n * input1_strides[5]], - input2[i * input2_strides[0] + j * input2_strides[1] + - k * input2_strides[2] + l * input2_strides[3] + - m * input2_strides[4] + n * input2_strides[5]]); - } - } - } - } - } - } - const float accumulated_min = - *std::min_element(output_ref.cbegin(), output_ref.cend()); - const float accumulated_max = - *std::max_element(output_ref.cbegin(), output_ref.cend()); - const float accumulated_range = accumulated_max - accumulated_min; - float output_min = - accumulated_min + - accumulated_range * - (static_cast(qmin() - std::numeric_limits::min()) / - static_cast(std::numeric_limits::max() - - std::numeric_limits::min())); - if (qmin() == std::numeric_limits::min()) { - output_min = -std::numeric_limits::infinity(); - } - float output_max = - accumulated_max - - accumulated_range * - (static_cast(std::numeric_limits::max() - qmax()) / - static_cast(std::numeric_limits::max() - - std::numeric_limits::min())); - if (qmax() == std::numeric_limits::max()) { - output_max = +std::numeric_limits::infinity(); - } - for (float& output_value : output_ref) { - output_value = std::max(output_value, output_min); - output_value = std::min(output_value, output_max); - } - - switch (operation_type()) { - case OperationType::Add: - ASSERT_EQ(xnn_status_success, - xnn_run_add_nd_f32(num_input1_dims(), input1_shape().data(), - num_input2_dims(), input2_shape().data(), - input1.data(), input2.data(), - output.data(), output_min, output_max, 0, - /*threadpool=*/nullptr)); - break; - case OperationType::Divide: - ASSERT_EQ(xnn_status_success, - xnn_run_divide_nd_f32( - num_input1_dims(), input1_shape().data(), - num_input2_dims(), input2_shape().data(), input1.data(), - input2.data(), output.data(), output_min, output_max, 0, - /*threadpool=*/nullptr)); - break; - case OperationType::Maximum: - ASSERT_EQ(xnn_status_success, - xnn_run_maximum_nd_f32( - num_input1_dims(), input1_shape().data(), - num_input2_dims(), input2_shape().data(), input1.data(), - input2.data(), output.data(), 0, - /*threadpool=*/nullptr)); - break; - case OperationType::Minimum: - ASSERT_EQ(xnn_status_success, - xnn_run_minimum_nd_f32( - num_input1_dims(), input1_shape().data(), - num_input2_dims(), input2_shape().data(), input1.data(), - input2.data(), output.data(), 0, - /*threadpool=*/nullptr)); - break; - case OperationType::Multiply: - ASSERT_EQ(xnn_status_success, - xnn_run_multiply_nd_f32( - num_input1_dims(), input1_shape().data(), - num_input2_dims(), input2_shape().data(), input1.data(), - input2.data(), output.data(), output_min, output_max, 0, - /*threadpool=*/nullptr)); - break; - case OperationType::Subtract: - ASSERT_EQ(xnn_status_success, - xnn_run_subtract_nd_f32( - num_input1_dims(), input1_shape().data(), - num_input2_dims(), input2_shape().data(), input1.data(), - input2.data(), output.data(), output_min, output_max, 0, - /*threadpool=*/nullptr)); - break; - case OperationType::SquaredDifference: - ASSERT_EQ(xnn_status_success, - xnn_run_squared_difference_nd_f32( - num_input1_dims(), input1_shape().data(), - num_input2_dims(), input2_shape().data(), input1.data(), - input2.data(), output.data(), 0, - /*threadpool=*/nullptr)); - break; - default: - FAIL() << "Unsupported operation type"; - } - - // Verify results. - for (size_t i = 0; i < output_dims[0]; i++) { - for (size_t j = 0; j < output_dims[1]; j++) { - for (size_t k = 0; k < output_dims[2]; k++) { - for (size_t l = 0; l < output_dims[3]; l++) { - for (size_t m = 0; m < output_dims[4]; m++) { - for (size_t n = 0; n < output_dims[5]; n++) { - const size_t index = - i * output_strides[0] + j * output_strides[1] + - k * output_strides[2] + l * output_strides[3] + - m * output_strides[4] + n * output_strides[5]; - ASSERT_NEAR(output[index], output_ref[index], - 1.0e-6f * std::abs(output_ref[index])) - << "(i, j, k, l, m, n) = (" << i << ", " << j << ", " << k - << ", " << l << ", " << m << ", " << n << ")"; - } - } - } - } - } - } - } -} - -void BinaryElementwiseOperatorTester::TestRunQS8() const { - ASSERT_NE(operation_type(), OperationType::Unknown); - ASSERT_GE(input1_zero_point(), std::numeric_limits::min()); - ASSERT_LE(input1_zero_point(), std::numeric_limits::max()); - ASSERT_GE(input2_zero_point(), std::numeric_limits::min()); - ASSERT_LE(input2_zero_point(), std::numeric_limits::max()); - ASSERT_GE(output_zero_point(), std::numeric_limits::min()); - ASSERT_LE(output_zero_point(), std::numeric_limits::max()); - ASSERT_GE(qmin(), std::numeric_limits::min()); - ASSERT_LE(qmax(), std::numeric_limits::max()); - ASSERT_LT(qmin(), qmax()); - - xnnpack::ReplicableRandomDevice rng; - std::uniform_int_distribution i8dist( - std::numeric_limits::min(), std::numeric_limits::max()); - - // Compute generalized shapes. - std::array input1_dims; - std::array input2_dims; - std::array output_dims; - std::fill(input1_dims.begin(), input1_dims.end(), 1); - std::fill(input2_dims.begin(), input2_dims.end(), 1); - std::fill(output_dims.begin(), output_dims.end(), 1); - std::copy(input1_shape().cbegin(), input1_shape().cend(), - input1_dims.end() - num_input1_dims()); - std::copy(input2_shape().cbegin(), input2_shape().cend(), - input2_dims.end() - num_input2_dims()); - for (size_t i = 0; i < XNN_MAX_TENSOR_DIMS; i++) { - if (input1_dims[i] != 1 && input2_dims[i] != 1) { - ASSERT_EQ(input1_dims[i], input2_dims[i]); - } - output_dims[i] = std::max(input1_dims[i], input2_dims[i]); - } - const size_t num_output_elements = - std::accumulate(output_dims.begin(), output_dims.end(), size_t(1), - std::multiplies()); - - // Compute generalized strides. - std::array input1_strides; - std::array input2_strides; - std::array output_strides; - size_t input1_stride = 1, input2_stride = 1, output_stride = 1; - for (size_t i = XNN_MAX_TENSOR_DIMS; i != 0; i--) { - input1_strides[i - 1] = input1_dims[i - 1] == 1 ? 0 : input1_stride; - input2_strides[i - 1] = input2_dims[i - 1] == 1 ? 0 : input2_stride; - output_strides[i - 1] = output_stride; - input1_stride *= input1_dims[i - 1]; - input2_stride *= input2_dims[i - 1]; - output_stride *= output_dims[i - 1]; - } - - std::vector input1(XNN_EXTRA_BYTES / sizeof(xnn_float16) + - num_input1_elements()); - std::vector input2(XNN_EXTRA_BYTES / sizeof(xnn_float16) + - num_input2_elements()); - std::vector output(num_output_elements); - std::vector output_ref(num_output_elements); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input1.begin(), input1.end(), [&]() { return i8dist(rng); }); - std::generate(input2.begin(), input2.end(), [&]() { return i8dist(rng); }); - std::fill(output.begin(), output.end(), 0xAA); - - // Compute reference results. - for (size_t i = 0; i < output_dims[0]; i++) { - for (size_t j = 0; j < output_dims[1]; j++) { - for (size_t k = 0; k < output_dims[2]; k++) { - for (size_t l = 0; l < output_dims[3]; l++) { - for (size_t m = 0; m < output_dims[4]; m++) { - for (size_t n = 0; n < output_dims[5]; n++) { - output_ref[i * output_strides[0] + j * output_strides[1] + - k * output_strides[2] + l * output_strides[3] + - m * output_strides[4] + n * output_strides[5]] = - Compute( - input1_scale() * (static_cast( - input1[i * input1_strides[0] + - j * input1_strides[1] + - k * input1_strides[2] + - l * input1_strides[3] + - m * input1_strides[4] + - n * input1_strides[5]]) - - input1_zero_point()), - input2_scale() * (static_cast( - input2[i * input2_strides[0] + - j * input2_strides[1] + - k * input2_strides[2] + - l * input2_strides[3] + - m * input2_strides[4] + - n * input2_strides[5]]) - - input2_zero_point())) / - output_scale() + - static_cast(output_zero_point()); - } - } - } - } - } - } - - for (float& output_value : output_ref) { - output_value = std::max(output_value, static_cast(qmin())); - output_value = std::min(output_value, static_cast(qmax())); - } - - switch (operation_type()) { - case OperationType::Add: - ASSERT_EQ( - xnn_status_success, - xnn_run_add_nd_qs8( - num_input1_dims(), input1_shape().data(), input1_zero_point(), - input1_scale(), num_input2_dims(), input2_shape().data(), - input2_zero_point(), input2_scale(), input1.data(), - input2.data(), output.data(), output_zero_point(), - output_scale(), static_cast(qmin()), - static_cast(qmax()), 0, - /*threadpool=*/nullptr)); - break; - case OperationType::Multiply: - ASSERT_EQ( - xnn_status_success, - xnn_run_multiply_nd_qs8( - num_input1_dims(), input1_shape().data(), input1_zero_point(), - input1_scale(), num_input2_dims(), input2_shape().data(), - input2_zero_point(), input2_scale(), input1.data(), - input2.data(), output.data(), output_zero_point(), - output_scale(), static_cast(qmin()), - static_cast(qmax()), 0, - /*threadpool=*/nullptr)); - break; - case OperationType::Subtract: - ASSERT_EQ( - xnn_status_success, - xnn_run_subtract_nd_qs8( - num_input1_dims(), input1_shape().data(), input1_zero_point(), - input1_scale(), num_input2_dims(), input2_shape().data(), - input2_zero_point(), input2_scale(), input1.data(), - input2.data(), output.data(), output_zero_point(), - output_scale(), static_cast(qmin()), - static_cast(qmax()), 0, - /*threadpool=*/nullptr)); - break; - default: - FAIL() << "Unsupported operation type"; - } - - // Verify results. - for (size_t i = 0; i < output_dims[0]; i++) { - for (size_t j = 0; j < output_dims[1]; j++) { - for (size_t k = 0; k < output_dims[2]; k++) { - for (size_t l = 0; l < output_dims[3]; l++) { - for (size_t m = 0; m < output_dims[4]; m++) { - for (size_t n = 0; n < output_dims[5]; n++) { - const size_t index = - i * output_strides[0] + j * output_strides[1] + - k * output_strides[2] + l * output_strides[3] + - m * output_strides[4] + n * output_strides[5]; - ASSERT_NEAR(static_cast(output[index]), - output_ref[index], 0.6f) - << "(i, j, k, l, m, n) = (" << i << ", " << j << ", " << k - << ", " << l << ", " << m << ", " << n << ")" - << ", input1 zero point = " << input1_zero_point() - << ", input1 scale = " << input1_scale() - << ", input2 zero point = " << input2_zero_point() - << ", input2 scale = " << input2_scale() - << ", output zero point = " << output_zero_point() - << ", output scale = " << output_scale(); - } - } - } - } - } - } - } -} - -void BinaryElementwiseOperatorTester::TestRunQU8() const { - ASSERT_NE(operation_type(), OperationType::Unknown); - ASSERT_GE(input1_zero_point(), std::numeric_limits::min()); - ASSERT_LE(input1_zero_point(), std::numeric_limits::max()); - ASSERT_GE(input2_zero_point(), std::numeric_limits::min()); - ASSERT_LE(input2_zero_point(), std::numeric_limits::max()); - ASSERT_GE(output_zero_point(), std::numeric_limits::min()); - ASSERT_LE(output_zero_point(), std::numeric_limits::max()); - ASSERT_GE(qmin(), std::numeric_limits::min()); - ASSERT_LE(qmax(), std::numeric_limits::max()); - ASSERT_LT(qmin(), qmax()); - - xnnpack::ReplicableRandomDevice rng; - std::uniform_int_distribution u8dist( - std::numeric_limits::min(), std::numeric_limits::max()); - - // Compute generalized shapes. - std::array input1_dims; - std::array input2_dims; - std::array output_dims; - std::fill(input1_dims.begin(), input1_dims.end(), 1); - std::fill(input2_dims.begin(), input2_dims.end(), 1); - std::fill(output_dims.begin(), output_dims.end(), 1); - std::copy(input1_shape().cbegin(), input1_shape().cend(), - input1_dims.end() - num_input1_dims()); - std::copy(input2_shape().cbegin(), input2_shape().cend(), - input2_dims.end() - num_input2_dims()); - for (size_t i = 0; i < XNN_MAX_TENSOR_DIMS; i++) { - if (input1_dims[i] != 1 && input2_dims[i] != 1) { - ASSERT_EQ(input1_dims[i], input2_dims[i]); - } - output_dims[i] = std::max(input1_dims[i], input2_dims[i]); - } - const size_t num_output_elements = - std::accumulate(output_dims.begin(), output_dims.end(), size_t(1), - std::multiplies()); - - // Compute generalized strides. - std::array input1_strides; - std::array input2_strides; - std::array output_strides; - size_t input1_stride = 1, input2_stride = 1, output_stride = 1; - for (size_t i = XNN_MAX_TENSOR_DIMS; i != 0; i--) { - input1_strides[i - 1] = input1_dims[i - 1] == 1 ? 0 : input1_stride; - input2_strides[i - 1] = input2_dims[i - 1] == 1 ? 0 : input2_stride; - output_strides[i - 1] = output_stride; - input1_stride *= input1_dims[i - 1]; - input2_stride *= input2_dims[i - 1]; - output_stride *= output_dims[i - 1]; - } - - std::vector input1(XNN_EXTRA_BYTES / sizeof(xnn_float16) + - num_input1_elements()); - std::vector input2(XNN_EXTRA_BYTES / sizeof(xnn_float16) + - num_input2_elements()); - std::vector output(num_output_elements); - std::vector output_ref(num_output_elements); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input1.begin(), input1.end(), [&]() { return u8dist(rng); }); - std::generate(input2.begin(), input2.end(), [&]() { return u8dist(rng); }); - std::fill(output.begin(), output.end(), 0xAA); - - // Compute reference results. - for (size_t i = 0; i < output_dims[0]; i++) { - for (size_t j = 0; j < output_dims[1]; j++) { - for (size_t k = 0; k < output_dims[2]; k++) { - for (size_t l = 0; l < output_dims[3]; l++) { - for (size_t m = 0; m < output_dims[4]; m++) { - for (size_t n = 0; n < output_dims[5]; n++) { - output_ref[i * output_strides[0] + j * output_strides[1] + - k * output_strides[2] + l * output_strides[3] + - m * output_strides[4] + n * output_strides[5]] = - Compute( - input1_scale() * (static_cast( - input1[i * input1_strides[0] + - j * input1_strides[1] + - k * input1_strides[2] + - l * input1_strides[3] + - m * input1_strides[4] + - n * input1_strides[5]]) - - input1_zero_point()), - input2_scale() * (static_cast( - input2[i * input2_strides[0] + - j * input2_strides[1] + - k * input2_strides[2] + - l * input2_strides[3] + - m * input2_strides[4] + - n * input2_strides[5]]) - - input2_zero_point())) / - output_scale() + - static_cast(output_zero_point()); - } - } - } - } - } - } - - for (float& output_value : output_ref) { - output_value = std::max(output_value, static_cast(qmin())); - output_value = std::min(output_value, static_cast(qmax())); - } - - switch (operation_type()) { - case OperationType::Add: - ASSERT_EQ( - xnn_status_success, - xnn_run_add_nd_qu8( - num_input1_dims(), input1_shape().data(), input1_zero_point(), - input1_scale(), num_input2_dims(), input2_shape().data(), - input2_zero_point(), input2_scale(), input1.data(), - input2.data(), output.data(), output_zero_point(), - output_scale(), static_cast(qmin()), - static_cast(qmax()), 0, - /*threadpool=*/nullptr)); - break; - case OperationType::Multiply: - ASSERT_EQ( - xnn_status_success, - xnn_run_multiply_nd_qu8( - num_input1_dims(), input1_shape().data(), input1_zero_point(), - input1_scale(), num_input2_dims(), input2_shape().data(), - input2_zero_point(), input2_scale(), input1.data(), - input2.data(), output.data(), output_zero_point(), - output_scale(), static_cast(qmin()), - static_cast(qmax()), 0, - /*threadpool=*/nullptr)); - break; - case OperationType::Subtract: - ASSERT_EQ( - xnn_status_success, - xnn_run_subtract_nd_qu8( - num_input1_dims(), input1_shape().data(), input1_zero_point(), - input1_scale(), num_input2_dims(), input2_shape().data(), - input2_zero_point(), input2_scale(), input1.data(), - input2.data(), output.data(), output_zero_point(), - output_scale(), static_cast(qmin()), - static_cast(qmax()), 0, - /*threadpool=*/nullptr)); - break; - default: - FAIL() << "Unsupported operation type"; - } - - // Verify results. - for (size_t i = 0; i < output_dims[0]; i++) { - for (size_t j = 0; j < output_dims[1]; j++) { - for (size_t k = 0; k < output_dims[2]; k++) { - for (size_t l = 0; l < output_dims[3]; l++) { - for (size_t m = 0; m < output_dims[4]; m++) { - for (size_t n = 0; n < output_dims[5]; n++) { - const size_t index = - i * output_strides[0] + j * output_strides[1] + - k * output_strides[2] + l * output_strides[3] + - m * output_strides[4] + n * output_strides[5]; - ASSERT_NEAR( - static_cast(static_cast(output[index])), - output_ref[index], 0.6f) - << "(i, j, k, l, m, n) = (" << i << ", " << j << ", " << k - << ", " << l << ", " << m << ", " << n << ")" - << ", input1 zero point = " << input1_zero_point() - << ", input1 scale = " << input1_scale() - << ", input2 zero point = " << input2_zero_point() - << ", input2 scale = " << input2_scale() - << ", output zero point = " << output_zero_point() - << ", output scale = " << output_scale(); - } - } - } - } - } - } - } -} diff --git a/test/binary-elementwise-operator-tester.h b/test/binary-elementwise-operator-tester.h deleted file mode 100644 index afc27cd1425..00000000000 --- a/test/binary-elementwise-operator-tester.h +++ /dev/null @@ -1,366 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/math.h" - -class BinaryElementwiseOperatorTester { - public: - enum class OperationType { - Unknown, - Add, - CopySign, - Divide, - Maximum, - Minimum, - Multiply, - Subtract, - SquaredDifference, - }; - - static std::string ToString(OperationType operation_type) { - switch (operation_type) { - case OperationType::Unknown: - return "Unknown"; - case OperationType::Add: - return "Add"; - case OperationType::CopySign: - return "CopySign"; - case OperationType::Divide: - return "Divide"; - case OperationType::Maximum: - return "Maximum"; - case OperationType::Minimum: - return "Minimum"; - case OperationType::Multiply: - return "Multiply"; - case OperationType::Subtract: - return "Subtract"; - case OperationType::SquaredDifference: - return "SquaredDifference"; - default: - return "Unknown"; - } - } - - template - void CheckResults(const size_t* output_dims, const T* input1, const T* input2, - const T* output, const size_t* input1_strides, - const size_t* input2_strides, - const size_t* output_strides) const { - // Verify results. - for (size_t i = 0; i < output_dims[0]; i++) { - for (size_t j = 0; j < output_dims[1]; j++) { - for (size_t k = 0; k < output_dims[2]; k++) { - for (size_t l = 0; l < output_dims[3]; l++) { - for (size_t m = 0; m < output_dims[4]; m++) { - for (size_t n = 0; n < output_dims[5]; n++) { - float output_ref = - Compute( - input1_scale() * (static_cast( - input1[i * input1_strides[0] + - j * input1_strides[1] + - k * input1_strides[2] + - l * input1_strides[3] + - m * input1_strides[4] + - n * input1_strides[5]]) - - input1_zero_point()), - input2_scale() * (static_cast( - input2[i * input2_strides[0] + - j * input2_strides[1] + - k * input2_strides[2] + - l * input2_strides[3] + - m * input2_strides[4] + - n * input2_strides[5]]) - - input2_zero_point())) / - output_scale() + - static_cast(output_zero_point()); - output_ref = - std::max(output_ref, static_cast(qmin())); - output_ref = - std::min(output_ref, static_cast(qmax())); - const size_t index = - i * output_strides[0] + j * output_strides[1] + - k * output_strides[2] + l * output_strides[3] + - m * output_strides[4] + n * output_strides[5]; - ASSERT_NEAR(static_cast(output[index]), output_ref, 0.6f) - << "(i, j, k, l, m, n) = (" << i << ", " << j << ", " << k - << ", " << l << ", " << m << ", " << n << ")" - << ", input1 zero point = " << input1_zero_point() - << ", input1 scale = " << input1_scale() - << ", input2 zero point = " << input2_zero_point() - << ", input2 scale = " << input2_scale() - << ", output zero point = " << output_zero_point() - << ", output scale = " << output_scale(); - } - } - } - } - } - } - } - BinaryElementwiseOperatorTester& input1_shape( - std::vector input1_shape) { - assert(input1_shape.size() <= XNN_MAX_TENSOR_DIMS); - this->input1_shape_ = std::move(input1_shape); - return *this; - } - - const std::vector& input1_shape() const { - return this->input1_shape_; - } - - size_t input1_dim(size_t i) const { - return i < num_input1_dims() ? this->input1_shape_[i] : 1; - } - - size_t num_input1_dims() const { return this->input1_shape_.size(); } - - size_t num_input1_elements() const { - return std::accumulate(this->input1_shape_.begin(), - this->input1_shape_.end(), size_t(1), - std::multiplies()); - } - - BinaryElementwiseOperatorTester& input1_zero_point( - int16_t input1_zero_point) { - this->input1_zero_point_ = input1_zero_point; - return *this; - } - - int16_t input1_zero_point() const { return this->input1_zero_point_; } - - BinaryElementwiseOperatorTester& input1_scale(float input1_scale) { - assert(std::isfinite(input1_scale)); - this->input1_scale_ = input1_scale; - return *this; - } - - float input1_scale() const { return this->input1_scale_; } - - BinaryElementwiseOperatorTester& input2_shape( - std::vector input2_shape) { - assert(input2_shape.size() <= XNN_MAX_TENSOR_DIMS); - this->input2_shape_ = std::move(input2_shape); - return *this; - } - - const std::vector& input2_shape() const { - return this->input2_shape_; - } - - size_t input2_dim(size_t i) const { - return i < num_input2_dims() ? this->input2_shape_[i] : 1; - } - - size_t num_input2_dims() const { return this->input2_shape_.size(); } - - size_t num_input2_elements() const { - return std::accumulate(this->input2_shape_.begin(), - this->input2_shape_.end(), size_t(1), - std::multiplies()); - } - - BinaryElementwiseOperatorTester& input2_zero_point( - int16_t input2_zero_point) { - this->input2_zero_point_ = input2_zero_point; - return *this; - } - - int16_t input2_zero_point() const { return this->input2_zero_point_; } - - BinaryElementwiseOperatorTester& input2_scale(float input2_scale) { - assert(std::isfinite(input2_scale)); - this->input2_scale_ = input2_scale; - return *this; - } - - float input2_scale() const { return this->input2_scale_; } - - BinaryElementwiseOperatorTester& output_zero_point( - int16_t output_zero_point) { - this->output_zero_point_ = output_zero_point; - return *this; - } - - int16_t output_zero_point() const { return this->output_zero_point_; } - - BinaryElementwiseOperatorTester& output_scale(float output_scale) { - assert(std::isfinite(output_scale)); - this->output_scale_ = output_scale; - return *this; - } - - float output_scale() const { return this->output_scale_; } - - BinaryElementwiseOperatorTester& qmin(int16_t qmin) { - this->qmin_ = qmin; - return *this; - } - - int16_t qmin() const { return this->qmin_; } - - BinaryElementwiseOperatorTester& qmax(int16_t qmax) { - this->qmax_ = qmax; - return *this; - } - - int16_t qmax() const { return this->qmax_; } - - BinaryElementwiseOperatorTester& operation_type( - OperationType operation_type) { - this->operation_type_ = operation_type; - return *this; - } - - OperationType operation_type() const { return this->operation_type_; } - - BinaryElementwiseOperatorTester& iterations(size_t iterations) { - this->iterations_ = iterations; - return *this; - } - - size_t iterations() const { return this->iterations_; } - - float Compute(float a, float b) const { - switch (operation_type()) { - case OperationType::Add: - return a + b; - case OperationType::CopySign: - return std::copysign(a, b); - case OperationType::Divide: - return a / b; - case OperationType::Maximum: - return std::max(a, b); - case OperationType::Minimum: - return std::min(a, b); - case OperationType::Multiply: - return a * b; - case OperationType::Subtract: - return a - b; - case OperationType::SquaredDifference: - return (a - b) * (a - b); - default: - return std::nanf(""); - } - } - - int32_t Compute(int32_t a, int32_t b) const{ - switch (operation_type()) { - case OperationType::Add: - return a + b; - case OperationType::CopySign: - return std::copysign(a, b); - case OperationType::Divide: - return a / b; - case OperationType::Maximum: - return std::max(a, b); - case OperationType::Minimum: - return std::min(a, b); - case OperationType::Multiply: - return a * b; - case OperationType::Subtract: - return a - b; - case OperationType::SquaredDifference: - return (a - b) * (a - b); - default: - return INT_MAX; - - } - } - void TestQS8() const; - - void TestQU8() const; - - void TestF16() const; - - void TestF32() const; - - void TestS32() const; - - void TestRunF32() const; - - void TestRunQS8() const; - - void TestRunQU8() const; - - void Test(int8_t) { TestQS8(); } - void Test(uint8_t) { TestQU8(); } - void Test(xnn_float16) { TestF16(); } - void Test(float) { TestF32(); } - void Test(int32_t) { TestS32(); } - - void TestRun(int8_t) { TestRunQS8(); } - void TestRun(uint8_t) { TestRunQU8(); } - void TestRun(xnn_float16) {} - void TestRun(float) { TestRunF32(); } - void TestRun(int32_t) {} - - private: - std::vector input1_shape_; - std::vector input2_shape_; - int16_t input1_zero_point_{0}; - float input1_scale_{1.0f}; - int16_t input2_zero_point_{0}; - float input2_scale_{1.0f}; - int16_t output_zero_point_{0}; - float output_scale_{1.0f}; - int16_t qmin_{std::numeric_limits::min()}; - int16_t qmax_{std::numeric_limits::max()}; - OperationType operation_type_{OperationType::Unknown}; - size_t iterations_{3}; -}; - -// Make a shape of `rank` dimensions, broadcasting in each dimension according -// `broadcast_mask`. -inline std::vector MakeShapeOfRank(size_t rank, uint32_t broadcast_mask, - const size_t* dims) { - std::vector shape; - for (size_t i = 0; i < rank; i++) { - const bool broadcast = (broadcast_mask & (uint32_t(1) << i)) != 0; - shape.push_back(broadcast ? 1 : dims[i]); - } - std::reverse(shape.begin(), shape.end()); - return shape; -} - -enum class RunMode { - kCreateReshapeRun, - kEager, -}; - -template -void RunBinaryOpTester(size_t rank_a, size_t rank_b, const size_t* dims, - RunMode run_mode, - BinaryElementwiseOperatorTester& tester) { - for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << rank_a); bm1++) { - for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << rank_b); bm2++) { - tester.input1_shape(MakeShapeOfRank(rank_a, bm1, dims)) - .input2_shape(MakeShapeOfRank(rank_b, bm2, dims)); - if (run_mode == RunMode::kCreateReshapeRun) { - tester.Test(T()); - } else if (run_mode == RunMode::kEager) { - tester.TestRun(T()); - } else { - FAIL() << "Unknown run_mode"; - } - } - } -} \ No newline at end of file diff --git a/test/binary-nd.cc b/test/binary-nd.cc deleted file mode 100644 index 4fa34a427f7..00000000000 --- a/test/binary-nd.cc +++ /dev/null @@ -1,472 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include - -#include -#include "binary-elementwise-operator-tester.h" - -constexpr size_t kDim1 = 2; -constexpr size_t kDim2 = 3; -constexpr size_t kDim3 = 4; -constexpr size_t kDim4 = 5; -constexpr size_t kDim5 = 6; -constexpr size_t kDim6 = 7; -const size_t kDims[] = {kDim1, kDim2, kDim3, kDim4, kDim5, kDim6}; - -const size_t kBroadcastRanks[] = {0, 1, 2, 3, 4, 5, 6}; -const size_t kTestRank = 4; - -template -void BroadcastNDTestImpl(const Params& params) { - RunMode mode = std::get<0>(params); - BinaryElementwiseOperatorTester::OperationType op = std::get<1>(params); - const size_t rank_a = std::get<2>(params); - const size_t rank_b = std::get<3>(params); - BinaryElementwiseOperatorTester tester; - tester.operation_type(op); - if (std::is_same::value || std::is_same::value) { - // TODO(dsharlet): This is a lame way to do this. The tester needs to be - // refactored to not require this. - tester.qmin(std::numeric_limits::min()); - tester.qmax(std::numeric_limits::max()); - } - RunBinaryOpTester(rank_a, rank_b, kDims, mode, tester); -} - -template -class BroadcastNDTest - : public testing::TestWithParam< - std::tuple> {}; - -using BroadcastNDTestQS8 = BroadcastNDTest; -using BroadcastNDTestQU8 = BroadcastNDTest; -#ifndef XNN_EXCLUDE_F16_TESTS -using BroadcastNDTestF16 = BroadcastNDTest; -#endif // XNN_EXCLUDE_F16_TESTS -using BroadcastNDTestF32 = BroadcastNDTest; -using BroadcastNDTestS32 = BroadcastNDTest; - -TEST_P(BroadcastNDTestQS8, op) { BroadcastNDTestImpl(GetParam()); } -TEST_P(BroadcastNDTestQU8, op) { BroadcastNDTestImpl(GetParam()); } -#ifndef XNN_EXCLUDE_F16_TESTS -TEST_P(BroadcastNDTestF16, op) { BroadcastNDTestImpl(GetParam()); } -#endif // XNN_EXCLUDE_F16_TESTS -TEST_P(BroadcastNDTestF32, op) { BroadcastNDTestImpl(GetParam()); } -TEST_P(BroadcastNDTestS32, op) { BroadcastNDTestImpl(GetParam()); } - -std::string ToString( - const std::tuple& param) { - return BinaryElementwiseOperatorTester::ToString(std::get<1>(param)) + "_" + - std::to_string(std::get<2>(param)) + "d_x_" + - std::to_string(std::get<3>(param)) + "d"; -} - -std::string ToString( - const std::tuple& - param) { - return BinaryElementwiseOperatorTester::ToString(std::get<1>(param)); -} - -INSTANTIATE_TEST_SUITE_P( - CreateReshapeRun, BroadcastNDTestQS8, - testing::Combine( - testing::Values(RunMode::kCreateReshapeRun), - testing::Values( - BinaryElementwiseOperatorTester::OperationType::Add, - BinaryElementwiseOperatorTester::OperationType::Subtract, - BinaryElementwiseOperatorTester::OperationType::Multiply), - testing::ValuesIn(kBroadcastRanks), testing::ValuesIn(kBroadcastRanks)), - [](const auto& info) { return ToString(info.param); }); -INSTANTIATE_TEST_SUITE_P( - Eager, BroadcastNDTestQS8, - testing::Combine( - testing::Values(RunMode::kEager), - testing::Values( - BinaryElementwiseOperatorTester::OperationType::Add, - BinaryElementwiseOperatorTester::OperationType::Subtract, - BinaryElementwiseOperatorTester::OperationType::Multiply), - testing::ValuesIn(kBroadcastRanks), testing::ValuesIn(kBroadcastRanks)), - [](const auto& info) { return ToString(info.param); }); -INSTANTIATE_TEST_SUITE_P( - CreateReshapeRun, BroadcastNDTestQU8, - testing::Combine( - testing::Values(RunMode::kCreateReshapeRun), - testing::Values( - BinaryElementwiseOperatorTester::OperationType::Add, - BinaryElementwiseOperatorTester::OperationType::Subtract, - BinaryElementwiseOperatorTester::OperationType::Multiply), - testing::ValuesIn(kBroadcastRanks), testing::ValuesIn(kBroadcastRanks)), - [](const auto& info) { return ToString(info.param); }); -INSTANTIATE_TEST_SUITE_P( - Eager, BroadcastNDTestQU8, - testing::Combine( - testing::Values(RunMode::kEager), - testing::Values( - BinaryElementwiseOperatorTester::OperationType::Add, - BinaryElementwiseOperatorTester::OperationType::Subtract, - BinaryElementwiseOperatorTester::OperationType::Multiply), - testing::ValuesIn(kBroadcastRanks), testing::ValuesIn(kBroadcastRanks)), - [](const auto& info) { return ToString(info.param); }); -#ifndef XNN_EXCLUDE_F16_TESTS -INSTANTIATE_TEST_SUITE_P( - CreateReshapeRun, BroadcastNDTestF16, - testing::Combine( - testing::Values(RunMode::kCreateReshapeRun), - testing::Values( - BinaryElementwiseOperatorTester::OperationType::Add, - BinaryElementwiseOperatorTester::OperationType::Divide, - BinaryElementwiseOperatorTester::OperationType::Maximum, - BinaryElementwiseOperatorTester::OperationType::Minimum, - BinaryElementwiseOperatorTester::OperationType::Multiply, - BinaryElementwiseOperatorTester::OperationType::SquaredDifference, - BinaryElementwiseOperatorTester::OperationType::Subtract), - testing::ValuesIn(kBroadcastRanks), testing::ValuesIn(kBroadcastRanks)), - [](const auto& info) { return ToString(info.param); }); -INSTANTIATE_TEST_SUITE_P( - Eager, BroadcastNDTestF16, - testing::Combine( - testing::Values(RunMode::kEager), - testing::Values( - BinaryElementwiseOperatorTester::OperationType::Add, - BinaryElementwiseOperatorTester::OperationType::Divide, - BinaryElementwiseOperatorTester::OperationType::Maximum, - BinaryElementwiseOperatorTester::OperationType::Minimum, - BinaryElementwiseOperatorTester::OperationType::Multiply, - BinaryElementwiseOperatorTester::OperationType::SquaredDifference, - BinaryElementwiseOperatorTester::OperationType::Subtract), - testing::ValuesIn(kBroadcastRanks), testing::ValuesIn(kBroadcastRanks)), - [](const auto& info) { return ToString(info.param); }); -#endif -INSTANTIATE_TEST_SUITE_P( - CreateReshapeRun, BroadcastNDTestF32, - testing::Combine( - testing::Values(RunMode::kCreateReshapeRun), - testing::Values( - BinaryElementwiseOperatorTester::OperationType::Add, - BinaryElementwiseOperatorTester::OperationType::CopySign, - BinaryElementwiseOperatorTester::OperationType::Divide, - BinaryElementwiseOperatorTester::OperationType::Maximum, - BinaryElementwiseOperatorTester::OperationType::Minimum, - BinaryElementwiseOperatorTester::OperationType::Multiply, - BinaryElementwiseOperatorTester::OperationType::Subtract, - BinaryElementwiseOperatorTester::OperationType::SquaredDifference), - testing::ValuesIn(kBroadcastRanks), testing::ValuesIn(kBroadcastRanks)), - [](const auto& info) { return ToString(info.param); }); -INSTANTIATE_TEST_SUITE_P( - Eager, BroadcastNDTestF32, - testing::Combine( - testing::Values(RunMode::kEager), - testing::Values( - BinaryElementwiseOperatorTester::OperationType::Add, - BinaryElementwiseOperatorTester::OperationType::Divide, - BinaryElementwiseOperatorTester::OperationType::Maximum, - BinaryElementwiseOperatorTester::OperationType::Minimum, - BinaryElementwiseOperatorTester::OperationType::Multiply, - BinaryElementwiseOperatorTester::OperationType::Subtract, - BinaryElementwiseOperatorTester::OperationType::SquaredDifference), - testing::ValuesIn(kBroadcastRanks), testing::ValuesIn(kBroadcastRanks)), - [](const auto& info) { return ToString(info.param); }); -INSTANTIATE_TEST_SUITE_P( - CreateReshapeRun, BroadcastNDTestS32, - testing::Combine( - testing::Values(RunMode::kCreateReshapeRun), - testing::Values( - BinaryElementwiseOperatorTester::OperationType::Multiply), - testing::ValuesIn(kBroadcastRanks), testing::ValuesIn(kBroadcastRanks)), - [](const auto& info) { return ToString(info.param); }); -INSTANTIATE_TEST_SUITE_P( - Eager, BroadcastNDTestS32, - testing::Combine( - testing::Values(RunMode::kEager), - testing::Values( - BinaryElementwiseOperatorTester::OperationType::Multiply), - testing::ValuesIn(kBroadcastRanks), testing::ValuesIn(kBroadcastRanks)), - [](const auto& info) { return ToString(info.param); }); - -template -void FloatMinTestImpl(Params params) { - for (int32_t qmin = std::numeric_limits::max() - 1000; - qmin > std::numeric_limits::min(); qmin -= 5000) { - RunBinaryOpTester(kTestRank, kTestRank, kDims, std::get<0>(params), - BinaryElementwiseOperatorTester() - .operation_type(std::get<1>(params)) - .qmin(qmin)); - } -} - -template -void FloatMaxTestImpl(Params params) { - for (int32_t qmax = std::numeric_limits::min() + 1000; - qmax < std::numeric_limits::max(); qmax += 5000) { - RunBinaryOpTester(kTestRank, kTestRank, kDims, std::get<0>(params), - BinaryElementwiseOperatorTester() - .operation_type(std::get<1>(params)) - .qmax(qmax)); - } -} - -template -class FloatMinMaxTest - : public testing::TestWithParam< - std::tuple> { -}; - -#ifndef XNN_EXCLUDE_F16_TESTS -using FloatMinMaxTestNDF16 = FloatMinMaxTest; -TEST_P(FloatMinMaxTestNDF16, qmin) { FloatMinTestImpl(GetParam()); } -TEST_P(FloatMinMaxTestNDF16, qmax) { FloatMaxTestImpl(GetParam()); } -#endif // XNN_EXCLUDE_F16_TESTS - -using FloatMinMaxTestNDF32 = FloatMinMaxTest; -TEST_P(FloatMinMaxTestNDF32, qmin) { FloatMinTestImpl(GetParam()); } -TEST_P(FloatMinMaxTestNDF32, qmax) { FloatMaxTestImpl(GetParam()); } - -#ifndef XNN_EXCLUDE_F16_TESTS -INSTANTIATE_TEST_SUITE_P( - CreateReshapeRun, FloatMinMaxTestNDF16, - testing::Combine( - testing::Values(RunMode::kCreateReshapeRun), - testing::Values( - BinaryElementwiseOperatorTester::OperationType::Add, - BinaryElementwiseOperatorTester::OperationType::Divide, - BinaryElementwiseOperatorTester::OperationType::Multiply, - BinaryElementwiseOperatorTester::OperationType::Subtract)), - [](const auto& info) { return ToString(info.param); }); -INSTANTIATE_TEST_SUITE_P( - Eager, FloatMinMaxTestNDF16, - testing::Combine( - testing::Values(RunMode::kEager), - testing::Values( - BinaryElementwiseOperatorTester::OperationType::Add, - BinaryElementwiseOperatorTester::OperationType::Divide, - BinaryElementwiseOperatorTester::OperationType::Multiply, - BinaryElementwiseOperatorTester::OperationType::Subtract)), - [](const auto& info) { return ToString(info.param); }); -#endif -INSTANTIATE_TEST_SUITE_P( - CreateReshapeRun, FloatMinMaxTestNDF32, - testing::Combine( - testing::Values(RunMode::kCreateReshapeRun), - testing::Values( - BinaryElementwiseOperatorTester::OperationType::Add, - BinaryElementwiseOperatorTester::OperationType::Divide, - BinaryElementwiseOperatorTester::OperationType::Multiply, - BinaryElementwiseOperatorTester::OperationType::Subtract)), - [](const auto& info) { return ToString(info.param); }); -INSTANTIATE_TEST_SUITE_P( - Eager, FloatMinMaxTestNDF32, - testing::Combine( - testing::Values(RunMode::kEager), - testing::Values( - BinaryElementwiseOperatorTester::OperationType::Add, - BinaryElementwiseOperatorTester::OperationType::Divide, - BinaryElementwiseOperatorTester::OperationType::Multiply, - BinaryElementwiseOperatorTester::OperationType::Subtract)), - [](const auto& info) { return ToString(info.param); }); - -template -void QuantizedTest_Input1Scale(Params params) { - for (float input1_scale = 0.1f; input1_scale <= 10.0f; - input1_scale *= 3.14f) { - RunBinaryOpTester(kTestRank, kTestRank, kDims, std::get<0>(params), - BinaryElementwiseOperatorTester() - .operation_type(std::get<1>(params)) - .input1_scale(input1_scale) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max())); - } -} - -template -void QuantizedTest_Input1ZeroPoint(Params params) { - for (int16_t input1_zero_point = std::numeric_limits::min(); - input1_zero_point <= std::numeric_limits::max(); - input1_zero_point += 51) { - RunBinaryOpTester(kTestRank, kTestRank, kDims, std::get<0>(params), - BinaryElementwiseOperatorTester() - .operation_type(std::get<1>(params)) - .input1_zero_point(input1_zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max())); - } -} - -template -void QuantizedTest_Input2Scale(Params params) { - for (float input2_scale = 0.1f; input2_scale <= 10.0f; - input2_scale *= 3.14f) { - RunBinaryOpTester(kTestRank, kTestRank, kDims, std::get<0>(params), - BinaryElementwiseOperatorTester() - .operation_type(std::get<1>(params)) - .input2_scale(input2_scale) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max())); - } -} - -template -void QuantizedTest_Input2ZeroPoint(Params params) { - for (int16_t input2_zero_point = std::numeric_limits::min(); - input2_zero_point <= std::numeric_limits::max(); - input2_zero_point += 51) { - RunBinaryOpTester(kTestRank, kTestRank, kDims, std::get<0>(params), - BinaryElementwiseOperatorTester() - .operation_type(std::get<1>(params)) - .input2_zero_point(input2_zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max())); - } -} - -template -void QuantizedTest_OutputScale(Params params) { - for (float output_scale = 0.1f; output_scale <= 10.0f; - output_scale *= 3.14f) { - RunBinaryOpTester(kTestRank, kTestRank, kDims, std::get<0>(params), - BinaryElementwiseOperatorTester() - .operation_type(std::get<1>(params)) - .output_scale(output_scale) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max())); - } -} - -template -void QuantizedTest_OutputZeroPoint(Params params) { - for (int16_t output_zero_point = std::numeric_limits::min(); - output_zero_point <= std::numeric_limits::max(); - output_zero_point += 51) { - RunBinaryOpTester(kTestRank, kTestRank, kDims, std::get<0>(params), - BinaryElementwiseOperatorTester() - .operation_type(std::get<1>(params)) - .output_zero_point(output_zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max())); - } -} - -template -void QuantizedTest_Qmin(Params params) { - for (int16_t qmin = std::numeric_limits::max() - 1; - qmin > std::numeric_limits::min(); qmin -= 50) { - RunBinaryOpTester(kTestRank, kTestRank, kDims, std::get<0>(params), - BinaryElementwiseOperatorTester() - .operation_type(std::get<1>(params)) - .qmin(qmin) - .qmax(std::numeric_limits::max())); - } -} -template -void QuantizedTest_Qmax(Params params) { - for (int16_t qmax = std::numeric_limits::min() + 1; - qmax < std::numeric_limits::max(); qmax += 50) { - RunBinaryOpTester(kTestRank, kTestRank, kDims, std::get<0>(params), - BinaryElementwiseOperatorTester() - .operation_type(std::get<1>(params)) - .qmin(std::numeric_limits::min()) - .qmax(qmax)); - } -} - -template -class QuantizedTest - : public testing::TestWithParam< - std::tuple> { -}; - -using QuantizedTestQS8 = QuantizedTest; - -TEST_P(QuantizedTestQS8, input1_scale) { - QuantizedTest_Input1Scale(GetParam()); -} -TEST_P(QuantizedTestQS8, input1_zero_point) { - QuantizedTest_Input1ZeroPoint(GetParam()); -} -TEST_P(QuantizedTestQS8, input2_scale) { - QuantizedTest_Input2Scale(GetParam()); -} -TEST_P(QuantizedTestQS8, input2_zero_point) { - QuantizedTest_Input2ZeroPoint(GetParam()); -} - -TEST_P(QuantizedTestQS8, output_scale) { - QuantizedTest_OutputScale(GetParam()); -} -TEST_P(QuantizedTestQS8, output_zero_point) { - QuantizedTest_OutputZeroPoint(GetParam()); -} - -TEST_P(QuantizedTestQS8, qmin) { QuantizedTest_Qmin(GetParam()); } -TEST_P(QuantizedTestQS8, qmax) { QuantizedTest_Qmax(GetParam()); } - -INSTANTIATE_TEST_SUITE_P( - CreateReshapeRun, QuantizedTestQS8, - testing::Combine( - testing::Values(RunMode::kCreateReshapeRun), - testing::Values( - BinaryElementwiseOperatorTester::OperationType::Add, - BinaryElementwiseOperatorTester::OperationType::Subtract, - BinaryElementwiseOperatorTester::OperationType::Multiply)), - [](const auto& info) { return ToString(info.param); }); -INSTANTIATE_TEST_SUITE_P( - Eager, QuantizedTestQS8, - testing::Combine( - testing::Values(RunMode::kEager), - testing::Values( - BinaryElementwiseOperatorTester::OperationType::Add, - BinaryElementwiseOperatorTester::OperationType::Subtract, - BinaryElementwiseOperatorTester::OperationType::Multiply)), - [](const auto& info) { return ToString(info.param); }); - -using QuantizedTestQU8 = QuantizedTest; - -TEST_P(QuantizedTestQU8, input1_scale) { - QuantizedTest_Input1Scale(GetParam()); -} -TEST_P(QuantizedTestQU8, input1_zero_point) { - QuantizedTest_Input1ZeroPoint(GetParam()); -} -TEST_P(QuantizedTestQU8, input2_scale) { - QuantizedTest_Input2Scale(GetParam()); -} -TEST_P(QuantizedTestQU8, input2_zero_point) { - QuantizedTest_Input2ZeroPoint(GetParam()); -} - -TEST_P(QuantizedTestQU8, output_scale) { - QuantizedTest_OutputScale(GetParam()); -} -TEST_P(QuantizedTestQU8, output_zero_point) { - QuantizedTest_OutputZeroPoint(GetParam()); -} - -TEST_P(QuantizedTestQU8, qmin) { QuantizedTest_Qmin(GetParam()); } -TEST_P(QuantizedTestQU8, qmax) { QuantizedTest_Qmax(GetParam()); } - -INSTANTIATE_TEST_SUITE_P( - CreateReshapeRun, QuantizedTestQU8, - testing::Combine( - testing::Values(RunMode::kCreateReshapeRun), - testing::Values( - BinaryElementwiseOperatorTester::OperationType::Add, - BinaryElementwiseOperatorTester::OperationType::Subtract, - BinaryElementwiseOperatorTester::OperationType::Multiply)), - [](const auto& info) { return ToString(info.param); }); -INSTANTIATE_TEST_SUITE_P( - Eager, QuantizedTestQU8, - testing::Combine( - testing::Values(RunMode::kEager), - testing::Values( - BinaryElementwiseOperatorTester::OperationType::Add, - BinaryElementwiseOperatorTester::OperationType::Subtract, - BinaryElementwiseOperatorTester::OperationType::Multiply)), - [](const auto& info) { return ToString(info.param); }); diff --git a/test/binary.cc b/test/binary.cc new file mode 100644 index 00000000000..50f218b2b34 --- /dev/null +++ b/test/binary.cc @@ -0,0 +1,928 @@ +// Copyright 2022 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "xnnpack.h" +#include "xnnpack/aligned-allocator.h" +#include "xnnpack/math.h" +#include "xnnpack/operator.h" +#include "xnnpack/subgraph.h" +#include "replicable_random_device.h" + +template +class NumericLimits { + public: + static constexpr T min() { return std::numeric_limits::min(); } + static constexpr T max() { return std::numeric_limits::max(); } +}; + +template <> +class NumericLimits { + public: + static xnn_float16 min() { return -std::numeric_limits::infinity(); } + static xnn_float16 max() { return +std::numeric_limits::infinity(); } +}; + +template +struct UniformDistribution { + std::uniform_real_distribution dist{-10.0f, 10.0f}; + + template + T operator()(Generator& g) { + return dist(g); + } +}; + +template <> +struct UniformDistribution { + std::uniform_real_distribution dist{-10.0f, 10.0f}; + + template + xnn_float16 operator()(Generator& g) { + return dist(g); + } +}; + +template <> +struct UniformDistribution { + std::uniform_int_distribution dist{std::numeric_limits::min(), + std::numeric_limits::max()}; + + template + int8_t operator()(Generator& g) { + return dist(g); + } +}; + +template <> +struct UniformDistribution { + std::uniform_int_distribution dist{ + std::numeric_limits::min(), + std::numeric_limits::max()}; + + template + uint8_t operator()(Generator& g) { + return dist(g); + } +}; + +template <> +struct UniformDistribution { + std::uniform_int_distribution dist{ + std::numeric_limits::min(), + std::numeric_limits::max()}; + + template + int32_t operator()(Generator& g) { + return dist(g); + } +}; + +template +size_t RandomRank(Rng& rng) { + return std::uniform_int_distribution(0, XNN_MAX_TENSOR_DIMS)(rng); +} + +template +std::vector RandomShape(Rng& rng, size_t rank) { + std::uniform_int_distribution dims_dist(1, 9); + std::vector dims(rank); + std::generate(dims.begin(), dims.end(), [&]() { return dims_dist(rng); }); + return dims; +} + +template +std::vector RandomShape(Rng& rng) { + return RandomShape(rng, RandomRank(rng)); +} + +template +xnn_quantization_params RandomQuantization(Rng& rng) { + if (std::is_same::value || std::is_same::value) { + return { + static_cast(UniformDistribution()(rng)), + std::uniform_real_distribution(0.1f, 5.0f)(rng), + }; + } else { + return {0, 1.0f}; + } +} + +void RemoveLeadingOnes(std::vector& dims) { + while (!dims.empty()) { + if (dims.front() == 1) { + dims.erase(dims.begin()); + } else { + break; + } + } +} + +size_t NumElements(const std::vector& dims) { + return std::accumulate(dims.begin(), dims.end(), size_t(1), + std::multiplies()); +} + +bool is_quantized(xnn_datatype t) { + switch (t) { + case xnn_datatype_qint8: + case xnn_datatype_quint8: + case xnn_datatype_qint32: + return true; + default: + return false; + } +} + +static const char* binary_operator_to_string( + xnn_binary_operator operation_type) { + switch (operation_type) { + case xnn_binary_add: + return "Add"; + case xnn_binary_copysign: + return "CopySign"; + case xnn_binary_divide: + return "Divide"; + case xnn_binary_maximum: + return "Maximum"; + case xnn_binary_minimum: + return "Minimum"; + case xnn_binary_multiply: + return "Multiply"; + case xnn_binary_subtract: + return "Subtract"; + case xnn_binary_squared_difference: + return "SquaredDifference"; + default: + return "Unknown"; + } +} + +template +xnn_datatype datatype_of() { + if (std::is_same::value) { + return xnn_datatype_quint8; + } else if (std::is_same::value) { + return xnn_datatype_qint8; + } else if (std::is_same::value) { + return xnn_datatype_fp16; + } else if (std::is_same::value) { + return xnn_datatype_fp32; + } else if (std::is_same::value) { + return xnn_datatype_int32; + } else { + XNN_UNREACHABLE; + } +} + +size_t xnn_datatype_size(xnn_datatype datatype) { + switch (datatype) { + case xnn_datatype_qint8: + case xnn_datatype_quint8: + return sizeof(int8_t); + case xnn_datatype_fp16: + return sizeof(xnn_float16); + case xnn_datatype_fp32: + return sizeof(float); + case xnn_datatype_int32: + return sizeof(int32_t); + default: + XNN_UNREACHABLE; + } +} + +// TODO(dsharlet): We need a place to put helper functions like this. +// XNNPACK's built-in equivalent helpers are not implemented in release +// builds... +const char* datatype_to_string(xnn_datatype datatype) { + switch (datatype) { + case xnn_datatype_qint8: + return "qint8"; + case xnn_datatype_quint8: + return "quint8"; + case xnn_datatype_fp16: + return "fp16"; + case xnn_datatype_fp32: + return "fp32"; + case xnn_datatype_int32: + return "int32"; + default: + XNN_UNREACHABLE; + } +} + +template +void MatchesOperatorApi(xnn_binary_operator binary_op) { + xnn_datatype datatype = datatype_of(); + xnnpack::ReplicableRandomDevice rng; + + std::vector input0_dims = RandomShape(rng); + std::vector input1_dims; + std::vector output_dims; + // Create input dimensions. + // Create input 2 with an equal or larger number of dimensions. + const size_t input1_num_dims = std::uniform_int_distribution( + input0_dims.size(), XNN_MAX_TENSOR_DIMS)(rng); + input1_dims = RandomShape(rng, input1_num_dims); + // Ensure that the inputs dimensions match. + std::copy_backward(input0_dims.begin(), input0_dims.end(), input1_dims.end()); + + // Choose a random dimension to broadcast for each input. + const size_t input0_broadcast_dim = + std::uniform_int_distribution(0, input0_dims.size())(rng); + if (input0_broadcast_dim < input0_dims.size()) { + input0_dims[input0_broadcast_dim] = 1; + } + const size_t input1_broadcast_dim = + std::uniform_int_distribution(0, input1_dims.size())(rng); + if (input1_broadcast_dim < input1_dims.size()) { + input1_dims[input1_broadcast_dim] = 1; + } + input0_dims.resize(XNN_MAX_TENSOR_DIMS); + input1_dims.resize(XNN_MAX_TENSOR_DIMS); + output_dims.resize(XNN_MAX_TENSOR_DIMS); + + // Calculate generalized shapes. + std::fill(input0_dims.begin(), input0_dims.end(), 1); + std::fill(input1_dims.begin(), input1_dims.end(), 1); + std::copy_backward(input0_dims.cbegin(), input0_dims.cend(), + input0_dims.end()); + std::copy_backward(input1_dims.cbegin(), input1_dims.cend(), + input1_dims.end()); + for (size_t i = 0; i < XNN_MAX_TENSOR_DIMS; i++) { + if (input0_dims[i] != 1 && input1_dims[i] != 1) { + ASSERT_EQ(input0_dims[i], input1_dims[i]) << "i: " << i; + } + output_dims[i] = std::max(input0_dims[i], input1_dims[i]); + } + + if (rng() % 2 == 0) { + RemoveLeadingOnes(input0_dims); + } + if (rng() % 2 == 0) { + RemoveLeadingOnes(input1_dims); + } + while (output_dims.size() > + std::max(input0_dims.size(), input1_dims.size())) { + output_dims.erase(output_dims.begin()); + } + + std::vector> input0(NumElements(input0_dims) + + XNN_EXTRA_BYTES / sizeof(T)); + std::vector> input1(NumElements(input1_dims) + + XNN_EXTRA_BYTES / sizeof(T)); + std::vector> operator_output( + NumElements(output_dims)); + std::vector> subgraph_output( + NumElements(output_dims)); + UniformDistribution dist; + std::generate(input0.begin(), input0.end(), [&]() { return dist(rng); }); + std::generate(input1.begin(), input1.end(), [&]() { return dist(rng); }); + + ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); + + bool quantized = is_quantized(datatype); + xnn_quantization_params input0_quantization = RandomQuantization(rng); + xnn_quantization_params input1_quantization = RandomQuantization(rng); + xnn_quantization_params output_quantization = RandomQuantization(rng); + + // Call subgraph API. + xnn_subgraph_t subgraph = nullptr; + ASSERT_EQ(xnn_status_success, xnn_create_subgraph(3, /*flags=*/0, &subgraph)); + std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); + + uint32_t input0_id = XNN_INVALID_NODE_ID; + uint32_t input1_id = XNN_INVALID_NODE_ID; + uint32_t output_id = XNN_INVALID_NODE_ID; + if (quantized) { + ASSERT_EQ(xnn_status_success, + xnn_define_quantized_tensor_value( + subgraph, datatype, input0_quantization.zero_point, + input0_quantization.scale, input0_dims.size(), + input0_dims.data(), nullptr, + /*external_id=*/0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, + &input0_id)); + + ASSERT_EQ(xnn_status_success, + xnn_define_quantized_tensor_value( + subgraph, datatype, input1_quantization.zero_point, + input1_quantization.scale, input1_dims.size(), + input1_dims.data(), nullptr, + /*external_id=*/1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, + &input1_id)); + + ASSERT_EQ(xnn_status_success, + xnn_define_quantized_tensor_value( + subgraph, datatype, output_quantization.zero_point, + output_quantization.scale, output_dims.size(), + output_dims.data(), nullptr, /*external_id=*/2, + /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); + } else { + ASSERT_EQ(xnn_status_success, + xnn_define_tensor_value(subgraph, datatype, input0_dims.size(), + input0_dims.data(), nullptr, + /*external_id=*/0, + /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, + &input0_id)); + + ASSERT_EQ(xnn_status_success, + xnn_define_tensor_value(subgraph, datatype, input1_dims.size(), + input1_dims.data(), nullptr, + /*external_id=*/1, + /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, + &input1_id)); + + ASSERT_EQ(xnn_status_success, + xnn_define_tensor_value( + subgraph, datatype, output_dims.size(), output_dims.data(), + nullptr, /*external_id=*/2, + /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); + } + ASSERT_NE(input0_id, XNN_INVALID_NODE_ID); + ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); + ASSERT_NE(output_id, XNN_INVALID_NODE_ID); + + ASSERT_EQ(xnn_status_success, + xnn_define_binary(subgraph, binary_op, nullptr, input0_id, + input1_id, output_id, /*flags=*/0)); + + xnn_runtime_t runtime = nullptr; + xnn_status status = + xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime); + if (status == xnn_status_unsupported_hardware) { + GTEST_SKIP(); + } + ASSERT_EQ(xnn_status_success, status); + ASSERT_NE(nullptr, runtime); + std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); + std::array external = { + xnn_external_value{input0_id, input0.data()}, + xnn_external_value{input1_id, input1.data()}, + xnn_external_value{output_id, subgraph_output.data()}}; + ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); + ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); + + // Call operator API. + xnn_operator_t op = nullptr; + + if (quantized) { + ASSERT_EQ(xnn_status_success, xnn_create_binary_elementwise_nd( + binary_op, datatype, &input0_quantization, + &input1_quantization, + &output_quantization, /*flags=*/0, &op)); + } else { + ASSERT_EQ(xnn_status_success, xnn_create_binary_elementwise_nd( + binary_op, datatype, &input0_quantization, + &input1_quantization, + &output_quantization, /*flags=*/0, &op)); + } + std::unique_ptr auto_op( + op, xnn_delete_operator); + + ASSERT_EQ(xnn_status_success, xnn_reshape_binary_elementwise_nd( + op, input0_dims.size(), input0_dims.data(), + input1_dims.size(), input1_dims.data(), + /*threadpool=*/nullptr)); + + ASSERT_EQ(xnn_status_success, + xnn_setup_binary_elementwise_nd(op, input0.data(), input1.data(), + operator_output.data())); + + ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); + + // Check output shape matches. + size_t observed_output_num_dims = 0; + std::vector observed_output_dims(XNN_MAX_TENSOR_DIMS, 0); + ASSERT_EQ( + xnn_status_success, + xnn_get_external_value_shape(runtime, output_id, &observed_output_num_dims, observed_output_dims.data())); + ASSERT_EQ(output_dims.size(), observed_output_num_dims); + for (size_t i = 0; i < observed_output_num_dims; i++) { + ASSERT_EQ(output_dims[i], observed_output_dims[i]); + } + + // Check outputs match. + ASSERT_EQ(subgraph_output, operator_output); +} + +void Reshape(xnn_datatype datatype, xnn_binary_operator binary_op) { + ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); + + xnn_subgraph_t subgraph = nullptr; + ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/3, + /*flags=*/0, &subgraph)); + std::unique_ptr auto_subgraph( + subgraph, xnn_delete_subgraph); + + std::vector dims{2, 3, 4}; + uint32_t input0_id = XNN_INVALID_NODE_ID; + ASSERT_EQ(xnn_status_success, + xnn_define_tensor_value(subgraph, datatype, dims.size(), + dims.data(), nullptr, /*external_id=*/0, + /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, + &input0_id)); + ASSERT_NE(input0_id, XNN_INVALID_NODE_ID); + uint32_t input1_id = XNN_INVALID_NODE_ID; + ASSERT_EQ(xnn_status_success, + xnn_define_tensor_value(subgraph, datatype, dims.size(), + dims.data(), nullptr, /*external_id=*/1, + /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, + &input1_id)); + ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); + + uint32_t output_id = XNN_INVALID_NODE_ID; + ASSERT_EQ(xnn_status_success, + xnn_define_tensor_value(subgraph, datatype, dims.size(), + dims.data(), nullptr, /*external_id=*/2, + /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, + &output_id)); + ASSERT_NE(output_id, XNN_INVALID_NODE_ID); + + ASSERT_EQ(xnn_status_success, + xnn_define_binary(subgraph, binary_op, nullptr, input0_id, + input1_id, output_id, /*flags=*/0)); + + ASSERT_EQ(subgraph->num_nodes, 1); + struct xnn_node* node = &subgraph->nodes[0]; + ASSERT_EQ(node->type, xnn_binary_operator_to_node_type(binary_op)); + ASSERT_EQ(node->num_inputs, 2); + ASSERT_EQ(node->inputs[0], input0_id); + ASSERT_EQ(node->inputs[1], input1_id); + ASSERT_EQ(node->num_outputs, 1); + ASSERT_EQ(node->outputs[0], output_id); + ASSERT_EQ(node->flags, 0); + + xnn_runtime_t runtime = nullptr; + xnn_status status = + xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime); + if (status == xnn_status_unsupported_hardware) { + GTEST_SKIP(); + } + ASSERT_EQ(xnn_status_success, status); + ASSERT_NE(nullptr, runtime); + std::unique_ptr auto_runtime( + runtime, xnn_delete_runtime); + + ASSERT_EQ(node->reshape(&runtime->opdata[0], subgraph->values, + subgraph->num_values, /*threadpool=*/nullptr), + xnn_status_success); + + dims[0] = 7; + ASSERT_EQ( + xnn_status_success, + xnn_reshape_external_value(runtime, input0_id, dims.size(), dims.data())); + ASSERT_EQ( + xnn_status_success, + xnn_reshape_external_value(runtime, input1_id, dims.size(), dims.data())); + + ASSERT_EQ(node->reshape(&runtime->opdata[0], runtime->values, + runtime->num_values, /*threadpool=*/nullptr), + xnn_status_reallocation_required); + const xnn_shape* output_shape = &runtime->values[node->outputs[0]].shape; + const size_t num_input_elements = std::accumulate( + dims.cbegin(), dims.cend(), size_t{1}, std::multiplies()); + ASSERT_EQ(output_shape->dim[0], dims[0]); + ASSERT_EQ(runtime->values[node->outputs[0]].size, + num_input_elements * xnn_datatype_size(datatype)); +} + +void ReshapeBroadcastDim0(xnn_datatype datatype, + xnn_binary_operator binary_op) { + ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); + + xnn_subgraph_t subgraph = nullptr; + ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/3, + /*flags=*/0, &subgraph)); + std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); + + std::vector dim0{1, 3, 4}; + std::vector dim1{5, 3, 4}; + uint32_t input0_id = XNN_INVALID_NODE_ID; + ASSERT_EQ(xnn_status_success, + xnn_define_tensor_value(subgraph, datatype, dim0.size(), + dim0.data(), nullptr, /*external_id=*/0, + /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, + &input0_id)); + ASSERT_NE(input0_id, XNN_INVALID_NODE_ID); + uint32_t input1_id = XNN_INVALID_NODE_ID; + ASSERT_EQ(xnn_status_success, + xnn_define_tensor_value(subgraph, datatype, dim1.size(), + dim1.data(), nullptr, /*external_id=*/1, + /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, + &input1_id)); + ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); + + uint32_t output_id = XNN_INVALID_NODE_ID; + // Output dims will be correctly set by reshape. + ASSERT_EQ(xnn_status_success, + xnn_define_tensor_value(subgraph, datatype, dim1.size(), + dim1.data(), nullptr, /*external_id=*/2, + /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, + &output_id)); + ASSERT_NE(output_id, XNN_INVALID_NODE_ID); + + ASSERT_EQ(xnn_status_success, + xnn_define_binary(subgraph, binary_op, nullptr, input0_id, + input1_id, output_id, /*flags=*/0)); + + ASSERT_EQ(subgraph->num_nodes, 1); + struct xnn_node* node = &subgraph->nodes[0]; + ASSERT_EQ(node->type, xnn_binary_operator_to_node_type(binary_op)); + ASSERT_EQ(node->num_inputs, 2); + ASSERT_EQ(node->inputs[0], input0_id); + ASSERT_EQ(node->inputs[1], input1_id); + ASSERT_EQ(node->num_outputs, 1); + ASSERT_EQ(node->outputs[0], output_id); + ASSERT_EQ(node->flags, 0); + + xnn_runtime_t runtime = nullptr; + xnn_status status = + xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime); + if (status == xnn_status_unsupported_hardware) { + GTEST_SKIP(); + } + ASSERT_EQ(xnn_status_success, status); + ASSERT_NE(nullptr, runtime); + std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); + + ASSERT_EQ(node->reshape(&runtime->opdata[0], subgraph->values, + subgraph->num_values, /*threadpool=*/nullptr), + xnn_status_success); + + dim0[0] = 7; + dim1[0] = 1; + ASSERT_EQ( + xnn_status_success, + xnn_reshape_external_value(runtime, input0_id, dim0.size(), dim0.data())); + ASSERT_EQ( + xnn_status_success, + xnn_reshape_external_value(runtime, input1_id, dim1.size(), dim1.data())); + + ASSERT_EQ(node->reshape(&runtime->opdata[0], runtime->values, + runtime->num_values, /*threadpool=*/nullptr), + xnn_status_reallocation_required); + const xnn_shape* output_shape = &runtime->values[node->outputs[0]].shape; + const size_t num_input_elements = std::accumulate( + dim0.cbegin(), dim0.cend(), size_t{1}, std::multiplies()); + ASSERT_EQ(output_shape->dim[0], dim0[0]); + ASSERT_EQ(runtime->values[node->outputs[0]].size, + num_input_elements * xnn_datatype_size(datatype)); +} + +void ReshapeBroadcast1D(xnn_datatype datatype, xnn_binary_operator binary_op) { + ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); + + xnn_subgraph_t subgraph = nullptr; + ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/3, + /*flags=*/0, &subgraph)); + std::unique_ptr auto_subgraph( + subgraph, xnn_delete_subgraph); + + std::vector dim0{1, 20, 80, 32}; + std::vector dim1{32}; + uint32_t input0_id = XNN_INVALID_NODE_ID; + ASSERT_EQ(xnn_status_success, + xnn_define_tensor_value(subgraph, datatype, dim0.size(), + dim0.data(), nullptr, /*external_id=*/0, + /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, + &input0_id)); + ASSERT_NE(input0_id, XNN_INVALID_NODE_ID); + uint32_t input1_id = XNN_INVALID_NODE_ID; + ASSERT_EQ(xnn_status_success, + xnn_define_tensor_value(subgraph, datatype, dim1.size(), + dim1.data(), nullptr, /*external_id=*/1, + /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, + &input1_id)); + ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); + + uint32_t output_id = XNN_INVALID_NODE_ID; + ASSERT_EQ(xnn_status_success, + xnn_define_tensor_value(subgraph, datatype, dim0.size(), + dim0.data(), nullptr, /*external_id=*/2, + /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, + &output_id)); + ASSERT_NE(output_id, XNN_INVALID_NODE_ID); + + ASSERT_EQ(xnn_status_success, + xnn_define_binary(subgraph, binary_op, nullptr, input0_id, + input1_id, output_id, /*flags=*/0)); + + ASSERT_EQ(subgraph->num_nodes, 1); + struct xnn_node* node = &subgraph->nodes[0]; + ASSERT_EQ(node->type, xnn_binary_operator_to_node_type(binary_op)); + ASSERT_EQ(node->num_inputs, 2); + ASSERT_EQ(node->inputs[0], input0_id); + ASSERT_EQ(node->inputs[1], input1_id); + ASSERT_EQ(node->num_outputs, 1); + ASSERT_EQ(node->outputs[0], output_id); + ASSERT_EQ(node->flags, 0); + + xnn_runtime_t runtime = nullptr; + xnn_status status = + xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime); + if (status == xnn_status_unsupported_hardware) { + GTEST_SKIP(); + } + ASSERT_EQ(xnn_status_success, status); + ASSERT_NE(nullptr, runtime); + std::unique_ptr auto_runtime( + runtime, xnn_delete_runtime); + + ASSERT_EQ(node->reshape(&runtime->opdata[0], subgraph->values, + subgraph->num_values, /*threadpool=*/nullptr), + xnn_status_success); + + dim0[0] = 7; + dim1[0] = 1; + ASSERT_EQ( + xnn_status_success, + xnn_reshape_external_value(runtime, input0_id, dim0.size(), dim0.data())); + ASSERT_EQ( + xnn_status_success, + xnn_reshape_external_value(runtime, input1_id, dim1.size(), dim1.data())); + + ASSERT_EQ(node->reshape(&runtime->opdata[0], runtime->values, + runtime->num_values, /*threadpool=*/nullptr), + xnn_status_reallocation_required); + const xnn_shape* output_shape = &runtime->values[node->outputs[0]].shape; + const size_t num_input_elements = std::accumulate( + dim0.cbegin(), dim0.cend(), size_t{1}, std::multiplies()); + ASSERT_EQ(output_shape->dim[0], dim0[0]); + ASSERT_EQ(runtime->values[node->outputs[0]].size, + num_input_elements * xnn_datatype_size(datatype)); +} + +void ReshapeBroadcast2D(xnn_datatype datatype, xnn_binary_operator binary_op) { + ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); + + xnn_subgraph_t subgraph = nullptr; + ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/3, + /*flags=*/0, &subgraph)); + std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); + + std::vector dim0{1, 20, 80, 32}; + std::vector dim1{80, 32}; + uint32_t input0_id = XNN_INVALID_NODE_ID; + ASSERT_EQ(xnn_status_success, + xnn_define_tensor_value(subgraph, datatype, dim0.size(), + dim0.data(), nullptr, /*external_id=*/0, + /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, + &input0_id)); + ASSERT_NE(input0_id, XNN_INVALID_NODE_ID); + uint32_t input1_id = XNN_INVALID_NODE_ID; + ASSERT_EQ(xnn_status_success, + xnn_define_tensor_value(subgraph, datatype, dim1.size(), + dim1.data(), nullptr, /*external_id=*/1, + /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, + &input1_id)); + ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); + + uint32_t output_id = XNN_INVALID_NODE_ID; + ASSERT_EQ(xnn_status_success, + xnn_define_tensor_value(subgraph, datatype, dim0.size(), + dim0.data(), nullptr, /*external_id=*/2, + /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, + &output_id)); + ASSERT_NE(output_id, XNN_INVALID_NODE_ID); + + ASSERT_EQ(xnn_status_success, + xnn_define_binary(subgraph, binary_op, nullptr, input0_id, + input1_id, output_id, /*flags=*/0)); + + ASSERT_EQ(subgraph->num_nodes, 1); + struct xnn_node* node = &subgraph->nodes[0]; + ASSERT_EQ(node->type, xnn_binary_operator_to_node_type(binary_op)); + ASSERT_EQ(node->num_inputs, 2); + ASSERT_EQ(node->inputs[0], input0_id); + ASSERT_EQ(node->inputs[1], input1_id); + ASSERT_EQ(node->num_outputs, 1); + ASSERT_EQ(node->outputs[0], output_id); + ASSERT_EQ(node->flags, 0); + + xnn_runtime_t runtime = nullptr; + xnn_status status = + xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime); + if (status == xnn_status_unsupported_hardware) { + GTEST_SKIP(); + } + ASSERT_EQ(xnn_status_success, status); + ASSERT_NE(nullptr, runtime); + std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); + + ASSERT_EQ(node->reshape(&runtime->opdata[0], subgraph->values, + subgraph->num_values, /*threadpool=*/nullptr), + xnn_status_success); + + dim0[0] = 7; + dim1[0] = 1; + ASSERT_EQ( + xnn_status_success, + xnn_reshape_external_value(runtime, input0_id, dim0.size(), dim0.data())); + ASSERT_EQ( + xnn_status_success, + xnn_reshape_external_value(runtime, input1_id, dim1.size(), dim1.data())); + + ASSERT_EQ(node->reshape(&runtime->opdata[0], runtime->values, + runtime->num_values, /*threadpool=*/nullptr), + xnn_status_reallocation_required); + const xnn_shape* output_shape = &runtime->values[node->outputs[0]].shape; + const size_t num_input_elements = std::accumulate( + dim0.cbegin(), dim0.cend(), size_t{1}, std::multiplies()); + ASSERT_EQ(output_shape->dim[0], dim0[0]); + ASSERT_EQ(runtime->values[node->outputs[0]].size, + num_input_elements * xnn_datatype_size(datatype)); +} + +void DegenerateDimension(xnn_datatype datatype, xnn_binary_operator binary_op) { + ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); + + xnn_subgraph_t subgraph = nullptr; + ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/3, + /*flags=*/0, &subgraph)); + std::unique_ptr auto_subgraph( + subgraph, xnn_delete_subgraph); + + std::vector dim0{0, 32}; + std::vector dim1{2, 0, 32}; + uint32_t input0_id = XNN_INVALID_NODE_ID; + ASSERT_EQ(xnn_status_success, + xnn_define_tensor_value(subgraph, datatype, dim0.size(), + dim0.data(), nullptr, /*external_id=*/0, + /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, + &input0_id)); + ASSERT_NE(input0_id, XNN_INVALID_NODE_ID); + uint32_t input1_id = XNN_INVALID_NODE_ID; + ASSERT_EQ(xnn_status_success, + xnn_define_tensor_value(subgraph, datatype, dim1.size(), + dim1.data(), nullptr, /*external_id=*/1, + /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, + &input1_id)); + ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); + + uint32_t output_id = XNN_INVALID_NODE_ID; + ASSERT_EQ(xnn_status_success, + xnn_define_tensor_value(subgraph, datatype, dim1.size(), + dim1.data(), nullptr, /*external_id=*/2, + /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, + &output_id)); + ASSERT_NE(output_id, XNN_INVALID_NODE_ID); + + ASSERT_EQ(xnn_status_success, + xnn_define_binary(subgraph, binary_op, nullptr, input0_id, + input1_id, output_id, /*flags=*/0)); + + ASSERT_EQ(subgraph->num_nodes, 1); + struct xnn_node* node = &subgraph->nodes[0]; + ASSERT_EQ(node->type, xnn_binary_operator_to_node_type(binary_op)); + ASSERT_EQ(node->num_inputs, 2); + ASSERT_EQ(node->inputs[0], input0_id); + ASSERT_EQ(node->inputs[1], input1_id); + ASSERT_EQ(node->num_outputs, 1); + ASSERT_EQ(node->outputs[0], output_id); + ASSERT_EQ(node->flags, 0); + + xnn_runtime_t runtime = nullptr; + xnn_status status = + xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime); + if (status == xnn_status_unsupported_hardware) { + GTEST_SKIP(); + } + ASSERT_EQ(xnn_status_success, status); + ASSERT_NE(nullptr, runtime); + std::unique_ptr auto_runtime( + runtime, xnn_delete_runtime); + + ASSERT_EQ(node->reshape(&runtime->opdata[0], subgraph->values, + subgraph->num_values, /*threadpool=*/nullptr), + xnn_status_success); +} + +template +class BinaryTest : public testing::TestWithParam {}; + +using BinaryTestQS8 = BinaryTest; +using BinaryTestQU8 = BinaryTest; +#ifndef XNN_EXCLUDE_F16_TESTS +using BinaryTestF16 = BinaryTest; +#endif // XNN_EXCLUDE_F16_TESTS +using BinaryTestF32 = BinaryTest; +using BinaryTestS32 = BinaryTest; + +TEST_P(BinaryTestQS8, matches_operator_api) { + MatchesOperatorApi(GetParam()); +} +TEST_P(BinaryTestQU8, matches_operator_api) { + MatchesOperatorApi(GetParam()); +} +#ifndef XNN_EXCLUDE_F16_TESTS +TEST_P(BinaryTestF16, matches_operator_api) { + MatchesOperatorApi(GetParam()); +} +#endif // XNN_EXCLUDE_F16_TESTS +TEST_P(BinaryTestF32, matches_operator_api) { + MatchesOperatorApi(GetParam()); +} +TEST_P(BinaryTestS32, matches_operator_api) { + MatchesOperatorApi(GetParam()); +} + +#ifndef XNN_EXCLUDE_F16_TESTS +TEST_P(BinaryTestF16, reshape) { Reshape(xnn_datatype_fp16, GetParam()); } +#endif // XNN_EXCLUDE_F16_TESTS +TEST_P(BinaryTestF32, reshape) { Reshape(xnn_datatype_fp32, GetParam()); } +TEST_P(BinaryTestS32, reshape) { Reshape(xnn_datatype_int32, GetParam()); } + +#ifndef XNN_EXCLUDE_F16_TESTS +TEST_P(BinaryTestF16, reshape_broadcast_dim0) { + ReshapeBroadcastDim0(xnn_datatype_fp16, GetParam()); +} +#endif // XNN_EXCLUDE_F16_TESTS +TEST_P(BinaryTestF32, reshape_broadcast_dim0) { + ReshapeBroadcastDim0(xnn_datatype_fp32, GetParam()); +} +TEST_P(BinaryTestS32, reshape_broadcast_dim0) { + ReshapeBroadcastDim0(xnn_datatype_int32, GetParam()); +} + +#ifndef XNN_EXCLUDE_F16_TESTS +TEST_P(BinaryTestF16, reshape_broadcast_1d) { + ReshapeBroadcast1D(xnn_datatype_fp16, GetParam()); +} +#endif // XNN_EXCLUDE_F16_TESTS +TEST_P(BinaryTestF32, reshape_broadcast_1d) { + ReshapeBroadcast1D(xnn_datatype_fp32, GetParam()); +} +TEST_P(BinaryTestS32, reshape_broadcast_1d) { + ReshapeBroadcast1D(xnn_datatype_int32, GetParam()); +} + +#ifndef XNN_EXCLUDE_F16_TESTS +TEST_P(BinaryTestF16, reshape_broadcast_2d) { + ReshapeBroadcast2D(xnn_datatype_fp16, GetParam()); +} +#endif // XNN_EXCLUDE_F16_TESTS +TEST_P(BinaryTestF32, reshape_broadcast_2d) { + ReshapeBroadcast2D(xnn_datatype_fp32, GetParam()); +} +TEST_P(BinaryTestS32, reshape_broadcast_2d) { + ReshapeBroadcast2D(xnn_datatype_int32, GetParam()); +} + +#ifndef XNN_EXCLUDE_F16_TESTS +TEST_P(BinaryTestF16, degenerate_dimension) { + DegenerateDimension(xnn_datatype_fp16, GetParam()); +} +#endif // XNN_EXCLUDE_F16_TESTS +TEST_P(BinaryTestF32, degenerate_dimension) { + DegenerateDimension(xnn_datatype_fp32, GetParam()); +} +TEST_P(BinaryTestS32, degenerate_dimension) { + DegenerateDimension(xnn_datatype_int32, GetParam()); +} + +std::string ToString(xnn_binary_operator op) { + return binary_operator_to_string(op); +} + +INSTANTIATE_TEST_SUITE_P(test, BinaryTestQS8, + testing::Values(xnn_binary_add, xnn_binary_subtract, + xnn_binary_multiply), + [](const auto& info) { return ToString(info.param); }); +INSTANTIATE_TEST_SUITE_P(test, BinaryTestQU8, + testing::Values(xnn_binary_add, xnn_binary_subtract, + xnn_binary_multiply), + [](const auto& info) { return ToString(info.param); }); +#ifndef XNN_EXCLUDE_F16_TESTS +INSTANTIATE_TEST_SUITE_P(test, BinaryTestF16, + testing::Values(xnn_binary_add, xnn_binary_subtract, + xnn_binary_multiply, xnn_binary_divide, + xnn_binary_maximum, xnn_binary_minimum, + xnn_binary_squared_difference), + [](const auto& info) { return ToString(info.param); }); +#endif +INSTANTIATE_TEST_SUITE_P(test, BinaryTestF32, + testing::Values(xnn_binary_add, xnn_binary_subtract, + xnn_binary_multiply, xnn_binary_divide, + xnn_binary_maximum, xnn_binary_minimum, + xnn_binary_copysign, + xnn_binary_squared_difference), + [](const auto& info) { return ToString(info.param); }); +INSTANTIATE_TEST_SUITE_P(test, BinaryTestS32, + testing::Values(xnn_binary_multiply), + [](const auto& info) { return ToString(info.param); }); diff --git a/test/copysign.cc b/test/copysign.cc deleted file mode 100644 index 538e6c74ba6..00000000000 --- a/test/copysign.cc +++ /dev/null @@ -1,138 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include - -#include -#include -#include -#include -#include - -#include -#include -#include "xnnpack.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/subgraph.h" -#include "subgraph-binary-tester.h" - -using CopySignTestF32 = BinaryTest; - -TEST_F(CopySignTestF32, define) { - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(3, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - std::vector dims = RandomShape(); - - uint32_t input1_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, - /*external_id=*/0, /*flags=*/0, &input1_id)); - ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); - - uint32_t input2_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, - /*external_id=*/0, /*flags=*/0, &input2_id)); - ASSERT_NE(input2_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ( - xnn_status_success, - xnn_define_copysign(subgraph, input1_id, input2_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_copysign); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp32); - ASSERT_EQ(node->num_inputs, 2); - ASSERT_EQ(node->inputs[0], input1_id); - ASSERT_EQ(node->inputs[1], input2_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(CopySignTestF32, matches_operator_api) -{ - std::generate(input1.begin(), input1.end(), [&]() { return f32dist(rng); }); - std::generate(input2.begin(), input2.end(), [&]() { return f32dist(rng); }); - std::fill(operator_output.begin(), operator_output.end(), nanf("")); - std::fill(subgraph_output.begin(), subgraph_output.end(), nanf("")); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_operator_t op = nullptr; - - // Call operator API. - ASSERT_EQ(xnn_status_success, xnn_create_copysign_nd_f32(/*flags=*/0, &op)); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ( - xnn_status_success, xnn_reshape_copysign_nd_f32( - op, input1_dims.size(), input1_dims.data(), input2_dims.size(), input2_dims.data(), - /*threadpool=*/nullptr)); - - ASSERT_EQ( - xnn_status_success, xnn_setup_copysign_nd_f32(op, input1.data(), input2.data(), operator_output.data())); - - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(3, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - uint32_t input1_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, input1_dims.size(), input1_dims.data(), nullptr, - /*external_id=*/0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input1_id)); - ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); - - uint32_t input2_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, input2_dims.size(), input2_dims.data(), nullptr, - /*external_id=*/1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input2_id)); - ASSERT_NE(input2_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, output_dims.size(), output_dims.data(), nullptr, /*external_id=*/2, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ( - xnn_status_success, - xnn_define_copysign(subgraph, input1_id, input2_id, output_id, /*flags=*/0)); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input1_id, input1.data()}, xnn_external_value{input2_id, input2.data()}, - xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} diff --git a/test/divide2.cc b/test/divide2.cc deleted file mode 100644 index 8e5c252de68..00000000000 --- a/test/divide2.cc +++ /dev/null @@ -1,167 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include - -#include -#include -#include -#include -#include - -#include -#include -#include "xnnpack.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/subgraph.h" -#include "subgraph-binary-tester.h" - -using Divide2TestF16 = BinaryTest; -using Divide2TestF32 = BinaryTest; - -TEST_F(Divide2TestF16, matches_operator_api) -{ - std::generate(input1.begin(), input1.end(), [&]() { return f32dist(rng); }); - std::generate(input2.begin(), input2.end(), [&]() { return f32dist(rng); }); - std::fill(operator_output.begin(), operator_output.end(), std::nanf("")); - std::fill(subgraph_output.begin(), subgraph_output.end(), std::nanf("")); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_operator_t op = nullptr; - - // Call operator API. - const xnn_status status = xnn_create_divide_nd_f16(output_min, output_max, 0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ( - xnn_status_success, xnn_reshape_divide_nd_f16( - op, input1_dims.size(), input1_dims.data(), input2_dims.size(), input2_dims.data(), - /*threadpool=*/nullptr)); - - ASSERT_EQ( - xnn_status_success, xnn_setup_divide_nd_f16(op, input1.data(), input2.data(), operator_output.data())); - - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(3, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - uint32_t input1_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, input1_dims.size(), input1_dims.data(), nullptr, - /*external_id=*/0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input1_id)); - ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); - - uint32_t input2_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, input2_dims.size(), input2_dims.data(), nullptr, - /*external_id=*/1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input2_id)); - ASSERT_NE(input2_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, output_dims.size(), output_dims.data(), nullptr, /*external_id=*/2, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ( - xnn_status_success, - xnn_define_divide(subgraph, output_min, output_max, input1_id, input2_id, output_id, /*flags=*/0)); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input1_id, input1.data()}, xnn_external_value{input2_id, input2.data()}, - xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(Divide2TestF32, matches_operator_api) -{ - std::generate(input1.begin(), input1.end(), [&]() { return f32dist(rng); }); - std::generate(input2.begin(), input2.end(), [&]() { return f32dist(rng); }); - std::fill(operator_output.begin(), operator_output.end(), nanf("")); - std::fill(subgraph_output.begin(), subgraph_output.end(), nanf("")); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_operator_t op = nullptr; - - // Call operator API. - ASSERT_EQ(xnn_status_success, xnn_create_divide_nd_f32(output_min, output_max, 0, &op)); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ( - xnn_status_success, xnn_reshape_divide_nd_f32( - op, input1_dims.size(), input1_dims.data(), input2_dims.size(), input2_dims.data(), - /*threadpool=*/nullptr)); - - ASSERT_EQ( - xnn_status_success, xnn_setup_divide_nd_f32(op, input1.data(), input2.data(), operator_output.data())); - - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(3, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - uint32_t input1_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, input1_dims.size(), input1_dims.data(), nullptr, - /*external_id=*/0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input1_id)); - ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); - - uint32_t input2_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, input2_dims.size(), input2_dims.data(), nullptr, - /*external_id=*/1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input2_id)); - ASSERT_NE(input2_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, output_dims.size(), output_dims.data(), nullptr, /*external_id=*/2, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ( - xnn_status_success, - xnn_define_divide(subgraph, output_min, output_max, input1_id, input2_id, output_id, /*flags=*/0)); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input1_id, input1.data()}, xnn_external_value{input2_id, input2.data()}, - xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} diff --git a/test/maximum2.cc b/test/maximum2.cc deleted file mode 100644 index 66ada449f1a..00000000000 --- a/test/maximum2.cc +++ /dev/null @@ -1,244 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include - -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/subgraph.h" -#include "subgraph-binary-tester.h" - -using Maximum2TestF16 = BinaryTest; -using Maximum2TestF32 = BinaryTest; - -TEST_F(Maximum2TestF16, define) { - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(3, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - uint32_t input1_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, input1_dims.size(), input1_dims.data(), nullptr, - /*external_id=*/0, /*flags=*/0, &input1_id)); - ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); - - uint32_t input2_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, input2_dims.size(), input2_dims.data(), nullptr, - /*external_id=*/0, /*flags=*/0, &input2_id)); - ASSERT_NE(input2_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, output_dims.size(), output_dims.data(), nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ( - xnn_status_success, - xnn_define_maximum2(subgraph, input1_id, input2_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_maximum2); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp16); - ASSERT_EQ(node->num_inputs, 2); - ASSERT_EQ(node->inputs[0], input1_id); - ASSERT_EQ(node->inputs[1], input2_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(Maximum2TestF32, define) { - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(3, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - uint32_t input1_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, input1_dims.size(), input1_dims.data(), nullptr, - /*external_id=*/0, /*flags=*/0, &input1_id)); - ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); - - uint32_t input2_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, input2_dims.size(), input2_dims.data(), nullptr, - /*external_id=*/0, /*flags=*/0, &input2_id)); - ASSERT_NE(input2_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, output_dims.size(), output_dims.data(), nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ( - xnn_status_success, - xnn_define_maximum2(subgraph, input1_id, input2_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_maximum2); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp32); - ASSERT_EQ(node->num_inputs, 2); - ASSERT_EQ(node->inputs[0], input1_id); - ASSERT_EQ(node->inputs[1], input2_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(Maximum2TestF16, matches_operator_api) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_operator_t op = nullptr; - - // Call operator API. - const xnn_status status = xnn_create_maximum_nd_f16(0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ( - xnn_status_success, xnn_reshape_maximum_nd_f16( - op, input1_dims.size(), input1_dims.data(), input2_dims.size(), input2_dims.data(), - /*threadpool=*/nullptr)); - - ASSERT_EQ( - xnn_status_success, xnn_setup_maximum_nd_f16( - op, input1.data(), input2.data(), operator_output.data())); - - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(3, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - uint32_t input1_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, input1_dims.size(), input1_dims.data(), nullptr, - /*external_id=*/0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input1_id)); - ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); - - uint32_t input2_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, input2_dims.size(), input2_dims.data(), nullptr, - /*external_id=*/1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input2_id)); - ASSERT_NE(input2_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, output_dims.size(), output_dims.data(), nullptr, /*external_id=*/2, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ( - xnn_status_success, - xnn_define_maximum2(subgraph, input1_id, input2_id, output_id, /*flags=*/0)); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input1_id, input1.data()}, xnn_external_value{input2_id, input2.data()}, - xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(Maximum2TestF32, matches_operator_api) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_operator_t op = nullptr; - - // Call operator API. - ASSERT_EQ(xnn_status_success, xnn_create_maximum_nd_f32(0, &op)); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ( - xnn_status_success, xnn_reshape_maximum_nd_f32( - op, input1_dims.size(), input1_dims.data(), input2_dims.size(), input2_dims.data(), - /*threadpool=*/nullptr)); - - ASSERT_EQ( - xnn_status_success, xnn_setup_maximum_nd_f32( - op, input1.data(), input2.data(), operator_output.data())); - - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(3, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - uint32_t input1_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, input1_dims.size(), input1_dims.data(), nullptr, - /*external_id=*/0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input1_id)); - ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); - - uint32_t input2_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, input2_dims.size(), input2_dims.data(), nullptr, - /*external_id=*/1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input2_id)); - ASSERT_NE(input2_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, output_dims.size(), output_dims.data(), nullptr, /*external_id=*/2, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ( - xnn_status_success, - xnn_define_maximum2(subgraph, input1_id, input2_id, output_id, /*flags=*/0)); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input1_id, input1.data()}, xnn_external_value{input2_id, input2.data()}, - xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} diff --git a/test/minimum2.cc b/test/minimum2.cc deleted file mode 100644 index d8ec2f1fa7c..00000000000 --- a/test/minimum2.cc +++ /dev/null @@ -1,241 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include - -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/subgraph.h" -#include "subgraph-binary-tester.h" - -using Minimum2TestF16 = BinaryTest; -using Minimum2TestF32 = BinaryTest; - -TEST_F(Minimum2TestF16, define) { - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(3, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - uint32_t input1_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, input1_dims.size(), input1_dims.data(), nullptr, - /*external_id=*/0, /*flags=*/0, &input1_id)); - ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); - - uint32_t input2_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, input2_dims.size(), input2_dims.data(), nullptr, - /*external_id=*/0, /*flags=*/0, &input2_id)); - ASSERT_NE(input2_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, output_dims.size(), output_dims.data(), nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ( - xnn_status_success, - xnn_define_minimum2(subgraph, input1_id, input2_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_minimum2); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp16); - ASSERT_EQ(node->num_inputs, 2); - ASSERT_EQ(node->inputs[0], input1_id); - ASSERT_EQ(node->inputs[1], input2_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(Minimum2TestF32, define) { - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(3, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - uint32_t input1_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, input1_dims.size(), input1_dims.data(), nullptr, - /*external_id=*/0, /*flags=*/0, &input1_id)); - ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); - - uint32_t input2_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, input2_dims.size(), input2_dims.data(), nullptr, - /*external_id=*/0, /*flags=*/0, &input2_id)); - ASSERT_NE(input2_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, output_dims.size(), output_dims.data(), nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ( - xnn_status_success, - xnn_define_minimum2(subgraph, input1_id, input2_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_minimum2); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp32); - ASSERT_EQ(node->num_inputs, 2); - ASSERT_EQ(node->inputs[0], input1_id); - ASSERT_EQ(node->inputs[1], input2_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(Minimum2TestF16, matches_operator_api) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_operator_t op = nullptr; - - // Call operator API. - const xnn_status status = xnn_create_minimum_nd_f16(0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - ASSERT_EQ(xnn_status_success, status); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ( - xnn_status_success, xnn_reshape_minimum_nd_f16( - op, input1_dims.size(), input1_dims.data(), input2_dims.size(), input2_dims.data(), - /*threadpool=*/nullptr)); - - ASSERT_EQ( - xnn_status_success, xnn_setup_minimum_nd_f16(op, input1.data(), input2.data(), operator_output.data())); - - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(3, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - uint32_t input1_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, input1_dims.size(), input1_dims.data(), nullptr, - /*external_id=*/0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input1_id)); - ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); - - uint32_t input2_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, input2_dims.size(), input2_dims.data(), nullptr, - /*external_id=*/1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input2_id)); - ASSERT_NE(input2_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, output_dims.size(), output_dims.data(), nullptr, /*external_id=*/2, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ( - xnn_status_success, - xnn_define_minimum2(subgraph, input1_id, input2_id, output_id, /*flags=*/0)); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input1_id, input1.data()}, xnn_external_value{input2_id, input2.data()}, - xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(Minimum2TestF32, matches_operator_api) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_operator_t op = nullptr; - - // Call operator API. - ASSERT_EQ(xnn_status_success, xnn_create_minimum_nd_f32(0, &op)); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ( - xnn_status_success, xnn_reshape_minimum_nd_f32( - op, input1_dims.size(), input1_dims.data(), input2_dims.size(), input2_dims.data(), - /*threadpool=*/nullptr)); - - ASSERT_EQ( - xnn_status_success, xnn_setup_minimum_nd_f32(op, input1.data(), input2.data(), operator_output.data())); - - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(3, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - uint32_t input1_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, input1_dims.size(), input1_dims.data(), nullptr, - /*external_id=*/0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input1_id)); - ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); - - uint32_t input2_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, input2_dims.size(), input2_dims.data(), nullptr, - /*external_id=*/1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input2_id)); - ASSERT_NE(input2_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, output_dims.size(), output_dims.data(), nullptr, /*external_id=*/2, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ( - xnn_status_success, - xnn_define_minimum2(subgraph, input1_id, input2_id, output_id, /*flags=*/0)); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input1_id, input1.data()}, xnn_external_value{input2_id, input2.data()}, - xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} diff --git a/test/multiply2.cc b/test/multiply2.cc deleted file mode 100644 index 4ed45f5189c..00000000000 --- a/test/multiply2.cc +++ /dev/null @@ -1,416 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/math.h" -#include "xnnpack/operator.h" -#include "xnnpack/requantization.h" -#include "xnnpack/subgraph.h" -#include "subgraph-binary-tester.h" - -using Multiply2TestQS8 = BinaryTest; -using Multiply2TestQU8 = BinaryTest; -using Multiply2TestF16 = BinaryTest; -using Multiply2TestF32 = BinaryTest; -using MultiplyTestS32 = BinaryTest; - -// forward declare until we know the exact interface. -extern "C" { -enum xnn_status xnn_define_multiply2_v2( - xnn_subgraph_t subgraph, - uint32_t input1_id, - uint32_t input2_id, - uint32_t output_id, - uint32_t flags); -} - -TEST_F(Multiply2TestQS8, matches_operator_api) -{ - const int32_t input1_zero_point = i8dist(rng); - const float input1_scale = scale_dist(rng); - const int32_t input2_zero_point = i8dist(rng); - const float input2_scale = scale_dist(rng); - const int32_t output_zero_point = i8dist(rng); - const float output_scale = scale_dist(rng); - const int8_t quantized_output_min = xnn_qs8_quantize(output_min, output_scale, output_zero_point); - const int8_t quantized_output_max = xnn_qs8_quantize(output_max, output_scale, output_zero_point); - - std::generate(input1.begin(), input1.end(), [&]() { return i8dist(rng); }); - std::generate(input2.begin(), input2.end(), [&]() { return i8dist(rng); }); - std::fill(operator_output.begin(), operator_output.end(), INT8_C(0xA5)); - std::fill(subgraph_output.begin(), subgraph_output.end(), INT8_C(0xA5)); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_operator_t op = nullptr; - - // Call operator API. - ASSERT_EQ( - xnn_status_success, xnn_create_multiply_nd_qs8( - input1_zero_point, input1_scale, input2_zero_point, input2_scale, output_zero_point, - output_scale, quantized_output_min, quantized_output_max, /*flags=*/0, &op)); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ( - xnn_status_success, xnn_reshape_multiply_nd_qs8( - op, input1_dims.size(), input1_dims.data(), input2_dims.size(), input2_dims.data(), - /*threadpool=*/nullptr)); - - ASSERT_EQ( - xnn_status_success, xnn_setup_multiply_nd_qs8(op, input1.data(), input2.data(), operator_output.data())); - - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(3, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - uint32_t input1_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, input1_zero_point, input1_scale, input1_dims.size(), input1_dims.data(), nullptr, - /*external_id=*/0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input1_id)); - ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); - - uint32_t input2_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, input2_zero_point, input2_scale, input2_dims.size(), input2_dims.data(), nullptr, - /*external_id=*/1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input2_id)); - ASSERT_NE(input2_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, output_zero_point, output_scale, output_dims.size(), - output_dims.data(), nullptr, /*external_id=*/2, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ( - xnn_status_success, - xnn_define_multiply2(subgraph, output_min, output_max, input1_id, input2_id, output_id, /*flags=*/0)); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input1_id, input1.data()}, xnn_external_value{input2_id, input2.data()}, - xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(Multiply2TestQU8, matches_operator_api) -{ - const int32_t input1_zero_point = u8dist(rng); - const float input1_scale = scale_dist(rng); - const int32_t input2_zero_point = u8dist(rng); - const float input2_scale = scale_dist(rng); - const int32_t output_zero_point = u8dist(rng); - const float output_scale = scale_dist(rng); - const uint8_t quantized_output_min = xnn_qu8_quantize(output_min, output_scale, output_zero_point); - const uint8_t quantized_output_max = xnn_qu8_quantize(output_max, output_scale, output_zero_point); - - std::generate(input1.begin(), input1.end(), [&]() { return u8dist(rng); }); - std::generate(input2.begin(), input2.end(), [&]() { return u8dist(rng); }); - std::fill(operator_output.begin(), operator_output.end(), UINT8_C(0xA5)); - std::fill(subgraph_output.begin(), subgraph_output.end(), UINT8_C(0xA5)); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_operator_t op = nullptr; - - // Call operator API. - ASSERT_EQ( - xnn_status_success, xnn_create_multiply_nd_qu8( - input1_zero_point, input1_scale, input2_zero_point, input2_scale, output_zero_point, - output_scale, quantized_output_min, quantized_output_max, /*flags=*/0, &op)); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ( - xnn_status_success, xnn_reshape_multiply_nd_qu8( - op, input1_dims.size(), input1_dims.data(), input2_dims.size(), input2_dims.data(), - /*threadpool=*/nullptr)); - - ASSERT_EQ( - xnn_status_success, xnn_setup_multiply_nd_qu8(op, input1.data(), input2.data(), operator_output.data())); - - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(3, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - uint32_t input1_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_quint8, input1_zero_point, input1_scale, input1_dims.size(), input1_dims.data(), nullptr, - /*external_id=*/0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input1_id)); - ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); - - uint32_t input2_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_quint8, input2_zero_point, input2_scale, input2_dims.size(), input2_dims.data(), nullptr, - /*external_id=*/1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input2_id)); - ASSERT_NE(input2_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_quint8, output_zero_point, output_scale, output_dims.size(), - output_dims.data(), nullptr, /*external_id=*/2, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ( - xnn_status_success, - xnn_define_multiply2(subgraph, output_min, output_max, input1_id, input2_id, output_id, /*flags=*/0)); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input1_id, input1.data()}, xnn_external_value{input2_id, input2.data()}, - xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(Multiply2TestF16, matches_operator_api) -{ - std::generate(input1.begin(), input1.end(), [&]() { return f32dist(rng); }); - std::generate(input2.begin(), input2.end(), [&]() { return f32dist(rng); }); - std::fill(operator_output.begin(), operator_output.end(), std::nanf("")); - std::fill(subgraph_output.begin(), subgraph_output.end(), std::nanf("")); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_operator_t op = nullptr; - - // Call operator API. - const xnn_status status = xnn_create_multiply_nd_f16(output_min, output_max, 0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - ASSERT_EQ(xnn_status_success, status); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ( - xnn_status_success, xnn_reshape_multiply_nd_f16( - op, input1_dims.size(), input1_dims.data(), input2_dims.size(), input2_dims.data(), - /*threadpool=*/nullptr)); - - ASSERT_EQ( - xnn_status_success, xnn_setup_multiply_nd_f16(op, input1.data(), input2.data(), operator_output.data())); - - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(3, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - uint32_t input1_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, input1_dims.size(), input1_dims.data(), nullptr, - /*external_id=*/0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input1_id)); - ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); - - uint32_t input2_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, input2_dims.size(), input2_dims.data(), nullptr, - /*external_id=*/1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input2_id)); - ASSERT_NE(input2_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, output_dims.size(), output_dims.data(), nullptr, /*external_id=*/2, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ( - xnn_status_success, - xnn_define_multiply2(subgraph, output_min, output_max, input1_id, input2_id, output_id, /*flags=*/0)); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input1_id, input1.data()}, xnn_external_value{input2_id, input2.data()}, - xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(Multiply2TestF32, matches_operator_api) -{ - std::generate(input1.begin(), input1.end(), [&]() { return f32dist(rng); }); - std::generate(input2.begin(), input2.end(), [&]() { return f32dist(rng); }); - std::fill(operator_output.begin(), operator_output.end(), nanf("")); - std::fill(subgraph_output.begin(), subgraph_output.end(), nanf("")); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_operator_t op = nullptr; - - // Call operator API. - ASSERT_EQ(xnn_status_success, xnn_create_multiply_nd_f32(output_min, output_max, 0, &op)); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ( - xnn_status_success, xnn_reshape_multiply_nd_f32( - op, input1_dims.size(), input1_dims.data(), input2_dims.size(), input2_dims.data(), - /*threadpool=*/nullptr)); - - ASSERT_EQ( - xnn_status_success, xnn_setup_multiply_nd_f32(op, input1.data(), input2.data(), operator_output.data())); - - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(3, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - uint32_t input1_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, input1_dims.size(), input1_dims.data(), nullptr, - /*external_id=*/0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input1_id)); - ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); - - uint32_t input2_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, input2_dims.size(), input2_dims.data(), nullptr, - /*external_id=*/1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input2_id)); - ASSERT_NE(input2_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, output_dims.size(), output_dims.data(), nullptr, /*external_id=*/2, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ( - xnn_status_success, - xnn_define_multiply2(subgraph, output_min, output_max, input1_id, input2_id, output_id, /*flags=*/0)); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input1_id, input1.data()}, xnn_external_value{input2_id, input2.data()}, - xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(MultiplyTestS32, matches_operator_api) -{ - std::generate(input1.begin(), input1.end(), [&]() { return s32dist(rng); }); - std::generate(input2.begin(), input2.end(), [&]() { return s32dist(rng); }); - std::fill(operator_output.begin(), operator_output.end(), - std::numeric_limits::max()); - std::fill(subgraph_output.begin(), subgraph_output.end(), - std::numeric_limits::max()); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_operator_t op = nullptr; - - // Call operator API. - ASSERT_EQ( - xnn_status_success, xnn_create_multiply_nd_s32( - /*flags=*/0, &op)); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ( - xnn_status_success, xnn_reshape_multiply_nd_s32( - op, input1_dims.size(), input1_dims.data(), input2_dims.size(), input2_dims.data(), - /*threadpool=*/nullptr)); - - ASSERT_EQ(xnn_status_success, xnn_setup_multiply_nd_s32(op, input1.data(), input2.data(), operator_output.data())); - - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(3, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - uint32_t input1_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_int32, input1_dims.size(), input1_dims.data(), nullptr, - /*external_id=*/0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input1_id)); - ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); - - uint32_t input2_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_int32, input2_dims.size(), input2_dims.data(), nullptr, - /*external_id=*/1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input2_id)); - ASSERT_NE(input2_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_tensor_value( - subgraph, xnn_datatype_int32, output_dims.size(), output_dims.data(), nullptr, /*external_id=*/2, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ( - xnn_status_success, - xnn_define_multiply2_v2(subgraph, input1_id, input2_id, output_id, /*flags=*/0)); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input1_id, input1.data()}, xnn_external_value{input2_id, input2.data()}, - xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} diff --git a/test/operator-size.c b/test/operator-size.c index 99abe6296c3..7c4c490489b 100644 --- a/test/operator-size.c +++ b/test/operator-size.c @@ -26,12 +26,11 @@ int main(int argc, char** argv) { xnn_run_operator(op, NULL); break; case 1: - xnn_create_add_nd_f32( - 0.0f, 0.0f, - 0, &op); + xnn_create_binary_elementwise_nd( + xnn_binary_add, xnn_datatype_fp32, NULL, NULL, NULL, 0, &op); break; case 2: - xnn_setup_add_nd_f32( + xnn_setup_binary_elementwise_nd( op, NULL, NULL, NULL); break; case 3: @@ -97,14 +96,6 @@ int main(int argc, char** argv) { xnn_setup_deconvolution2d_nhwc_f32( op, NULL, NULL); break; - case 13: - xnn_create_divide_nd_f32( - 0.0f, 0.0f, 0, &op); - break; - case 14: - xnn_setup_divide_nd_f32( - op, NULL, NULL, NULL); - break; case 15: xnn_create_fully_connected_nc_f32( 0, 0, 0, 0, @@ -147,31 +138,6 @@ int main(int argc, char** argv) { xnn_setup_max_pooling2d_nhwc_f32( op, NULL, NULL); break; - case 23: - xnn_create_maximum_nd_f32( - 0, &op); - break; - case 24: - xnn_setup_maximum_nd_f32( - op, NULL, NULL, NULL); - break; - case 25: - xnn_create_minimum_nd_f32( - 0, &op); - break; - case 26: - xnn_setup_minimum_nd_f32( - op, NULL, NULL, NULL); - break; - case 27: - xnn_create_multiply_nd_f32( - 0.0f, 0.0f, - 0, &op); - break; - case 28: - xnn_setup_multiply_nd_f32( - op, NULL, NULL, NULL); - break; case 29: xnn_create_prelu_nc_f32( 0, 0, 0, 0, @@ -208,15 +174,6 @@ int main(int argc, char** argv) { xnn_setup_softmax_nc_f32( op, NULL, NULL); break; - case 37: - xnn_create_subtract_nd_f32( - 0.0f, 0.0f, - 0, &op); - break; - case 38: - xnn_setup_subtract_nd_f32( - op, NULL, NULL, NULL); - break; case 39: xnn_create_channel_shuffle_nc_x32( 0, 0, 0, 0, diff --git a/test/squared-difference.cc b/test/squared-difference.cc deleted file mode 100644 index fb5093a66f8..00000000000 --- a/test/squared-difference.cc +++ /dev/null @@ -1,259 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include - -#include -#include -#include -#include -#include -#include - -#include -#include -#include "xnnpack.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/subgraph.h" -#include "subgraph-binary-tester.h" - -using SquaredDifferenceTestF16 = BinaryTest; -using SquaredDifferenceTestF32 = BinaryTest; - -TEST_F(SquaredDifferenceTestF16, define) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(3, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - uint32_t input1_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, input1_dims.size(), input1_dims.data(), nullptr, - /*external_id=*/0, /*flags=*/0, &input1_id)); - ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); - - uint32_t input2_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, input2_dims.size(), input2_dims.data(), nullptr, - /*external_id=*/0, /*flags=*/0, &input2_id)); - ASSERT_NE(input2_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, output_dims.size(), output_dims.data(), nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ( - xnn_status_success, - xnn_define_squared_difference(subgraph, input1_id, input2_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_squared_difference); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp16); - ASSERT_EQ(node->num_inputs, 2); - ASSERT_EQ(node->inputs[0], input1_id); - ASSERT_EQ(node->inputs[1], input2_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(SquaredDifferenceTestF32, define) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(3, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - uint32_t input1_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, input1_dims.size(), input1_dims.data(), nullptr, - /*external_id=*/0, /*flags=*/0, &input1_id)); - ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); - - uint32_t input2_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, input2_dims.size(), input2_dims.data(), nullptr, - /*external_id=*/0, /*flags=*/0, &input2_id)); - ASSERT_NE(input2_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, output_dims.size(), output_dims.data(), nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ( - xnn_status_success, - xnn_define_squared_difference(subgraph, input1_id, input2_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_squared_difference); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp32); - ASSERT_EQ(node->num_inputs, 2); - ASSERT_EQ(node->inputs[0], input1_id); - ASSERT_EQ(node->inputs[1], input2_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(SquaredDifferenceTestF16, matches_operator_api) -{ - std::generate(input1.begin(), input1.end(), [&]() { return f32dist(rng); }); - std::generate(input2.begin(), input2.end(), [&]() { return f32dist(rng); }); - std::fill(operator_output.begin(), operator_output.end(), std::nanf("")); - std::fill(subgraph_output.begin(), subgraph_output.end(), std::nanf("")); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_operator_t op = nullptr; - - // Call operator API. - const xnn_status status = xnn_create_squared_difference_nd_f16(0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - ASSERT_EQ(xnn_status_success, status); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ( - xnn_status_success, xnn_reshape_squared_difference_nd_f16( - op, input1_dims.size(), input1_dims.data(), input2_dims.size(), input2_dims.data(), - /*threadpool=*/nullptr)); - - ASSERT_EQ( - xnn_status_success, xnn_setup_squared_difference_nd_f16(op, input1.data(), input2.data(), operator_output.data())); - - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(3, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - uint32_t input1_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, input1_dims.size(), input1_dims.data(), nullptr, - /*external_id=*/0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input1_id)); - ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); - - uint32_t input2_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, input2_dims.size(), input2_dims.data(), nullptr, - /*external_id=*/1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input2_id)); - ASSERT_NE(input2_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, output_dims.size(), output_dims.data(), nullptr, /*external_id=*/2, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ( - xnn_status_success, - xnn_define_squared_difference(subgraph, input1_id, input2_id, output_id, /*flags=*/0)); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input1_id, input1.data()}, xnn_external_value{input2_id, input2.data()}, - xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - // Check outputs match. - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(SquaredDifferenceTestF32, matches_operator_api) -{ - std::generate(input1.begin(), input1.end(), [&]() { return f32dist(rng); }); - std::generate(input2.begin(), input2.end(), [&]() { return f32dist(rng); }); - std::fill(operator_output.begin(), operator_output.end(), nanf("")); - std::fill(subgraph_output.begin(), subgraph_output.end(), nanf("")); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_operator_t op = nullptr; - - // Call operator API. - ASSERT_EQ(xnn_status_success, xnn_create_squared_difference_nd_f32(0, &op)); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ( - xnn_status_success, xnn_reshape_squared_difference_nd_f32( - op, input1_dims.size(), input1_dims.data(), input2_dims.size(), input2_dims.data(), - /*threadpool=*/nullptr)); - - ASSERT_EQ( - xnn_status_success, xnn_setup_squared_difference_nd_f32(op, input1.data(), input2.data(), operator_output.data())); - - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(3, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - uint32_t input1_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, input1_dims.size(), input1_dims.data(), nullptr, - /*external_id=*/0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input1_id)); - ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); - - uint32_t input2_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, input2_dims.size(), input2_dims.data(), nullptr, - /*external_id=*/1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input2_id)); - ASSERT_NE(input2_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, output_dims.size(), output_dims.data(), nullptr, /*external_id=*/2, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ( - xnn_status_success, - xnn_define_squared_difference(subgraph, input1_id, input2_id, output_id, /*flags=*/0)); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input1_id, input1.data()}, xnn_external_value{input2_id, input2.data()}, - xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - // Check outputs match. - ASSERT_EQ(subgraph_output, operator_output); -} diff --git a/test/subgraph-binary-tester.h b/test/subgraph-binary-tester.h deleted file mode 100644 index c024b551693..00000000000 --- a/test/subgraph-binary-tester.h +++ /dev/null @@ -1,161 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/requantization.h" -#include "xnnpack/subgraph.h" -#include "replicable_random_device.h" - -template -class NumericLimits { -public: - static constexpr T min() { return std::numeric_limits::min(); } - static constexpr T max() { return std::numeric_limits::max(); } -}; - -template <> -class NumericLimits { -public: - static xnn_float16 min() { - return -std::numeric_limits::infinity(); - } - static xnn_float16 max() { - return +std::numeric_limits::infinity(); - } -}; - -template class BinaryTest : public ::testing::Test { - protected: - BinaryTest() { - shape_dist = std::uniform_int_distribution(0, XNN_MAX_TENSOR_DIMS); - dim_dist = std::uniform_int_distribution(1, 9); - f32dist = std::uniform_real_distribution(0.01f, 1.0f); - i8dist = - std::uniform_int_distribution(std::numeric_limits::min(), std::numeric_limits::max()); - u8dist = - std::uniform_int_distribution(std::numeric_limits::min(), std::numeric_limits::max()); - scale_dist = std::uniform_real_distribution(0.1f, 5.0f); - s32dist = std::uniform_int_distribution(std::numeric_limits::min(), std::numeric_limits::max()); - } - - void SetUp() override - { - std::vector input1_shape = RandomShape(); - std::vector input2_shape; - std::vector output_shape; - // Create input dimensions. - // Create input 2 with an equal or larger number of dimensions. - const size_t input2_num_dims = std::uniform_int_distribution(input1_shape.size(), XNN_MAX_TENSOR_DIMS)(rng); - input2_shape = RandomShape(input2_num_dims); - // Ensure that the inputs dimensions match. - std::copy_backward(input1_shape.begin(), input1_shape.end(), input2_shape.end()); - - // Choose a random dimension to broadcast for each input. - const size_t input1_broadcast_dim = std::uniform_int_distribution(0, input1_shape.size())(rng); - if (input1_broadcast_dim < input1_shape.size()) { - input1_shape[input1_broadcast_dim] = 1; - } - const size_t input2_broadcast_dim = std::uniform_int_distribution(0, input2_shape.size())(rng); - if (input2_broadcast_dim < input2_shape.size()) { - input2_shape[input2_broadcast_dim] = 1; - } - input1_dims.resize(XNN_MAX_TENSOR_DIMS); - input2_dims.resize(XNN_MAX_TENSOR_DIMS); - output_dims.resize(XNN_MAX_TENSOR_DIMS); - - // Calculate generalized shapes. - std::fill(input1_dims.begin(), input1_dims.end(), 1); - std::fill(input2_dims.begin(), input2_dims.end(), 1); - std::copy_backward(input1_shape.cbegin(), input1_shape.cend(), input1_dims.end()); - std::copy_backward(input2_shape.cbegin(), input2_shape.cend(), input2_dims.end()); - for (size_t i = 0; i < XNN_MAX_TENSOR_DIMS; i++) { - if (input1_dims[i] != 1 && input2_dims[i] != 1) { - ASSERT_EQ(input1_dims[i], input2_dims[i]) << "i: " << i; - } - output_dims[i] = std::max(input1_dims[i], input2_dims[i]); - } - - if (f32dist(rng) < 0.5f) { - RemoveLeadingOnes(input1_dims); - } - if (f32dist(rng) < 0.5f) { - RemoveLeadingOnes(input2_dims); - } - while (output_dims.size() > std::max(input1_dims.size(), input2_dims.size())) { - output_dims.erase(output_dims.begin()); - } - - input1 = std::vector(XNN_EXTRA_BYTES / sizeof(T) + NumElements(input1_shape)); - input2 = std::vector(XNN_EXTRA_BYTES / sizeof(T) + NumElements(input2_shape)); - operator_output = std::vector(NumElements(output_dims)); - subgraph_output = std::vector(operator_output.size()); - } - - std::vector RandomShape(size_t num_dims) - { - std::vector dims(num_dims); - std::generate(dims.begin(), dims.end(), [&] { return dim_dist(rng); }); - return dims; - } - - std::vector RandomShape() { return RandomShape(shape_dist(rng)); } - - size_t NumElements(std::vector& dims) - { - return std::accumulate(dims.begin(), dims.end(), size_t(1), std::multiplies()); - } - - size_t NumElements(std::array& dims) - { - return std::accumulate(dims.begin(), dims.end(), size_t(1), std::multiplies()); - } - - void RemoveLeadingOnes(std::vector& dims) { - while (!dims.empty()) { - if (dims.front() == 1) { - dims.erase(dims.begin()); - } else { - break; - } - } - } - - xnnpack::ReplicableRandomDevice rng; - std::uniform_int_distribution shape_dist; - std::uniform_int_distribution dim_dist; - std::uniform_real_distribution f32dist; - std::uniform_real_distribution scale_dist; - std::uniform_int_distribution i8dist; - std::uniform_int_distribution u8dist; - std::uniform_int_distribution s32dist; - - - T output_min = NumericLimits::min(); - T output_max = NumericLimits::max(); - - std::vector input1_dims; - std::vector input2_dims; - std::vector output_dims; - - std::vector input1; - std::vector input2; - std::vector operator_output; - std::vector subgraph_output; -}; diff --git a/test/subgraph-size.c b/test/subgraph-size.c index dba8ed6ca8f..1dcbdb50176 100644 --- a/test/subgraph-size.c +++ b/test/subgraph-size.c @@ -69,10 +69,7 @@ int main(int argc, char** argv) { 0, 0, 0); break; case 7: - xnn_define_add2(NULL, 0.0f, 0.0f, 0, 0, 0, 0); - break; - case 8: - xnn_define_multiply2(NULL, 0.0f, 0.0f, 0, 0, 0, 0); + xnn_define_binary(NULL, xnn_binary_add, NULL, 0, 0, 0, 0); break; case 9: xnn_define_prelu(NULL, 0, 0, 0, 0); diff --git a/test/subtract2.cc b/test/subtract2.cc deleted file mode 100644 index e57fde788d5..00000000000 --- a/test/subtract2.cc +++ /dev/null @@ -1,339 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include - -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/requantization.h" -#include "xnnpack/subgraph.h" -#include "subgraph-binary-tester.h" - -using Subtract2TestQS8 = BinaryTest; -using Subtract2TestQU8 = BinaryTest; -using Subtract2TestF16 = BinaryTest; -using Subtract2TestF32 = BinaryTest; - -TEST_F(Subtract2TestQS8, matches_operator_api) -{ - const int32_t input1_zero_point = i8dist(rng); - const float input1_scale = scale_dist(rng); - const int32_t input2_zero_point = i8dist(rng); - const float input2_scale = scale_dist(rng); - const int32_t output_zero_point = i8dist(rng); - const float output_scale = scale_dist(rng); - const int8_t quantized_output_min = xnn_qs8_quantize(output_min, output_scale, output_zero_point); - const int8_t quantized_output_max = xnn_qs8_quantize(output_max, output_scale, output_zero_point); - - std::generate(input1.begin(), input1.end(), [&]() { return i8dist(rng); }); - std::generate(input2.begin(), input2.end(), [&]() { return i8dist(rng); }); - std::fill(operator_output.begin(), operator_output.end(), INT8_C(0xA5)); - std::fill(subgraph_output.begin(), subgraph_output.end(), INT8_C(0xA5)); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_operator_t op = nullptr; - - // Call operator API. - ASSERT_EQ( - xnn_status_success, xnn_create_subtract_nd_qs8( - input1_zero_point, input1_scale, input2_zero_point, input2_scale, output_zero_point, - output_scale, quantized_output_min, quantized_output_max, /*flags=*/0, &op)); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ( - xnn_status_success, xnn_reshape_subtract_nd_qs8( - op, input1_dims.size(), input1_dims.data(), input2_dims.size(), input2_dims.data(), - /*threadpool=*/nullptr)); - - ASSERT_EQ( - xnn_status_success, xnn_setup_subtract_nd_qs8( - op, input1.data(), input2.data(), operator_output.data())); - - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(3, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - uint32_t input1_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, input1_zero_point, input1_scale, input1_dims.size(), input1_dims.data(), nullptr, - /*external_id=*/0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input1_id)); - ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); - - uint32_t input2_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, input2_zero_point, input2_scale, input2_dims.size(), input2_dims.data(), nullptr, - /*external_id=*/1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input2_id)); - ASSERT_NE(input2_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, output_zero_point, output_scale, output_dims.size(), - output_dims.data(), nullptr, /*external_id=*/2, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ( - xnn_status_success, - xnn_define_subtract(subgraph, output_min, output_max, input1_id, input2_id, output_id, /*flags=*/0)); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input1_id, input1.data()}, xnn_external_value{input2_id, input2.data()}, - xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(Subtract2TestQU8, matches_operator_api) -{ - const int32_t input1_zero_point = u8dist(rng); - const float input1_scale = scale_dist(rng); - const int32_t input2_zero_point = u8dist(rng); - const float input2_scale = scale_dist(rng); - const int32_t output_zero_point = u8dist(rng); - const float output_scale = scale_dist(rng); - const uint8_t quantized_output_min = xnn_qu8_quantize(output_min, output_scale, output_zero_point); - const uint8_t quantized_output_max = xnn_qu8_quantize(output_max, output_scale, output_zero_point); - - std::generate(input1.begin(), input1.end(), [&]() { return u8dist(rng); }); - std::generate(input2.begin(), input2.end(), [&]() { return u8dist(rng); }); - std::fill(operator_output.begin(), operator_output.end(), UINT8_C(0xA5)); - std::fill(subgraph_output.begin(), subgraph_output.end(), UINT8_C(0xA5)); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_operator_t op = nullptr; - - // Call operator API. - ASSERT_EQ( - xnn_status_success, xnn_create_subtract_nd_qu8( - input1_zero_point, input1_scale, input2_zero_point, input2_scale, output_zero_point, - output_scale, quantized_output_min, quantized_output_max, /*flags=*/0, &op)); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ( - xnn_status_success, xnn_reshape_subtract_nd_qu8( - op, input1_dims.size(), input1_dims.data(), input2_dims.size(), input2_dims.data(), - /*threadpool=*/nullptr)); - - ASSERT_EQ( - xnn_status_success, xnn_setup_subtract_nd_qu8( - op, input1.data(), input2.data(), operator_output.data())); - - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(3, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - uint32_t input1_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_quint8, input1_zero_point, input1_scale, input1_dims.size(), input1_dims.data(), nullptr, - /*external_id=*/0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input1_id)); - ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); - - uint32_t input2_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_quint8, input2_zero_point, input2_scale, input2_dims.size(), input2_dims.data(), nullptr, - /*external_id=*/1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input2_id)); - ASSERT_NE(input2_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_quint8, output_zero_point, output_scale, output_dims.size(), - output_dims.data(), nullptr, /*external_id=*/2, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ( - xnn_status_success, - xnn_define_subtract(subgraph, output_min, output_max, input1_id, input2_id, output_id, /*flags=*/0)); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input1_id, input1.data()}, xnn_external_value{input2_id, input2.data()}, - xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(Subtract2TestF16, matches_operator_api) -{ - std::generate(input1.begin(), input1.end(), [&]() { return f32dist(rng); }); - std::generate(input2.begin(), input2.end(), [&]() { return f32dist(rng); }); - std::fill(operator_output.begin(), operator_output.end(), std::nanf("")); - std::fill(subgraph_output.begin(), subgraph_output.end(), std::nanf("")); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_operator_t op = nullptr; - - // Call operator API. - const xnn_status status = xnn_create_subtract_nd_f16(output_min, output_max, 0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - ASSERT_EQ(xnn_status_success, status); - - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ( - xnn_status_success, xnn_reshape_subtract_nd_f16( - op, input1_dims.size(), input1_dims.data(), input2_dims.size(), input2_dims.data(), - /*threadpool=*/nullptr)); - - ASSERT_EQ( - xnn_status_success, xnn_setup_subtract_nd_f16( - op, input1.data(), input2.data(), operator_output.data())); - - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(3, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - uint32_t input1_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, input1_dims.size(), input1_dims.data(), nullptr, - /*external_id=*/0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input1_id)); - ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); - - uint32_t input2_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, input2_dims.size(), input2_dims.data(), nullptr, - /*external_id=*/1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input2_id)); - ASSERT_NE(input2_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, output_dims.size(), output_dims.data(), nullptr, /*external_id=*/2, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ( - xnn_status_success, - xnn_define_subtract(subgraph, output_min, output_max, input1_id, input2_id, output_id, /*flags=*/0)); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input1_id, input1.data()}, xnn_external_value{input2_id, input2.data()}, - xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(Subtract2TestF32, matches_operator_api) -{ - std::generate(input1.begin(), input1.end(), [&]() { return f32dist(rng); }); - std::generate(input2.begin(), input2.end(), [&]() { return f32dist(rng); }); - std::fill(operator_output.begin(), operator_output.end(), nanf("")); - std::fill(subgraph_output.begin(), subgraph_output.end(), nanf("")); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_operator_t op = nullptr; - - // Call operator API. - ASSERT_EQ(xnn_status_success, xnn_create_subtract_nd_f32(output_min, output_max, 0, &op)); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ( - xnn_status_success, xnn_reshape_subtract_nd_f32( - op, input1_dims.size(), input1_dims.data(), input2_dims.size(), input2_dims.data(), - /*threadpool=*/nullptr)); - - ASSERT_EQ( - xnn_status_success, xnn_setup_subtract_nd_f32( - op, input1.data(), input2.data(), operator_output.data())); - - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(3, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - uint32_t input1_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, input1_dims.size(), input1_dims.data(), nullptr, - /*external_id=*/0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input1_id)); - ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); - - uint32_t input2_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, input2_dims.size(), input2_dims.data(), nullptr, - /*external_id=*/1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input2_id)); - ASSERT_NE(input2_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, output_dims.size(), output_dims.data(), nullptr, /*external_id=*/2, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ( - xnn_status_success, - xnn_define_subtract(subgraph, output_min, output_max, input1_id, input2_id, output_id, /*flags=*/0)); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input1_id, input1.data()}, xnn_external_value{input2_id, input2.data()}, - xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} diff --git a/test/vbinary-microkernel-tester.cc b/test/vbinary-microkernel-tester.cc index 6bd2a04499d..00515ad5713 100644 --- a/test/vbinary-microkernel-tester.cc +++ b/test/vbinary-microkernel-tester.cc @@ -288,15 +288,11 @@ void VBinaryMicrokernelTester::Test( const size_t stride_b = broadcast_b() ? 0 : 1; // Prepare parameters. - xnn_qu8_add_minmax_params quantization_params; - init_params(&quantization_params, a_zero_point(), b_zero_point(), - y_zero_point(), a_scale() / y_scale(), b_scale() / y_scale(), - qmin(), qmax()); - xnn_qu8_add_minmax_params scalar_quantization_params; - xnn_init_qu8_add_minmax_scalar_params( - &scalar_quantization_params, a_zero_point(), b_zero_point(), - y_zero_point(), a_scale() / y_scale(), b_scale() / y_scale(), qmin(), - qmax()); + xnn_qu8_add_minmax_params params; + struct xnn_quantization_params a_quantization = {a_zero_point(), a_scale()}; + struct xnn_quantization_params b_quantization = {b_zero_point(), b_scale()}; + struct xnn_quantization_params y_quantization = {y_zero_point(), y_scale()}; + init_params(¶ms, &a_quantization, &b_quantization, &y_quantization); // Compute reference results. for (size_t i = 0; i < batch_size(); i++) { @@ -307,21 +303,16 @@ void VBinaryMicrokernelTester::Test( static_cast(static_cast(b_data[i * stride_b]) - static_cast(b_zero_point())) * (b_scale() / y_scale()); - y_fp[i] = std::min(y_fp[i], static_cast(qmax())); - y_fp[i] = std::max(y_fp[i], static_cast(qmin())); - y_ref[i] = xnn_qu8_quantize_add(a_data[i], b_data[i * stride_b], - scalar_quantization_params); + y_fp[i] = std::min(y_fp[i], static_cast(UINT8_MAX)); + y_fp[i] = std::max(y_fp[i], static_cast(0)); + y_ref[i] = xnn_qu8_quantize_add(a_data[i], b_data[i * stride_b], params); } // Call optimized micro-kernel. - vadd_minmax(batch_size(), a_data, b_data, y.data(), &quantization_params); + vadd_minmax(batch_size(), a_data, b_data, y.data(), ¶ms); // Verify results. for (size_t i = 0; i < batch_size(); i++) { - EXPECT_LE(static_cast(y[i]), static_cast(qmax())) - << "at element " << i << " / " << batch_size(); - EXPECT_GE(static_cast(y[i]), static_cast(qmin())) - << "at element " << i << " / " << batch_size(); EXPECT_NEAR(static_cast(static_cast(y[i])), y_fp[i], 1.0f) << "at element " << i << " / " << batch_size(); EXPECT_EQ(static_cast(y_ref[i]), static_cast(y[i])) @@ -361,9 +352,11 @@ void VBinaryMicrokernelTester::Test( // Prepare parameters. const float product_scale = a_scale() * b_scale(); const float product_output_scale = product_scale / y_scale(); - xnn_qu8_mul_minmax_params quantization_params; - init_params(&quantization_params, a_zero_point(), b_zero_point(), - y_zero_point(), product_output_scale, qmin(), qmax()); + xnn_qu8_mul_minmax_params params; + struct xnn_quantization_params a_quantization = {a_zero_point(), a_scale()}; + struct xnn_quantization_params b_quantization = {b_zero_point(), b_scale()}; + struct xnn_quantization_params y_quantization = {y_zero_point(), y_scale()}; + init_params(¶ms, &a_quantization, &b_quantization, &y_quantization); // Compute reference results. for (size_t i = 0; i < batch_size(); i++) { @@ -373,22 +366,16 @@ void VBinaryMicrokernelTester::Test( static_cast(b_zero_point())); y_fp[i] = static_cast(y_zero_point()) + product_output_scale * static_cast(acc); - y_fp[i] = std::min( - y_fp[i], static_cast(static_cast(qmax()))); - y_fp[i] = std::max( - y_fp[i], static_cast(static_cast(qmin()))); - y_ref[i] = xnn_qu8_requantize_fp32(acc, product_output_scale, y_zero_point(), qmin(), qmax()); + y_fp[i] = std::min(y_fp[i], static_cast(UINT8_MAX)); + y_fp[i] = std::max(y_fp[i], static_cast(0)); + y_ref[i] = xnn_qu8_requantize_fp32(acc, product_output_scale, y_zero_point(), 0, UINT8_MAX); } // Call optimized micro-kernel. - vmul_minmax(batch_size(), a_data, b_data, y.data(), &quantization_params); + vmul_minmax(batch_size(), a_data, b_data, y.data(), ¶ms); // Verify results. for (size_t i = 0; i < batch_size(); i++) { - EXPECT_LE(static_cast(y[i]), static_cast(qmax())) - << "at element " << i << " / " << batch_size(); - EXPECT_GE(static_cast(y[i]), static_cast(qmin())) - << "at element " << i << " / " << batch_size(); EXPECT_NEAR(static_cast(static_cast(y[i])), y_fp[i], 1.0f) << "at element " << i << " / " << batch_size(); EXPECT_NEAR(static_cast(y[i]), static_cast(y_ref[i]), 1) @@ -427,20 +414,14 @@ void VBinaryMicrokernelTester::Test( const size_t stride_b = broadcast_b() ? 0 : 1; // Prepare parameters. - xnn_qs8_add_minmax_params quantization_params; - init_params( - &quantization_params, static_cast(a_zero_point() - 0x80), - static_cast(b_zero_point() - 0x80), - static_cast(y_zero_point() - 0x80), a_scale() / y_scale(), - b_scale() / y_scale(), static_cast(qmin() - 0x80), - static_cast(qmax() - 0x80)); - xnn_qs8_add_minmax_params scalar_quantization_params; - xnn_init_qs8_add_minmax_scalar_params( - &scalar_quantization_params, static_cast(a_zero_point() - 0x80), - static_cast(b_zero_point() - 0x80), - static_cast(y_zero_point() - 0x80), a_scale() / y_scale(), - b_scale() / y_scale(), static_cast(qmin() - 0x80), - static_cast(qmax() - 0x80)); + xnn_qs8_add_minmax_params params; + struct xnn_quantization_params a_quantization = {a_zero_point() - 0x80, + a_scale()}; + struct xnn_quantization_params b_quantization = {b_zero_point() - 0x80, + b_scale()}; + struct xnn_quantization_params y_quantization = {y_zero_point() - 0x80, + y_scale()}; + init_params(¶ms, &a_quantization, &b_quantization, &y_quantization); // Compute reference results. for (size_t i = 0; i < batch_size(); i++) { @@ -452,23 +433,16 @@ void VBinaryMicrokernelTester::Test( static_cast(static_cast(b_data[i * stride_b]) - static_cast(b_zero_point() - 0x80)) * (b_scale() / y_scale()); - y_fp[i] = std::min( - y_fp[i], static_cast(static_cast(qmax() - 0x80))); - y_fp[i] = std::max( - y_fp[i], static_cast(static_cast(qmin() - 0x80))); - y_ref[i] = xnn_qs8_quantize_add(a_data[i], b_data[i * stride_b], - scalar_quantization_params); + y_fp[i] = std::min(y_fp[i], static_cast(INT8_MAX)); + y_fp[i] = std::max(y_fp[i], static_cast(INT8_MIN)); + y_ref[i] = xnn_qs8_quantize_add(a_data[i], b_data[i * stride_b], params); } // Call optimized micro-kernel. - vadd_minmax(batch_size(), a_data, b_data, y.data(), &quantization_params); + vadd_minmax(batch_size(), a_data, b_data, y.data(), ¶ms); // Verify results. for (size_t i = 0; i < batch_size(); i++) { - EXPECT_LE(static_cast(y[i]), static_cast(qmax() - 0x80)) - << "at element " << i << " / " << batch_size(); - EXPECT_GE(static_cast(y[i]), static_cast(qmin() - 0x80)) - << "at element " << i << " / " << batch_size(); EXPECT_EQ(static_cast(y_ref[i]), static_cast(y[i])) << "at element " << i << " / " << batch_size(); EXPECT_NEAR(static_cast(static_cast(y[i])), y_fp[i], 1.0f) @@ -507,17 +481,19 @@ void VBinaryMicrokernelTester::Test( const size_t stride_b = broadcast_b() ? 0 : 1; // Prepare parameters. + xnn_qs8_mul_minmax_params params; + struct xnn_quantization_params a_quantization = {a_zero_point() - 0x80, + a_scale()}; + struct xnn_quantization_params b_quantization = {b_zero_point() - 0x80, + b_scale()}; + struct xnn_quantization_params y_quantization = {y_zero_point() - 0x80, + y_scale()}; + init_params(¶ms, &a_quantization, &b_quantization, &y_quantization); + + // Compute reference results. const float product_scale = a_scale() * b_scale(); const float product_output_scale = product_scale / y_scale(); EXPECT_GE(product_output_scale, 0x1.0p-32f); - xnn_qs8_mul_minmax_params quantization_params; - init_params( - &quantization_params, static_cast(a_zero_point() - 0x80), - static_cast(b_zero_point() - 0x80), - static_cast(y_zero_point() - 0x80), product_output_scale, - static_cast(qmin() - 0x80), static_cast(qmax() - 0x80)); - - // Compute reference results. for (size_t i = 0; i < batch_size(); i++) { const int32_t acc = (static_cast(a_data[i]) - static_cast(a_zero_point() - 0x80)) * @@ -525,25 +501,18 @@ void VBinaryMicrokernelTester::Test( static_cast(b_zero_point() - 0x80)); y_fp[i] = static_cast(y_zero_point() - 0x80) + product_output_scale * static_cast(acc); - y_fp[i] = std::min( - y_fp[i], static_cast(static_cast(qmax() - 0x80))); - y_fp[i] = std::max( - y_fp[i], static_cast(static_cast(qmin() - 0x80))); - y_ref[i] = xnn_qs8_requantize_fp32(acc, product_output_scale, - static_cast(y_zero_point() - 0x80), - static_cast(qmin() - 0x80), - static_cast(qmax() - 0x80)); + y_fp[i] = std::min(y_fp[i], static_cast(INT8_MAX)); + y_fp[i] = std::max(y_fp[i], static_cast(INT8_MIN)); + y_ref[i] = xnn_qs8_requantize_fp32( + acc, product_output_scale, static_cast(y_zero_point() - 0x80), + INT8_MIN, INT8_MAX); } // Call optimized micro-kernel. - vmul_minmax(batch_size(), a_data, b_data, y.data(), &quantization_params); + vmul_minmax(batch_size(), a_data, b_data, y.data(), ¶ms); // Verify results. for (size_t i = 0; i < batch_size(); i++) { - EXPECT_LE(static_cast(y[i]), static_cast(qmax() - 0x80)) - << "at element " << i << " / " << batch_size(); - EXPECT_GE(static_cast(y[i]), static_cast(qmin() - 0x80)) - << "at element " << i << " / " << batch_size(); EXPECT_NEAR(static_cast(y_ref[i]), static_cast(y[i]), 1) << "at element " << i << " / " << batch_size(); EXPECT_NEAR(static_cast(static_cast(y[i])), y_fp[i], 1.0f) From 61c23c32afc1840c1004de7b1e1be21f4e1a3592 Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Tue, 24 Sep 2024 04:47:13 -0700 Subject: [PATCH 37/50] Remove now-unused minmax vbinary microkernels PiperOrigin-RevId: 678192206 --- BUILD.bazel | 30 +- CMakeLists.txt | 30 +- bench/f32-softmax.cc | 17 +- bench/vbinary.cc | 30 +- cmake/gen/avx512f_microkernels.cmake | 40 +- cmake/gen/avx512fp16_microkernels.cmake | 40 +- cmake/gen/avx_microkernels.cmake | 40 +- cmake/gen/f16c_microkernels.cmake | 40 +- cmake/gen/fp16arith_microkernels.cmake | 60 +- cmake/gen/hvx_microkernels.cmake | 42 +- cmake/gen/neon_aarch64_microkernels.cmake | 12 +- cmake/gen/neon_microkernels.cmake | 28 +- .../neonfp16arith_aarch64_microkernels.cmake | 12 +- cmake/gen/neonfp16arith_microkernels.cmake | 28 +- cmake/gen/rvv_microkernels.cmake | 40 +- cmake/gen/scalar_microkernels.cmake | 60 +- cmake/gen/sse_microkernels.cmake | 40 +- cmake/gen/wasm_microkernels.cmake | 80 +- cmake/gen/wasmsimd_microkernels.cmake | 60 - gen/avx512f_microkernels.bzl | 40 +- gen/avx512fp16_microkernels.bzl | 40 +- gen/avx_microkernels.bzl | 40 +- gen/f16c_microkernels.bzl | 40 +- gen/fp16arith_microkernels.bzl | 60 +- gen/hvx_microkernels.bzl | 42 +- gen/neon_aarch64_microkernels.bzl | 12 +- gen/neon_microkernels.bzl | 28 +- gen/neonfp16arith_aarch64_microkernels.bzl | 12 +- gen/neonfp16arith_microkernels.bzl | 28 +- gen/rvv_microkernels.bzl | 40 +- gen/scalar_microkernels.bzl | 60 +- gen/sse_microkernels.bzl | 40 +- gen/wasm_microkernels.bzl | 80 +- gen/wasmsimd_microkernels.bzl | 60 - include/xnnpack.h | 14 - scripts/generate-f16-vbinary.sh | 324 ++-- scripts/generate-f32-vbinary.sh | 1014 ++++++------- scripts/generate-tests.sh | 30 +- src/configs/binary-elementwise-config.c | 1352 ++++++++--------- .../{f16-vadd-minmax.h => f16-vadd.h} | 18 +- .../{f16-vaddc-minmax.h => f16-vaddc.h} | 18 +- src/f16-vbinary/f16-vdiv-minmax.h | 49 - src/f16-vbinary/f16-vdiv.h | 49 + src/f16-vbinary/f16-vdivc-minmax.h | 49 - src/f16-vbinary/f16-vdivc.h | 49 + .../{f16-vmul-minmax.h => f16-vmul.h} | 18 +- .../{f16-vmulc-minmax.h => f16-vmulc.h} | 18 +- .../{f16-vrdivc-minmax.h => f16-vrdivc.h} | 18 +- .../{f16-vrsubc-minmax.h => f16-vrsubc.h} | 18 +- .../{f16-vsub-minmax.h => f16-vsub.h} | 18 +- .../{f16-vsubc-minmax.h => f16-vsubc.h} | 18 +- src/f16-vbinary/gen/f16-vadd-avx512fp16-u32.c | 63 + ...12fp16-u32.c => f16-vadd-avx512fp16-u64.c} | 24 +- ...-minmax-f16c-u16.c => f16-vadd-f16c-u16.c} | 21 +- ...dd-minmax-f16c-u8.c => f16-vadd-f16c-u8.c} | 15 +- ...fp16arith-u1.c => f16-vadd-fp16arith-u1.c} | 11 +- ...fp16arith-u2.c => f16-vadd-fp16arith-u2.c} | 17 +- ...fp16arith-u4.c => f16-vadd-fp16arith-u4.c} | 21 +- .../gen/f16-vadd-minmax-avx512fp16-u64.c | 91 -- ...ith-u16.c => f16-vadd-neonfp16arith-u16.c} | 17 +- ...arith-u8.c => f16-vadd-neonfp16arith-u8.c} | 11 +- .../gen/f16-vaddc-avx512fp16-u32.c | 64 + ...2fp16-u32.c => f16-vaddc-avx512fp16-u64.c} | 26 +- ...minmax-f16c-u16.c => f16-vaddc-f16c-u16.c} | 21 +- ...c-minmax-f16c-u8.c => f16-vaddc-f16c-u8.c} | 15 +- ...p16arith-u1.c => f16-vaddc-fp16arith-u1.c} | 11 +- ...p16arith-u2.c => f16-vaddc-fp16arith-u2.c} | 17 +- ...p16arith-u4.c => f16-vaddc-fp16arith-u4.c} | 21 +- .../gen/f16-vaddc-minmax-avx512fp16-u64.c | 89 -- ...th-u16.c => f16-vaddc-neonfp16arith-u16.c} | 17 +- ...rith-u8.c => f16-vaddc-neonfp16arith-u8.c} | 11 +- ...c => f16-vdiv-aarch64-neonfp16arith-u16.c} | 17 +- ....c => f16-vdiv-aarch64-neonfp16arith-u8.c} | 11 +- src/f16-vbinary/gen/f16-vdiv-avx512fp16-u32.c | 63 + ...12fp16-u32.c => f16-vdiv-avx512fp16-u64.c} | 24 +- ...-minmax-f16c-u16.c => f16-vdiv-f16c-u16.c} | 21 +- ...iv-minmax-f16c-u8.c => f16-vdiv-f16c-u8.c} | 15 +- ...fp16arith-u1.c => f16-vdiv-fp16arith-u1.c} | 11 +- ...fp16arith-u2.c => f16-vdiv-fp16arith-u2.c} | 17 +- ...fp16arith-u4.c => f16-vdiv-fp16arith-u4.c} | 21 +- .../gen/f16-vdiv-minmax-avx512fp16-u64.c | 91 -- ... => f16-vdivc-aarch64-neonfp16arith-u16.c} | 17 +- ...c => f16-vdivc-aarch64-neonfp16arith-u8.c} | 11 +- .../gen/f16-vdivc-avx512fp16-u32.c | 64 + ...2fp16-u32.c => f16-vdivc-avx512fp16-u64.c} | 26 +- ...minmax-f16c-u16.c => f16-vdivc-f16c-u16.c} | 21 +- ...c-minmax-f16c-u8.c => f16-vdivc-f16c-u8.c} | 15 +- ...p16arith-u1.c => f16-vdivc-fp16arith-u1.c} | 11 +- ...p16arith-u2.c => f16-vdivc-fp16arith-u2.c} | 17 +- ...p16arith-u4.c => f16-vdivc-fp16arith-u4.c} | 21 +- .../gen/f16-vdivc-minmax-avx512fp16-u64.c | 89 -- src/f16-vbinary/gen/f16-vmax-avx512fp16-u32.c | 2 - src/f16-vbinary/gen/f16-vmax-avx512fp16-u64.c | 3 - src/f16-vbinary/gen/f16-vmax-f16c-u16.c | 4 - src/f16-vbinary/gen/f16-vmax-f16c-u8.c | 3 - src/f16-vbinary/gen/f16-vmax-fp16arith-u1.c | 1 - src/f16-vbinary/gen/f16-vmax-fp16arith-u2.c | 2 - src/f16-vbinary/gen/f16-vmax-fp16arith-u4.c | 2 - .../gen/f16-vmax-neonfp16arith-u16.c | 2 - .../gen/f16-vmax-neonfp16arith-u8.c | 1 - .../gen/f16-vmaxc-avx512fp16-u32.c | 1 - .../gen/f16-vmaxc-avx512fp16-u64.c | 2 - src/f16-vbinary/gen/f16-vmaxc-f16c-u16.c | 4 - src/f16-vbinary/gen/f16-vmaxc-f16c-u8.c | 3 - src/f16-vbinary/gen/f16-vmaxc-fp16arith-u1.c | 1 - src/f16-vbinary/gen/f16-vmaxc-fp16arith-u2.c | 2 - src/f16-vbinary/gen/f16-vmaxc-fp16arith-u4.c | 2 - .../gen/f16-vmaxc-neonfp16arith-u16.c | 2 - .../gen/f16-vmaxc-neonfp16arith-u8.c | 1 - src/f16-vbinary/gen/f16-vmin-avx512fp16-u32.c | 2 - src/f16-vbinary/gen/f16-vmin-avx512fp16-u64.c | 3 - src/f16-vbinary/gen/f16-vmin-f16c-u16.c | 4 - src/f16-vbinary/gen/f16-vmin-f16c-u8.c | 3 - src/f16-vbinary/gen/f16-vmin-fp16arith-u1.c | 1 - src/f16-vbinary/gen/f16-vmin-fp16arith-u2.c | 2 - src/f16-vbinary/gen/f16-vmin-fp16arith-u4.c | 2 - .../gen/f16-vmin-neonfp16arith-u16.c | 2 - .../gen/f16-vmin-neonfp16arith-u8.c | 1 - .../gen/f16-vminc-avx512fp16-u32.c | 1 - .../gen/f16-vminc-avx512fp16-u64.c | 2 - src/f16-vbinary/gen/f16-vminc-f16c-u16.c | 4 - src/f16-vbinary/gen/f16-vminc-f16c-u8.c | 3 - src/f16-vbinary/gen/f16-vminc-fp16arith-u1.c | 1 - src/f16-vbinary/gen/f16-vminc-fp16arith-u2.c | 2 - src/f16-vbinary/gen/f16-vminc-fp16arith-u4.c | 2 - .../gen/f16-vminc-neonfp16arith-u16.c | 2 - .../gen/f16-vminc-neonfp16arith-u8.c | 1 - src/f16-vbinary/gen/f16-vmul-avx512fp16-u32.c | 63 + ...12fp16-u32.c => f16-vmul-avx512fp16-u64.c} | 24 +- ...-minmax-f16c-u16.c => f16-vmul-f16c-u16.c} | 21 +- ...ul-minmax-f16c-u8.c => f16-vmul-f16c-u8.c} | 15 +- ...fp16arith-u1.c => f16-vmul-fp16arith-u1.c} | 11 +- ...fp16arith-u2.c => f16-vmul-fp16arith-u2.c} | 17 +- ...fp16arith-u4.c => f16-vmul-fp16arith-u4.c} | 21 +- .../gen/f16-vmul-minmax-avx512fp16-u64.c | 91 -- ...ith-u16.c => f16-vmul-neonfp16arith-u16.c} | 17 +- ...arith-u8.c => f16-vmul-neonfp16arith-u8.c} | 11 +- .../gen/f16-vmulc-avx512fp16-u32.c | 64 + ...2fp16-u32.c => f16-vmulc-avx512fp16-u64.c} | 26 +- ...minmax-f16c-u16.c => f16-vmulc-f16c-u16.c} | 21 +- ...c-minmax-f16c-u8.c => f16-vmulc-f16c-u8.c} | 15 +- ...p16arith-u1.c => f16-vmulc-fp16arith-u1.c} | 11 +- ...p16arith-u2.c => f16-vmulc-fp16arith-u2.c} | 17 +- ...p16arith-u4.c => f16-vmulc-fp16arith-u4.c} | 21 +- .../gen/f16-vmulc-minmax-avx512fp16-u64.c | 89 -- ...th-u16.c => f16-vmulc-neonfp16arith-u16.c} | 17 +- ...rith-u8.c => f16-vmulc-neonfp16arith-u8.c} | 11 +- .../gen/f16-vprelu-avx512fp16-u32.c | 2 - .../gen/f16-vprelu-avx512fp16-u64.c | 3 - src/f16-vbinary/gen/f16-vprelu-f16c-u16.c | 4 - src/f16-vbinary/gen/f16-vprelu-f16c-u8.c | 3 - .../gen/f16-vprelu-neonfp16arith-u16.c | 2 - .../gen/f16-vprelu-neonfp16arith-u8.c | 1 - .../gen/f16-vpreluc-avx512fp16-u32.c | 1 - .../gen/f16-vpreluc-avx512fp16-u64.c | 2 - src/f16-vbinary/gen/f16-vpreluc-f16c-u16.c | 4 - src/f16-vbinary/gen/f16-vpreluc-f16c-u8.c | 3 - .../gen/f16-vpreluc-neonfp16arith-u16.c | 2 - .../gen/f16-vpreluc-neonfp16arith-u8.c | 1 - ...=> f16-vrdivc-aarch64-neonfp16arith-u16.c} | 17 +- ... => f16-vrdivc-aarch64-neonfp16arith-u8.c} | 11 +- .../gen/f16-vrdivc-avx512fp16-u32.c | 64 + ...fp16-u32.c => f16-vrdivc-avx512fp16-u64.c} | 26 +- ...inmax-f16c-u16.c => f16-vrdivc-f16c-u16.c} | 21 +- ...-minmax-f16c-u8.c => f16-vrdivc-f16c-u8.c} | 15 +- ...16arith-u1.c => f16-vrdivc-fp16arith-u1.c} | 11 +- ...16arith-u2.c => f16-vrdivc-fp16arith-u2.c} | 17 +- ...16arith-u4.c => f16-vrdivc-fp16arith-u4.c} | 21 +- .../gen/f16-vrdivc-minmax-avx512fp16-u64.c | 89 -- .../gen/f16-vrpreluc-avx512fp16-u32.c | 1 - .../gen/f16-vrpreluc-avx512fp16-u64.c | 2 - src/f16-vbinary/gen/f16-vrpreluc-f16c-u16.c | 4 - src/f16-vbinary/gen/f16-vrpreluc-f16c-u8.c | 3 - .../gen/f16-vrpreluc-neonfp16arith-u16.c | 2 - .../gen/f16-vrpreluc-neonfp16arith-u8.c | 1 - .../gen/f16-vrsubc-avx512fp16-u32.c | 64 + ...fp16-u32.c => f16-vrsubc-avx512fp16-u64.c} | 26 +- ...inmax-f16c-u16.c => f16-vrsubc-f16c-u16.c} | 21 +- ...-minmax-f16c-u8.c => f16-vrsubc-f16c-u8.c} | 15 +- ...16arith-u1.c => f16-vrsubc-fp16arith-u1.c} | 11 +- ...16arith-u2.c => f16-vrsubc-fp16arith-u2.c} | 17 +- ...16arith-u4.c => f16-vrsubc-fp16arith-u4.c} | 21 +- .../gen/f16-vrsubc-minmax-avx512fp16-u64.c | 89 -- ...h-u16.c => f16-vrsubc-neonfp16arith-u16.c} | 17 +- ...ith-u8.c => f16-vrsubc-neonfp16arith-u8.c} | 11 +- .../gen/f16-vsqrdiff-avx512fp16-u32.c | 2 - .../gen/f16-vsqrdiff-avx512fp16-u64.c | 3 - src/f16-vbinary/gen/f16-vsqrdiff-f16c-u16.c | 4 - src/f16-vbinary/gen/f16-vsqrdiff-f16c-u8.c | 3 - .../gen/f16-vsqrdiff-fp16arith-u1.c | 1 - .../gen/f16-vsqrdiff-fp16arith-u2.c | 2 - .../gen/f16-vsqrdiff-fp16arith-u4.c | 2 - .../gen/f16-vsqrdiff-neonfp16arith-u16.c | 2 - .../gen/f16-vsqrdiff-neonfp16arith-u8.c | 1 - .../gen/f16-vsqrdiffc-avx512fp16-u32.c | 1 - .../gen/f16-vsqrdiffc-avx512fp16-u64.c | 2 - src/f16-vbinary/gen/f16-vsqrdiffc-f16c-u16.c | 4 - src/f16-vbinary/gen/f16-vsqrdiffc-f16c-u8.c | 3 - .../gen/f16-vsqrdiffc-fp16arith-u1.c | 1 - .../gen/f16-vsqrdiffc-fp16arith-u2.c | 2 - .../gen/f16-vsqrdiffc-fp16arith-u4.c | 2 - .../gen/f16-vsqrdiffc-neonfp16arith-u16.c | 2 - .../gen/f16-vsqrdiffc-neonfp16arith-u8.c | 1 - src/f16-vbinary/gen/f16-vsub-avx512fp16-u32.c | 63 + ...12fp16-u32.c => f16-vsub-avx512fp16-u64.c} | 24 +- ...-minmax-f16c-u16.c => f16-vsub-f16c-u16.c} | 21 +- ...ub-minmax-f16c-u8.c => f16-vsub-f16c-u8.c} | 15 +- ...fp16arith-u1.c => f16-vsub-fp16arith-u1.c} | 11 +- ...fp16arith-u2.c => f16-vsub-fp16arith-u2.c} | 17 +- ...fp16arith-u4.c => f16-vsub-fp16arith-u4.c} | 21 +- .../gen/f16-vsub-minmax-avx512fp16-u64.c | 91 -- ...ith-u16.c => f16-vsub-neonfp16arith-u16.c} | 17 +- ...arith-u8.c => f16-vsub-neonfp16arith-u8.c} | 11 +- .../gen/f16-vsubc-avx512fp16-u32.c | 64 + ...2fp16-u32.c => f16-vsubc-avx512fp16-u64.c} | 26 +- ...minmax-f16c-u16.c => f16-vsubc-f16c-u16.c} | 21 +- ...c-minmax-f16c-u8.c => f16-vsubc-f16c-u8.c} | 15 +- ...p16arith-u1.c => f16-vsubc-fp16arith-u1.c} | 11 +- ...p16arith-u2.c => f16-vsubc-fp16arith-u2.c} | 17 +- ...p16arith-u4.c => f16-vsubc-fp16arith-u4.c} | 21 +- .../gen/f16-vsubc-minmax-avx512fp16-u64.c | 89 -- ...th-u16.c => f16-vsubc-neonfp16arith-u16.c} | 17 +- ...rith-u8.c => f16-vsubc-neonfp16arith-u8.c} | 11 +- src/f16-vbinary/vop-avx512fp16.c.in | 25 +- src/f16-vbinary/vop-f16c.c.in | 28 +- src/f16-vbinary/vop-fp16arith.c.in | 30 +- src/f16-vbinary/vop-neonfp16arith.c.in | 24 +- src/f16-vbinary/vopc-avx512fp16.c.in | 24 +- src/f16-vbinary/vopc-f16c.c.in | 28 +- src/f16-vbinary/vopc-fp16arith.c.in | 30 +- src/f16-vbinary/vopc-neonfp16arith.c.in | 24 +- src/f32-vbinary/f32-vadd-minmax.h | 73 - src/f32-vbinary/f32-vadd.h | 32 + src/f32-vbinary/f32-vaddc-minmax.h | 73 - src/f32-vbinary/f32-vaddc.h | 32 + src/f32-vbinary/f32-vdiv-minmax.h | 67 - src/f32-vbinary/f32-vdiv.h | 26 + src/f32-vbinary/f32-vdivc-minmax.h | 67 - src/f32-vbinary/f32-vdivc.h | 26 + src/f32-vbinary/f32-vmul-minmax.h | 73 - src/f32-vbinary/f32-vmul.h | 32 + src/f32-vbinary/f32-vmulc-minmax.h | 73 - src/f32-vbinary/f32-vmulc.h | 38 +- src/f32-vbinary/f32-vrdivc-minmax.h | 67 - src/f32-vbinary/f32-vrdivc.h | 26 + src/f32-vbinary/f32-vrsubc-minmax.h | 73 - src/f32-vbinary/f32-vrsubc.h | 32 + src/f32-vbinary/f32-vsub-minmax.h | 73 - src/f32-vbinary/f32-vsub.h | 32 + src/f32-vbinary/f32-vsubc-minmax.h | 73 - src/f32-vbinary/f32-vsubc.h | 32 + ...dd-minmax-avx-u16.c => f32-vadd-avx-u16.c} | 19 +- ...vadd-minmax-avx-u8.c => f32-vadd-avx-u8.c} | 13 +- ...x-avx512f-u16.c => f32-vadd-avx512f-u16.c} | 12 +- ...x-avx512f-u32.c => f32-vadd-avx512f-u32.c} | 18 +- ...-minmax-hvx-u128.c => f32-vadd-hvx-u128.c} | 23 +- ...dd-minmax-hvx-u32.c => f32-vadd-hvx-u32.c} | 13 +- ...dd-minmax-hvx-u64.c => f32-vadd-hvx-u64.c} | 19 +- src/f32-vbinary/gen/f32-vadd-minmax-wasm-u1.c | 41 - src/f32-vbinary/gen/f32-vadd-minmax-wasm-u2.c | 65 - src/f32-vbinary/gen/f32-vadd-minmax-wasm-u4.c | 79 - src/f32-vbinary/gen/f32-vadd-minmax-wasm-u8.c | 103 -- .../gen/f32-vadd-minmax-wasmsimd-arm-u16.c | 103 -- .../gen/f32-vadd-minmax-wasmsimd-arm-u4.c | 69 - .../gen/f32-vadd-minmax-wasmsimd-arm-u8.c | 91 -- .../gen/f32-vadd-minmax-wasmsimd-x86-u16.c | 103 -- .../gen/f32-vadd-minmax-wasmsimd-x86-u4.c | 69 - .../gen/f32-vadd-minmax-wasmsimd-x86-u8.c | 91 -- ...dd-minmax-neon-u4.c => f32-vadd-neon-u4.c} | 13 +- ...dd-minmax-neon-u8.c => f32-vadd-neon-u8.c} | 19 +- ...dd-minmax-rvv-u4v.c => f32-vadd-rvv-u4v.c} | 8 +- ...dd-minmax-rvv-u8v.c => f32-vadd-rvv-u8v.c} | 8 +- src/f32-vbinary/gen/f32-vadd-scalar-u1.c | 1 - src/f32-vbinary/gen/f32-vadd-scalar-u2.c | 2 - src/f32-vbinary/gen/f32-vadd-scalar-u4.c | 2 - src/f32-vbinary/gen/f32-vadd-scalar-u8.c | 2 - ...vadd-minmax-sse-u4.c => f32-vadd-sse-u4.c} | 13 +- ...vadd-minmax-sse-u8.c => f32-vadd-sse-u8.c} | 19 +- ...-minmax-scalar-u1.c => f32-vadd-wasm-u1.c} | 9 +- ...-minmax-scalar-u2.c => f32-vadd-wasm-u2.c} | 15 +- ...-minmax-scalar-u4.c => f32-vadd-wasm-u4.c} | 19 +- ...-minmax-scalar-u8.c => f32-vadd-wasm-u8.c} | 27 +- src/f32-vbinary/gen/f32-vadd-wasmsimd-u16.c | 3 - src/f32-vbinary/gen/f32-vadd-wasmsimd-u4.c | 3 - src/f32-vbinary/gen/f32-vadd-wasmsimd-u8.c | 3 - ...dc-minmax-avx-u8.c => f32-vaddc-avx-u16.c} | 25 +- src/f32-vbinary/gen/f32-vaddc-avx-u8.c | 67 + ...-avx512f-u16.c => f32-vaddc-avx512f-u16.c} | 13 +- ...-avx512f-u32.c => f32-vaddc-avx512f-u32.c} | 18 +- ...minmax-hvx-u128.c => f32-vaddc-hvx-u128.c} | 20 +- ...c-minmax-hvx-u32.c => f32-vaddc-hvx-u32.c} | 10 +- ...c-minmax-hvx-u64.c => f32-vaddc-hvx-u64.c} | 16 +- .../gen/f32-vaddc-minmax-avx-u16.c | 94 -- .../gen/f32-vaddc-minmax-wasm-u1.c | 41 - .../gen/f32-vaddc-minmax-wasm-u2.c | 61 - .../gen/f32-vaddc-minmax-wasm-u4.c | 73 - .../gen/f32-vaddc-minmax-wasm-u8.c | 93 -- .../gen/f32-vaddc-minmax-wasmsimd-arm-u16.c | 95 -- .../gen/f32-vaddc-minmax-wasmsimd-arm-u4.c | 66 - .../gen/f32-vaddc-minmax-wasmsimd-arm-u8.c | 85 -- .../gen/f32-vaddc-minmax-wasmsimd-x86-u16.c | 95 -- .../gen/f32-vaddc-minmax-wasmsimd-x86-u4.c | 66 - .../gen/f32-vaddc-minmax-wasmsimd-x86-u8.c | 85 -- ...c-minmax-neon-u4.c => f32-vaddc-neon-u4.c} | 12 +- ...c-minmax-neon-u8.c => f32-vaddc-neon-u8.c} | 18 +- ...c-minmax-rvv-u4v.c => f32-vaddc-rvv-u4v.c} | 8 +- ...c-minmax-rvv-u8v.c => f32-vaddc-rvv-u8v.c} | 8 +- src/f32-vbinary/gen/f32-vaddc-scalar-u2.c | 1 - src/f32-vbinary/gen/f32-vaddc-scalar-u4.c | 1 - src/f32-vbinary/gen/f32-vaddc-scalar-u8.c | 1 - ...ddc-minmax-sse-u4.c => f32-vaddc-sse-u4.c} | 12 +- ...ddc-minmax-sse-u8.c => f32-vaddc-sse-u8.c} | 18 +- ...minmax-scalar-u1.c => f32-vaddc-wasm-u1.c} | 8 +- ...minmax-scalar-u2.c => f32-vaddc-wasm-u2.c} | 14 +- ...minmax-scalar-u4.c => f32-vaddc-wasm-u4.c} | 18 +- ...minmax-scalar-u8.c => f32-vaddc-wasm-u8.c} | 26 +- src/f32-vbinary/gen/f32-vaddc-wasmsimd-u16.c | 3 - src/f32-vbinary/gen/f32-vaddc-wasmsimd-u4.c | 2 - src/f32-vbinary/gen/f32-vaddc-wasmsimd-u8.c | 3 - ...4-neon-u4.c => f32-vdiv-aarch64-neon-u4.c} | 13 +- ...4-neon-u8.c => f32-vdiv-aarch64-neon-u8.c} | 19 +- ...iv-minmax-avx-u16.c => f32-vdiv-avx-u16.c} | 19 +- ...vdiv-minmax-avx-u8.c => f32-vdiv-avx-u8.c} | 13 +- ...x-avx512f-u16.c => f32-vdiv-avx512f-u16.c} | 12 +- ...x-avx512f-u32.c => f32-vdiv-avx512f-u32.c} | 18 +- src/f32-vbinary/gen/f32-vdiv-minmax-wasm-u1.c | 41 - src/f32-vbinary/gen/f32-vdiv-minmax-wasm-u2.c | 65 - src/f32-vbinary/gen/f32-vdiv-minmax-wasm-u4.c | 79 - src/f32-vbinary/gen/f32-vdiv-minmax-wasm-u8.c | 103 -- .../gen/f32-vdiv-minmax-wasmsimd-arm-u16.c | 103 -- .../gen/f32-vdiv-minmax-wasmsimd-arm-u4.c | 69 - .../gen/f32-vdiv-minmax-wasmsimd-arm-u8.c | 91 -- .../gen/f32-vdiv-minmax-wasmsimd-x86-u16.c | 103 -- .../gen/f32-vdiv-minmax-wasmsimd-x86-u4.c | 69 - .../gen/f32-vdiv-minmax-wasmsimd-x86-u8.c | 91 -- ...iv-minmax-rvv-u4v.c => f32-vdiv-rvv-u4v.c} | 8 +- ...iv-minmax-rvv-u8v.c => f32-vdiv-rvv-u8v.c} | 8 +- src/f32-vbinary/gen/f32-vdiv-scalar-u1.c | 1 - src/f32-vbinary/gen/f32-vdiv-scalar-u2.c | 2 - src/f32-vbinary/gen/f32-vdiv-scalar-u4.c | 2 - src/f32-vbinary/gen/f32-vdiv-scalar-u8.c | 2 - ...vdiv-minmax-sse-u4.c => f32-vdiv-sse-u4.c} | 13 +- ...vdiv-minmax-sse-u8.c => f32-vdiv-sse-u8.c} | 19 +- ...-minmax-scalar-u1.c => f32-vdiv-wasm-u1.c} | 9 +- ...-minmax-scalar-u2.c => f32-vdiv-wasm-u2.c} | 15 +- ...-minmax-scalar-u4.c => f32-vdiv-wasm-u4.c} | 19 +- ...-minmax-scalar-u8.c => f32-vdiv-wasm-u8.c} | 27 +- src/f32-vbinary/gen/f32-vdiv-wasmsimd-u16.c | 3 - src/f32-vbinary/gen/f32-vdiv-wasmsimd-u4.c | 3 - src/f32-vbinary/gen/f32-vdiv-wasmsimd-u8.c | 3 - ...-neon-u4.c => f32-vdivc-aarch64-neon-u4.c} | 12 +- ...-neon-u8.c => f32-vdivc-aarch64-neon-u8.c} | 18 +- ...vc-minmax-avx-u8.c => f32-vdivc-avx-u16.c} | 25 +- src/f32-vbinary/gen/f32-vdivc-avx-u8.c | 67 + ...-avx512f-u16.c => f32-vdivc-avx512f-u16.c} | 13 +- ...-avx512f-u32.c => f32-vdivc-avx512f-u32.c} | 18 +- .../gen/f32-vdivc-minmax-avx-u16.c | 94 -- .../gen/f32-vdivc-minmax-wasm-u1.c | 41 - .../gen/f32-vdivc-minmax-wasm-u2.c | 61 - .../gen/f32-vdivc-minmax-wasm-u4.c | 73 - .../gen/f32-vdivc-minmax-wasm-u8.c | 93 -- .../gen/f32-vdivc-minmax-wasmsimd-arm-u16.c | 95 -- .../gen/f32-vdivc-minmax-wasmsimd-arm-u4.c | 66 - .../gen/f32-vdivc-minmax-wasmsimd-arm-u8.c | 85 -- .../gen/f32-vdivc-minmax-wasmsimd-x86-u16.c | 95 -- .../gen/f32-vdivc-minmax-wasmsimd-x86-u4.c | 66 - .../gen/f32-vdivc-minmax-wasmsimd-x86-u8.c | 85 -- ...c-minmax-rvv-u4v.c => f32-vdivc-rvv-u4v.c} | 8 +- ...c-minmax-rvv-u8v.c => f32-vdivc-rvv-u8v.c} | 8 +- src/f32-vbinary/gen/f32-vdivc-scalar-u2.c | 1 - src/f32-vbinary/gen/f32-vdivc-scalar-u4.c | 1 - src/f32-vbinary/gen/f32-vdivc-scalar-u8.c | 1 - ...ivc-minmax-sse-u4.c => f32-vdivc-sse-u4.c} | 12 +- ...ivc-minmax-sse-u8.c => f32-vdivc-sse-u8.c} | 18 +- ...minmax-scalar-u1.c => f32-vdivc-wasm-u1.c} | 8 +- ...minmax-scalar-u2.c => f32-vdivc-wasm-u2.c} | 14 +- ...minmax-scalar-u4.c => f32-vdivc-wasm-u4.c} | 18 +- ...minmax-scalar-u8.c => f32-vdivc-wasm-u8.c} | 26 +- src/f32-vbinary/gen/f32-vdivc-wasmsimd-u16.c | 3 - src/f32-vbinary/gen/f32-vdivc-wasmsimd-u4.c | 2 - src/f32-vbinary/gen/f32-vdivc-wasmsimd-u8.c | 3 - src/f32-vbinary/gen/f32-vmax-avx-u16.c | 2 - src/f32-vbinary/gen/f32-vmax-avx-u8.c | 1 - src/f32-vbinary/gen/f32-vmax-avx512f-u16.c | 2 - src/f32-vbinary/gen/f32-vmax-avx512f-u32.c | 3 - src/f32-vbinary/gen/f32-vmax-hvx-u128.c | 4 +- src/f32-vbinary/gen/f32-vmax-hvx-u32.c | 3 +- src/f32-vbinary/gen/f32-vmax-hvx-u64.c | 4 +- src/f32-vbinary/gen/f32-vmax-neon-u4.c | 3 - src/f32-vbinary/gen/f32-vmax-neon-u8.c | 4 - src/f32-vbinary/gen/f32-vmax-scalar-u1.c | 1 - src/f32-vbinary/gen/f32-vmax-scalar-u2.c | 2 - src/f32-vbinary/gen/f32-vmax-scalar-u4.c | 2 - src/f32-vbinary/gen/f32-vmax-scalar-u8.c | 2 - src/f32-vbinary/gen/f32-vmax-sse-u4.c | 1 - src/f32-vbinary/gen/f32-vmax-sse-u8.c | 2 - src/f32-vbinary/gen/f32-vmax-wasm-u1.c | 1 - src/f32-vbinary/gen/f32-vmax-wasm-u2.c | 2 - src/f32-vbinary/gen/f32-vmax-wasm-u4.c | 2 - src/f32-vbinary/gen/f32-vmax-wasm-u8.c | 2 - .../gen/f32-vmax-wasmsimd-arm-u16.c | 3 - .../gen/f32-vmax-wasmsimd-arm-u4.c | 3 - .../gen/f32-vmax-wasmsimd-arm-u8.c | 3 - .../gen/f32-vmax-wasmsimd-x86-u16.c | 3 - .../gen/f32-vmax-wasmsimd-x86-u4.c | 3 - .../gen/f32-vmax-wasmsimd-x86-u8.c | 3 - src/f32-vbinary/gen/f32-vmaxc-avx-u16.c | 1 - src/f32-vbinary/gen/f32-vmaxc-avx512f-u16.c | 2 +- src/f32-vbinary/gen/f32-vmaxc-avx512f-u32.c | 3 +- src/f32-vbinary/gen/f32-vmaxc-hvx-u128.c | 1 - src/f32-vbinary/gen/f32-vmaxc-hvx-u64.c | 1 - src/f32-vbinary/gen/f32-vmaxc-neon-u4.c | 2 - src/f32-vbinary/gen/f32-vmaxc-neon-u8.c | 3 - src/f32-vbinary/gen/f32-vmaxc-scalar-u2.c | 1 - src/f32-vbinary/gen/f32-vmaxc-scalar-u4.c | 1 - src/f32-vbinary/gen/f32-vmaxc-scalar-u8.c | 1 - src/f32-vbinary/gen/f32-vmaxc-sse-u8.c | 1 - src/f32-vbinary/gen/f32-vmaxc-wasm-u2.c | 1 - src/f32-vbinary/gen/f32-vmaxc-wasm-u4.c | 1 - src/f32-vbinary/gen/f32-vmaxc-wasm-u8.c | 1 - .../gen/f32-vmaxc-wasmsimd-arm-u16.c | 3 - .../gen/f32-vmaxc-wasmsimd-arm-u4.c | 2 - .../gen/f32-vmaxc-wasmsimd-arm-u8.c | 3 - .../gen/f32-vmaxc-wasmsimd-x86-u16.c | 3 - .../gen/f32-vmaxc-wasmsimd-x86-u4.c | 2 - .../gen/f32-vmaxc-wasmsimd-x86-u8.c | 3 - src/f32-vbinary/gen/f32-vmin-avx-u16.c | 2 - src/f32-vbinary/gen/f32-vmin-avx-u8.c | 1 - src/f32-vbinary/gen/f32-vmin-avx512f-u16.c | 2 - src/f32-vbinary/gen/f32-vmin-avx512f-u32.c | 3 - src/f32-vbinary/gen/f32-vmin-hvx-u128.c | 4 +- src/f32-vbinary/gen/f32-vmin-hvx-u32.c | 3 +- src/f32-vbinary/gen/f32-vmin-hvx-u64.c | 4 +- src/f32-vbinary/gen/f32-vmin-neon-u4.c | 3 - src/f32-vbinary/gen/f32-vmin-neon-u8.c | 4 - src/f32-vbinary/gen/f32-vmin-scalar-u1.c | 1 - src/f32-vbinary/gen/f32-vmin-scalar-u2.c | 2 - src/f32-vbinary/gen/f32-vmin-scalar-u4.c | 2 - src/f32-vbinary/gen/f32-vmin-scalar-u8.c | 2 - src/f32-vbinary/gen/f32-vmin-sse-u4.c | 1 - src/f32-vbinary/gen/f32-vmin-sse-u8.c | 2 - src/f32-vbinary/gen/f32-vmin-wasm-u1.c | 1 - src/f32-vbinary/gen/f32-vmin-wasm-u2.c | 2 - src/f32-vbinary/gen/f32-vmin-wasm-u4.c | 2 - src/f32-vbinary/gen/f32-vmin-wasm-u8.c | 2 - .../gen/f32-vmin-wasmsimd-arm-u16.c | 3 - .../gen/f32-vmin-wasmsimd-arm-u4.c | 3 - .../gen/f32-vmin-wasmsimd-arm-u8.c | 3 - .../gen/f32-vmin-wasmsimd-x86-u16.c | 3 - .../gen/f32-vmin-wasmsimd-x86-u4.c | 3 - .../gen/f32-vmin-wasmsimd-x86-u8.c | 3 - src/f32-vbinary/gen/f32-vminc-avx-u16.c | 1 - src/f32-vbinary/gen/f32-vminc-avx512f-u16.c | 2 +- src/f32-vbinary/gen/f32-vminc-avx512f-u32.c | 3 +- src/f32-vbinary/gen/f32-vminc-hvx-u128.c | 1 - src/f32-vbinary/gen/f32-vminc-hvx-u64.c | 1 - src/f32-vbinary/gen/f32-vminc-neon-u4.c | 2 - src/f32-vbinary/gen/f32-vminc-neon-u8.c | 3 - src/f32-vbinary/gen/f32-vminc-scalar-u2.c | 1 - src/f32-vbinary/gen/f32-vminc-scalar-u4.c | 1 - src/f32-vbinary/gen/f32-vminc-scalar-u8.c | 1 - src/f32-vbinary/gen/f32-vminc-sse-u8.c | 1 - src/f32-vbinary/gen/f32-vminc-wasm-u2.c | 1 - src/f32-vbinary/gen/f32-vminc-wasm-u4.c | 1 - src/f32-vbinary/gen/f32-vminc-wasm-u8.c | 1 - .../gen/f32-vminc-wasmsimd-arm-u16.c | 3 - .../gen/f32-vminc-wasmsimd-arm-u4.c | 2 - .../gen/f32-vminc-wasmsimd-arm-u8.c | 3 - .../gen/f32-vminc-wasmsimd-x86-u16.c | 3 - .../gen/f32-vminc-wasmsimd-x86-u4.c | 2 - .../gen/f32-vminc-wasmsimd-x86-u8.c | 3 - ...ul-minmax-avx-u16.c => f32-vmul-avx-u16.c} | 19 +- ...vmul-minmax-avx-u8.c => f32-vmul-avx-u8.c} | 13 +- ...x-avx512f-u16.c => f32-vmul-avx512f-u16.c} | 12 +- ...x-avx512f-u32.c => f32-vmul-avx512f-u32.c} | 18 +- ...-minmax-hvx-u128.c => f32-vmul-hvx-u128.c} | 23 +- ...ul-minmax-hvx-u32.c => f32-vmul-hvx-u32.c} | 13 +- ...ul-minmax-hvx-u64.c => f32-vmul-hvx-u64.c} | 19 +- src/f32-vbinary/gen/f32-vmul-minmax-wasm-u1.c | 41 - src/f32-vbinary/gen/f32-vmul-minmax-wasm-u2.c | 65 - src/f32-vbinary/gen/f32-vmul-minmax-wasm-u4.c | 79 - src/f32-vbinary/gen/f32-vmul-minmax-wasm-u8.c | 103 -- .../gen/f32-vmul-minmax-wasmsimd-arm-u16.c | 103 -- .../gen/f32-vmul-minmax-wasmsimd-arm-u4.c | 69 - .../gen/f32-vmul-minmax-wasmsimd-arm-u8.c | 91 -- .../gen/f32-vmul-minmax-wasmsimd-x86-u16.c | 103 -- .../gen/f32-vmul-minmax-wasmsimd-x86-u4.c | 69 - .../gen/f32-vmul-minmax-wasmsimd-x86-u8.c | 91 -- ...ul-minmax-neon-u4.c => f32-vmul-neon-u4.c} | 13 +- ...ul-minmax-neon-u8.c => f32-vmul-neon-u8.c} | 19 +- ...ul-minmax-rvv-u4v.c => f32-vmul-rvv-u4v.c} | 8 +- ...ul-minmax-rvv-u8v.c => f32-vmul-rvv-u8v.c} | 8 +- src/f32-vbinary/gen/f32-vmul-scalar-u1.c | 1 - src/f32-vbinary/gen/f32-vmul-scalar-u2.c | 2 - src/f32-vbinary/gen/f32-vmul-scalar-u4.c | 2 - src/f32-vbinary/gen/f32-vmul-scalar-u8.c | 2 - ...vmul-minmax-sse-u4.c => f32-vmul-sse-u4.c} | 13 +- ...vmul-minmax-sse-u8.c => f32-vmul-sse-u8.c} | 19 +- ...-minmax-scalar-u1.c => f32-vmul-wasm-u1.c} | 9 +- ...-minmax-scalar-u2.c => f32-vmul-wasm-u2.c} | 15 +- ...-minmax-scalar-u4.c => f32-vmul-wasm-u4.c} | 19 +- ...-minmax-scalar-u8.c => f32-vmul-wasm-u8.c} | 27 +- src/f32-vbinary/gen/f32-vmul-wasmsimd-u16.c | 3 - src/f32-vbinary/gen/f32-vmul-wasmsimd-u4.c | 3 - src/f32-vbinary/gen/f32-vmul-wasmsimd-u8.c | 3 - ...lc-minmax-avx-u8.c => f32-vmulc-avx-u16.c} | 25 +- src/f32-vbinary/gen/f32-vmulc-avx-u8.c | 67 + ...-avx512f-u16.c => f32-vmulc-avx512f-u16.c} | 13 +- ...-avx512f-u32.c => f32-vmulc-avx512f-u32.c} | 18 +- ...minmax-hvx-u128.c => f32-vmulc-hvx-u128.c} | 20 +- ...c-minmax-hvx-u32.c => f32-vmulc-hvx-u32.c} | 10 +- ...c-minmax-hvx-u64.c => f32-vmulc-hvx-u64.c} | 16 +- .../gen/f32-vmulc-minmax-avx-u16.c | 94 -- .../gen/f32-vmulc-minmax-rvv-u4v.c | 47 - .../gen/f32-vmulc-minmax-rvv-u8v.c | 47 - .../gen/f32-vmulc-minmax-wasm-u1.c | 41 - .../gen/f32-vmulc-minmax-wasm-u2.c | 61 - .../gen/f32-vmulc-minmax-wasm-u4.c | 73 - .../gen/f32-vmulc-minmax-wasm-u8.c | 93 -- .../gen/f32-vmulc-minmax-wasmsimd-arm-u16.c | 95 -- .../gen/f32-vmulc-minmax-wasmsimd-arm-u4.c | 66 - .../gen/f32-vmulc-minmax-wasmsimd-arm-u8.c | 85 -- .../gen/f32-vmulc-minmax-wasmsimd-x86-u16.c | 95 -- .../gen/f32-vmulc-minmax-wasmsimd-x86-u4.c | 66 - .../gen/f32-vmulc-minmax-wasmsimd-x86-u8.c | 85 -- ...c-minmax-neon-u4.c => f32-vmulc-neon-u4.c} | 12 +- ...c-minmax-neon-u8.c => f32-vmulc-neon-u8.c} | 18 +- src/f32-vbinary/gen/f32-vmulc-scalar-u2.c | 1 - src/f32-vbinary/gen/f32-vmulc-scalar-u4.c | 1 - src/f32-vbinary/gen/f32-vmulc-scalar-u8.c | 1 - ...ulc-minmax-sse-u4.c => f32-vmulc-sse-u4.c} | 12 +- ...ulc-minmax-sse-u8.c => f32-vmulc-sse-u8.c} | 18 +- ...minmax-scalar-u1.c => f32-vmulc-wasm-u1.c} | 8 +- ...minmax-scalar-u2.c => f32-vmulc-wasm-u2.c} | 14 +- ...minmax-scalar-u4.c => f32-vmulc-wasm-u4.c} | 18 +- ...minmax-scalar-u8.c => f32-vmulc-wasm-u8.c} | 26 +- src/f32-vbinary/gen/f32-vmulc-wasmsimd-u16.c | 3 - src/f32-vbinary/gen/f32-vmulc-wasmsimd-u4.c | 2 - src/f32-vbinary/gen/f32-vmulc-wasmsimd-u8.c | 3 - src/f32-vbinary/gen/f32-vprelu-avx-u16.c | 2 - src/f32-vbinary/gen/f32-vprelu-avx-u8.c | 1 - src/f32-vbinary/gen/f32-vprelu-avx512f-u16.c | 2 - src/f32-vbinary/gen/f32-vprelu-avx512f-u32.c | 3 - src/f32-vbinary/gen/f32-vprelu-neon-u4.c | 3 - src/f32-vbinary/gen/f32-vprelu-neon-u8.c | 4 - src/f32-vbinary/gen/f32-vprelu-scalar-u1.c | 1 - src/f32-vbinary/gen/f32-vprelu-scalar-u2.c | 2 - src/f32-vbinary/gen/f32-vprelu-scalar-u4.c | 2 - src/f32-vbinary/gen/f32-vprelu-scalar-u8.c | 2 - src/f32-vbinary/gen/f32-vprelu-sse2-u4.c | 1 - src/f32-vbinary/gen/f32-vprelu-sse2-u8.c | 2 - src/f32-vbinary/gen/f32-vprelu-sse41-u4.c | 1 - src/f32-vbinary/gen/f32-vprelu-sse41-u8.c | 2 - src/f32-vbinary/gen/f32-vprelu-wasm-u1.c | 1 - src/f32-vbinary/gen/f32-vprelu-wasm-u2.c | 2 - src/f32-vbinary/gen/f32-vprelu-wasm-u4.c | 2 - src/f32-vbinary/gen/f32-vprelu-wasm-u8.c | 2 - .../gen/f32-vprelu-wasmrelaxedsimd-u16.c | 3 - .../gen/f32-vprelu-wasmrelaxedsimd-u4.c | 3 - .../gen/f32-vprelu-wasmrelaxedsimd-u8.c | 3 - src/f32-vbinary/gen/f32-vprelu-wasmsimd-u16.c | 3 - src/f32-vbinary/gen/f32-vprelu-wasmsimd-u4.c | 3 - src/f32-vbinary/gen/f32-vprelu-wasmsimd-u8.c | 3 - src/f32-vbinary/gen/f32-vpreluc-avx-u16.c | 1 - src/f32-vbinary/gen/f32-vpreluc-avx512f-u16.c | 2 +- src/f32-vbinary/gen/f32-vpreluc-avx512f-u32.c | 3 +- src/f32-vbinary/gen/f32-vpreluc-neon-u4.c | 2 - src/f32-vbinary/gen/f32-vpreluc-neon-u8.c | 3 - src/f32-vbinary/gen/f32-vpreluc-scalar-u2.c | 1 - src/f32-vbinary/gen/f32-vpreluc-scalar-u4.c | 1 - src/f32-vbinary/gen/f32-vpreluc-scalar-u8.c | 1 - src/f32-vbinary/gen/f32-vpreluc-sse2-u8.c | 1 - src/f32-vbinary/gen/f32-vpreluc-sse41-u8.c | 1 - src/f32-vbinary/gen/f32-vpreluc-wasm-u2.c | 1 - src/f32-vbinary/gen/f32-vpreluc-wasm-u4.c | 1 - src/f32-vbinary/gen/f32-vpreluc-wasm-u8.c | 1 - .../gen/f32-vpreluc-wasmrelaxedsimd-u16.c | 3 - .../gen/f32-vpreluc-wasmrelaxedsimd-u4.c | 2 - .../gen/f32-vpreluc-wasmrelaxedsimd-u8.c | 3 - .../gen/f32-vpreluc-wasmsimd-u16.c | 3 - src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u4.c | 2 - src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u8.c | 3 - ...neon-u4.c => f32-vrdivc-aarch64-neon-u4.c} | 12 +- ...neon-u8.c => f32-vrdivc-aarch64-neon-u8.c} | 18 +- ...c-minmax-avx-u8.c => f32-vrdivc-avx-u16.c} | 25 +- src/f32-vbinary/gen/f32-vrdivc-avx-u8.c | 67 + ...avx512f-u16.c => f32-vrdivc-avx512f-u16.c} | 13 +- ...avx512f-u32.c => f32-vrdivc-avx512f-u32.c} | 18 +- .../gen/f32-vrdivc-minmax-avx-u16.c | 94 -- .../gen/f32-vrdivc-minmax-wasm-u1.c | 41 - .../gen/f32-vrdivc-minmax-wasm-u2.c | 61 - .../gen/f32-vrdivc-minmax-wasm-u4.c | 73 - .../gen/f32-vrdivc-minmax-wasm-u8.c | 93 -- .../gen/f32-vrdivc-minmax-wasmsimd-arm-u16.c | 95 -- .../gen/f32-vrdivc-minmax-wasmsimd-arm-u4.c | 66 - .../gen/f32-vrdivc-minmax-wasmsimd-arm-u8.c | 85 -- .../gen/f32-vrdivc-minmax-wasmsimd-x86-u16.c | 95 -- .../gen/f32-vrdivc-minmax-wasmsimd-x86-u4.c | 66 - .../gen/f32-vrdivc-minmax-wasmsimd-x86-u8.c | 85 -- ...-minmax-rvv-u4v.c => f32-vrdivc-rvv-u4v.c} | 8 +- ...-minmax-rvv-u8v.c => f32-vrdivc-rvv-u8v.c} | 8 +- src/f32-vbinary/gen/f32-vrdivc-scalar-u2.c | 1 - src/f32-vbinary/gen/f32-vrdivc-scalar-u4.c | 1 - src/f32-vbinary/gen/f32-vrdivc-scalar-u8.c | 1 - ...vc-minmax-sse-u4.c => f32-vrdivc-sse-u4.c} | 12 +- ...vc-minmax-sse-u8.c => f32-vrdivc-sse-u8.c} | 18 +- ...inmax-scalar-u1.c => f32-vrdivc-wasm-u1.c} | 8 +- ...inmax-scalar-u2.c => f32-vrdivc-wasm-u2.c} | 14 +- ...inmax-scalar-u4.c => f32-vrdivc-wasm-u4.c} | 18 +- ...inmax-scalar-u8.c => f32-vrdivc-wasm-u8.c} | 26 +- src/f32-vbinary/gen/f32-vrdivc-wasmsimd-u16.c | 3 - src/f32-vbinary/gen/f32-vrdivc-wasmsimd-u4.c | 2 - src/f32-vbinary/gen/f32-vrdivc-wasmsimd-u8.c | 3 - src/f32-vbinary/gen/f32-vrpreluc-avx-u16.c | 1 - .../gen/f32-vrpreluc-avx512f-u16.c | 2 +- .../gen/f32-vrpreluc-avx512f-u32.c | 3 +- src/f32-vbinary/gen/f32-vrpreluc-neon-u4.c | 2 - src/f32-vbinary/gen/f32-vrpreluc-neon-u8.c | 3 - src/f32-vbinary/gen/f32-vrpreluc-scalar-u2.c | 1 - src/f32-vbinary/gen/f32-vrpreluc-scalar-u4.c | 1 - src/f32-vbinary/gen/f32-vrpreluc-scalar-u8.c | 1 - src/f32-vbinary/gen/f32-vrpreluc-sse2-u8.c | 1 - src/f32-vbinary/gen/f32-vrpreluc-sse41-u8.c | 1 - src/f32-vbinary/gen/f32-vrpreluc-wasm-u2.c | 1 - src/f32-vbinary/gen/f32-vrpreluc-wasm-u4.c | 1 - src/f32-vbinary/gen/f32-vrpreluc-wasm-u8.c | 1 - .../gen/f32-vrpreluc-wasmrelaxedsimd-u16.c | 3 - .../gen/f32-vrpreluc-wasmrelaxedsimd-u4.c | 2 - .../gen/f32-vrpreluc-wasmrelaxedsimd-u8.c | 3 - .../gen/f32-vrpreluc-wasmsimd-u16.c | 3 - .../gen/f32-vrpreluc-wasmsimd-u4.c | 2 - .../gen/f32-vrpreluc-wasmsimd-u8.c | 3 - ...c-minmax-avx-u8.c => f32-vrsubc-avx-u16.c} | 25 +- src/f32-vbinary/gen/f32-vrsubc-avx-u8.c | 67 + ...avx512f-u16.c => f32-vrsubc-avx512f-u16.c} | 13 +- ...avx512f-u32.c => f32-vrsubc-avx512f-u32.c} | 18 +- ...inmax-hvx-u128.c => f32-vrsubc-hvx-u128.c} | 20 +- ...-minmax-hvx-u32.c => f32-vrsubc-hvx-u32.c} | 10 +- ...-minmax-hvx-u64.c => f32-vrsubc-hvx-u64.c} | 16 +- .../gen/f32-vrsubc-minmax-avx-u16.c | 94 -- .../gen/f32-vrsubc-minmax-wasm-u1.c | 41 - .../gen/f32-vrsubc-minmax-wasm-u2.c | 61 - .../gen/f32-vrsubc-minmax-wasm-u4.c | 73 - .../gen/f32-vrsubc-minmax-wasm-u8.c | 93 -- .../gen/f32-vrsubc-minmax-wasmsimd-arm-u16.c | 95 -- .../gen/f32-vrsubc-minmax-wasmsimd-arm-u4.c | 66 - .../gen/f32-vrsubc-minmax-wasmsimd-arm-u8.c | 85 -- .../gen/f32-vrsubc-minmax-wasmsimd-x86-u16.c | 95 -- .../gen/f32-vrsubc-minmax-wasmsimd-x86-u4.c | 66 - .../gen/f32-vrsubc-minmax-wasmsimd-x86-u8.c | 85 -- ...-minmax-neon-u4.c => f32-vrsubc-neon-u4.c} | 12 +- ...-minmax-neon-u8.c => f32-vrsubc-neon-u8.c} | 18 +- ...-minmax-rvv-u4v.c => f32-vrsubc-rvv-u4v.c} | 8 +- ...-minmax-rvv-u8v.c => f32-vrsubc-rvv-u8v.c} | 8 +- src/f32-vbinary/gen/f32-vrsubc-scalar-u2.c | 1 - src/f32-vbinary/gen/f32-vrsubc-scalar-u4.c | 1 - src/f32-vbinary/gen/f32-vrsubc-scalar-u8.c | 1 - ...bc-minmax-sse-u4.c => f32-vrsubc-sse-u4.c} | 12 +- ...bc-minmax-sse-u8.c => f32-vrsubc-sse-u8.c} | 18 +- ...inmax-scalar-u1.c => f32-vrsubc-wasm-u1.c} | 8 +- ...inmax-scalar-u2.c => f32-vrsubc-wasm-u2.c} | 14 +- ...inmax-scalar-u4.c => f32-vrsubc-wasm-u4.c} | 18 +- ...inmax-scalar-u8.c => f32-vrsubc-wasm-u8.c} | 26 +- src/f32-vbinary/gen/f32-vrsubc-wasmsimd-u16.c | 3 - src/f32-vbinary/gen/f32-vrsubc-wasmsimd-u4.c | 2 - src/f32-vbinary/gen/f32-vrsubc-wasmsimd-u8.c | 3 - src/f32-vbinary/gen/f32-vsqrdiff-avx-u16.c | 2 - src/f32-vbinary/gen/f32-vsqrdiff-avx-u8.c | 1 - .../gen/f32-vsqrdiff-avx512f-u16.c | 2 - .../gen/f32-vsqrdiff-avx512f-u32.c | 3 - src/f32-vbinary/gen/f32-vsqrdiff-hvx-u128.c | 4 +- src/f32-vbinary/gen/f32-vsqrdiff-hvx-u32.c | 3 +- src/f32-vbinary/gen/f32-vsqrdiff-hvx-u64.c | 4 +- src/f32-vbinary/gen/f32-vsqrdiff-neon-u4.c | 3 - src/f32-vbinary/gen/f32-vsqrdiff-neon-u8.c | 4 - src/f32-vbinary/gen/f32-vsqrdiff-scalar-u1.c | 1 - src/f32-vbinary/gen/f32-vsqrdiff-scalar-u2.c | 2 - src/f32-vbinary/gen/f32-vsqrdiff-scalar-u4.c | 2 - src/f32-vbinary/gen/f32-vsqrdiff-scalar-u8.c | 2 - src/f32-vbinary/gen/f32-vsqrdiff-sse-u4.c | 1 - src/f32-vbinary/gen/f32-vsqrdiff-sse-u8.c | 2 - .../gen/f32-vsqrdiff-wasmsimd-u16.c | 3 - .../gen/f32-vsqrdiff-wasmsimd-u4.c | 3 - .../gen/f32-vsqrdiff-wasmsimd-u8.c | 3 - src/f32-vbinary/gen/f32-vsqrdiffc-avx-u16.c | 1 - .../gen/f32-vsqrdiffc-avx512f-u16.c | 2 +- .../gen/f32-vsqrdiffc-avx512f-u32.c | 3 +- src/f32-vbinary/gen/f32-vsqrdiffc-hvx-u128.c | 1 - src/f32-vbinary/gen/f32-vsqrdiffc-hvx-u64.c | 1 - src/f32-vbinary/gen/f32-vsqrdiffc-neon-u4.c | 2 - src/f32-vbinary/gen/f32-vsqrdiffc-neon-u8.c | 3 - src/f32-vbinary/gen/f32-vsqrdiffc-scalar-u2.c | 1 - src/f32-vbinary/gen/f32-vsqrdiffc-scalar-u4.c | 1 - src/f32-vbinary/gen/f32-vsqrdiffc-scalar-u8.c | 1 - src/f32-vbinary/gen/f32-vsqrdiffc-sse-u8.c | 1 - .../gen/f32-vsqrdiffc-wasmsimd-u16.c | 3 - .../gen/f32-vsqrdiffc-wasmsimd-u4.c | 2 - .../gen/f32-vsqrdiffc-wasmsimd-u8.c | 3 - ...ub-minmax-avx-u16.c => f32-vsub-avx-u16.c} | 19 +- ...vsub-minmax-avx-u8.c => f32-vsub-avx-u8.c} | 13 +- ...x-avx512f-u16.c => f32-vsub-avx512f-u16.c} | 12 +- ...x-avx512f-u32.c => f32-vsub-avx512f-u32.c} | 18 +- ...-minmax-hvx-u128.c => f32-vsub-hvx-u128.c} | 23 +- ...ub-minmax-hvx-u32.c => f32-vsub-hvx-u32.c} | 13 +- ...ub-minmax-hvx-u64.c => f32-vsub-hvx-u64.c} | 19 +- src/f32-vbinary/gen/f32-vsub-minmax-wasm-u1.c | 41 - src/f32-vbinary/gen/f32-vsub-minmax-wasm-u2.c | 65 - src/f32-vbinary/gen/f32-vsub-minmax-wasm-u4.c | 79 - src/f32-vbinary/gen/f32-vsub-minmax-wasm-u8.c | 103 -- .../gen/f32-vsub-minmax-wasmsimd-arm-u16.c | 103 -- .../gen/f32-vsub-minmax-wasmsimd-arm-u4.c | 69 - .../gen/f32-vsub-minmax-wasmsimd-arm-u8.c | 91 -- .../gen/f32-vsub-minmax-wasmsimd-x86-u16.c | 103 -- .../gen/f32-vsub-minmax-wasmsimd-x86-u4.c | 69 - .../gen/f32-vsub-minmax-wasmsimd-x86-u8.c | 91 -- ...ub-minmax-neon-u4.c => f32-vsub-neon-u4.c} | 13 +- ...ub-minmax-neon-u8.c => f32-vsub-neon-u8.c} | 19 +- ...ub-minmax-rvv-u4v.c => f32-vsub-rvv-u4v.c} | 8 +- ...ub-minmax-rvv-u8v.c => f32-vsub-rvv-u8v.c} | 8 +- src/f32-vbinary/gen/f32-vsub-scalar-u1.c | 1 - src/f32-vbinary/gen/f32-vsub-scalar-u2.c | 2 - src/f32-vbinary/gen/f32-vsub-scalar-u4.c | 2 - src/f32-vbinary/gen/f32-vsub-scalar-u8.c | 2 - ...vsub-minmax-sse-u4.c => f32-vsub-sse-u4.c} | 13 +- ...vsub-minmax-sse-u8.c => f32-vsub-sse-u8.c} | 19 +- ...-minmax-scalar-u1.c => f32-vsub-wasm-u1.c} | 9 +- ...-minmax-scalar-u2.c => f32-vsub-wasm-u2.c} | 15 +- ...-minmax-scalar-u4.c => f32-vsub-wasm-u4.c} | 19 +- ...-minmax-scalar-u8.c => f32-vsub-wasm-u8.c} | 27 +- src/f32-vbinary/gen/f32-vsub-wasmsimd-u16.c | 3 - src/f32-vbinary/gen/f32-vsub-wasmsimd-u4.c | 3 - src/f32-vbinary/gen/f32-vsub-wasmsimd-u8.c | 3 - ...bc-minmax-avx-u8.c => f32-vsubc-avx-u16.c} | 25 +- src/f32-vbinary/gen/f32-vsubc-avx-u8.c | 67 + ...-avx512f-u16.c => f32-vsubc-avx512f-u16.c} | 13 +- ...-avx512f-u32.c => f32-vsubc-avx512f-u32.c} | 18 +- ...minmax-hvx-u128.c => f32-vsubc-hvx-u128.c} | 20 +- ...c-minmax-hvx-u32.c => f32-vsubc-hvx-u32.c} | 10 +- ...c-minmax-hvx-u64.c => f32-vsubc-hvx-u64.c} | 16 +- .../gen/f32-vsubc-minmax-avx-u16.c | 94 -- .../gen/f32-vsubc-minmax-wasm-u1.c | 41 - .../gen/f32-vsubc-minmax-wasm-u2.c | 61 - .../gen/f32-vsubc-minmax-wasm-u4.c | 73 - .../gen/f32-vsubc-minmax-wasm-u8.c | 93 -- .../gen/f32-vsubc-minmax-wasmsimd-arm-u16.c | 95 -- .../gen/f32-vsubc-minmax-wasmsimd-arm-u4.c | 66 - .../gen/f32-vsubc-minmax-wasmsimd-arm-u8.c | 85 -- .../gen/f32-vsubc-minmax-wasmsimd-x86-u16.c | 95 -- .../gen/f32-vsubc-minmax-wasmsimd-x86-u4.c | 66 - .../gen/f32-vsubc-minmax-wasmsimd-x86-u8.c | 85 -- ...c-minmax-neon-u4.c => f32-vsubc-neon-u4.c} | 12 +- ...c-minmax-neon-u8.c => f32-vsubc-neon-u8.c} | 18 +- ...c-minmax-rvv-u4v.c => f32-vsubc-rvv-u4v.c} | 8 +- ...c-minmax-rvv-u8v.c => f32-vsubc-rvv-u8v.c} | 8 +- src/f32-vbinary/gen/f32-vsubc-scalar-u2.c | 1 - src/f32-vbinary/gen/f32-vsubc-scalar-u4.c | 1 - src/f32-vbinary/gen/f32-vsubc-scalar-u8.c | 1 - ...ubc-minmax-sse-u4.c => f32-vsubc-sse-u4.c} | 12 +- ...ubc-minmax-sse-u8.c => f32-vsubc-sse-u8.c} | 18 +- ...minmax-scalar-u1.c => f32-vsubc-wasm-u1.c} | 8 +- ...minmax-scalar-u2.c => f32-vsubc-wasm-u2.c} | 14 +- ...minmax-scalar-u4.c => f32-vsubc-wasm-u4.c} | 18 +- ...minmax-scalar-u8.c => f32-vsubc-wasm-u8.c} | 26 +- src/f32-vbinary/gen/f32-vsubc-wasmsimd-u16.c | 3 - src/f32-vbinary/gen/f32-vsubc-wasmsimd-u4.c | 2 - src/f32-vbinary/gen/f32-vsubc-wasmsimd-u8.c | 3 - src/f32-vbinary/vop-avx.c.in | 26 +- src/f32-vbinary/vop-avx512f.c.in | 25 +- src/f32-vbinary/vop-hvx.c.in | 26 +- src/f32-vbinary/vop-neon.c.in | 26 +- src/f32-vbinary/vop-rvv.c.in | 13 +- src/f32-vbinary/vop-scalar.c.in | 27 +- src/f32-vbinary/vop-sse.c.in | 26 +- src/f32-vbinary/vop-wasmsimd.c.in | 44 +- src/f32-vbinary/vopc-avx.c.in | 25 +- src/f32-vbinary/vopc-avx512f.c.in | 25 +- src/f32-vbinary/vopc-hvx.c.in | 23 +- src/f32-vbinary/vopc-neon.c.in | 25 +- src/f32-vbinary/vopc-rvv.c.in | 13 +- src/f32-vbinary/vopc-scalar.c.in | 26 +- src/f32-vbinary/vopc-sse.c.in | 25 +- src/f32-vbinary/vopc-wasmsimd.c.in | 45 +- src/operators/binary-elementwise-nd.c | 38 +- .../scaled-dot-product-attention-nhtc.c | 6 +- src/operators/softmax-nc.c | 16 +- src/xnnpack/config-types.h | 9 +- src/xnnpack/operator.h | 2 +- src/xnnpack/vbinary.h | 30 +- test/BUILD.bazel | 29 +- test/{f16-vadd-minmax.cc => f16-vadd.cc} | 10 +- test/f16-vaddc-minmax.cc | 29 - test/{f32-vaddc-minmax.cc => f16-vaddc.cc} | 10 +- test/f16-vdiv-minmax.cc | 29 - test/{f32-vdiv-minmax.cc => f16-vdiv.cc} | 10 +- test/f16-vdivc-minmax.cc | 29 - test/{f32-vdivc-minmax.cc => f16-vdivc.cc} | 10 +- test/f16-vmul-minmax.cc | 29 - test/{f32-vmul-minmax.cc => f16-vmul.cc} | 10 +- test/f16-vmulc-minmax.cc | 29 - test/{f32-vmulc-minmax.cc => f16-vmulc.cc} | 10 +- test/f16-vrdivc-minmax.cc | 29 - test/{f32-vrdivc-minmax.cc => f16-vrdivc.cc} | 10 +- test/{f16-vrsubc-minmax.cc => f16-vrsubc.cc} | 10 +- test/f16-vsub-minmax.cc | 29 - test/{f32-vsub-minmax.cc => f16-vsub.cc} | 10 +- test/f16-vsubc-minmax.cc | 29 - test/{f32-vsubc-minmax.cc => f16-vsubc.cc} | 10 +- test/f32-vadd-minmax.cc | 29 - test/f32-vrsubc-minmax.cc | 29 - 809 files changed, 4135 insertions(+), 17296 deletions(-) rename src/f16-vbinary/{f16-vadd-minmax.h => f16-vadd.h} (50%) rename src/f16-vbinary/{f16-vaddc-minmax.h => f16-vaddc.h} (50%) delete mode 100644 src/f16-vbinary/f16-vdiv-minmax.h create mode 100644 src/f16-vbinary/f16-vdiv.h delete mode 100644 src/f16-vbinary/f16-vdivc-minmax.h create mode 100644 src/f16-vbinary/f16-vdivc.h rename src/f16-vbinary/{f16-vmul-minmax.h => f16-vmul.h} (50%) rename src/f16-vbinary/{f16-vmulc-minmax.h => f16-vmulc.h} (50%) rename src/f16-vbinary/{f16-vrdivc-minmax.h => f16-vrdivc.h} (59%) rename src/f16-vbinary/{f16-vrsubc-minmax.h => f16-vrsubc.h} (60%) rename src/f16-vbinary/{f16-vsub-minmax.h => f16-vsub.h} (50%) rename src/f16-vbinary/{f16-vsubc-minmax.h => f16-vsubc.h} (50%) create mode 100644 src/f16-vbinary/gen/f16-vadd-avx512fp16-u32.c rename src/f16-vbinary/gen/{f16-vadd-minmax-avx512fp16-u32.c => f16-vadd-avx512fp16-u64.c} (75%) rename src/f16-vbinary/gen/{f16-vadd-minmax-f16c-u16.c => f16-vadd-f16c-u16.c} (79%) rename src/f16-vbinary/gen/{f16-vadd-minmax-f16c-u8.c => f16-vadd-f16c-u8.c} (78%) rename src/f16-vbinary/gen/{f16-vadd-minmax-fp16arith-u1.c => f16-vadd-fp16arith-u1.c} (74%) rename src/f16-vbinary/gen/{f16-vadd-minmax-fp16arith-u2.c => f16-vadd-fp16arith-u2.c} (71%) rename src/f16-vbinary/gen/{f16-vadd-minmax-fp16arith-u4.c => f16-vadd-fp16arith-u4.c} (70%) delete mode 100644 src/f16-vbinary/gen/f16-vadd-minmax-avx512fp16-u64.c rename src/f16-vbinary/gen/{f16-vadd-minmax-neonfp16arith-u16.c => f16-vadd-neonfp16arith-u16.c} (77%) rename src/f16-vbinary/gen/{f16-vadd-minmax-neonfp16arith-u8.c => f16-vadd-neonfp16arith-u8.c} (77%) create mode 100644 src/f16-vbinary/gen/f16-vaddc-avx512fp16-u32.c rename src/f16-vbinary/gen/{f16-vaddc-minmax-avx512fp16-u32.c => f16-vaddc-avx512fp16-u64.c} (75%) rename src/f16-vbinary/gen/{f16-vaddc-minmax-f16c-u16.c => f16-vaddc-f16c-u16.c} (77%) rename src/f16-vbinary/gen/{f16-vaddc-minmax-f16c-u8.c => f16-vaddc-f16c-u8.c} (77%) rename src/f16-vbinary/gen/{f16-vaddc-minmax-fp16arith-u1.c => f16-vaddc-fp16arith-u1.c} (74%) rename src/f16-vbinary/gen/{f16-vaddc-minmax-fp16arith-u2.c => f16-vaddc-fp16arith-u2.c} (70%) rename src/f16-vbinary/gen/{f16-vaddc-minmax-fp16arith-u4.c => f16-vaddc-fp16arith-u4.c} (68%) delete mode 100644 src/f16-vbinary/gen/f16-vaddc-minmax-avx512fp16-u64.c rename src/f16-vbinary/gen/{f16-vaddc-minmax-neonfp16arith-u16.c => f16-vaddc-neonfp16arith-u16.c} (75%) rename src/f16-vbinary/gen/{f16-vaddc-minmax-neonfp16arith-u8.c => f16-vaddc-neonfp16arith-u8.c} (76%) rename src/f16-vbinary/gen/{f16-vdiv-minmax-aarch64-neonfp16arith-u16.c => f16-vdiv-aarch64-neonfp16arith-u16.c} (77%) rename src/f16-vbinary/gen/{f16-vdiv-minmax-aarch64-neonfp16arith-u8.c => f16-vdiv-aarch64-neonfp16arith-u8.c} (77%) create mode 100644 src/f16-vbinary/gen/f16-vdiv-avx512fp16-u32.c rename src/f16-vbinary/gen/{f16-vdiv-minmax-avx512fp16-u32.c => f16-vdiv-avx512fp16-u64.c} (75%) rename src/f16-vbinary/gen/{f16-vdiv-minmax-f16c-u16.c => f16-vdiv-f16c-u16.c} (79%) rename src/f16-vbinary/gen/{f16-vdiv-minmax-f16c-u8.c => f16-vdiv-f16c-u8.c} (78%) rename src/f16-vbinary/gen/{f16-vdiv-minmax-fp16arith-u1.c => f16-vdiv-fp16arith-u1.c} (74%) rename src/f16-vbinary/gen/{f16-vdiv-minmax-fp16arith-u2.c => f16-vdiv-fp16arith-u2.c} (71%) rename src/f16-vbinary/gen/{f16-vdiv-minmax-fp16arith-u4.c => f16-vdiv-fp16arith-u4.c} (70%) delete mode 100644 src/f16-vbinary/gen/f16-vdiv-minmax-avx512fp16-u64.c rename src/f16-vbinary/gen/{f16-vdivc-minmax-aarch64-neonfp16arith-u16.c => f16-vdivc-aarch64-neonfp16arith-u16.c} (75%) rename src/f16-vbinary/gen/{f16-vdivc-minmax-aarch64-neonfp16arith-u8.c => f16-vdivc-aarch64-neonfp16arith-u8.c} (76%) create mode 100644 src/f16-vbinary/gen/f16-vdivc-avx512fp16-u32.c rename src/f16-vbinary/gen/{f16-vdivc-minmax-avx512fp16-u32.c => f16-vdivc-avx512fp16-u64.c} (75%) rename src/f16-vbinary/gen/{f16-vdivc-minmax-f16c-u16.c => f16-vdivc-f16c-u16.c} (77%) rename src/f16-vbinary/gen/{f16-vdivc-minmax-f16c-u8.c => f16-vdivc-f16c-u8.c} (77%) rename src/f16-vbinary/gen/{f16-vdivc-minmax-fp16arith-u1.c => f16-vdivc-fp16arith-u1.c} (74%) rename src/f16-vbinary/gen/{f16-vdivc-minmax-fp16arith-u2.c => f16-vdivc-fp16arith-u2.c} (70%) rename src/f16-vbinary/gen/{f16-vdivc-minmax-fp16arith-u4.c => f16-vdivc-fp16arith-u4.c} (68%) delete mode 100644 src/f16-vbinary/gen/f16-vdivc-minmax-avx512fp16-u64.c create mode 100644 src/f16-vbinary/gen/f16-vmul-avx512fp16-u32.c rename src/f16-vbinary/gen/{f16-vmul-minmax-avx512fp16-u32.c => f16-vmul-avx512fp16-u64.c} (75%) rename src/f16-vbinary/gen/{f16-vmul-minmax-f16c-u16.c => f16-vmul-f16c-u16.c} (79%) rename src/f16-vbinary/gen/{f16-vmul-minmax-f16c-u8.c => f16-vmul-f16c-u8.c} (78%) rename src/f16-vbinary/gen/{f16-vmul-minmax-fp16arith-u1.c => f16-vmul-fp16arith-u1.c} (74%) rename src/f16-vbinary/gen/{f16-vmul-minmax-fp16arith-u2.c => f16-vmul-fp16arith-u2.c} (71%) rename src/f16-vbinary/gen/{f16-vmul-minmax-fp16arith-u4.c => f16-vmul-fp16arith-u4.c} (70%) delete mode 100644 src/f16-vbinary/gen/f16-vmul-minmax-avx512fp16-u64.c rename src/f16-vbinary/gen/{f16-vmul-minmax-neonfp16arith-u16.c => f16-vmul-neonfp16arith-u16.c} (77%) rename src/f16-vbinary/gen/{f16-vmul-minmax-neonfp16arith-u8.c => f16-vmul-neonfp16arith-u8.c} (77%) create mode 100644 src/f16-vbinary/gen/f16-vmulc-avx512fp16-u32.c rename src/f16-vbinary/gen/{f16-vmulc-minmax-avx512fp16-u32.c => f16-vmulc-avx512fp16-u64.c} (75%) rename src/f16-vbinary/gen/{f16-vmulc-minmax-f16c-u16.c => f16-vmulc-f16c-u16.c} (77%) rename src/f16-vbinary/gen/{f16-vmulc-minmax-f16c-u8.c => f16-vmulc-f16c-u8.c} (77%) rename src/f16-vbinary/gen/{f16-vmulc-minmax-fp16arith-u1.c => f16-vmulc-fp16arith-u1.c} (74%) rename src/f16-vbinary/gen/{f16-vmulc-minmax-fp16arith-u2.c => f16-vmulc-fp16arith-u2.c} (70%) rename src/f16-vbinary/gen/{f16-vmulc-minmax-fp16arith-u4.c => f16-vmulc-fp16arith-u4.c} (68%) delete mode 100644 src/f16-vbinary/gen/f16-vmulc-minmax-avx512fp16-u64.c rename src/f16-vbinary/gen/{f16-vmulc-minmax-neonfp16arith-u16.c => f16-vmulc-neonfp16arith-u16.c} (75%) rename src/f16-vbinary/gen/{f16-vmulc-minmax-neonfp16arith-u8.c => f16-vmulc-neonfp16arith-u8.c} (76%) rename src/f16-vbinary/gen/{f16-vrdivc-minmax-aarch64-neonfp16arith-u16.c => f16-vrdivc-aarch64-neonfp16arith-u16.c} (75%) rename src/f16-vbinary/gen/{f16-vrdivc-minmax-aarch64-neonfp16arith-u8.c => f16-vrdivc-aarch64-neonfp16arith-u8.c} (76%) create mode 100644 src/f16-vbinary/gen/f16-vrdivc-avx512fp16-u32.c rename src/f16-vbinary/gen/{f16-vrdivc-minmax-avx512fp16-u32.c => f16-vrdivc-avx512fp16-u64.c} (75%) rename src/f16-vbinary/gen/{f16-vrdivc-minmax-f16c-u16.c => f16-vrdivc-f16c-u16.c} (77%) rename src/f16-vbinary/gen/{f16-vrdivc-minmax-f16c-u8.c => f16-vrdivc-f16c-u8.c} (77%) rename src/f16-vbinary/gen/{f16-vrdivc-minmax-fp16arith-u1.c => f16-vrdivc-fp16arith-u1.c} (74%) rename src/f16-vbinary/gen/{f16-vrdivc-minmax-fp16arith-u2.c => f16-vrdivc-fp16arith-u2.c} (70%) rename src/f16-vbinary/gen/{f16-vrdivc-minmax-fp16arith-u4.c => f16-vrdivc-fp16arith-u4.c} (68%) delete mode 100644 src/f16-vbinary/gen/f16-vrdivc-minmax-avx512fp16-u64.c create mode 100644 src/f16-vbinary/gen/f16-vrsubc-avx512fp16-u32.c rename src/f16-vbinary/gen/{f16-vrsubc-minmax-avx512fp16-u32.c => f16-vrsubc-avx512fp16-u64.c} (75%) rename src/f16-vbinary/gen/{f16-vrsubc-minmax-f16c-u16.c => f16-vrsubc-f16c-u16.c} (77%) rename src/f16-vbinary/gen/{f16-vrsubc-minmax-f16c-u8.c => f16-vrsubc-f16c-u8.c} (77%) rename src/f16-vbinary/gen/{f16-vrsubc-minmax-fp16arith-u1.c => f16-vrsubc-fp16arith-u1.c} (74%) rename src/f16-vbinary/gen/{f16-vrsubc-minmax-fp16arith-u2.c => f16-vrsubc-fp16arith-u2.c} (70%) rename src/f16-vbinary/gen/{f16-vrsubc-minmax-fp16arith-u4.c => f16-vrsubc-fp16arith-u4.c} (68%) delete mode 100644 src/f16-vbinary/gen/f16-vrsubc-minmax-avx512fp16-u64.c rename src/f16-vbinary/gen/{f16-vrsubc-minmax-neonfp16arith-u16.c => f16-vrsubc-neonfp16arith-u16.c} (75%) rename src/f16-vbinary/gen/{f16-vrsubc-minmax-neonfp16arith-u8.c => f16-vrsubc-neonfp16arith-u8.c} (76%) create mode 100644 src/f16-vbinary/gen/f16-vsub-avx512fp16-u32.c rename src/f16-vbinary/gen/{f16-vsub-minmax-avx512fp16-u32.c => f16-vsub-avx512fp16-u64.c} (75%) rename src/f16-vbinary/gen/{f16-vsub-minmax-f16c-u16.c => f16-vsub-f16c-u16.c} (79%) rename src/f16-vbinary/gen/{f16-vsub-minmax-f16c-u8.c => f16-vsub-f16c-u8.c} (78%) rename src/f16-vbinary/gen/{f16-vsub-minmax-fp16arith-u1.c => f16-vsub-fp16arith-u1.c} (74%) rename src/f16-vbinary/gen/{f16-vsub-minmax-fp16arith-u2.c => f16-vsub-fp16arith-u2.c} (71%) rename src/f16-vbinary/gen/{f16-vsub-minmax-fp16arith-u4.c => f16-vsub-fp16arith-u4.c} (70%) delete mode 100644 src/f16-vbinary/gen/f16-vsub-minmax-avx512fp16-u64.c rename src/f16-vbinary/gen/{f16-vsub-minmax-neonfp16arith-u16.c => f16-vsub-neonfp16arith-u16.c} (77%) rename src/f16-vbinary/gen/{f16-vsub-minmax-neonfp16arith-u8.c => f16-vsub-neonfp16arith-u8.c} (77%) create mode 100644 src/f16-vbinary/gen/f16-vsubc-avx512fp16-u32.c rename src/f16-vbinary/gen/{f16-vsubc-minmax-avx512fp16-u32.c => f16-vsubc-avx512fp16-u64.c} (75%) rename src/f16-vbinary/gen/{f16-vsubc-minmax-f16c-u16.c => f16-vsubc-f16c-u16.c} (77%) rename src/f16-vbinary/gen/{f16-vsubc-minmax-f16c-u8.c => f16-vsubc-f16c-u8.c} (77%) rename src/f16-vbinary/gen/{f16-vsubc-minmax-fp16arith-u1.c => f16-vsubc-fp16arith-u1.c} (74%) rename src/f16-vbinary/gen/{f16-vsubc-minmax-fp16arith-u2.c => f16-vsubc-fp16arith-u2.c} (70%) rename src/f16-vbinary/gen/{f16-vsubc-minmax-fp16arith-u4.c => f16-vsubc-fp16arith-u4.c} (68%) delete mode 100644 src/f16-vbinary/gen/f16-vsubc-minmax-avx512fp16-u64.c rename src/f16-vbinary/gen/{f16-vsubc-minmax-neonfp16arith-u16.c => f16-vsubc-neonfp16arith-u16.c} (75%) rename src/f16-vbinary/gen/{f16-vsubc-minmax-neonfp16arith-u8.c => f16-vsubc-neonfp16arith-u8.c} (76%) delete mode 100644 src/f32-vbinary/f32-vadd-minmax.h delete mode 100644 src/f32-vbinary/f32-vaddc-minmax.h delete mode 100644 src/f32-vbinary/f32-vdiv-minmax.h delete mode 100644 src/f32-vbinary/f32-vdivc-minmax.h delete mode 100644 src/f32-vbinary/f32-vmul-minmax.h delete mode 100644 src/f32-vbinary/f32-vmulc-minmax.h delete mode 100644 src/f32-vbinary/f32-vrdivc-minmax.h delete mode 100644 src/f32-vbinary/f32-vrsubc-minmax.h delete mode 100644 src/f32-vbinary/f32-vsub-minmax.h delete mode 100644 src/f32-vbinary/f32-vsubc-minmax.h rename src/f32-vbinary/gen/{f32-vadd-minmax-avx-u16.c => f32-vadd-avx-u16.c} (76%) rename src/f32-vbinary/gen/{f32-vadd-minmax-avx-u8.c => f32-vadd-avx-u8.c} (78%) rename src/f32-vbinary/gen/{f32-vadd-minmax-avx512f-u16.c => f32-vadd-avx512f-u16.c} (75%) rename src/f32-vbinary/gen/{f32-vadd-minmax-avx512f-u32.c => f32-vadd-avx512f-u32.c} (74%) rename src/f32-vbinary/gen/{f32-vadd-minmax-hvx-u128.c => f32-vadd-hvx-u128.c} (69%) rename src/f32-vbinary/gen/{f32-vadd-minmax-hvx-u32.c => f32-vadd-hvx-u32.c} (68%) rename src/f32-vbinary/gen/{f32-vadd-minmax-hvx-u64.c => f32-vadd-hvx-u64.c} (69%) delete mode 100644 src/f32-vbinary/gen/f32-vadd-minmax-wasm-u1.c delete mode 100644 src/f32-vbinary/gen/f32-vadd-minmax-wasm-u2.c delete mode 100644 src/f32-vbinary/gen/f32-vadd-minmax-wasm-u4.c delete mode 100644 src/f32-vbinary/gen/f32-vadd-minmax-wasm-u8.c delete mode 100644 src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-arm-u16.c delete mode 100644 src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-arm-u4.c delete mode 100644 src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-arm-u8.c delete mode 100644 src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-x86-u16.c delete mode 100644 src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-x86-u4.c delete mode 100644 src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-x86-u8.c rename src/f32-vbinary/gen/{f32-vadd-minmax-neon-u4.c => f32-vadd-neon-u4.c} (75%) rename src/f32-vbinary/gen/{f32-vadd-minmax-neon-u8.c => f32-vadd-neon-u8.c} (74%) rename src/f32-vbinary/gen/{f32-vadd-minmax-rvv-u4v.c => f32-vadd-rvv-u4v.c} (75%) rename src/f32-vbinary/gen/{f32-vadd-minmax-rvv-u8v.c => f32-vadd-rvv-u8v.c} (75%) rename src/f32-vbinary/gen/{f32-vadd-minmax-sse-u4.c => f32-vadd-sse-u4.c} (72%) rename src/f32-vbinary/gen/{f32-vadd-minmax-sse-u8.c => f32-vadd-sse-u8.c} (72%) rename src/f32-vbinary/gen/{f32-vadd-minmax-scalar-u1.c => f32-vadd-wasm-u1.c} (72%) rename src/f32-vbinary/gen/{f32-vadd-minmax-scalar-u2.c => f32-vadd-wasm-u2.c} (70%) rename src/f32-vbinary/gen/{f32-vadd-minmax-scalar-u4.c => f32-vadd-wasm-u4.c} (68%) rename src/f32-vbinary/gen/{f32-vadd-minmax-scalar-u8.c => f32-vadd-wasm-u8.c} (64%) rename src/f32-vbinary/gen/{f32-vaddc-minmax-avx-u8.c => f32-vaddc-avx-u16.c} (77%) create mode 100644 src/f32-vbinary/gen/f32-vaddc-avx-u8.c rename src/f32-vbinary/gen/{f32-vaddc-minmax-avx512f-u16.c => f32-vaddc-avx512f-u16.c} (75%) rename src/f32-vbinary/gen/{f32-vaddc-minmax-avx512f-u32.c => f32-vaddc-avx512f-u32.c} (73%) rename src/f32-vbinary/gen/{f32-vaddc-minmax-hvx-u128.c => f32-vaddc-hvx-u128.c} (65%) rename src/f32-vbinary/gen/{f32-vaddc-minmax-hvx-u32.c => f32-vaddc-hvx-u32.c} (67%) rename src/f32-vbinary/gen/{f32-vaddc-minmax-hvx-u64.c => f32-vaddc-hvx-u64.c} (67%) delete mode 100644 src/f32-vbinary/gen/f32-vaddc-minmax-avx-u16.c delete mode 100644 src/f32-vbinary/gen/f32-vaddc-minmax-wasm-u1.c delete mode 100644 src/f32-vbinary/gen/f32-vaddc-minmax-wasm-u2.c delete mode 100644 src/f32-vbinary/gen/f32-vaddc-minmax-wasm-u4.c delete mode 100644 src/f32-vbinary/gen/f32-vaddc-minmax-wasm-u8.c delete mode 100644 src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-arm-u16.c delete mode 100644 src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-arm-u4.c delete mode 100644 src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-arm-u8.c delete mode 100644 src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-x86-u16.c delete mode 100644 src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-x86-u4.c delete mode 100644 src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-x86-u8.c rename src/f32-vbinary/gen/{f32-vaddc-minmax-neon-u4.c => f32-vaddc-neon-u4.c} (74%) rename src/f32-vbinary/gen/{f32-vaddc-minmax-neon-u8.c => f32-vaddc-neon-u8.c} (72%) rename src/f32-vbinary/gen/{f32-vaddc-minmax-rvv-u4v.c => f32-vaddc-rvv-u4v.c} (74%) rename src/f32-vbinary/gen/{f32-vaddc-minmax-rvv-u8v.c => f32-vaddc-rvv-u8v.c} (74%) rename src/f32-vbinary/gen/{f32-vaddc-minmax-sse-u4.c => f32-vaddc-sse-u4.c} (71%) rename src/f32-vbinary/gen/{f32-vaddc-minmax-sse-u8.c => f32-vaddc-sse-u8.c} (70%) rename src/f32-vbinary/gen/{f32-vaddc-minmax-scalar-u1.c => f32-vaddc-wasm-u1.c} (72%) rename src/f32-vbinary/gen/{f32-vaddc-minmax-scalar-u2.c => f32-vaddc-wasm-u2.c} (68%) rename src/f32-vbinary/gen/{f32-vaddc-minmax-scalar-u4.c => f32-vaddc-wasm-u4.c} (65%) rename src/f32-vbinary/gen/{f32-vaddc-minmax-scalar-u8.c => f32-vaddc-wasm-u8.c} (60%) rename src/f32-vbinary/gen/{f32-vdiv-minmax-aarch64-neon-u4.c => f32-vdiv-aarch64-neon-u4.c} (74%) rename src/f32-vbinary/gen/{f32-vdiv-minmax-aarch64-neon-u8.c => f32-vdiv-aarch64-neon-u8.c} (74%) rename src/f32-vbinary/gen/{f32-vdiv-minmax-avx-u16.c => f32-vdiv-avx-u16.c} (76%) rename src/f32-vbinary/gen/{f32-vdiv-minmax-avx-u8.c => f32-vdiv-avx-u8.c} (78%) rename src/f32-vbinary/gen/{f32-vdiv-minmax-avx512f-u16.c => f32-vdiv-avx512f-u16.c} (75%) rename src/f32-vbinary/gen/{f32-vdiv-minmax-avx512f-u32.c => f32-vdiv-avx512f-u32.c} (74%) delete mode 100644 src/f32-vbinary/gen/f32-vdiv-minmax-wasm-u1.c delete mode 100644 src/f32-vbinary/gen/f32-vdiv-minmax-wasm-u2.c delete mode 100644 src/f32-vbinary/gen/f32-vdiv-minmax-wasm-u4.c delete mode 100644 src/f32-vbinary/gen/f32-vdiv-minmax-wasm-u8.c delete mode 100644 src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-arm-u16.c delete mode 100644 src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-arm-u4.c delete mode 100644 src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-arm-u8.c delete mode 100644 src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-x86-u16.c delete mode 100644 src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-x86-u4.c delete mode 100644 src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-x86-u8.c rename src/f32-vbinary/gen/{f32-vdiv-minmax-rvv-u4v.c => f32-vdiv-rvv-u4v.c} (75%) rename src/f32-vbinary/gen/{f32-vdiv-minmax-rvv-u8v.c => f32-vdiv-rvv-u8v.c} (75%) rename src/f32-vbinary/gen/{f32-vdiv-minmax-sse-u4.c => f32-vdiv-sse-u4.c} (72%) rename src/f32-vbinary/gen/{f32-vdiv-minmax-sse-u8.c => f32-vdiv-sse-u8.c} (72%) rename src/f32-vbinary/gen/{f32-vdiv-minmax-scalar-u1.c => f32-vdiv-wasm-u1.c} (72%) rename src/f32-vbinary/gen/{f32-vdiv-minmax-scalar-u2.c => f32-vdiv-wasm-u2.c} (70%) rename src/f32-vbinary/gen/{f32-vdiv-minmax-scalar-u4.c => f32-vdiv-wasm-u4.c} (68%) rename src/f32-vbinary/gen/{f32-vdiv-minmax-scalar-u8.c => f32-vdiv-wasm-u8.c} (64%) rename src/f32-vbinary/gen/{f32-vdivc-minmax-aarch64-neon-u4.c => f32-vdivc-aarch64-neon-u4.c} (73%) rename src/f32-vbinary/gen/{f32-vdivc-minmax-aarch64-neon-u8.c => f32-vdivc-aarch64-neon-u8.c} (72%) rename src/f32-vbinary/gen/{f32-vdivc-minmax-avx-u8.c => f32-vdivc-avx-u16.c} (77%) create mode 100644 src/f32-vbinary/gen/f32-vdivc-avx-u8.c rename src/f32-vbinary/gen/{f32-vdivc-minmax-avx512f-u16.c => f32-vdivc-avx512f-u16.c} (75%) rename src/f32-vbinary/gen/{f32-vdivc-minmax-avx512f-u32.c => f32-vdivc-avx512f-u32.c} (73%) delete mode 100644 src/f32-vbinary/gen/f32-vdivc-minmax-avx-u16.c delete mode 100644 src/f32-vbinary/gen/f32-vdivc-minmax-wasm-u1.c delete mode 100644 src/f32-vbinary/gen/f32-vdivc-minmax-wasm-u2.c delete mode 100644 src/f32-vbinary/gen/f32-vdivc-minmax-wasm-u4.c delete mode 100644 src/f32-vbinary/gen/f32-vdivc-minmax-wasm-u8.c delete mode 100644 src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-arm-u16.c delete mode 100644 src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-arm-u4.c delete mode 100644 src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-arm-u8.c delete mode 100644 src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-x86-u16.c delete mode 100644 src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-x86-u4.c delete mode 100644 src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-x86-u8.c rename src/f32-vbinary/gen/{f32-vdivc-minmax-rvv-u4v.c => f32-vdivc-rvv-u4v.c} (74%) rename src/f32-vbinary/gen/{f32-vdivc-minmax-rvv-u8v.c => f32-vdivc-rvv-u8v.c} (74%) rename src/f32-vbinary/gen/{f32-vdivc-minmax-sse-u4.c => f32-vdivc-sse-u4.c} (71%) rename src/f32-vbinary/gen/{f32-vdivc-minmax-sse-u8.c => f32-vdivc-sse-u8.c} (70%) rename src/f32-vbinary/gen/{f32-vdivc-minmax-scalar-u1.c => f32-vdivc-wasm-u1.c} (72%) rename src/f32-vbinary/gen/{f32-vdivc-minmax-scalar-u2.c => f32-vdivc-wasm-u2.c} (68%) rename src/f32-vbinary/gen/{f32-vdivc-minmax-scalar-u4.c => f32-vdivc-wasm-u4.c} (65%) rename src/f32-vbinary/gen/{f32-vdivc-minmax-scalar-u8.c => f32-vdivc-wasm-u8.c} (60%) rename src/f32-vbinary/gen/{f32-vmul-minmax-avx-u16.c => f32-vmul-avx-u16.c} (76%) rename src/f32-vbinary/gen/{f32-vmul-minmax-avx-u8.c => f32-vmul-avx-u8.c} (78%) rename src/f32-vbinary/gen/{f32-vmul-minmax-avx512f-u16.c => f32-vmul-avx512f-u16.c} (75%) rename src/f32-vbinary/gen/{f32-vmul-minmax-avx512f-u32.c => f32-vmul-avx512f-u32.c} (74%) rename src/f32-vbinary/gen/{f32-vmul-minmax-hvx-u128.c => f32-vmul-hvx-u128.c} (69%) rename src/f32-vbinary/gen/{f32-vmul-minmax-hvx-u32.c => f32-vmul-hvx-u32.c} (68%) rename src/f32-vbinary/gen/{f32-vmul-minmax-hvx-u64.c => f32-vmul-hvx-u64.c} (69%) delete mode 100644 src/f32-vbinary/gen/f32-vmul-minmax-wasm-u1.c delete mode 100644 src/f32-vbinary/gen/f32-vmul-minmax-wasm-u2.c delete mode 100644 src/f32-vbinary/gen/f32-vmul-minmax-wasm-u4.c delete mode 100644 src/f32-vbinary/gen/f32-vmul-minmax-wasm-u8.c delete mode 100644 src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-arm-u16.c delete mode 100644 src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-arm-u4.c delete mode 100644 src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-arm-u8.c delete mode 100644 src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-x86-u16.c delete mode 100644 src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-x86-u4.c delete mode 100644 src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-x86-u8.c rename src/f32-vbinary/gen/{f32-vmul-minmax-neon-u4.c => f32-vmul-neon-u4.c} (75%) rename src/f32-vbinary/gen/{f32-vmul-minmax-neon-u8.c => f32-vmul-neon-u8.c} (74%) rename src/f32-vbinary/gen/{f32-vmul-minmax-rvv-u4v.c => f32-vmul-rvv-u4v.c} (75%) rename src/f32-vbinary/gen/{f32-vmul-minmax-rvv-u8v.c => f32-vmul-rvv-u8v.c} (75%) rename src/f32-vbinary/gen/{f32-vmul-minmax-sse-u4.c => f32-vmul-sse-u4.c} (72%) rename src/f32-vbinary/gen/{f32-vmul-minmax-sse-u8.c => f32-vmul-sse-u8.c} (72%) rename src/f32-vbinary/gen/{f32-vmul-minmax-scalar-u1.c => f32-vmul-wasm-u1.c} (72%) rename src/f32-vbinary/gen/{f32-vmul-minmax-scalar-u2.c => f32-vmul-wasm-u2.c} (70%) rename src/f32-vbinary/gen/{f32-vmul-minmax-scalar-u4.c => f32-vmul-wasm-u4.c} (68%) rename src/f32-vbinary/gen/{f32-vmul-minmax-scalar-u8.c => f32-vmul-wasm-u8.c} (64%) rename src/f32-vbinary/gen/{f32-vmulc-minmax-avx-u8.c => f32-vmulc-avx-u16.c} (77%) create mode 100644 src/f32-vbinary/gen/f32-vmulc-avx-u8.c rename src/f32-vbinary/gen/{f32-vmulc-minmax-avx512f-u16.c => f32-vmulc-avx512f-u16.c} (75%) rename src/f32-vbinary/gen/{f32-vmulc-minmax-avx512f-u32.c => f32-vmulc-avx512f-u32.c} (73%) rename src/f32-vbinary/gen/{f32-vmulc-minmax-hvx-u128.c => f32-vmulc-hvx-u128.c} (65%) rename src/f32-vbinary/gen/{f32-vmulc-minmax-hvx-u32.c => f32-vmulc-hvx-u32.c} (67%) rename src/f32-vbinary/gen/{f32-vmulc-minmax-hvx-u64.c => f32-vmulc-hvx-u64.c} (67%) delete mode 100644 src/f32-vbinary/gen/f32-vmulc-minmax-avx-u16.c delete mode 100644 src/f32-vbinary/gen/f32-vmulc-minmax-rvv-u4v.c delete mode 100644 src/f32-vbinary/gen/f32-vmulc-minmax-rvv-u8v.c delete mode 100644 src/f32-vbinary/gen/f32-vmulc-minmax-wasm-u1.c delete mode 100644 src/f32-vbinary/gen/f32-vmulc-minmax-wasm-u2.c delete mode 100644 src/f32-vbinary/gen/f32-vmulc-minmax-wasm-u4.c delete mode 100644 src/f32-vbinary/gen/f32-vmulc-minmax-wasm-u8.c delete mode 100644 src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-arm-u16.c delete mode 100644 src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-arm-u4.c delete mode 100644 src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-arm-u8.c delete mode 100644 src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-x86-u16.c delete mode 100644 src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-x86-u4.c delete mode 100644 src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-x86-u8.c rename src/f32-vbinary/gen/{f32-vmulc-minmax-neon-u4.c => f32-vmulc-neon-u4.c} (74%) rename src/f32-vbinary/gen/{f32-vmulc-minmax-neon-u8.c => f32-vmulc-neon-u8.c} (72%) rename src/f32-vbinary/gen/{f32-vmulc-minmax-sse-u4.c => f32-vmulc-sse-u4.c} (71%) rename src/f32-vbinary/gen/{f32-vmulc-minmax-sse-u8.c => f32-vmulc-sse-u8.c} (70%) rename src/f32-vbinary/gen/{f32-vmulc-minmax-scalar-u1.c => f32-vmulc-wasm-u1.c} (72%) rename src/f32-vbinary/gen/{f32-vmulc-minmax-scalar-u2.c => f32-vmulc-wasm-u2.c} (68%) rename src/f32-vbinary/gen/{f32-vmulc-minmax-scalar-u4.c => f32-vmulc-wasm-u4.c} (65%) rename src/f32-vbinary/gen/{f32-vmulc-minmax-scalar-u8.c => f32-vmulc-wasm-u8.c} (60%) rename src/f32-vbinary/gen/{f32-vrdivc-minmax-aarch64-neon-u4.c => f32-vrdivc-aarch64-neon-u4.c} (73%) rename src/f32-vbinary/gen/{f32-vrdivc-minmax-aarch64-neon-u8.c => f32-vrdivc-aarch64-neon-u8.c} (72%) rename src/f32-vbinary/gen/{f32-vrdivc-minmax-avx-u8.c => f32-vrdivc-avx-u16.c} (77%) create mode 100644 src/f32-vbinary/gen/f32-vrdivc-avx-u8.c rename src/f32-vbinary/gen/{f32-vrdivc-minmax-avx512f-u16.c => f32-vrdivc-avx512f-u16.c} (75%) rename src/f32-vbinary/gen/{f32-vrdivc-minmax-avx512f-u32.c => f32-vrdivc-avx512f-u32.c} (73%) delete mode 100644 src/f32-vbinary/gen/f32-vrdivc-minmax-avx-u16.c delete mode 100644 src/f32-vbinary/gen/f32-vrdivc-minmax-wasm-u1.c delete mode 100644 src/f32-vbinary/gen/f32-vrdivc-minmax-wasm-u2.c delete mode 100644 src/f32-vbinary/gen/f32-vrdivc-minmax-wasm-u4.c delete mode 100644 src/f32-vbinary/gen/f32-vrdivc-minmax-wasm-u8.c delete mode 100644 src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-arm-u16.c delete mode 100644 src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-arm-u4.c delete mode 100644 src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-arm-u8.c delete mode 100644 src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-x86-u16.c delete mode 100644 src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-x86-u4.c delete mode 100644 src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-x86-u8.c rename src/f32-vbinary/gen/{f32-vrdivc-minmax-rvv-u4v.c => f32-vrdivc-rvv-u4v.c} (74%) rename src/f32-vbinary/gen/{f32-vrdivc-minmax-rvv-u8v.c => f32-vrdivc-rvv-u8v.c} (74%) rename src/f32-vbinary/gen/{f32-vrdivc-minmax-sse-u4.c => f32-vrdivc-sse-u4.c} (71%) rename src/f32-vbinary/gen/{f32-vrdivc-minmax-sse-u8.c => f32-vrdivc-sse-u8.c} (70%) rename src/f32-vbinary/gen/{f32-vrdivc-minmax-scalar-u1.c => f32-vrdivc-wasm-u1.c} (72%) rename src/f32-vbinary/gen/{f32-vrdivc-minmax-scalar-u2.c => f32-vrdivc-wasm-u2.c} (68%) rename src/f32-vbinary/gen/{f32-vrdivc-minmax-scalar-u4.c => f32-vrdivc-wasm-u4.c} (65%) rename src/f32-vbinary/gen/{f32-vrdivc-minmax-scalar-u8.c => f32-vrdivc-wasm-u8.c} (60%) rename src/f32-vbinary/gen/{f32-vrsubc-minmax-avx-u8.c => f32-vrsubc-avx-u16.c} (77%) create mode 100644 src/f32-vbinary/gen/f32-vrsubc-avx-u8.c rename src/f32-vbinary/gen/{f32-vrsubc-minmax-avx512f-u16.c => f32-vrsubc-avx512f-u16.c} (75%) rename src/f32-vbinary/gen/{f32-vrsubc-minmax-avx512f-u32.c => f32-vrsubc-avx512f-u32.c} (73%) rename src/f32-vbinary/gen/{f32-vrsubc-minmax-hvx-u128.c => f32-vrsubc-hvx-u128.c} (65%) rename src/f32-vbinary/gen/{f32-vrsubc-minmax-hvx-u32.c => f32-vrsubc-hvx-u32.c} (67%) rename src/f32-vbinary/gen/{f32-vrsubc-minmax-hvx-u64.c => f32-vrsubc-hvx-u64.c} (67%) delete mode 100644 src/f32-vbinary/gen/f32-vrsubc-minmax-avx-u16.c delete mode 100644 src/f32-vbinary/gen/f32-vrsubc-minmax-wasm-u1.c delete mode 100644 src/f32-vbinary/gen/f32-vrsubc-minmax-wasm-u2.c delete mode 100644 src/f32-vbinary/gen/f32-vrsubc-minmax-wasm-u4.c delete mode 100644 src/f32-vbinary/gen/f32-vrsubc-minmax-wasm-u8.c delete mode 100644 src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-arm-u16.c delete mode 100644 src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-arm-u4.c delete mode 100644 src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-arm-u8.c delete mode 100644 src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-x86-u16.c delete mode 100644 src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-x86-u4.c delete mode 100644 src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-x86-u8.c rename src/f32-vbinary/gen/{f32-vrsubc-minmax-neon-u4.c => f32-vrsubc-neon-u4.c} (74%) rename src/f32-vbinary/gen/{f32-vrsubc-minmax-neon-u8.c => f32-vrsubc-neon-u8.c} (72%) rename src/f32-vbinary/gen/{f32-vrsubc-minmax-rvv-u4v.c => f32-vrsubc-rvv-u4v.c} (74%) rename src/f32-vbinary/gen/{f32-vrsubc-minmax-rvv-u8v.c => f32-vrsubc-rvv-u8v.c} (74%) rename src/f32-vbinary/gen/{f32-vrsubc-minmax-sse-u4.c => f32-vrsubc-sse-u4.c} (71%) rename src/f32-vbinary/gen/{f32-vrsubc-minmax-sse-u8.c => f32-vrsubc-sse-u8.c} (70%) rename src/f32-vbinary/gen/{f32-vrsubc-minmax-scalar-u1.c => f32-vrsubc-wasm-u1.c} (72%) rename src/f32-vbinary/gen/{f32-vrsubc-minmax-scalar-u2.c => f32-vrsubc-wasm-u2.c} (68%) rename src/f32-vbinary/gen/{f32-vrsubc-minmax-scalar-u4.c => f32-vrsubc-wasm-u4.c} (65%) rename src/f32-vbinary/gen/{f32-vrsubc-minmax-scalar-u8.c => f32-vrsubc-wasm-u8.c} (60%) rename src/f32-vbinary/gen/{f32-vsub-minmax-avx-u16.c => f32-vsub-avx-u16.c} (76%) rename src/f32-vbinary/gen/{f32-vsub-minmax-avx-u8.c => f32-vsub-avx-u8.c} (78%) rename src/f32-vbinary/gen/{f32-vsub-minmax-avx512f-u16.c => f32-vsub-avx512f-u16.c} (75%) rename src/f32-vbinary/gen/{f32-vsub-minmax-avx512f-u32.c => f32-vsub-avx512f-u32.c} (74%) rename src/f32-vbinary/gen/{f32-vsub-minmax-hvx-u128.c => f32-vsub-hvx-u128.c} (69%) rename src/f32-vbinary/gen/{f32-vsub-minmax-hvx-u32.c => f32-vsub-hvx-u32.c} (68%) rename src/f32-vbinary/gen/{f32-vsub-minmax-hvx-u64.c => f32-vsub-hvx-u64.c} (69%) delete mode 100644 src/f32-vbinary/gen/f32-vsub-minmax-wasm-u1.c delete mode 100644 src/f32-vbinary/gen/f32-vsub-minmax-wasm-u2.c delete mode 100644 src/f32-vbinary/gen/f32-vsub-minmax-wasm-u4.c delete mode 100644 src/f32-vbinary/gen/f32-vsub-minmax-wasm-u8.c delete mode 100644 src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-arm-u16.c delete mode 100644 src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-arm-u4.c delete mode 100644 src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-arm-u8.c delete mode 100644 src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-x86-u16.c delete mode 100644 src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-x86-u4.c delete mode 100644 src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-x86-u8.c rename src/f32-vbinary/gen/{f32-vsub-minmax-neon-u4.c => f32-vsub-neon-u4.c} (75%) rename src/f32-vbinary/gen/{f32-vsub-minmax-neon-u8.c => f32-vsub-neon-u8.c} (74%) rename src/f32-vbinary/gen/{f32-vsub-minmax-rvv-u4v.c => f32-vsub-rvv-u4v.c} (75%) rename src/f32-vbinary/gen/{f32-vsub-minmax-rvv-u8v.c => f32-vsub-rvv-u8v.c} (75%) rename src/f32-vbinary/gen/{f32-vsub-minmax-sse-u4.c => f32-vsub-sse-u4.c} (72%) rename src/f32-vbinary/gen/{f32-vsub-minmax-sse-u8.c => f32-vsub-sse-u8.c} (72%) rename src/f32-vbinary/gen/{f32-vsub-minmax-scalar-u1.c => f32-vsub-wasm-u1.c} (72%) rename src/f32-vbinary/gen/{f32-vsub-minmax-scalar-u2.c => f32-vsub-wasm-u2.c} (70%) rename src/f32-vbinary/gen/{f32-vsub-minmax-scalar-u4.c => f32-vsub-wasm-u4.c} (68%) rename src/f32-vbinary/gen/{f32-vsub-minmax-scalar-u8.c => f32-vsub-wasm-u8.c} (64%) rename src/f32-vbinary/gen/{f32-vsubc-minmax-avx-u8.c => f32-vsubc-avx-u16.c} (77%) create mode 100644 src/f32-vbinary/gen/f32-vsubc-avx-u8.c rename src/f32-vbinary/gen/{f32-vsubc-minmax-avx512f-u16.c => f32-vsubc-avx512f-u16.c} (75%) rename src/f32-vbinary/gen/{f32-vsubc-minmax-avx512f-u32.c => f32-vsubc-avx512f-u32.c} (73%) rename src/f32-vbinary/gen/{f32-vsubc-minmax-hvx-u128.c => f32-vsubc-hvx-u128.c} (65%) rename src/f32-vbinary/gen/{f32-vsubc-minmax-hvx-u32.c => f32-vsubc-hvx-u32.c} (67%) rename src/f32-vbinary/gen/{f32-vsubc-minmax-hvx-u64.c => f32-vsubc-hvx-u64.c} (67%) delete mode 100644 src/f32-vbinary/gen/f32-vsubc-minmax-avx-u16.c delete mode 100644 src/f32-vbinary/gen/f32-vsubc-minmax-wasm-u1.c delete mode 100644 src/f32-vbinary/gen/f32-vsubc-minmax-wasm-u2.c delete mode 100644 src/f32-vbinary/gen/f32-vsubc-minmax-wasm-u4.c delete mode 100644 src/f32-vbinary/gen/f32-vsubc-minmax-wasm-u8.c delete mode 100644 src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-arm-u16.c delete mode 100644 src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-arm-u4.c delete mode 100644 src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-arm-u8.c delete mode 100644 src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-x86-u16.c delete mode 100644 src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-x86-u4.c delete mode 100644 src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-x86-u8.c rename src/f32-vbinary/gen/{f32-vsubc-minmax-neon-u4.c => f32-vsubc-neon-u4.c} (74%) rename src/f32-vbinary/gen/{f32-vsubc-minmax-neon-u8.c => f32-vsubc-neon-u8.c} (72%) rename src/f32-vbinary/gen/{f32-vsubc-minmax-rvv-u4v.c => f32-vsubc-rvv-u4v.c} (74%) rename src/f32-vbinary/gen/{f32-vsubc-minmax-rvv-u8v.c => f32-vsubc-rvv-u8v.c} (74%) rename src/f32-vbinary/gen/{f32-vsubc-minmax-sse-u4.c => f32-vsubc-sse-u4.c} (71%) rename src/f32-vbinary/gen/{f32-vsubc-minmax-sse-u8.c => f32-vsubc-sse-u8.c} (70%) rename src/f32-vbinary/gen/{f32-vsubc-minmax-scalar-u1.c => f32-vsubc-wasm-u1.c} (72%) rename src/f32-vbinary/gen/{f32-vsubc-minmax-scalar-u2.c => f32-vsubc-wasm-u2.c} (68%) rename src/f32-vbinary/gen/{f32-vsubc-minmax-scalar-u4.c => f32-vsubc-wasm-u4.c} (65%) rename src/f32-vbinary/gen/{f32-vsubc-minmax-scalar-u8.c => f32-vsubc-wasm-u8.c} (60%) rename test/{f16-vadd-minmax.cc => f16-vadd.cc} (71%) delete mode 100644 test/f16-vaddc-minmax.cc rename test/{f32-vaddc-minmax.cc => f16-vaddc.cc} (71%) delete mode 100644 test/f16-vdiv-minmax.cc rename test/{f32-vdiv-minmax.cc => f16-vdiv.cc} (71%) delete mode 100644 test/f16-vdivc-minmax.cc rename test/{f32-vdivc-minmax.cc => f16-vdivc.cc} (71%) delete mode 100644 test/f16-vmul-minmax.cc rename test/{f32-vmul-minmax.cc => f16-vmul.cc} (71%) delete mode 100644 test/f16-vmulc-minmax.cc rename test/{f32-vmulc-minmax.cc => f16-vmulc.cc} (71%) delete mode 100644 test/f16-vrdivc-minmax.cc rename test/{f32-vrdivc-minmax.cc => f16-vrdivc.cc} (71%) rename test/{f16-vrsubc-minmax.cc => f16-vrsubc.cc} (71%) delete mode 100644 test/f16-vsub-minmax.cc rename test/{f32-vsub-minmax.cc => f16-vsub.cc} (71%) delete mode 100644 test/f16-vsubc-minmax.cc rename test/{f32-vsubc-minmax.cc => f16-vsubc.cc} (71%) delete mode 100644 test/f32-vadd-minmax.cc delete mode 100644 test/f32-vrsubc-minmax.cc diff --git a/BUILD.bazel b/BUILD.bazel index 46a01dff727..758501e673d 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -154,26 +154,26 @@ MICROKERNEL_DEFS = [ "src/f16-maxpool/f16-maxpool-minmax.h", "src/f16-pavgpool/f16-pavgpool-minmax.h", "src/f16-vabs/f16-vabs.h", - "src/f16-vbinary/f16-vadd-minmax.h", - "src/f16-vbinary/f16-vaddc-minmax.h", + "src/f16-vbinary/f16-vadd.h", + "src/f16-vbinary/f16-vaddc.h", "src/f16-vbinary/f16-vcmul.h", - "src/f16-vbinary/f16-vdiv-minmax.h", - "src/f16-vbinary/f16-vdivc-minmax.h", + "src/f16-vbinary/f16-vdiv.h", + "src/f16-vbinary/f16-vdivc.h", "src/f16-vbinary/f16-vmax.h", "src/f16-vbinary/f16-vmaxc.h", "src/f16-vbinary/f16-vmin.h", "src/f16-vbinary/f16-vminc.h", - "src/f16-vbinary/f16-vmul-minmax.h", - "src/f16-vbinary/f16-vmulc-minmax.h", + "src/f16-vbinary/f16-vmul.h", + "src/f16-vbinary/f16-vmulc.h", "src/f16-vbinary/f16-vprelu.h", "src/f16-vbinary/f16-vpreluc.h", - "src/f16-vbinary/f16-vrdivc-minmax.h", + "src/f16-vbinary/f16-vrdivc.h", "src/f16-vbinary/f16-vrpreluc.h", - "src/f16-vbinary/f16-vrsubc-minmax.h", + "src/f16-vbinary/f16-vrsubc.h", "src/f16-vbinary/f16-vsqrdiff.h", "src/f16-vbinary/f16-vsqrdiffc.h", - "src/f16-vbinary/f16-vsub-minmax.h", - "src/f16-vbinary/f16-vsubc-minmax.h", + "src/f16-vbinary/f16-vsub.h", + "src/f16-vbinary/f16-vsubc.h", "src/f16-vclamp/f16-vclamp.h", "src/f16-velu/f16-velu.h", "src/f16-vhswish/f16-vhswish.h", @@ -192,38 +192,28 @@ MICROKERNEL_DEFS = [ "src/f32-maxpool/f32-maxpool-minmax.h", "src/f32-pavgpool/f32-pavgpool-minmax.h", "src/f32-vabs/f32-vabs.h", - "src/f32-vbinary/f32-vadd-minmax.h", "src/f32-vbinary/f32-vadd.h", - "src/f32-vbinary/f32-vaddc-minmax.h", "src/f32-vbinary/f32-vaddc.h", "src/f32-vbinary/f32-vcmul.h", "src/f32-vbinary/f32-vcopysign.h", "src/f32-vbinary/f32-vcopysignc.h", - "src/f32-vbinary/f32-vdiv-minmax.h", "src/f32-vbinary/f32-vdiv.h", - "src/f32-vbinary/f32-vdivc-minmax.h", "src/f32-vbinary/f32-vdivc.h", "src/f32-vbinary/f32-vmax.h", "src/f32-vbinary/f32-vmaxc.h", "src/f32-vbinary/f32-vmin.h", "src/f32-vbinary/f32-vminc.h", - "src/f32-vbinary/f32-vmul-minmax.h", "src/f32-vbinary/f32-vmul.h", - "src/f32-vbinary/f32-vmulc-minmax.h", "src/f32-vbinary/f32-vmulc.h", "src/f32-vbinary/f32-vprelu.h", "src/f32-vbinary/f32-vpreluc.h", "src/f32-vbinary/f32-vrcopysignc.h", - "src/f32-vbinary/f32-vrdivc-minmax.h", "src/f32-vbinary/f32-vrdivc.h", "src/f32-vbinary/f32-vrpreluc.h", - "src/f32-vbinary/f32-vrsubc-minmax.h", "src/f32-vbinary/f32-vrsubc.h", "src/f32-vbinary/f32-vsqrdiff.h", "src/f32-vbinary/f32-vsqrdiffc.h", - "src/f32-vbinary/f32-vsub-minmax.h", "src/f32-vbinary/f32-vsub.h", - "src/f32-vbinary/f32-vsubc-minmax.h", "src/f32-vbinary/f32-vsubc.h", "src/f32-vclamp/f32-vclamp.h", "src/f32-velu/f32-velu.h", diff --git a/CMakeLists.txt b/CMakeLists.txt index 0fbe62f50ae..dfda24ca714 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1643,57 +1643,47 @@ IF(XNNPACK_BUILD_TESTS) ENDFOREACH() SET(MICROKERNEL_VBINARY_UNIT_TESTS - f16-vadd-minmax - f16-vaddc-minmax - f16-vdiv-minmax - f16-vdivc-minmax + f16-vadd + f16-vaddc + f16-vdiv + f16-vdivc f16-vmax f16-vmaxc f16-vmin f16-vminc - f16-vmul-minmax - f16-vmulc-minmax + f16-vmul + f16-vmulc f16-vprelu f16-vpreluc f16-vrpreluc - f16-vrdivc-minmax - f16-vrsubc-minmax + f16-vrdivc + f16-vrsubc f16-vsqrdiff f16-vsqrdiffc - f16-vsub-minmax - f16-vsubc-minmax + f16-vsub + f16-vsubc f32-vadd - f32-vadd-minmax f32-vaddc - f32-vaddc-minmax f32-vcopysign f32-vcopysignc f32-vdiv - f32-vdiv-minmax f32-vdivc - f32-vdivc-minmax f32-vmax f32-vmaxc f32-vmin f32-vminc f32-vmul - f32-vmul-minmax f32-vmulc - f32-vmulc-minmax f32-vprelu f32-vpreluc f32-vrpreluc f32-vrcopysignc f32-vrdivc - f32-vrdivc-minmax f32-vrsubc - f32-vrsubc-minmax f32-vsqrdiff f32-vsqrdiffc f32-vsub - f32-vsub-minmax f32-vsubc - f32-vsubc-minmax qs8-vadd-minmax qs8-vaddc-minmax qs8-vmul-minmax-fp32 diff --git a/bench/f32-softmax.cc b/bench/f32-softmax.cc index 78f562e7062..6f9752bdc6c 100644 --- a/bench/f32-softmax.cc +++ b/bench/f32-softmax.cc @@ -275,8 +275,7 @@ static void ThreePassSoftMaxWithReloading( xnn_init_f32_default_params_fn init_rmax_params, xnn_f32_raddstoreexpminusmax_ukernel_fn raddstoreexpminusmax, xnn_init_f32_expminus_params_fn init_expminus_params, - xnn_f32_vbinary_minmax_ukernel_fn vmulc, - xnn_init_f32_minmax_params_fn init_minmax_params, + xnn_f32_vbinary_ukernel_fn vmulc, benchmark::utils::IsaCheckFunction isa_check = nullptr) { if (isa_check != nullptr && !isa_check(state)) { @@ -302,15 +301,12 @@ static void ThreePassSoftMaxWithReloading( xnn_f32_default_params rmax_params; xnn_f32_expminus_params expminus_params; - xnn_f32_minmax_params minmax_params; if (init_rmax_params) { init_rmax_params(&rmax_params); } if (init_expminus_params) { init_expminus_params(&expminus_params); } - assert(init_minmax_params); - init_minmax_params(&minmax_params, -INFINITY, INFINITY); size_t buffer_index = 0; for (auto _ : state) { @@ -325,7 +321,7 @@ static void ThreePassSoftMaxWithReloading( float y_sum = nanf(""); raddstoreexpminusmax(elements * sizeof(float), x.data(), &x_max, y.data() + packed_elements * buffer_index, &y_sum, &expminus_params); const float inv_y_sum = 1.0f / y_sum; - vmulc(elements * sizeof(float), y.data() + packed_elements * buffer_index, &inv_y_sum, y.data() + packed_elements * buffer_index, &minmax_params); + vmulc(elements * sizeof(float), y.data() + packed_elements * buffer_index, &inv_y_sum, y.data() + packed_elements * buffer_index, nullptr); const auto end = std::chrono::high_resolution_clock::now(); const auto elapsed_seconds = @@ -438,8 +434,7 @@ static void CharacteristicArguments(benchmark::internal::Benchmark* b) { (xnn_init_f32_default_params_fn) nullptr, xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u64_acc2, nullptr, - xnn_f32_vmulc_minmax_ukernel__avx_u16, - xnn_init_f32_minmax_scalar_params, + xnn_f32_vmulc_ukernel__avx_u16, benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime(); BENCHMARK_CAPTURE(TwoPassSoftMax, avx512f_p5_scalef, @@ -457,8 +452,7 @@ static void CharacteristicArguments(benchmark::internal::Benchmark* b) { (xnn_init_f32_default_params_fn) nullptr, xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u128_acc2, nullptr, - xnn_f32_vmulc_minmax_ukernel__avx512f_u32, - xnn_init_f32_minmax_scalar_params, + xnn_f32_vmulc_ukernel__avx512f_u32, benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseManualTime(); #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -468,8 +462,7 @@ static void CharacteristicArguments(benchmark::internal::Benchmark* b) { (xnn_init_f32_default_params_fn) nullptr, xnn_f32_raddstoreexpminusmax_ukernel__rvv_rr2_p6_u4v, nullptr, - xnn_f32_vmulc_minmax_ukernel__rvv_u8v, - xnn_init_f32_minmax_scalar_params, + xnn_f32_vmulc_ukernel__rvv_u8v, benchmark::utils::CheckRVV)->Apply(CharacteristicArguments)->UseManualTime(); #endif // XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV diff --git a/bench/vbinary.cc b/bench/vbinary.cc index 3e7f6332502..29c955477d7 100644 --- a/bench/vbinary.cc +++ b/bench/vbinary.cc @@ -156,56 +156,46 @@ static void vbinary(benchmark::State& state, uint64_t arch_flags, ->Apply( \ benchmark::utils::BinaryElementwiseParameters) \ ->UseRealTime(); -#include "src/f16-vbinary/f16-vadd-minmax.h" -#include "src/f16-vbinary/f16-vaddc-minmax.h" -#include "src/f16-vbinary/f16-vdiv-minmax.h" -#include "src/f16-vbinary/f16-vdivc-minmax.h" +#include "src/f16-vbinary/f16-vadd.h" +#include "src/f16-vbinary/f16-vaddc.h" +#include "src/f16-vbinary/f16-vdiv.h" +#include "src/f16-vbinary/f16-vdivc.h" #include "src/f16-vbinary/f16-vmax.h" #include "src/f16-vbinary/f16-vmaxc.h" #include "src/f16-vbinary/f16-vmin.h" #include "src/f16-vbinary/f16-vminc.h" -#include "src/f16-vbinary/f16-vmul-minmax.h" -#include "src/f16-vbinary/f16-vmulc-minmax.h" +#include "src/f16-vbinary/f16-vmul.h" +#include "src/f16-vbinary/f16-vmulc.h" #include "src/f16-vbinary/f16-vprelu.h" #include "src/f16-vbinary/f16-vpreluc.h" -#include "src/f16-vbinary/f16-vrdivc-minmax.h" +#include "src/f16-vbinary/f16-vrdivc.h" #include "src/f16-vbinary/f16-vrpreluc.h" -#include "src/f16-vbinary/f16-vrsubc-minmax.h" +#include "src/f16-vbinary/f16-vrsubc.h" #include "src/f16-vbinary/f16-vsqrdiff.h" #include "src/f16-vbinary/f16-vsqrdiffc.h" -#include "src/f16-vbinary/f16-vsub-minmax.h" -#include "src/f16-vbinary/f16-vsubc-minmax.h" -#include "src/f32-vbinary/f32-vadd-minmax.h" +#include "src/f16-vbinary/f16-vsub.h" +#include "src/f16-vbinary/f16-vsubc.h" #include "src/f32-vbinary/f32-vadd.h" -#include "src/f32-vbinary/f32-vaddc-minmax.h" #include "src/f32-vbinary/f32-vaddc.h" #include "src/f32-vbinary/f32-vcopysign.h" #include "src/f32-vbinary/f32-vcopysignc.h" -#include "src/f32-vbinary/f32-vdiv-minmax.h" #include "src/f32-vbinary/f32-vdiv.h" -#include "src/f32-vbinary/f32-vdivc-minmax.h" #include "src/f32-vbinary/f32-vdivc.h" #include "src/f32-vbinary/f32-vmax.h" #include "src/f32-vbinary/f32-vmaxc.h" #include "src/f32-vbinary/f32-vmin.h" #include "src/f32-vbinary/f32-vminc.h" -#include "src/f32-vbinary/f32-vmul-minmax.h" #include "src/f32-vbinary/f32-vmul.h" -#include "src/f32-vbinary/f32-vmulc-minmax.h" #include "src/f32-vbinary/f32-vmulc.h" #include "src/f32-vbinary/f32-vprelu.h" #include "src/f32-vbinary/f32-vpreluc.h" #include "src/f32-vbinary/f32-vrcopysignc.h" -#include "src/f32-vbinary/f32-vrdivc-minmax.h" #include "src/f32-vbinary/f32-vrdivc.h" #include "src/f32-vbinary/f32-vrpreluc.h" -#include "src/f32-vbinary/f32-vrsubc-minmax.h" #include "src/f32-vbinary/f32-vrsubc.h" #include "src/f32-vbinary/f32-vsqrdiff.h" #include "src/f32-vbinary/f32-vsqrdiffc.h" -#include "src/f32-vbinary/f32-vsub-minmax.h" #include "src/f32-vbinary/f32-vsub.h" -#include "src/f32-vbinary/f32-vsubc-minmax.h" #include "src/f32-vbinary/f32-vsubc.h" #include "src/qs8-vadd/qs8-vadd-minmax.h" #include "src/qs8-vaddc/qs8-vaddc-minmax.h" diff --git a/cmake/gen/avx512f_microkernels.cmake b/cmake/gen/avx512f_microkernels.cmake index 6ac4c88c5ed..41598bdc2aa 100644 --- a/cmake/gen/avx512f_microkernels.cmake +++ b/cmake/gen/avx512f_microkernels.cmake @@ -24,22 +24,22 @@ SET(PROD_AVX512F_MICROKERNEL_SRCS src/f32-rminmax/gen/f32-rmax-avx512f-u64-acc4.c src/f32-rminmax/gen/f32-rminmax-avx512f-u64-acc4.c src/f32-rsum/gen/f32-rsum-avx512f-u64-acc4.c - src/f32-vbinary/gen/f32-vadd-minmax-avx512f-u32.c - src/f32-vbinary/gen/f32-vaddc-minmax-avx512f-u32.c - src/f32-vbinary/gen/f32-vdiv-minmax-avx512f-u32.c - src/f32-vbinary/gen/f32-vdivc-minmax-avx512f-u32.c + src/f32-vbinary/gen/f32-vadd-avx512f-u32.c + src/f32-vbinary/gen/f32-vaddc-avx512f-u32.c + src/f32-vbinary/gen/f32-vdiv-avx512f-u32.c + src/f32-vbinary/gen/f32-vdivc-avx512f-u32.c src/f32-vbinary/gen/f32-vmax-avx512f-u32.c src/f32-vbinary/gen/f32-vmaxc-avx512f-u32.c src/f32-vbinary/gen/f32-vmin-avx512f-u32.c src/f32-vbinary/gen/f32-vminc-avx512f-u32.c - src/f32-vbinary/gen/f32-vmul-minmax-avx512f-u32.c - src/f32-vbinary/gen/f32-vmulc-minmax-avx512f-u32.c - src/f32-vbinary/gen/f32-vrdivc-minmax-avx512f-u32.c - src/f32-vbinary/gen/f32-vrsubc-minmax-avx512f-u32.c + src/f32-vbinary/gen/f32-vmul-avx512f-u32.c + src/f32-vbinary/gen/f32-vmulc-avx512f-u32.c + src/f32-vbinary/gen/f32-vrdivc-avx512f-u32.c + src/f32-vbinary/gen/f32-vrsubc-avx512f-u32.c src/f32-vbinary/gen/f32-vsqrdiff-avx512f-u32.c src/f32-vbinary/gen/f32-vsqrdiffc-avx512f-u32.c - src/f32-vbinary/gen/f32-vsub-minmax-avx512f-u32.c - src/f32-vbinary/gen/f32-vsubc-minmax-avx512f-u32.c + src/f32-vbinary/gen/f32-vsub-avx512f-u32.c + src/f32-vbinary/gen/f32-vsubc-avx512f-u32.c src/f32-vclamp/gen/f32-vclamp-avx512f-u16.c src/f32-vcmul/gen/f32-vcmul-avx512f-u32.c src/f32-vcopysign/gen/f32-vcopysign-avx512f.c @@ -163,28 +163,28 @@ SET(NON_PROD_AVX512F_MICROKERNEL_SRCS src/f32-rsum/gen/f32-rsum-avx512f-u32-acc2.c src/f32-rsum/gen/f32-rsum-avx512f-u48-acc3.c src/f32-rsum/gen/f32-rsum-avx512f-u64-acc2.c - src/f32-vbinary/gen/f32-vadd-minmax-avx512f-u16.c - src/f32-vbinary/gen/f32-vaddc-minmax-avx512f-u16.c - src/f32-vbinary/gen/f32-vdiv-minmax-avx512f-u16.c - src/f32-vbinary/gen/f32-vdivc-minmax-avx512f-u16.c + src/f32-vbinary/gen/f32-vadd-avx512f-u16.c + src/f32-vbinary/gen/f32-vaddc-avx512f-u16.c + src/f32-vbinary/gen/f32-vdiv-avx512f-u16.c + src/f32-vbinary/gen/f32-vdivc-avx512f-u16.c src/f32-vbinary/gen/f32-vmax-avx512f-u16.c src/f32-vbinary/gen/f32-vmaxc-avx512f-u16.c src/f32-vbinary/gen/f32-vmin-avx512f-u16.c src/f32-vbinary/gen/f32-vminc-avx512f-u16.c - src/f32-vbinary/gen/f32-vmul-minmax-avx512f-u16.c - src/f32-vbinary/gen/f32-vmulc-minmax-avx512f-u16.c + src/f32-vbinary/gen/f32-vmul-avx512f-u16.c + src/f32-vbinary/gen/f32-vmulc-avx512f-u16.c src/f32-vbinary/gen/f32-vprelu-avx512f-u16.c src/f32-vbinary/gen/f32-vprelu-avx512f-u32.c src/f32-vbinary/gen/f32-vpreluc-avx512f-u16.c src/f32-vbinary/gen/f32-vpreluc-avx512f-u32.c - src/f32-vbinary/gen/f32-vrdivc-minmax-avx512f-u16.c + src/f32-vbinary/gen/f32-vrdivc-avx512f-u16.c src/f32-vbinary/gen/f32-vrpreluc-avx512f-u16.c src/f32-vbinary/gen/f32-vrpreluc-avx512f-u32.c - src/f32-vbinary/gen/f32-vrsubc-minmax-avx512f-u16.c + src/f32-vbinary/gen/f32-vrsubc-avx512f-u16.c src/f32-vbinary/gen/f32-vsqrdiff-avx512f-u16.c src/f32-vbinary/gen/f32-vsqrdiffc-avx512f-u16.c - src/f32-vbinary/gen/f32-vsub-minmax-avx512f-u16.c - src/f32-vbinary/gen/f32-vsubc-minmax-avx512f-u16.c + src/f32-vbinary/gen/f32-vsub-avx512f-u16.c + src/f32-vbinary/gen/f32-vsubc-avx512f-u16.c src/f32-vclamp/gen/f32-vclamp-avx512f-u32.c src/f32-vcmul/gen/f32-vcmul-avx512f-u16.c src/f32-vcmul/gen/f32-vcmul-avx512f-u64.c diff --git a/cmake/gen/avx512fp16_microkernels.cmake b/cmake/gen/avx512fp16_microkernels.cmake index e2e4ab5f0ee..addfb1edaf1 100644 --- a/cmake/gen/avx512fp16_microkernels.cmake +++ b/cmake/gen/avx512fp16_microkernels.cmake @@ -16,22 +16,22 @@ SET(PROD_AVX512FP16_MICROKERNEL_SRCS src/f16-igemm/gen/f16-igemm-7x64-minmax-avx512fp16-broadcast.c src/f16-rminmax/gen/f16-rmax-avx512fp16-u128-acc4.c src/f16-rminmax/gen/f16-rminmax-avx512fp16-u128-acc4.c - src/f16-vbinary/gen/f16-vadd-minmax-avx512fp16-u64.c - src/f16-vbinary/gen/f16-vaddc-minmax-avx512fp16-u64.c - src/f16-vbinary/gen/f16-vdiv-minmax-avx512fp16-u64.c - src/f16-vbinary/gen/f16-vdivc-minmax-avx512fp16-u64.c + src/f16-vbinary/gen/f16-vadd-avx512fp16-u64.c + src/f16-vbinary/gen/f16-vaddc-avx512fp16-u64.c + src/f16-vbinary/gen/f16-vdiv-avx512fp16-u64.c + src/f16-vbinary/gen/f16-vdivc-avx512fp16-u64.c src/f16-vbinary/gen/f16-vmax-avx512fp16-u64.c src/f16-vbinary/gen/f16-vmaxc-avx512fp16-u64.c src/f16-vbinary/gen/f16-vmin-avx512fp16-u64.c src/f16-vbinary/gen/f16-vminc-avx512fp16-u64.c - src/f16-vbinary/gen/f16-vmul-minmax-avx512fp16-u64.c - src/f16-vbinary/gen/f16-vmulc-minmax-avx512fp16-u64.c - src/f16-vbinary/gen/f16-vrdivc-minmax-avx512fp16-u64.c - src/f16-vbinary/gen/f16-vrsubc-minmax-avx512fp16-u64.c + src/f16-vbinary/gen/f16-vmul-avx512fp16-u64.c + src/f16-vbinary/gen/f16-vmulc-avx512fp16-u64.c + src/f16-vbinary/gen/f16-vrdivc-avx512fp16-u64.c + src/f16-vbinary/gen/f16-vrsubc-avx512fp16-u64.c src/f16-vbinary/gen/f16-vsqrdiff-avx512fp16-u64.c src/f16-vbinary/gen/f16-vsqrdiffc-avx512fp16-u64.c - src/f16-vbinary/gen/f16-vsub-minmax-avx512fp16-u64.c - src/f16-vbinary/gen/f16-vsubc-minmax-avx512fp16-u64.c) + src/f16-vbinary/gen/f16-vsub-avx512fp16-u64.c + src/f16-vbinary/gen/f16-vsubc-avx512fp16-u64.c) SET(NON_PROD_AVX512FP16_MICROKERNEL_SRCS src/f16-gemm/gen/f16-gemm-1x32-minmax-avx512fp16-broadcast.c @@ -72,28 +72,28 @@ SET(NON_PROD_AVX512FP16_MICROKERNEL_SRCS src/f16-rsum/gen/f16-rsum-avx512fp16-u96-acc3.c src/f16-rsum/gen/f16-rsum-avx512fp16-u128-acc2.c src/f16-rsum/gen/f16-rsum-avx512fp16-u128-acc4.c - src/f16-vbinary/gen/f16-vadd-minmax-avx512fp16-u32.c - src/f16-vbinary/gen/f16-vaddc-minmax-avx512fp16-u32.c - src/f16-vbinary/gen/f16-vdiv-minmax-avx512fp16-u32.c - src/f16-vbinary/gen/f16-vdivc-minmax-avx512fp16-u32.c + src/f16-vbinary/gen/f16-vadd-avx512fp16-u32.c + src/f16-vbinary/gen/f16-vaddc-avx512fp16-u32.c + src/f16-vbinary/gen/f16-vdiv-avx512fp16-u32.c + src/f16-vbinary/gen/f16-vdivc-avx512fp16-u32.c src/f16-vbinary/gen/f16-vmax-avx512fp16-u32.c src/f16-vbinary/gen/f16-vmaxc-avx512fp16-u32.c src/f16-vbinary/gen/f16-vmin-avx512fp16-u32.c src/f16-vbinary/gen/f16-vminc-avx512fp16-u32.c - src/f16-vbinary/gen/f16-vmul-minmax-avx512fp16-u32.c - src/f16-vbinary/gen/f16-vmulc-minmax-avx512fp16-u32.c + src/f16-vbinary/gen/f16-vmul-avx512fp16-u32.c + src/f16-vbinary/gen/f16-vmulc-avx512fp16-u32.c src/f16-vbinary/gen/f16-vprelu-avx512fp16-u32.c src/f16-vbinary/gen/f16-vprelu-avx512fp16-u64.c src/f16-vbinary/gen/f16-vpreluc-avx512fp16-u32.c src/f16-vbinary/gen/f16-vpreluc-avx512fp16-u64.c - src/f16-vbinary/gen/f16-vrdivc-minmax-avx512fp16-u32.c + src/f16-vbinary/gen/f16-vrdivc-avx512fp16-u32.c src/f16-vbinary/gen/f16-vrpreluc-avx512fp16-u32.c src/f16-vbinary/gen/f16-vrpreluc-avx512fp16-u64.c - src/f16-vbinary/gen/f16-vrsubc-minmax-avx512fp16-u32.c + src/f16-vbinary/gen/f16-vrsubc-avx512fp16-u32.c src/f16-vbinary/gen/f16-vsqrdiff-avx512fp16-u32.c src/f16-vbinary/gen/f16-vsqrdiffc-avx512fp16-u32.c - src/f16-vbinary/gen/f16-vsub-minmax-avx512fp16-u32.c - src/f16-vbinary/gen/f16-vsubc-minmax-avx512fp16-u32.c + src/f16-vbinary/gen/f16-vsub-avx512fp16-u32.c + src/f16-vbinary/gen/f16-vsubc-avx512fp16-u32.c src/f16-vsqrt/gen/f16-vsqrt-avx512fp16-sqrt-u32.c src/f16-vsqrt/gen/f16-vsqrt-avx512fp16-sqrt-u64.c src/f16-vsqrt/gen/f16-vsqrt-avx512fp16-sqrt-u128.c) diff --git a/cmake/gen/avx_microkernels.cmake b/cmake/gen/avx_microkernels.cmake index 4b54801c46f..0ec12965ee1 100644 --- a/cmake/gen/avx_microkernels.cmake +++ b/cmake/gen/avx_microkernels.cmake @@ -32,22 +32,22 @@ SET(PROD_AVX_MICROKERNEL_SRCS src/f32-rminmax/gen/f32-rmax-avx-u32-acc4.c src/f32-rminmax/gen/f32-rminmax-avx-u32-acc4.c src/f32-rsum/gen/f32-rsum-avx-u32-acc4.c - src/f32-vbinary/gen/f32-vadd-minmax-avx-u16.c - src/f32-vbinary/gen/f32-vaddc-minmax-avx-u16.c - src/f32-vbinary/gen/f32-vdiv-minmax-avx-u16.c - src/f32-vbinary/gen/f32-vdivc-minmax-avx-u16.c + src/f32-vbinary/gen/f32-vadd-avx-u16.c + src/f32-vbinary/gen/f32-vaddc-avx-u16.c + src/f32-vbinary/gen/f32-vdiv-avx-u16.c + src/f32-vbinary/gen/f32-vdivc-avx-u16.c src/f32-vbinary/gen/f32-vmax-avx-u16.c src/f32-vbinary/gen/f32-vmaxc-avx-u16.c src/f32-vbinary/gen/f32-vmin-avx-u16.c src/f32-vbinary/gen/f32-vminc-avx-u16.c - src/f32-vbinary/gen/f32-vmul-minmax-avx-u16.c - src/f32-vbinary/gen/f32-vmulc-minmax-avx-u16.c - src/f32-vbinary/gen/f32-vrdivc-minmax-avx-u16.c - src/f32-vbinary/gen/f32-vrsubc-minmax-avx-u16.c + src/f32-vbinary/gen/f32-vmul-avx-u16.c + src/f32-vbinary/gen/f32-vmulc-avx-u16.c + src/f32-vbinary/gen/f32-vrdivc-avx-u16.c + src/f32-vbinary/gen/f32-vrsubc-avx-u16.c src/f32-vbinary/gen/f32-vsqrdiff-avx-u16.c src/f32-vbinary/gen/f32-vsqrdiffc-avx-u16.c - src/f32-vbinary/gen/f32-vsub-minmax-avx-u16.c - src/f32-vbinary/gen/f32-vsubc-minmax-avx-u16.c + src/f32-vbinary/gen/f32-vsub-avx-u16.c + src/f32-vbinary/gen/f32-vsubc-avx-u16.c src/f32-vclamp/gen/f32-vclamp-avx-u16.c src/f32-vcopysign/gen/f32-vcopysign-avx.c src/f32-vcopysign/gen/f32-vcopysignc-avx.c @@ -215,28 +215,28 @@ SET(NON_PROD_AVX_MICROKERNEL_SRCS src/f32-rsum/gen/f32-rsum-avx-u16-acc2.c src/f32-rsum/gen/f32-rsum-avx-u24-acc3.c src/f32-rsum/gen/f32-rsum-avx-u32-acc2.c - src/f32-vbinary/gen/f32-vadd-minmax-avx-u8.c - src/f32-vbinary/gen/f32-vaddc-minmax-avx-u8.c - src/f32-vbinary/gen/f32-vdiv-minmax-avx-u8.c - src/f32-vbinary/gen/f32-vdivc-minmax-avx-u8.c + src/f32-vbinary/gen/f32-vadd-avx-u8.c + src/f32-vbinary/gen/f32-vaddc-avx-u8.c + src/f32-vbinary/gen/f32-vdiv-avx-u8.c + src/f32-vbinary/gen/f32-vdivc-avx-u8.c src/f32-vbinary/gen/f32-vmax-avx-u8.c src/f32-vbinary/gen/f32-vmaxc-avx-u8.c src/f32-vbinary/gen/f32-vmin-avx-u8.c src/f32-vbinary/gen/f32-vminc-avx-u8.c - src/f32-vbinary/gen/f32-vmul-minmax-avx-u8.c - src/f32-vbinary/gen/f32-vmulc-minmax-avx-u8.c + src/f32-vbinary/gen/f32-vmul-avx-u8.c + src/f32-vbinary/gen/f32-vmulc-avx-u8.c src/f32-vbinary/gen/f32-vprelu-avx-u8.c src/f32-vbinary/gen/f32-vprelu-avx-u16.c src/f32-vbinary/gen/f32-vpreluc-avx-u8.c src/f32-vbinary/gen/f32-vpreluc-avx-u16.c - src/f32-vbinary/gen/f32-vrdivc-minmax-avx-u8.c + src/f32-vbinary/gen/f32-vrdivc-avx-u8.c src/f32-vbinary/gen/f32-vrpreluc-avx-u8.c src/f32-vbinary/gen/f32-vrpreluc-avx-u16.c - src/f32-vbinary/gen/f32-vrsubc-minmax-avx-u8.c + src/f32-vbinary/gen/f32-vrsubc-avx-u8.c src/f32-vbinary/gen/f32-vsqrdiff-avx-u8.c src/f32-vbinary/gen/f32-vsqrdiffc-avx-u8.c - src/f32-vbinary/gen/f32-vsub-minmax-avx-u8.c - src/f32-vbinary/gen/f32-vsubc-minmax-avx-u8.c + src/f32-vbinary/gen/f32-vsub-avx-u8.c + src/f32-vbinary/gen/f32-vsubc-avx-u8.c src/f32-vclamp/gen/f32-vclamp-avx-u8.c src/f32-velu/gen/f32-velu-avx-rr2-lut4-p4-perm-u8.c src/f32-velu/gen/f32-velu-avx-rr2-lut4-p4-perm-u16.c diff --git a/cmake/gen/f16c_microkernels.cmake b/cmake/gen/f16c_microkernels.cmake index 0eb478d67f1..e478a8abc1d 100644 --- a/cmake/gen/f16c_microkernels.cmake +++ b/cmake/gen/f16c_microkernels.cmake @@ -20,22 +20,22 @@ SET(PROD_F16C_MICROKERNEL_SRCS src/f16-maxpool/f16-maxpool-9p8x-minmax-f16c-c8.c src/f16-prelu/gen/f16-prelu-f16c-2x16.c src/f16-rminmax/f16-rmax-f16c-u32.c - src/f16-vbinary/gen/f16-vadd-minmax-f16c-u16.c - src/f16-vbinary/gen/f16-vaddc-minmax-f16c-u16.c - src/f16-vbinary/gen/f16-vdiv-minmax-f16c-u8.c - src/f16-vbinary/gen/f16-vdivc-minmax-f16c-u8.c + src/f16-vbinary/gen/f16-vadd-f16c-u16.c + src/f16-vbinary/gen/f16-vaddc-f16c-u16.c + src/f16-vbinary/gen/f16-vdiv-f16c-u8.c + src/f16-vbinary/gen/f16-vdivc-f16c-u8.c src/f16-vbinary/gen/f16-vmax-f16c-u16.c src/f16-vbinary/gen/f16-vmaxc-f16c-u16.c src/f16-vbinary/gen/f16-vmin-f16c-u16.c src/f16-vbinary/gen/f16-vminc-f16c-u16.c - src/f16-vbinary/gen/f16-vmul-minmax-f16c-u16.c - src/f16-vbinary/gen/f16-vmulc-minmax-f16c-u16.c - src/f16-vbinary/gen/f16-vrdivc-minmax-f16c-u8.c - src/f16-vbinary/gen/f16-vrsubc-minmax-f16c-u16.c + src/f16-vbinary/gen/f16-vmul-f16c-u16.c + src/f16-vbinary/gen/f16-vmulc-f16c-u16.c + src/f16-vbinary/gen/f16-vrdivc-f16c-u8.c + src/f16-vbinary/gen/f16-vrsubc-f16c-u16.c src/f16-vbinary/gen/f16-vsqrdiff-f16c-u16.c src/f16-vbinary/gen/f16-vsqrdiffc-f16c-u16.c - src/f16-vbinary/gen/f16-vsub-minmax-f16c-u16.c - src/f16-vbinary/gen/f16-vsubc-minmax-f16c-u16.c + src/f16-vbinary/gen/f16-vsub-f16c-u16.c + src/f16-vbinary/gen/f16-vsubc-f16c-u16.c src/f16-vclamp/gen/f16-vclamp-f16c-u16.c src/f16-vhswish/gen/f16-vhswish-f16c-u16.c src/f16-vlrelu/gen/f16-vlrelu-f16c-u16.c @@ -65,28 +65,28 @@ SET(NON_PROD_F16C_MICROKERNEL_SRCS src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c24.c src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c32.c src/f16-prelu/gen/f16-prelu-f16c-2x8.c - src/f16-vbinary/gen/f16-vadd-minmax-f16c-u8.c - src/f16-vbinary/gen/f16-vaddc-minmax-f16c-u8.c - src/f16-vbinary/gen/f16-vdiv-minmax-f16c-u16.c - src/f16-vbinary/gen/f16-vdivc-minmax-f16c-u16.c + src/f16-vbinary/gen/f16-vadd-f16c-u8.c + src/f16-vbinary/gen/f16-vaddc-f16c-u8.c + src/f16-vbinary/gen/f16-vdiv-f16c-u16.c + src/f16-vbinary/gen/f16-vdivc-f16c-u16.c src/f16-vbinary/gen/f16-vmax-f16c-u8.c src/f16-vbinary/gen/f16-vmaxc-f16c-u8.c src/f16-vbinary/gen/f16-vmin-f16c-u8.c src/f16-vbinary/gen/f16-vminc-f16c-u8.c - src/f16-vbinary/gen/f16-vmul-minmax-f16c-u8.c - src/f16-vbinary/gen/f16-vmulc-minmax-f16c-u8.c + src/f16-vbinary/gen/f16-vmul-f16c-u8.c + src/f16-vbinary/gen/f16-vmulc-f16c-u8.c src/f16-vbinary/gen/f16-vprelu-f16c-u8.c src/f16-vbinary/gen/f16-vprelu-f16c-u16.c src/f16-vbinary/gen/f16-vpreluc-f16c-u8.c src/f16-vbinary/gen/f16-vpreluc-f16c-u16.c - src/f16-vbinary/gen/f16-vrdivc-minmax-f16c-u16.c + src/f16-vbinary/gen/f16-vrdivc-f16c-u16.c src/f16-vbinary/gen/f16-vrpreluc-f16c-u8.c src/f16-vbinary/gen/f16-vrpreluc-f16c-u16.c - src/f16-vbinary/gen/f16-vrsubc-minmax-f16c-u8.c + src/f16-vbinary/gen/f16-vrsubc-f16c-u8.c src/f16-vbinary/gen/f16-vsqrdiff-f16c-u8.c src/f16-vbinary/gen/f16-vsqrdiffc-f16c-u8.c - src/f16-vbinary/gen/f16-vsub-minmax-f16c-u8.c - src/f16-vbinary/gen/f16-vsubc-minmax-f16c-u8.c + src/f16-vbinary/gen/f16-vsub-f16c-u8.c + src/f16-vbinary/gen/f16-vsubc-f16c-u8.c src/f16-vclamp/gen/f16-vclamp-f16c-u8.c src/f16-vhswish/gen/f16-vhswish-f16c-u8.c src/f16-vlrelu/gen/f16-vlrelu-f16c-u8.c diff --git a/cmake/gen/fp16arith_microkernels.cmake b/cmake/gen/fp16arith_microkernels.cmake index 1f44b9ac1aa..b139f5f75a2 100644 --- a/cmake/gen/fp16arith_microkernels.cmake +++ b/cmake/gen/fp16arith_microkernels.cmake @@ -10,21 +10,21 @@ SET(PROD_FP16ARITH_MICROKERNEL_SRCS - src/f16-vbinary/gen/f16-vdiv-minmax-fp16arith-u2.c - src/f16-vbinary/gen/f16-vdivc-minmax-fp16arith-u2.c - src/f16-vbinary/gen/f16-vrdivc-minmax-fp16arith-u2.c) + src/f16-vbinary/gen/f16-vdiv-fp16arith-u2.c + src/f16-vbinary/gen/f16-vdivc-fp16arith-u2.c + src/f16-vbinary/gen/f16-vrdivc-fp16arith-u2.c) SET(NON_PROD_FP16ARITH_MICROKERNEL_SRCS - src/f16-vbinary/gen/f16-vadd-minmax-fp16arith-u1.c - src/f16-vbinary/gen/f16-vadd-minmax-fp16arith-u2.c - src/f16-vbinary/gen/f16-vadd-minmax-fp16arith-u4.c - src/f16-vbinary/gen/f16-vaddc-minmax-fp16arith-u1.c - src/f16-vbinary/gen/f16-vaddc-minmax-fp16arith-u2.c - src/f16-vbinary/gen/f16-vaddc-minmax-fp16arith-u4.c - src/f16-vbinary/gen/f16-vdiv-minmax-fp16arith-u1.c - src/f16-vbinary/gen/f16-vdiv-minmax-fp16arith-u4.c - src/f16-vbinary/gen/f16-vdivc-minmax-fp16arith-u1.c - src/f16-vbinary/gen/f16-vdivc-minmax-fp16arith-u4.c + src/f16-vbinary/gen/f16-vadd-fp16arith-u1.c + src/f16-vbinary/gen/f16-vadd-fp16arith-u2.c + src/f16-vbinary/gen/f16-vadd-fp16arith-u4.c + src/f16-vbinary/gen/f16-vaddc-fp16arith-u1.c + src/f16-vbinary/gen/f16-vaddc-fp16arith-u2.c + src/f16-vbinary/gen/f16-vaddc-fp16arith-u4.c + src/f16-vbinary/gen/f16-vdiv-fp16arith-u1.c + src/f16-vbinary/gen/f16-vdiv-fp16arith-u4.c + src/f16-vbinary/gen/f16-vdivc-fp16arith-u1.c + src/f16-vbinary/gen/f16-vdivc-fp16arith-u4.c src/f16-vbinary/gen/f16-vmax-fp16arith-u1.c src/f16-vbinary/gen/f16-vmax-fp16arith-u2.c src/f16-vbinary/gen/f16-vmax-fp16arith-u4.c @@ -37,29 +37,29 @@ SET(NON_PROD_FP16ARITH_MICROKERNEL_SRCS src/f16-vbinary/gen/f16-vminc-fp16arith-u1.c src/f16-vbinary/gen/f16-vminc-fp16arith-u2.c src/f16-vbinary/gen/f16-vminc-fp16arith-u4.c - src/f16-vbinary/gen/f16-vmul-minmax-fp16arith-u1.c - src/f16-vbinary/gen/f16-vmul-minmax-fp16arith-u2.c - src/f16-vbinary/gen/f16-vmul-minmax-fp16arith-u4.c - src/f16-vbinary/gen/f16-vmulc-minmax-fp16arith-u1.c - src/f16-vbinary/gen/f16-vmulc-minmax-fp16arith-u2.c - src/f16-vbinary/gen/f16-vmulc-minmax-fp16arith-u4.c - src/f16-vbinary/gen/f16-vrdivc-minmax-fp16arith-u1.c - src/f16-vbinary/gen/f16-vrdivc-minmax-fp16arith-u4.c - src/f16-vbinary/gen/f16-vrsubc-minmax-fp16arith-u1.c - src/f16-vbinary/gen/f16-vrsubc-minmax-fp16arith-u2.c - src/f16-vbinary/gen/f16-vrsubc-minmax-fp16arith-u4.c + src/f16-vbinary/gen/f16-vmul-fp16arith-u1.c + src/f16-vbinary/gen/f16-vmul-fp16arith-u2.c + src/f16-vbinary/gen/f16-vmul-fp16arith-u4.c + src/f16-vbinary/gen/f16-vmulc-fp16arith-u1.c + src/f16-vbinary/gen/f16-vmulc-fp16arith-u2.c + src/f16-vbinary/gen/f16-vmulc-fp16arith-u4.c + src/f16-vbinary/gen/f16-vrdivc-fp16arith-u1.c + src/f16-vbinary/gen/f16-vrdivc-fp16arith-u4.c + src/f16-vbinary/gen/f16-vrsubc-fp16arith-u1.c + src/f16-vbinary/gen/f16-vrsubc-fp16arith-u2.c + src/f16-vbinary/gen/f16-vrsubc-fp16arith-u4.c src/f16-vbinary/gen/f16-vsqrdiff-fp16arith-u1.c src/f16-vbinary/gen/f16-vsqrdiff-fp16arith-u2.c src/f16-vbinary/gen/f16-vsqrdiff-fp16arith-u4.c src/f16-vbinary/gen/f16-vsqrdiffc-fp16arith-u1.c src/f16-vbinary/gen/f16-vsqrdiffc-fp16arith-u2.c src/f16-vbinary/gen/f16-vsqrdiffc-fp16arith-u4.c - src/f16-vbinary/gen/f16-vsub-minmax-fp16arith-u1.c - src/f16-vbinary/gen/f16-vsub-minmax-fp16arith-u2.c - src/f16-vbinary/gen/f16-vsub-minmax-fp16arith-u4.c - src/f16-vbinary/gen/f16-vsubc-minmax-fp16arith-u1.c - src/f16-vbinary/gen/f16-vsubc-minmax-fp16arith-u2.c - src/f16-vbinary/gen/f16-vsubc-minmax-fp16arith-u4.c + src/f16-vbinary/gen/f16-vsub-fp16arith-u1.c + src/f16-vbinary/gen/f16-vsub-fp16arith-u2.c + src/f16-vbinary/gen/f16-vsub-fp16arith-u4.c + src/f16-vbinary/gen/f16-vsubc-fp16arith-u1.c + src/f16-vbinary/gen/f16-vsubc-fp16arith-u2.c + src/f16-vbinary/gen/f16-vsubc-fp16arith-u4.c src/f16-vsqrt/gen/f16-vsqrt-fp16arith-sqrt-u1.c src/f16-vsqrt/gen/f16-vsqrt-fp16arith-sqrt-u2.c src/f16-vsqrt/gen/f16-vsqrt-fp16arith-sqrt-u4.c) diff --git a/cmake/gen/hvx_microkernels.cmake b/cmake/gen/hvx_microkernels.cmake index 5906acd912f..4724ce14b7c 100644 --- a/cmake/gen/hvx_microkernels.cmake +++ b/cmake/gen/hvx_microkernels.cmake @@ -66,12 +66,12 @@ SET(NON_PROD_HVX_MICROKERNEL_SRCS src/f32-spmm/gen/f32-spmm-128x1-minmax-hvx-x2.c src/f32-spmm/gen/f32-spmm-128x1-minmax-hvx-x4.c src/f32-spmm/gen/f32-spmm-128x1-minmax-hvx.c - src/f32-vbinary/gen/f32-vadd-minmax-hvx-u32.c - src/f32-vbinary/gen/f32-vadd-minmax-hvx-u64.c - src/f32-vbinary/gen/f32-vadd-minmax-hvx-u128.c - src/f32-vbinary/gen/f32-vaddc-minmax-hvx-u32.c - src/f32-vbinary/gen/f32-vaddc-minmax-hvx-u64.c - src/f32-vbinary/gen/f32-vaddc-minmax-hvx-u128.c + src/f32-vbinary/gen/f32-vadd-hvx-u32.c + src/f32-vbinary/gen/f32-vadd-hvx-u64.c + src/f32-vbinary/gen/f32-vadd-hvx-u128.c + src/f32-vbinary/gen/f32-vaddc-hvx-u32.c + src/f32-vbinary/gen/f32-vaddc-hvx-u64.c + src/f32-vbinary/gen/f32-vaddc-hvx-u128.c src/f32-vbinary/gen/f32-vmax-hvx-u32.c src/f32-vbinary/gen/f32-vmax-hvx-u64.c src/f32-vbinary/gen/f32-vmax-hvx-u128.c @@ -84,27 +84,27 @@ SET(NON_PROD_HVX_MICROKERNEL_SRCS src/f32-vbinary/gen/f32-vminc-hvx-u32.c src/f32-vbinary/gen/f32-vminc-hvx-u64.c src/f32-vbinary/gen/f32-vminc-hvx-u128.c - src/f32-vbinary/gen/f32-vmul-minmax-hvx-u32.c - src/f32-vbinary/gen/f32-vmul-minmax-hvx-u64.c - src/f32-vbinary/gen/f32-vmul-minmax-hvx-u128.c - src/f32-vbinary/gen/f32-vmulc-minmax-hvx-u32.c - src/f32-vbinary/gen/f32-vmulc-minmax-hvx-u64.c - src/f32-vbinary/gen/f32-vmulc-minmax-hvx-u128.c - src/f32-vbinary/gen/f32-vrsubc-minmax-hvx-u32.c - src/f32-vbinary/gen/f32-vrsubc-minmax-hvx-u64.c - src/f32-vbinary/gen/f32-vrsubc-minmax-hvx-u128.c + src/f32-vbinary/gen/f32-vmul-hvx-u32.c + src/f32-vbinary/gen/f32-vmul-hvx-u64.c + src/f32-vbinary/gen/f32-vmul-hvx-u128.c + src/f32-vbinary/gen/f32-vmulc-hvx-u32.c + src/f32-vbinary/gen/f32-vmulc-hvx-u64.c + src/f32-vbinary/gen/f32-vmulc-hvx-u128.c + src/f32-vbinary/gen/f32-vrsubc-hvx-u32.c + src/f32-vbinary/gen/f32-vrsubc-hvx-u64.c + src/f32-vbinary/gen/f32-vrsubc-hvx-u128.c src/f32-vbinary/gen/f32-vsqrdiff-hvx-u32.c src/f32-vbinary/gen/f32-vsqrdiff-hvx-u64.c src/f32-vbinary/gen/f32-vsqrdiff-hvx-u128.c src/f32-vbinary/gen/f32-vsqrdiffc-hvx-u32.c src/f32-vbinary/gen/f32-vsqrdiffc-hvx-u64.c src/f32-vbinary/gen/f32-vsqrdiffc-hvx-u128.c - src/f32-vbinary/gen/f32-vsub-minmax-hvx-u32.c - src/f32-vbinary/gen/f32-vsub-minmax-hvx-u64.c - src/f32-vbinary/gen/f32-vsub-minmax-hvx-u128.c - src/f32-vbinary/gen/f32-vsubc-minmax-hvx-u32.c - src/f32-vbinary/gen/f32-vsubc-minmax-hvx-u64.c - src/f32-vbinary/gen/f32-vsubc-minmax-hvx-u128.c + src/f32-vbinary/gen/f32-vsub-hvx-u32.c + src/f32-vbinary/gen/f32-vsub-hvx-u64.c + src/f32-vbinary/gen/f32-vsub-hvx-u128.c + src/f32-vbinary/gen/f32-vsubc-hvx-u32.c + src/f32-vbinary/gen/f32-vsubc-hvx-u64.c + src/f32-vbinary/gen/f32-vsubc-hvx-u128.c src/f32-vgelu/gen/f32-vgelu-hvx-rational-12-10-div.c src/f32-vunary/gen/f32-vabs-hvx.c src/f32-vunary/gen/f32-vneg-hvx.c diff --git a/cmake/gen/neon_aarch64_microkernels.cmake b/cmake/gen/neon_aarch64_microkernels.cmake index df0e417ac14..0668feddcf5 100644 --- a/cmake/gen/neon_aarch64_microkernels.cmake +++ b/cmake/gen/neon_aarch64_microkernels.cmake @@ -10,9 +10,9 @@ SET(PROD_NEON_AARCH64_MICROKERNEL_SRCS - src/f32-vbinary/gen/f32-vdiv-minmax-aarch64-neon-u8.c - src/f32-vbinary/gen/f32-vdivc-minmax-aarch64-neon-u8.c - src/f32-vbinary/gen/f32-vrdivc-minmax-aarch64-neon-u8.c + src/f32-vbinary/gen/f32-vdiv-aarch64-neon-u8.c + src/f32-vbinary/gen/f32-vdivc-aarch64-neon-u8.c + src/f32-vbinary/gen/f32-vrdivc-aarch64-neon-u8.c src/f32-vsqrt/gen/f32-vsqrt-aarch64-neon-sqrt-u4.c src/x8-lut/gen/x8-lut-aarch64-neon-tbx128x4-u64.c src/x8-packq/x8-packq-aarch64-neon-f32qp8-u2.c @@ -20,9 +20,9 @@ SET(PROD_NEON_AARCH64_MICROKERNEL_SRCS src/x32-transposec/x32-transposec-4x4-aarch64-neon-tbl128.c) SET(NON_PROD_NEON_AARCH64_MICROKERNEL_SRCS - src/f32-vbinary/gen/f32-vdiv-minmax-aarch64-neon-u4.c - src/f32-vbinary/gen/f32-vdivc-minmax-aarch64-neon-u4.c - src/f32-vbinary/gen/f32-vrdivc-minmax-aarch64-neon-u4.c + src/f32-vbinary/gen/f32-vdiv-aarch64-neon-u4.c + src/f32-vbinary/gen/f32-vdivc-aarch64-neon-u4.c + src/f32-vbinary/gen/f32-vrdivc-aarch64-neon-u4.c src/f32-vsqrt/gen/f32-vsqrt-aarch64-neon-sqrt-u8.c src/f32-vsqrt/gen/f32-vsqrt-aarch64-neon-sqrt-u16.c src/x8-lut/gen/x8-lut-aarch64-neon-tbx128x4-u16.c diff --git a/cmake/gen/neon_microkernels.cmake b/cmake/gen/neon_microkernels.cmake index 44e2884daa3..7f0597c722e 100644 --- a/cmake/gen/neon_microkernels.cmake +++ b/cmake/gen/neon_microkernels.cmake @@ -54,19 +54,19 @@ SET(PROD_NEON_MICROKERNEL_SRCS src/f32-rminmax/gen/f32-rminmax-neon-u16-acc4.c src/f32-rsum/gen/f32-rsum-neon-u16-acc4.c src/f32-spmm/gen/f32-spmm-32x1-minmax-neon.c - src/f32-vbinary/gen/f32-vadd-minmax-neon-u8.c - src/f32-vbinary/gen/f32-vaddc-minmax-neon-u8.c + src/f32-vbinary/gen/f32-vadd-neon-u8.c + src/f32-vbinary/gen/f32-vaddc-neon-u8.c src/f32-vbinary/gen/f32-vmax-neon-u8.c src/f32-vbinary/gen/f32-vmaxc-neon-u8.c src/f32-vbinary/gen/f32-vmin-neon-u8.c src/f32-vbinary/gen/f32-vminc-neon-u8.c - src/f32-vbinary/gen/f32-vmul-minmax-neon-u8.c - src/f32-vbinary/gen/f32-vmulc-minmax-neon-u8.c - src/f32-vbinary/gen/f32-vrsubc-minmax-neon-u8.c + src/f32-vbinary/gen/f32-vmul-neon-u8.c + src/f32-vbinary/gen/f32-vmulc-neon-u8.c + src/f32-vbinary/gen/f32-vrsubc-neon-u8.c src/f32-vbinary/gen/f32-vsqrdiff-neon-u8.c src/f32-vbinary/gen/f32-vsqrdiffc-neon-u8.c - src/f32-vbinary/gen/f32-vsub-minmax-neon-u8.c - src/f32-vbinary/gen/f32-vsubc-minmax-neon-u8.c + src/f32-vbinary/gen/f32-vsub-neon-u8.c + src/f32-vbinary/gen/f32-vsubc-neon-u8.c src/f32-vclamp/gen/f32-vclamp-neon-u16.c src/f32-vcmul/gen/f32-vcmul-neon-u8.c src/f32-vcopysign/gen/f32-vcopysign-neon.c @@ -416,25 +416,25 @@ SET(NON_PROD_NEON_MICROKERNEL_SRCS src/f32-spmm/gen/f32-spmm-16x1-minmax-neon.c src/f32-spmm/gen/f32-spmm-32x1-minmax-neon-pipelined.c src/f32-spmm/gen/f32-spmm-32x1-minmax-neon-x2.c - src/f32-vbinary/gen/f32-vadd-minmax-neon-u4.c - src/f32-vbinary/gen/f32-vaddc-minmax-neon-u4.c + src/f32-vbinary/gen/f32-vadd-neon-u4.c + src/f32-vbinary/gen/f32-vaddc-neon-u4.c src/f32-vbinary/gen/f32-vmax-neon-u4.c src/f32-vbinary/gen/f32-vmaxc-neon-u4.c src/f32-vbinary/gen/f32-vmin-neon-u4.c src/f32-vbinary/gen/f32-vminc-neon-u4.c - src/f32-vbinary/gen/f32-vmul-minmax-neon-u4.c - src/f32-vbinary/gen/f32-vmulc-minmax-neon-u4.c + src/f32-vbinary/gen/f32-vmul-neon-u4.c + src/f32-vbinary/gen/f32-vmulc-neon-u4.c src/f32-vbinary/gen/f32-vprelu-neon-u4.c src/f32-vbinary/gen/f32-vprelu-neon-u8.c src/f32-vbinary/gen/f32-vpreluc-neon-u4.c src/f32-vbinary/gen/f32-vpreluc-neon-u8.c src/f32-vbinary/gen/f32-vrpreluc-neon-u4.c src/f32-vbinary/gen/f32-vrpreluc-neon-u8.c - src/f32-vbinary/gen/f32-vrsubc-minmax-neon-u4.c + src/f32-vbinary/gen/f32-vrsubc-neon-u4.c src/f32-vbinary/gen/f32-vsqrdiff-neon-u4.c src/f32-vbinary/gen/f32-vsqrdiffc-neon-u4.c - src/f32-vbinary/gen/f32-vsub-minmax-neon-u4.c - src/f32-vbinary/gen/f32-vsubc-minmax-neon-u4.c + src/f32-vbinary/gen/f32-vsub-neon-u4.c + src/f32-vbinary/gen/f32-vsubc-neon-u4.c src/f32-vclamp/gen/f32-vclamp-neon-u4.c src/f32-vclamp/gen/f32-vclamp-neon-u8.c src/f32-vcmul/gen/f32-vcmul-neon-u4.c diff --git a/cmake/gen/neonfp16arith_aarch64_microkernels.cmake b/cmake/gen/neonfp16arith_aarch64_microkernels.cmake index e410fddfd4f..b5d22b0665e 100644 --- a/cmake/gen/neonfp16arith_aarch64_microkernels.cmake +++ b/cmake/gen/neonfp16arith_aarch64_microkernels.cmake @@ -10,16 +10,16 @@ SET(PROD_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS - src/f16-vbinary/gen/f16-vdiv-minmax-aarch64-neonfp16arith-u8.c - src/f16-vbinary/gen/f16-vdivc-minmax-aarch64-neonfp16arith-u8.c - src/f16-vbinary/gen/f16-vrdivc-minmax-aarch64-neonfp16arith-u8.c + src/f16-vbinary/gen/f16-vdiv-aarch64-neonfp16arith-u8.c + src/f16-vbinary/gen/f16-vdivc-aarch64-neonfp16arith-u8.c + src/f16-vbinary/gen/f16-vrdivc-aarch64-neonfp16arith-u8.c src/f16-vsqrt/gen/f16-vsqrt-aarch64-neonfp16arith-sqrt-u8.c src/f16-vtanh/gen/f16-vtanh-aarch64-neonfp16arith-expm1minus-rr1-p3h2ts-div-u32.c) SET(NON_PROD_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS - src/f16-vbinary/gen/f16-vdiv-minmax-aarch64-neonfp16arith-u16.c - src/f16-vbinary/gen/f16-vdivc-minmax-aarch64-neonfp16arith-u16.c - src/f16-vbinary/gen/f16-vrdivc-minmax-aarch64-neonfp16arith-u16.c + src/f16-vbinary/gen/f16-vdiv-aarch64-neonfp16arith-u16.c + src/f16-vbinary/gen/f16-vdivc-aarch64-neonfp16arith-u16.c + src/f16-vbinary/gen/f16-vrdivc-aarch64-neonfp16arith-u16.c src/f16-vsigmoid/gen/f16-vsigmoid-aarch64-neonfp16arith-rr2-p2-div-u8.c src/f16-vsigmoid/gen/f16-vsigmoid-aarch64-neonfp16arith-rr2-p2-div-u16.c src/f16-vsigmoid/gen/f16-vsigmoid-aarch64-neonfp16arith-rr2-p2-div-u24.c diff --git a/cmake/gen/neonfp16arith_microkernels.cmake b/cmake/gen/neonfp16arith_microkernels.cmake index c51f105c454..d9d1b3bd9e9 100644 --- a/cmake/gen/neonfp16arith_microkernels.cmake +++ b/cmake/gen/neonfp16arith_microkernels.cmake @@ -47,19 +47,19 @@ SET(PROD_NEONFP16ARITH_MICROKERNEL_SRCS src/f16-rminmax/gen/f16-rmax-neonfp16arith-u32-acc4.c src/f16-rminmax/gen/f16-rminmax-neonfp16arith-u32-acc4.c src/f16-spmm/gen/f16-spmm-32x1-minmax-neonfp16arith-pipelined.c - src/f16-vbinary/gen/f16-vadd-minmax-neonfp16arith-u16.c - src/f16-vbinary/gen/f16-vaddc-minmax-neonfp16arith-u16.c + src/f16-vbinary/gen/f16-vadd-neonfp16arith-u16.c + src/f16-vbinary/gen/f16-vaddc-neonfp16arith-u16.c src/f16-vbinary/gen/f16-vmax-neonfp16arith-u16.c src/f16-vbinary/gen/f16-vmaxc-neonfp16arith-u16.c src/f16-vbinary/gen/f16-vmin-neonfp16arith-u16.c src/f16-vbinary/gen/f16-vminc-neonfp16arith-u16.c - src/f16-vbinary/gen/f16-vmul-minmax-neonfp16arith-u16.c - src/f16-vbinary/gen/f16-vmulc-minmax-neonfp16arith-u16.c - src/f16-vbinary/gen/f16-vrsubc-minmax-neonfp16arith-u16.c + src/f16-vbinary/gen/f16-vmul-neonfp16arith-u16.c + src/f16-vbinary/gen/f16-vmulc-neonfp16arith-u16.c + src/f16-vbinary/gen/f16-vrsubc-neonfp16arith-u16.c src/f16-vbinary/gen/f16-vsqrdiff-neonfp16arith-u16.c src/f16-vbinary/gen/f16-vsqrdiffc-neonfp16arith-u16.c - src/f16-vbinary/gen/f16-vsub-minmax-neonfp16arith-u16.c - src/f16-vbinary/gen/f16-vsubc-minmax-neonfp16arith-u16.c + src/f16-vbinary/gen/f16-vsub-neonfp16arith-u16.c + src/f16-vbinary/gen/f16-vsubc-neonfp16arith-u16.c src/f16-vclamp/gen/f16-vclamp-neonfp16arith-u16.c src/f16-vcmul/gen/f16-vcmul-neonfp16arith-u16.c src/f16-velu/gen/f16-velu-neonfp16arith-rr1-p3-u16.c @@ -262,25 +262,25 @@ SET(NON_PROD_NEONFP16ARITH_MICROKERNEL_SRCS src/f16-spmm/gen/f16-spmm-24x1-minmax-neonfp16arith.c src/f16-spmm/gen/f16-spmm-32x1-minmax-neonfp16arith-x2.c src/f16-spmm/gen/f16-spmm-32x1-minmax-neonfp16arith.c - src/f16-vbinary/gen/f16-vadd-minmax-neonfp16arith-u8.c - src/f16-vbinary/gen/f16-vaddc-minmax-neonfp16arith-u8.c + src/f16-vbinary/gen/f16-vadd-neonfp16arith-u8.c + src/f16-vbinary/gen/f16-vaddc-neonfp16arith-u8.c src/f16-vbinary/gen/f16-vmax-neonfp16arith-u8.c src/f16-vbinary/gen/f16-vmaxc-neonfp16arith-u8.c src/f16-vbinary/gen/f16-vmin-neonfp16arith-u8.c src/f16-vbinary/gen/f16-vminc-neonfp16arith-u8.c - src/f16-vbinary/gen/f16-vmul-minmax-neonfp16arith-u8.c - src/f16-vbinary/gen/f16-vmulc-minmax-neonfp16arith-u8.c + src/f16-vbinary/gen/f16-vmul-neonfp16arith-u8.c + src/f16-vbinary/gen/f16-vmulc-neonfp16arith-u8.c src/f16-vbinary/gen/f16-vprelu-neonfp16arith-u8.c src/f16-vbinary/gen/f16-vprelu-neonfp16arith-u16.c src/f16-vbinary/gen/f16-vpreluc-neonfp16arith-u8.c src/f16-vbinary/gen/f16-vpreluc-neonfp16arith-u16.c src/f16-vbinary/gen/f16-vrpreluc-neonfp16arith-u8.c src/f16-vbinary/gen/f16-vrpreluc-neonfp16arith-u16.c - src/f16-vbinary/gen/f16-vrsubc-minmax-neonfp16arith-u8.c + src/f16-vbinary/gen/f16-vrsubc-neonfp16arith-u8.c src/f16-vbinary/gen/f16-vsqrdiff-neonfp16arith-u8.c src/f16-vbinary/gen/f16-vsqrdiffc-neonfp16arith-u8.c - src/f16-vbinary/gen/f16-vsub-minmax-neonfp16arith-u8.c - src/f16-vbinary/gen/f16-vsubc-minmax-neonfp16arith-u8.c + src/f16-vbinary/gen/f16-vsub-neonfp16arith-u8.c + src/f16-vbinary/gen/f16-vsubc-neonfp16arith-u8.c src/f16-vclamp/gen/f16-vclamp-neonfp16arith-u8.c src/f16-vcmul/gen/f16-vcmul-neonfp16arith-u8.c src/f16-vcmul/gen/f16-vcmul-neonfp16arith-u32.c diff --git a/cmake/gen/rvv_microkernels.cmake b/cmake/gen/rvv_microkernels.cmake index cd41d9717bb..e1dc3f4ad4e 100644 --- a/cmake/gen/rvv_microkernels.cmake +++ b/cmake/gen/rvv_microkernels.cmake @@ -30,22 +30,22 @@ SET(PROD_RVV_MICROKERNEL_SRCS src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-rvv-u4v.c src/f32-rminmax/gen/f32-rmax-rvv-u8v.c src/f32-rminmax/gen/f32-rminmax-rvv-u8v.c - src/f32-vbinary/gen/f32-vadd-minmax-rvv-u8v.c - src/f32-vbinary/gen/f32-vaddc-minmax-rvv-u8v.c - src/f32-vbinary/gen/f32-vdiv-minmax-rvv-u8v.c - src/f32-vbinary/gen/f32-vdivc-minmax-rvv-u8v.c + src/f32-vbinary/gen/f32-vadd-rvv-u8v.c + src/f32-vbinary/gen/f32-vaddc-rvv-u8v.c + src/f32-vbinary/gen/f32-vdiv-rvv-u8v.c + src/f32-vbinary/gen/f32-vdivc-rvv-u8v.c src/f32-vbinary/gen/f32-vmax-rvv-u8v.c src/f32-vbinary/gen/f32-vmaxc-rvv-u8v.c src/f32-vbinary/gen/f32-vmin-rvv-u8v.c src/f32-vbinary/gen/f32-vminc-rvv-u8v.c - src/f32-vbinary/gen/f32-vmul-minmax-rvv-u8v.c - src/f32-vbinary/gen/f32-vmulc-minmax-rvv-u8v.c - src/f32-vbinary/gen/f32-vrdivc-minmax-rvv-u8v.c - src/f32-vbinary/gen/f32-vrsubc-minmax-rvv-u8v.c + src/f32-vbinary/gen/f32-vmul-rvv-u8v.c + src/f32-vbinary/gen/f32-vmulc-rvv-u8v.c + src/f32-vbinary/gen/f32-vrdivc-rvv-u8v.c + src/f32-vbinary/gen/f32-vrsubc-rvv-u8v.c src/f32-vbinary/gen/f32-vsqrdiff-rvv-u8v.c src/f32-vbinary/gen/f32-vsqrdiffc-rvv-u8v.c - src/f32-vbinary/gen/f32-vsub-minmax-rvv-u8v.c - src/f32-vbinary/gen/f32-vsubc-minmax-rvv-u8v.c + src/f32-vbinary/gen/f32-vsub-rvv-u8v.c + src/f32-vbinary/gen/f32-vsubc-rvv-u8v.c src/f32-vcmul/gen/f32-vcmul-rvv-u2v.c src/f32-vlrelu/gen/f32-vlrelu-rvv-u4v.c src/f32-vrelu/gen/f32-vrelu-rvv-u4v.c @@ -100,24 +100,22 @@ SET(NON_PROD_RVV_MICROKERNEL_SRCS src/f32-rminmax/gen/f32-rminmax-rvv-u2v.c src/f32-rminmax/gen/f32-rminmax-rvv-u4v.c src/f32-rsum/f32-rsum-rvv-u1v.c - src/f32-vbinary/gen/f32-vadd-minmax-rvv-u4v.c - src/f32-vbinary/gen/f32-vaddc-minmax-rvv-u4v.c - src/f32-vbinary/gen/f32-vdiv-minmax-rvv-u4v.c - src/f32-vbinary/gen/f32-vdivc-minmax-rvv-u4v.c + src/f32-vbinary/gen/f32-vadd-rvv-u4v.c + src/f32-vbinary/gen/f32-vaddc-rvv-u4v.c + src/f32-vbinary/gen/f32-vdiv-rvv-u4v.c + src/f32-vbinary/gen/f32-vdivc-rvv-u4v.c src/f32-vbinary/gen/f32-vmax-rvv-u4v.c src/f32-vbinary/gen/f32-vmaxc-rvv-u4v.c src/f32-vbinary/gen/f32-vmin-rvv-u4v.c src/f32-vbinary/gen/f32-vminc-rvv-u4v.c - src/f32-vbinary/gen/f32-vmul-minmax-rvv-u4v.c - src/f32-vbinary/gen/f32-vmulc-minmax-rvv-u4v.c + src/f32-vbinary/gen/f32-vmul-rvv-u4v.c src/f32-vbinary/gen/f32-vmulc-rvv-u4v.c - src/f32-vbinary/gen/f32-vmulc-rvv-u8v.c - src/f32-vbinary/gen/f32-vrdivc-minmax-rvv-u4v.c - src/f32-vbinary/gen/f32-vrsubc-minmax-rvv-u4v.c + src/f32-vbinary/gen/f32-vrdivc-rvv-u4v.c + src/f32-vbinary/gen/f32-vrsubc-rvv-u4v.c src/f32-vbinary/gen/f32-vsqrdiff-rvv-u4v.c src/f32-vbinary/gen/f32-vsqrdiffc-rvv-u4v.c - src/f32-vbinary/gen/f32-vsub-minmax-rvv-u4v.c - src/f32-vbinary/gen/f32-vsubc-minmax-rvv-u4v.c + src/f32-vbinary/gen/f32-vsub-rvv-u4v.c + src/f32-vbinary/gen/f32-vsubc-rvv-u4v.c src/f32-vclamp/gen/f32-vclamp-rvv-u1v.c src/f32-vclamp/gen/f32-vclamp-rvv-u2v.c src/f32-vclamp/gen/f32-vclamp-rvv-u4v.c diff --git a/cmake/gen/scalar_microkernels.cmake b/cmake/gen/scalar_microkernels.cmake index 3c3fe6f9765..18f41fa2e99 100644 --- a/cmake/gen/scalar_microkernels.cmake +++ b/cmake/gen/scalar_microkernels.cmake @@ -89,22 +89,22 @@ SET(PROD_SCALAR_MICROKERNEL_SRCS src/f32-spmm/gen/f32-spmm-8x1-minmax-scalar.c src/f32-spmm/gen/f32-spmm-8x2-minmax-scalar.c src/f32-spmm/gen/f32-spmm-8x4-minmax-scalar.c - src/f32-vbinary/gen/f32-vadd-minmax-scalar-u8.c - src/f32-vbinary/gen/f32-vaddc-minmax-scalar-u8.c - src/f32-vbinary/gen/f32-vdiv-minmax-scalar-u2.c - src/f32-vbinary/gen/f32-vdivc-minmax-scalar-u2.c + src/f32-vbinary/gen/f32-vadd-scalar-u8.c + src/f32-vbinary/gen/f32-vaddc-scalar-u8.c + src/f32-vbinary/gen/f32-vdiv-scalar-u2.c + src/f32-vbinary/gen/f32-vdivc-scalar-u2.c src/f32-vbinary/gen/f32-vmax-scalar-u8.c src/f32-vbinary/gen/f32-vmaxc-scalar-u8.c src/f32-vbinary/gen/f32-vmin-scalar-u8.c src/f32-vbinary/gen/f32-vminc-scalar-u8.c - src/f32-vbinary/gen/f32-vmul-minmax-scalar-u8.c - src/f32-vbinary/gen/f32-vmulc-minmax-scalar-u8.c - src/f32-vbinary/gen/f32-vrdivc-minmax-scalar-u2.c - src/f32-vbinary/gen/f32-vrsubc-minmax-scalar-u8.c + src/f32-vbinary/gen/f32-vmul-scalar-u8.c + src/f32-vbinary/gen/f32-vmulc-scalar-u8.c + src/f32-vbinary/gen/f32-vrdivc-scalar-u2.c + src/f32-vbinary/gen/f32-vrsubc-scalar-u8.c src/f32-vbinary/gen/f32-vsqrdiff-scalar-u8.c src/f32-vbinary/gen/f32-vsqrdiffc-scalar-u8.c - src/f32-vbinary/gen/f32-vsub-minmax-scalar-u8.c - src/f32-vbinary/gen/f32-vsubc-minmax-scalar-u8.c + src/f32-vbinary/gen/f32-vsub-scalar-u8.c + src/f32-vbinary/gen/f32-vsubc-scalar-u8.c src/f32-vclamp/gen/f32-vclamp-scalar-u4.c src/f32-vcmul/gen/f32-vcmul-scalar-u4.c src/f32-vcopysign/gen/f32-vcopysign-scalar.c @@ -455,32 +455,16 @@ SET(NON_PROD_SCALAR_MICROKERNEL_SRCS src/f32-spmm/gen/f32-spmm-4x1-minmax-scalar-pipelined.c src/f32-spmm/gen/f32-spmm-4x1-minmax-scalar.c src/f32-spmm/gen/f32-spmm-8x1-minmax-scalar-pipelined.c - src/f32-vbinary/gen/f32-vadd-minmax-scalar-u1.c - src/f32-vbinary/gen/f32-vadd-minmax-scalar-u2.c - src/f32-vbinary/gen/f32-vadd-minmax-scalar-u4.c src/f32-vbinary/gen/f32-vadd-scalar-u1.c src/f32-vbinary/gen/f32-vadd-scalar-u2.c src/f32-vbinary/gen/f32-vadd-scalar-u4.c - src/f32-vbinary/gen/f32-vadd-scalar-u8.c - src/f32-vbinary/gen/f32-vaddc-minmax-scalar-u1.c - src/f32-vbinary/gen/f32-vaddc-minmax-scalar-u2.c - src/f32-vbinary/gen/f32-vaddc-minmax-scalar-u4.c src/f32-vbinary/gen/f32-vaddc-scalar-u1.c src/f32-vbinary/gen/f32-vaddc-scalar-u2.c src/f32-vbinary/gen/f32-vaddc-scalar-u4.c - src/f32-vbinary/gen/f32-vaddc-scalar-u8.c - src/f32-vbinary/gen/f32-vdiv-minmax-scalar-u1.c - src/f32-vbinary/gen/f32-vdiv-minmax-scalar-u4.c - src/f32-vbinary/gen/f32-vdiv-minmax-scalar-u8.c src/f32-vbinary/gen/f32-vdiv-scalar-u1.c - src/f32-vbinary/gen/f32-vdiv-scalar-u2.c src/f32-vbinary/gen/f32-vdiv-scalar-u4.c src/f32-vbinary/gen/f32-vdiv-scalar-u8.c - src/f32-vbinary/gen/f32-vdivc-minmax-scalar-u1.c - src/f32-vbinary/gen/f32-vdivc-minmax-scalar-u4.c - src/f32-vbinary/gen/f32-vdivc-minmax-scalar-u8.c src/f32-vbinary/gen/f32-vdivc-scalar-u1.c - src/f32-vbinary/gen/f32-vdivc-scalar-u2.c src/f32-vbinary/gen/f32-vdivc-scalar-u4.c src/f32-vbinary/gen/f32-vdivc-scalar-u8.c src/f32-vbinary/gen/f32-vmax-scalar-u1.c @@ -495,20 +479,12 @@ SET(NON_PROD_SCALAR_MICROKERNEL_SRCS src/f32-vbinary/gen/f32-vminc-scalar-u1.c src/f32-vbinary/gen/f32-vminc-scalar-u2.c src/f32-vbinary/gen/f32-vminc-scalar-u4.c - src/f32-vbinary/gen/f32-vmul-minmax-scalar-u1.c - src/f32-vbinary/gen/f32-vmul-minmax-scalar-u2.c - src/f32-vbinary/gen/f32-vmul-minmax-scalar-u4.c src/f32-vbinary/gen/f32-vmul-scalar-u1.c src/f32-vbinary/gen/f32-vmul-scalar-u2.c src/f32-vbinary/gen/f32-vmul-scalar-u4.c - src/f32-vbinary/gen/f32-vmul-scalar-u8.c - src/f32-vbinary/gen/f32-vmulc-minmax-scalar-u1.c - src/f32-vbinary/gen/f32-vmulc-minmax-scalar-u2.c - src/f32-vbinary/gen/f32-vmulc-minmax-scalar-u4.c src/f32-vbinary/gen/f32-vmulc-scalar-u1.c src/f32-vbinary/gen/f32-vmulc-scalar-u2.c src/f32-vbinary/gen/f32-vmulc-scalar-u4.c - src/f32-vbinary/gen/f32-vmulc-scalar-u8.c src/f32-vbinary/gen/f32-vprelu-scalar-u1.c src/f32-vbinary/gen/f32-vprelu-scalar-u2.c src/f32-vbinary/gen/f32-vprelu-scalar-u4.c @@ -517,44 +493,28 @@ SET(NON_PROD_SCALAR_MICROKERNEL_SRCS src/f32-vbinary/gen/f32-vpreluc-scalar-u2.c src/f32-vbinary/gen/f32-vpreluc-scalar-u4.c src/f32-vbinary/gen/f32-vpreluc-scalar-u8.c - src/f32-vbinary/gen/f32-vrdivc-minmax-scalar-u1.c - src/f32-vbinary/gen/f32-vrdivc-minmax-scalar-u4.c - src/f32-vbinary/gen/f32-vrdivc-minmax-scalar-u8.c src/f32-vbinary/gen/f32-vrdivc-scalar-u1.c - src/f32-vbinary/gen/f32-vrdivc-scalar-u2.c src/f32-vbinary/gen/f32-vrdivc-scalar-u4.c src/f32-vbinary/gen/f32-vrdivc-scalar-u8.c src/f32-vbinary/gen/f32-vrpreluc-scalar-u1.c src/f32-vbinary/gen/f32-vrpreluc-scalar-u2.c src/f32-vbinary/gen/f32-vrpreluc-scalar-u4.c src/f32-vbinary/gen/f32-vrpreluc-scalar-u8.c - src/f32-vbinary/gen/f32-vrsubc-minmax-scalar-u1.c - src/f32-vbinary/gen/f32-vrsubc-minmax-scalar-u2.c - src/f32-vbinary/gen/f32-vrsubc-minmax-scalar-u4.c src/f32-vbinary/gen/f32-vrsubc-scalar-u1.c src/f32-vbinary/gen/f32-vrsubc-scalar-u2.c src/f32-vbinary/gen/f32-vrsubc-scalar-u4.c - src/f32-vbinary/gen/f32-vrsubc-scalar-u8.c src/f32-vbinary/gen/f32-vsqrdiff-scalar-u1.c src/f32-vbinary/gen/f32-vsqrdiff-scalar-u2.c src/f32-vbinary/gen/f32-vsqrdiff-scalar-u4.c src/f32-vbinary/gen/f32-vsqrdiffc-scalar-u1.c src/f32-vbinary/gen/f32-vsqrdiffc-scalar-u2.c src/f32-vbinary/gen/f32-vsqrdiffc-scalar-u4.c - src/f32-vbinary/gen/f32-vsub-minmax-scalar-u1.c - src/f32-vbinary/gen/f32-vsub-minmax-scalar-u2.c - src/f32-vbinary/gen/f32-vsub-minmax-scalar-u4.c src/f32-vbinary/gen/f32-vsub-scalar-u1.c src/f32-vbinary/gen/f32-vsub-scalar-u2.c src/f32-vbinary/gen/f32-vsub-scalar-u4.c - src/f32-vbinary/gen/f32-vsub-scalar-u8.c - src/f32-vbinary/gen/f32-vsubc-minmax-scalar-u1.c - src/f32-vbinary/gen/f32-vsubc-minmax-scalar-u2.c - src/f32-vbinary/gen/f32-vsubc-minmax-scalar-u4.c src/f32-vbinary/gen/f32-vsubc-scalar-u1.c src/f32-vbinary/gen/f32-vsubc-scalar-u2.c src/f32-vbinary/gen/f32-vsubc-scalar-u4.c - src/f32-vbinary/gen/f32-vsubc-scalar-u8.c src/f32-vclamp/gen/f32-vclamp-scalar-u1.c src/f32-vclamp/gen/f32-vclamp-scalar-u2.c src/f32-vcmul/gen/f32-vcmul-scalar-u1.c diff --git a/cmake/gen/sse_microkernels.cmake b/cmake/gen/sse_microkernels.cmake index 1943065f7bc..befa0bb3246 100644 --- a/cmake/gen/sse_microkernels.cmake +++ b/cmake/gen/sse_microkernels.cmake @@ -41,22 +41,22 @@ SET(PROD_SSE_MICROKERNEL_SRCS src/f32-rminmax/gen/f32-rminmax-sse-u16-acc4.c src/f32-rsum/gen/f32-rsum-sse-u16-acc4.c src/f32-spmm/gen/f32-spmm-32x1-minmax-sse.c - src/f32-vbinary/gen/f32-vadd-minmax-sse-u8.c - src/f32-vbinary/gen/f32-vaddc-minmax-sse-u8.c - src/f32-vbinary/gen/f32-vdiv-minmax-sse-u8.c - src/f32-vbinary/gen/f32-vdivc-minmax-sse-u8.c + src/f32-vbinary/gen/f32-vadd-sse-u8.c + src/f32-vbinary/gen/f32-vaddc-sse-u8.c + src/f32-vbinary/gen/f32-vdiv-sse-u8.c + src/f32-vbinary/gen/f32-vdivc-sse-u8.c src/f32-vbinary/gen/f32-vmax-sse-u8.c src/f32-vbinary/gen/f32-vmaxc-sse-u8.c src/f32-vbinary/gen/f32-vmin-sse-u8.c src/f32-vbinary/gen/f32-vminc-sse-u8.c - src/f32-vbinary/gen/f32-vmul-minmax-sse-u8.c - src/f32-vbinary/gen/f32-vmulc-minmax-sse-u8.c - src/f32-vbinary/gen/f32-vrdivc-minmax-sse-u8.c - src/f32-vbinary/gen/f32-vrsubc-minmax-sse-u8.c + src/f32-vbinary/gen/f32-vmul-sse-u8.c + src/f32-vbinary/gen/f32-vmulc-sse-u8.c + src/f32-vbinary/gen/f32-vrdivc-sse-u8.c + src/f32-vbinary/gen/f32-vrsubc-sse-u8.c src/f32-vbinary/gen/f32-vsqrdiff-sse-u8.c src/f32-vbinary/gen/f32-vsqrdiffc-sse-u8.c - src/f32-vbinary/gen/f32-vsub-minmax-sse-u8.c - src/f32-vbinary/gen/f32-vsubc-minmax-sse-u8.c + src/f32-vbinary/gen/f32-vsub-sse-u8.c + src/f32-vbinary/gen/f32-vsubc-sse-u8.c src/f32-vclamp/gen/f32-vclamp-sse-u8.c src/f32-vcmul/gen/f32-vcmul-sse-u8.c src/f32-vhswish/gen/f32-vhswish-sse-u8.c @@ -204,22 +204,22 @@ SET(NON_PROD_SSE_MICROKERNEL_SRCS src/f32-spmm/gen/f32-spmm-4x1-minmax-sse.c src/f32-spmm/gen/f32-spmm-8x1-minmax-sse.c src/f32-spmm/gen/f32-spmm-16x1-minmax-sse.c - src/f32-vbinary/gen/f32-vadd-minmax-sse-u4.c - src/f32-vbinary/gen/f32-vaddc-minmax-sse-u4.c - src/f32-vbinary/gen/f32-vdiv-minmax-sse-u4.c - src/f32-vbinary/gen/f32-vdivc-minmax-sse-u4.c + src/f32-vbinary/gen/f32-vadd-sse-u4.c + src/f32-vbinary/gen/f32-vaddc-sse-u4.c + src/f32-vbinary/gen/f32-vdiv-sse-u4.c + src/f32-vbinary/gen/f32-vdivc-sse-u4.c src/f32-vbinary/gen/f32-vmax-sse-u4.c src/f32-vbinary/gen/f32-vmaxc-sse-u4.c src/f32-vbinary/gen/f32-vmin-sse-u4.c src/f32-vbinary/gen/f32-vminc-sse-u4.c - src/f32-vbinary/gen/f32-vmul-minmax-sse-u4.c - src/f32-vbinary/gen/f32-vmulc-minmax-sse-u4.c - src/f32-vbinary/gen/f32-vrdivc-minmax-sse-u4.c - src/f32-vbinary/gen/f32-vrsubc-minmax-sse-u4.c + src/f32-vbinary/gen/f32-vmul-sse-u4.c + src/f32-vbinary/gen/f32-vmulc-sse-u4.c + src/f32-vbinary/gen/f32-vrdivc-sse-u4.c + src/f32-vbinary/gen/f32-vrsubc-sse-u4.c src/f32-vbinary/gen/f32-vsqrdiff-sse-u4.c src/f32-vbinary/gen/f32-vsqrdiffc-sse-u4.c - src/f32-vbinary/gen/f32-vsub-minmax-sse-u4.c - src/f32-vbinary/gen/f32-vsubc-minmax-sse-u4.c + src/f32-vbinary/gen/f32-vsub-sse-u4.c + src/f32-vbinary/gen/f32-vsubc-sse-u4.c src/f32-vclamp/gen/f32-vclamp-sse-u4.c src/f32-vcmul/gen/f32-vcmul-sse-u4.c src/f32-vcmul/gen/f32-vcmul-sse-u12.c diff --git a/cmake/gen/wasm_microkernels.cmake b/cmake/gen/wasm_microkernels.cmake index 41ebe24d143..de89cd83440 100644 --- a/cmake/gen/wasm_microkernels.cmake +++ b/cmake/gen/wasm_microkernels.cmake @@ -40,20 +40,20 @@ SET(PROD_WASM_MICROKERNEL_SRCS src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasm-fmagic-u4.c src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasm-fmagic-u4.c src/f32-rminmax/gen/f32-rminmax-wasm-u4-acc4.c - src/f32-vbinary/gen/f32-vadd-minmax-wasm-u8.c - src/f32-vbinary/gen/f32-vaddc-minmax-wasm-u8.c - src/f32-vbinary/gen/f32-vdiv-minmax-wasm-u8.c - src/f32-vbinary/gen/f32-vdivc-minmax-wasm-u8.c + src/f32-vbinary/gen/f32-vadd-wasm-u8.c + src/f32-vbinary/gen/f32-vaddc-wasm-u8.c + src/f32-vbinary/gen/f32-vdiv-wasm-u8.c + src/f32-vbinary/gen/f32-vdivc-wasm-u8.c src/f32-vbinary/gen/f32-vmax-wasm-u8.c src/f32-vbinary/gen/f32-vmaxc-wasm-u8.c src/f32-vbinary/gen/f32-vmin-wasm-u8.c src/f32-vbinary/gen/f32-vminc-wasm-u8.c - src/f32-vbinary/gen/f32-vmul-minmax-wasm-u8.c - src/f32-vbinary/gen/f32-vmulc-minmax-wasm-u8.c - src/f32-vbinary/gen/f32-vrdivc-minmax-wasm-u8.c - src/f32-vbinary/gen/f32-vrsubc-minmax-wasm-u8.c - src/f32-vbinary/gen/f32-vsub-minmax-wasm-u8.c - src/f32-vbinary/gen/f32-vsubc-minmax-wasm-u8.c + src/f32-vbinary/gen/f32-vmul-wasm-u8.c + src/f32-vbinary/gen/f32-vmulc-wasm-u8.c + src/f32-vbinary/gen/f32-vrdivc-wasm-u8.c + src/f32-vbinary/gen/f32-vrsubc-wasm-u8.c + src/f32-vbinary/gen/f32-vsub-wasm-u8.c + src/f32-vbinary/gen/f32-vsubc-wasm-u8.c src/f32-vclamp/gen/f32-vclamp-wasm-u4.c src/f32-velu/gen/f32-velu-wasm-rr2-p6-u6.c src/f32-vhswish/gen/f32-vhswish-wasm-u4.c @@ -139,18 +139,18 @@ SET(NON_PROD_WASM_MICROKERNEL_SRCS src/f32-rminmax/gen/f32-rminmax-wasm-u2-acc2.c src/f32-rminmax/gen/f32-rminmax-wasm-u3-acc3.c src/f32-rminmax/gen/f32-rminmax-wasm-u4-acc2.c - src/f32-vbinary/gen/f32-vadd-minmax-wasm-u1.c - src/f32-vbinary/gen/f32-vadd-minmax-wasm-u2.c - src/f32-vbinary/gen/f32-vadd-minmax-wasm-u4.c - src/f32-vbinary/gen/f32-vaddc-minmax-wasm-u1.c - src/f32-vbinary/gen/f32-vaddc-minmax-wasm-u2.c - src/f32-vbinary/gen/f32-vaddc-minmax-wasm-u4.c - src/f32-vbinary/gen/f32-vdiv-minmax-wasm-u1.c - src/f32-vbinary/gen/f32-vdiv-minmax-wasm-u2.c - src/f32-vbinary/gen/f32-vdiv-minmax-wasm-u4.c - src/f32-vbinary/gen/f32-vdivc-minmax-wasm-u1.c - src/f32-vbinary/gen/f32-vdivc-minmax-wasm-u2.c - src/f32-vbinary/gen/f32-vdivc-minmax-wasm-u4.c + src/f32-vbinary/gen/f32-vadd-wasm-u1.c + src/f32-vbinary/gen/f32-vadd-wasm-u2.c + src/f32-vbinary/gen/f32-vadd-wasm-u4.c + src/f32-vbinary/gen/f32-vaddc-wasm-u1.c + src/f32-vbinary/gen/f32-vaddc-wasm-u2.c + src/f32-vbinary/gen/f32-vaddc-wasm-u4.c + src/f32-vbinary/gen/f32-vdiv-wasm-u1.c + src/f32-vbinary/gen/f32-vdiv-wasm-u2.c + src/f32-vbinary/gen/f32-vdiv-wasm-u4.c + src/f32-vbinary/gen/f32-vdivc-wasm-u1.c + src/f32-vbinary/gen/f32-vdivc-wasm-u2.c + src/f32-vbinary/gen/f32-vdivc-wasm-u4.c src/f32-vbinary/gen/f32-vmax-wasm-u1.c src/f32-vbinary/gen/f32-vmax-wasm-u2.c src/f32-vbinary/gen/f32-vmax-wasm-u4.c @@ -163,12 +163,12 @@ SET(NON_PROD_WASM_MICROKERNEL_SRCS src/f32-vbinary/gen/f32-vminc-wasm-u1.c src/f32-vbinary/gen/f32-vminc-wasm-u2.c src/f32-vbinary/gen/f32-vminc-wasm-u4.c - src/f32-vbinary/gen/f32-vmul-minmax-wasm-u1.c - src/f32-vbinary/gen/f32-vmul-minmax-wasm-u2.c - src/f32-vbinary/gen/f32-vmul-minmax-wasm-u4.c - src/f32-vbinary/gen/f32-vmulc-minmax-wasm-u1.c - src/f32-vbinary/gen/f32-vmulc-minmax-wasm-u2.c - src/f32-vbinary/gen/f32-vmulc-minmax-wasm-u4.c + src/f32-vbinary/gen/f32-vmul-wasm-u1.c + src/f32-vbinary/gen/f32-vmul-wasm-u2.c + src/f32-vbinary/gen/f32-vmul-wasm-u4.c + src/f32-vbinary/gen/f32-vmulc-wasm-u1.c + src/f32-vbinary/gen/f32-vmulc-wasm-u2.c + src/f32-vbinary/gen/f32-vmulc-wasm-u4.c src/f32-vbinary/gen/f32-vprelu-wasm-u1.c src/f32-vbinary/gen/f32-vprelu-wasm-u2.c src/f32-vbinary/gen/f32-vprelu-wasm-u4.c @@ -177,22 +177,22 @@ SET(NON_PROD_WASM_MICROKERNEL_SRCS src/f32-vbinary/gen/f32-vpreluc-wasm-u2.c src/f32-vbinary/gen/f32-vpreluc-wasm-u4.c src/f32-vbinary/gen/f32-vpreluc-wasm-u8.c - src/f32-vbinary/gen/f32-vrdivc-minmax-wasm-u1.c - src/f32-vbinary/gen/f32-vrdivc-minmax-wasm-u2.c - src/f32-vbinary/gen/f32-vrdivc-minmax-wasm-u4.c + src/f32-vbinary/gen/f32-vrdivc-wasm-u1.c + src/f32-vbinary/gen/f32-vrdivc-wasm-u2.c + src/f32-vbinary/gen/f32-vrdivc-wasm-u4.c src/f32-vbinary/gen/f32-vrpreluc-wasm-u1.c src/f32-vbinary/gen/f32-vrpreluc-wasm-u2.c src/f32-vbinary/gen/f32-vrpreluc-wasm-u4.c src/f32-vbinary/gen/f32-vrpreluc-wasm-u8.c - src/f32-vbinary/gen/f32-vrsubc-minmax-wasm-u1.c - src/f32-vbinary/gen/f32-vrsubc-minmax-wasm-u2.c - src/f32-vbinary/gen/f32-vrsubc-minmax-wasm-u4.c - src/f32-vbinary/gen/f32-vsub-minmax-wasm-u1.c - src/f32-vbinary/gen/f32-vsub-minmax-wasm-u2.c - src/f32-vbinary/gen/f32-vsub-minmax-wasm-u4.c - src/f32-vbinary/gen/f32-vsubc-minmax-wasm-u1.c - src/f32-vbinary/gen/f32-vsubc-minmax-wasm-u2.c - src/f32-vbinary/gen/f32-vsubc-minmax-wasm-u4.c + src/f32-vbinary/gen/f32-vrsubc-wasm-u1.c + src/f32-vbinary/gen/f32-vrsubc-wasm-u2.c + src/f32-vbinary/gen/f32-vrsubc-wasm-u4.c + src/f32-vbinary/gen/f32-vsub-wasm-u1.c + src/f32-vbinary/gen/f32-vsub-wasm-u2.c + src/f32-vbinary/gen/f32-vsub-wasm-u4.c + src/f32-vbinary/gen/f32-vsubc-wasm-u1.c + src/f32-vbinary/gen/f32-vsubc-wasm-u2.c + src/f32-vbinary/gen/f32-vsubc-wasm-u4.c src/f32-vclamp/gen/f32-vclamp-wasm-u1.c src/f32-vclamp/gen/f32-vclamp-wasm-u2.c src/f32-velu/gen/f32-velu-wasm-rr2-lut16-p3-u1.c diff --git a/cmake/gen/wasmsimd_microkernels.cmake b/cmake/gen/wasmsimd_microkernels.cmake index cfb2f06e4f1..d4bff4df747 100644 --- a/cmake/gen/wasmsimd_microkernels.cmake +++ b/cmake/gen/wasmsimd_microkernels.cmake @@ -121,17 +121,9 @@ SET(PROD_WASMSIMD_MICROKERNEL_SRCS src/f32-rsum/gen/f32-rsum-wasmsimd-u16-acc4.c src/f32-spmm/gen/f32-spmm-32x1-minmax-wasmsimd-arm.c src/f32-spmm/gen/f32-spmm-32x1-minmax-wasmsimd-x86.c - src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-arm-u16.c - src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-x86-u16.c src/f32-vbinary/gen/f32-vadd-wasmsimd-u16.c - src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-arm-u16.c - src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-x86-u16.c src/f32-vbinary/gen/f32-vaddc-wasmsimd-u16.c - src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-arm-u16.c - src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-x86-u16.c src/f32-vbinary/gen/f32-vdiv-wasmsimd-u16.c - src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-arm-u16.c - src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-x86-u16.c src/f32-vbinary/gen/f32-vdivc-wasmsimd-u16.c src/f32-vbinary/gen/f32-vmax-wasmsimd-arm-u16.c src/f32-vbinary/gen/f32-vmax-wasmsimd-x86-u16.c @@ -141,25 +133,13 @@ SET(PROD_WASMSIMD_MICROKERNEL_SRCS src/f32-vbinary/gen/f32-vmin-wasmsimd-x86-u16.c src/f32-vbinary/gen/f32-vminc-wasmsimd-arm-u16.c src/f32-vbinary/gen/f32-vminc-wasmsimd-x86-u16.c - src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-arm-u16.c - src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-x86-u16.c src/f32-vbinary/gen/f32-vmul-wasmsimd-u16.c - src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-arm-u16.c - src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-x86-u16.c src/f32-vbinary/gen/f32-vmulc-wasmsimd-u16.c - src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-arm-u16.c - src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-x86-u16.c src/f32-vbinary/gen/f32-vrdivc-wasmsimd-u16.c - src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-arm-u16.c - src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-x86-u16.c src/f32-vbinary/gen/f32-vrsubc-wasmsimd-u16.c src/f32-vbinary/gen/f32-vsqrdiff-wasmsimd-u16.c src/f32-vbinary/gen/f32-vsqrdiffc-wasmsimd-u16.c - src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-arm-u16.c - src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-x86-u16.c src/f32-vbinary/gen/f32-vsub-wasmsimd-u16.c - src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-arm-u16.c - src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-x86-u16.c src/f32-vbinary/gen/f32-vsubc-wasmsimd-u16.c src/f32-vclamp/gen/f32-vclamp-wasmsimd-arm-u8.c src/f32-vclamp/gen/f32-vclamp-wasmsimd-x86-u8.c @@ -751,28 +731,12 @@ SET(NON_PROD_WASMSIMD_MICROKERNEL_SRCS src/f32-spmm/gen/f32-spmm-32x1-minmax-wasmsimd-x86-pipelined.c src/f32-spmm/gen/f32-spmm-32x1-minmax-wasmsimd-x86-x2.c src/f32-spmm/gen/f32-spmm-32x1-minmax-wasmsimd-x86-x4.c - src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-arm-u4.c - src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-arm-u8.c - src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-x86-u4.c - src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-x86-u8.c src/f32-vbinary/gen/f32-vadd-wasmsimd-u4.c src/f32-vbinary/gen/f32-vadd-wasmsimd-u8.c - src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-arm-u4.c - src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-arm-u8.c - src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-x86-u4.c - src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-x86-u8.c src/f32-vbinary/gen/f32-vaddc-wasmsimd-u4.c src/f32-vbinary/gen/f32-vaddc-wasmsimd-u8.c - src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-arm-u4.c - src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-arm-u8.c - src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-x86-u4.c - src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-x86-u8.c src/f32-vbinary/gen/f32-vdiv-wasmsimd-u4.c src/f32-vbinary/gen/f32-vdiv-wasmsimd-u8.c - src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-arm-u4.c - src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-arm-u8.c - src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-x86-u4.c - src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-x86-u8.c src/f32-vbinary/gen/f32-vdivc-wasmsimd-u4.c src/f32-vbinary/gen/f32-vdivc-wasmsimd-u8.c src/f32-vbinary/gen/f32-vmax-wasmsimd-arm-u4.c @@ -791,16 +755,8 @@ SET(NON_PROD_WASMSIMD_MICROKERNEL_SRCS src/f32-vbinary/gen/f32-vminc-wasmsimd-arm-u8.c src/f32-vbinary/gen/f32-vminc-wasmsimd-x86-u4.c src/f32-vbinary/gen/f32-vminc-wasmsimd-x86-u8.c - src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-arm-u4.c - src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-arm-u8.c - src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-x86-u4.c - src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-x86-u8.c src/f32-vbinary/gen/f32-vmul-wasmsimd-u4.c src/f32-vbinary/gen/f32-vmul-wasmsimd-u8.c - src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-arm-u4.c - src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-arm-u8.c - src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-x86-u4.c - src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-x86-u8.c src/f32-vbinary/gen/f32-vmulc-wasmsimd-u4.c src/f32-vbinary/gen/f32-vmulc-wasmsimd-u8.c src/f32-vbinary/gen/f32-vprelu-wasmsimd-u4.c @@ -809,35 +765,19 @@ SET(NON_PROD_WASMSIMD_MICROKERNEL_SRCS src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u4.c src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u8.c src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u16.c - src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-arm-u4.c - src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-arm-u8.c - src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-x86-u4.c - src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-x86-u8.c src/f32-vbinary/gen/f32-vrdivc-wasmsimd-u4.c src/f32-vbinary/gen/f32-vrdivc-wasmsimd-u8.c src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u4.c src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u8.c src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u16.c - src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-arm-u4.c - src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-arm-u8.c - src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-x86-u4.c - src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-x86-u8.c src/f32-vbinary/gen/f32-vrsubc-wasmsimd-u4.c src/f32-vbinary/gen/f32-vrsubc-wasmsimd-u8.c src/f32-vbinary/gen/f32-vsqrdiff-wasmsimd-u4.c src/f32-vbinary/gen/f32-vsqrdiff-wasmsimd-u8.c src/f32-vbinary/gen/f32-vsqrdiffc-wasmsimd-u4.c src/f32-vbinary/gen/f32-vsqrdiffc-wasmsimd-u8.c - src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-arm-u4.c - src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-arm-u8.c - src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-x86-u4.c - src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-x86-u8.c src/f32-vbinary/gen/f32-vsub-wasmsimd-u4.c src/f32-vbinary/gen/f32-vsub-wasmsimd-u8.c - src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-arm-u4.c - src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-arm-u8.c - src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-x86-u4.c - src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-x86-u8.c src/f32-vbinary/gen/f32-vsubc-wasmsimd-u4.c src/f32-vbinary/gen/f32-vsubc-wasmsimd-u8.c src/f32-vclamp/gen/f32-vclamp-wasmsimd-arm-u4.c diff --git a/gen/avx512f_microkernels.bzl b/gen/avx512f_microkernels.bzl index b9d9a714671..df2192fffb0 100644 --- a/gen/avx512f_microkernels.bzl +++ b/gen/avx512f_microkernels.bzl @@ -20,22 +20,22 @@ PROD_AVX512F_MICROKERNEL_SRCS = [ "src/f32-rminmax/gen/f32-rmax-avx512f-u64-acc4.c", "src/f32-rminmax/gen/f32-rminmax-avx512f-u64-acc4.c", "src/f32-rsum/gen/f32-rsum-avx512f-u64-acc4.c", - "src/f32-vbinary/gen/f32-vadd-minmax-avx512f-u32.c", - "src/f32-vbinary/gen/f32-vaddc-minmax-avx512f-u32.c", - "src/f32-vbinary/gen/f32-vdiv-minmax-avx512f-u32.c", - "src/f32-vbinary/gen/f32-vdivc-minmax-avx512f-u32.c", + "src/f32-vbinary/gen/f32-vadd-avx512f-u32.c", + "src/f32-vbinary/gen/f32-vaddc-avx512f-u32.c", + "src/f32-vbinary/gen/f32-vdiv-avx512f-u32.c", + "src/f32-vbinary/gen/f32-vdivc-avx512f-u32.c", "src/f32-vbinary/gen/f32-vmax-avx512f-u32.c", "src/f32-vbinary/gen/f32-vmaxc-avx512f-u32.c", "src/f32-vbinary/gen/f32-vmin-avx512f-u32.c", "src/f32-vbinary/gen/f32-vminc-avx512f-u32.c", - "src/f32-vbinary/gen/f32-vmul-minmax-avx512f-u32.c", - "src/f32-vbinary/gen/f32-vmulc-minmax-avx512f-u32.c", - "src/f32-vbinary/gen/f32-vrdivc-minmax-avx512f-u32.c", - "src/f32-vbinary/gen/f32-vrsubc-minmax-avx512f-u32.c", + "src/f32-vbinary/gen/f32-vmul-avx512f-u32.c", + "src/f32-vbinary/gen/f32-vmulc-avx512f-u32.c", + "src/f32-vbinary/gen/f32-vrdivc-avx512f-u32.c", + "src/f32-vbinary/gen/f32-vrsubc-avx512f-u32.c", "src/f32-vbinary/gen/f32-vsqrdiff-avx512f-u32.c", "src/f32-vbinary/gen/f32-vsqrdiffc-avx512f-u32.c", - "src/f32-vbinary/gen/f32-vsub-minmax-avx512f-u32.c", - "src/f32-vbinary/gen/f32-vsubc-minmax-avx512f-u32.c", + "src/f32-vbinary/gen/f32-vsub-avx512f-u32.c", + "src/f32-vbinary/gen/f32-vsubc-avx512f-u32.c", "src/f32-vclamp/gen/f32-vclamp-avx512f-u16.c", "src/f32-vcmul/gen/f32-vcmul-avx512f-u32.c", "src/f32-vcopysign/gen/f32-vcopysign-avx512f.c", @@ -160,28 +160,28 @@ NON_PROD_AVX512F_MICROKERNEL_SRCS = [ "src/f32-rsum/gen/f32-rsum-avx512f-u32-acc2.c", "src/f32-rsum/gen/f32-rsum-avx512f-u48-acc3.c", "src/f32-rsum/gen/f32-rsum-avx512f-u64-acc2.c", - "src/f32-vbinary/gen/f32-vadd-minmax-avx512f-u16.c", - "src/f32-vbinary/gen/f32-vaddc-minmax-avx512f-u16.c", - "src/f32-vbinary/gen/f32-vdiv-minmax-avx512f-u16.c", - "src/f32-vbinary/gen/f32-vdivc-minmax-avx512f-u16.c", + "src/f32-vbinary/gen/f32-vadd-avx512f-u16.c", + "src/f32-vbinary/gen/f32-vaddc-avx512f-u16.c", + "src/f32-vbinary/gen/f32-vdiv-avx512f-u16.c", + "src/f32-vbinary/gen/f32-vdivc-avx512f-u16.c", "src/f32-vbinary/gen/f32-vmax-avx512f-u16.c", "src/f32-vbinary/gen/f32-vmaxc-avx512f-u16.c", "src/f32-vbinary/gen/f32-vmin-avx512f-u16.c", "src/f32-vbinary/gen/f32-vminc-avx512f-u16.c", - "src/f32-vbinary/gen/f32-vmul-minmax-avx512f-u16.c", - "src/f32-vbinary/gen/f32-vmulc-minmax-avx512f-u16.c", + "src/f32-vbinary/gen/f32-vmul-avx512f-u16.c", + "src/f32-vbinary/gen/f32-vmulc-avx512f-u16.c", "src/f32-vbinary/gen/f32-vprelu-avx512f-u16.c", "src/f32-vbinary/gen/f32-vprelu-avx512f-u32.c", "src/f32-vbinary/gen/f32-vpreluc-avx512f-u16.c", "src/f32-vbinary/gen/f32-vpreluc-avx512f-u32.c", - "src/f32-vbinary/gen/f32-vrdivc-minmax-avx512f-u16.c", + "src/f32-vbinary/gen/f32-vrdivc-avx512f-u16.c", "src/f32-vbinary/gen/f32-vrpreluc-avx512f-u16.c", "src/f32-vbinary/gen/f32-vrpreluc-avx512f-u32.c", - "src/f32-vbinary/gen/f32-vrsubc-minmax-avx512f-u16.c", + "src/f32-vbinary/gen/f32-vrsubc-avx512f-u16.c", "src/f32-vbinary/gen/f32-vsqrdiff-avx512f-u16.c", "src/f32-vbinary/gen/f32-vsqrdiffc-avx512f-u16.c", - "src/f32-vbinary/gen/f32-vsub-minmax-avx512f-u16.c", - "src/f32-vbinary/gen/f32-vsubc-minmax-avx512f-u16.c", + "src/f32-vbinary/gen/f32-vsub-avx512f-u16.c", + "src/f32-vbinary/gen/f32-vsubc-avx512f-u16.c", "src/f32-vclamp/gen/f32-vclamp-avx512f-u32.c", "src/f32-vcmul/gen/f32-vcmul-avx512f-u16.c", "src/f32-vcmul/gen/f32-vcmul-avx512f-u64.c", diff --git a/gen/avx512fp16_microkernels.bzl b/gen/avx512fp16_microkernels.bzl index 50e8ea666ba..aeb7b028a3f 100644 --- a/gen/avx512fp16_microkernels.bzl +++ b/gen/avx512fp16_microkernels.bzl @@ -12,22 +12,22 @@ PROD_AVX512FP16_MICROKERNEL_SRCS = [ "src/f16-igemm/gen/f16-igemm-7x64-minmax-avx512fp16-broadcast.c", "src/f16-rminmax/gen/f16-rmax-avx512fp16-u128-acc4.c", "src/f16-rminmax/gen/f16-rminmax-avx512fp16-u128-acc4.c", - "src/f16-vbinary/gen/f16-vadd-minmax-avx512fp16-u64.c", - "src/f16-vbinary/gen/f16-vaddc-minmax-avx512fp16-u64.c", - "src/f16-vbinary/gen/f16-vdiv-minmax-avx512fp16-u64.c", - "src/f16-vbinary/gen/f16-vdivc-minmax-avx512fp16-u64.c", + "src/f16-vbinary/gen/f16-vadd-avx512fp16-u64.c", + "src/f16-vbinary/gen/f16-vaddc-avx512fp16-u64.c", + "src/f16-vbinary/gen/f16-vdiv-avx512fp16-u64.c", + "src/f16-vbinary/gen/f16-vdivc-avx512fp16-u64.c", "src/f16-vbinary/gen/f16-vmax-avx512fp16-u64.c", "src/f16-vbinary/gen/f16-vmaxc-avx512fp16-u64.c", "src/f16-vbinary/gen/f16-vmin-avx512fp16-u64.c", "src/f16-vbinary/gen/f16-vminc-avx512fp16-u64.c", - "src/f16-vbinary/gen/f16-vmul-minmax-avx512fp16-u64.c", - "src/f16-vbinary/gen/f16-vmulc-minmax-avx512fp16-u64.c", - "src/f16-vbinary/gen/f16-vrdivc-minmax-avx512fp16-u64.c", - "src/f16-vbinary/gen/f16-vrsubc-minmax-avx512fp16-u64.c", + "src/f16-vbinary/gen/f16-vmul-avx512fp16-u64.c", + "src/f16-vbinary/gen/f16-vmulc-avx512fp16-u64.c", + "src/f16-vbinary/gen/f16-vrdivc-avx512fp16-u64.c", + "src/f16-vbinary/gen/f16-vrsubc-avx512fp16-u64.c", "src/f16-vbinary/gen/f16-vsqrdiff-avx512fp16-u64.c", "src/f16-vbinary/gen/f16-vsqrdiffc-avx512fp16-u64.c", - "src/f16-vbinary/gen/f16-vsub-minmax-avx512fp16-u64.c", - "src/f16-vbinary/gen/f16-vsubc-minmax-avx512fp16-u64.c", + "src/f16-vbinary/gen/f16-vsub-avx512fp16-u64.c", + "src/f16-vbinary/gen/f16-vsubc-avx512fp16-u64.c", ] NON_PROD_AVX512FP16_MICROKERNEL_SRCS = [ @@ -69,28 +69,28 @@ NON_PROD_AVX512FP16_MICROKERNEL_SRCS = [ "src/f16-rsum/gen/f16-rsum-avx512fp16-u96-acc3.c", "src/f16-rsum/gen/f16-rsum-avx512fp16-u128-acc2.c", "src/f16-rsum/gen/f16-rsum-avx512fp16-u128-acc4.c", - "src/f16-vbinary/gen/f16-vadd-minmax-avx512fp16-u32.c", - "src/f16-vbinary/gen/f16-vaddc-minmax-avx512fp16-u32.c", - "src/f16-vbinary/gen/f16-vdiv-minmax-avx512fp16-u32.c", - "src/f16-vbinary/gen/f16-vdivc-minmax-avx512fp16-u32.c", + "src/f16-vbinary/gen/f16-vadd-avx512fp16-u32.c", + "src/f16-vbinary/gen/f16-vaddc-avx512fp16-u32.c", + "src/f16-vbinary/gen/f16-vdiv-avx512fp16-u32.c", + "src/f16-vbinary/gen/f16-vdivc-avx512fp16-u32.c", "src/f16-vbinary/gen/f16-vmax-avx512fp16-u32.c", "src/f16-vbinary/gen/f16-vmaxc-avx512fp16-u32.c", "src/f16-vbinary/gen/f16-vmin-avx512fp16-u32.c", "src/f16-vbinary/gen/f16-vminc-avx512fp16-u32.c", - "src/f16-vbinary/gen/f16-vmul-minmax-avx512fp16-u32.c", - "src/f16-vbinary/gen/f16-vmulc-minmax-avx512fp16-u32.c", + "src/f16-vbinary/gen/f16-vmul-avx512fp16-u32.c", + "src/f16-vbinary/gen/f16-vmulc-avx512fp16-u32.c", "src/f16-vbinary/gen/f16-vprelu-avx512fp16-u32.c", "src/f16-vbinary/gen/f16-vprelu-avx512fp16-u64.c", "src/f16-vbinary/gen/f16-vpreluc-avx512fp16-u32.c", "src/f16-vbinary/gen/f16-vpreluc-avx512fp16-u64.c", - "src/f16-vbinary/gen/f16-vrdivc-minmax-avx512fp16-u32.c", + "src/f16-vbinary/gen/f16-vrdivc-avx512fp16-u32.c", "src/f16-vbinary/gen/f16-vrpreluc-avx512fp16-u32.c", "src/f16-vbinary/gen/f16-vrpreluc-avx512fp16-u64.c", - "src/f16-vbinary/gen/f16-vrsubc-minmax-avx512fp16-u32.c", + "src/f16-vbinary/gen/f16-vrsubc-avx512fp16-u32.c", "src/f16-vbinary/gen/f16-vsqrdiff-avx512fp16-u32.c", "src/f16-vbinary/gen/f16-vsqrdiffc-avx512fp16-u32.c", - "src/f16-vbinary/gen/f16-vsub-minmax-avx512fp16-u32.c", - "src/f16-vbinary/gen/f16-vsubc-minmax-avx512fp16-u32.c", + "src/f16-vbinary/gen/f16-vsub-avx512fp16-u32.c", + "src/f16-vbinary/gen/f16-vsubc-avx512fp16-u32.c", "src/f16-vsqrt/gen/f16-vsqrt-avx512fp16-sqrt-u32.c", "src/f16-vsqrt/gen/f16-vsqrt-avx512fp16-sqrt-u64.c", "src/f16-vsqrt/gen/f16-vsqrt-avx512fp16-sqrt-u128.c", diff --git a/gen/avx_microkernels.bzl b/gen/avx_microkernels.bzl index ee8f9f70cf1..67231286ef2 100644 --- a/gen/avx_microkernels.bzl +++ b/gen/avx_microkernels.bzl @@ -28,22 +28,22 @@ PROD_AVX_MICROKERNEL_SRCS = [ "src/f32-rminmax/gen/f32-rmax-avx-u32-acc4.c", "src/f32-rminmax/gen/f32-rminmax-avx-u32-acc4.c", "src/f32-rsum/gen/f32-rsum-avx-u32-acc4.c", - "src/f32-vbinary/gen/f32-vadd-minmax-avx-u16.c", - "src/f32-vbinary/gen/f32-vaddc-minmax-avx-u16.c", - "src/f32-vbinary/gen/f32-vdiv-minmax-avx-u16.c", - "src/f32-vbinary/gen/f32-vdivc-minmax-avx-u16.c", + "src/f32-vbinary/gen/f32-vadd-avx-u16.c", + "src/f32-vbinary/gen/f32-vaddc-avx-u16.c", + "src/f32-vbinary/gen/f32-vdiv-avx-u16.c", + "src/f32-vbinary/gen/f32-vdivc-avx-u16.c", "src/f32-vbinary/gen/f32-vmax-avx-u16.c", "src/f32-vbinary/gen/f32-vmaxc-avx-u16.c", "src/f32-vbinary/gen/f32-vmin-avx-u16.c", "src/f32-vbinary/gen/f32-vminc-avx-u16.c", - "src/f32-vbinary/gen/f32-vmul-minmax-avx-u16.c", - "src/f32-vbinary/gen/f32-vmulc-minmax-avx-u16.c", - "src/f32-vbinary/gen/f32-vrdivc-minmax-avx-u16.c", - "src/f32-vbinary/gen/f32-vrsubc-minmax-avx-u16.c", + "src/f32-vbinary/gen/f32-vmul-avx-u16.c", + "src/f32-vbinary/gen/f32-vmulc-avx-u16.c", + "src/f32-vbinary/gen/f32-vrdivc-avx-u16.c", + "src/f32-vbinary/gen/f32-vrsubc-avx-u16.c", "src/f32-vbinary/gen/f32-vsqrdiff-avx-u16.c", "src/f32-vbinary/gen/f32-vsqrdiffc-avx-u16.c", - "src/f32-vbinary/gen/f32-vsub-minmax-avx-u16.c", - "src/f32-vbinary/gen/f32-vsubc-minmax-avx-u16.c", + "src/f32-vbinary/gen/f32-vsub-avx-u16.c", + "src/f32-vbinary/gen/f32-vsubc-avx-u16.c", "src/f32-vclamp/gen/f32-vclamp-avx-u16.c", "src/f32-vcopysign/gen/f32-vcopysign-avx.c", "src/f32-vcopysign/gen/f32-vcopysignc-avx.c", @@ -212,28 +212,28 @@ NON_PROD_AVX_MICROKERNEL_SRCS = [ "src/f32-rsum/gen/f32-rsum-avx-u16-acc2.c", "src/f32-rsum/gen/f32-rsum-avx-u24-acc3.c", "src/f32-rsum/gen/f32-rsum-avx-u32-acc2.c", - "src/f32-vbinary/gen/f32-vadd-minmax-avx-u8.c", - "src/f32-vbinary/gen/f32-vaddc-minmax-avx-u8.c", - "src/f32-vbinary/gen/f32-vdiv-minmax-avx-u8.c", - "src/f32-vbinary/gen/f32-vdivc-minmax-avx-u8.c", + "src/f32-vbinary/gen/f32-vadd-avx-u8.c", + "src/f32-vbinary/gen/f32-vaddc-avx-u8.c", + "src/f32-vbinary/gen/f32-vdiv-avx-u8.c", + "src/f32-vbinary/gen/f32-vdivc-avx-u8.c", "src/f32-vbinary/gen/f32-vmax-avx-u8.c", "src/f32-vbinary/gen/f32-vmaxc-avx-u8.c", "src/f32-vbinary/gen/f32-vmin-avx-u8.c", "src/f32-vbinary/gen/f32-vminc-avx-u8.c", - "src/f32-vbinary/gen/f32-vmul-minmax-avx-u8.c", - "src/f32-vbinary/gen/f32-vmulc-minmax-avx-u8.c", + "src/f32-vbinary/gen/f32-vmul-avx-u8.c", + "src/f32-vbinary/gen/f32-vmulc-avx-u8.c", "src/f32-vbinary/gen/f32-vprelu-avx-u8.c", "src/f32-vbinary/gen/f32-vprelu-avx-u16.c", "src/f32-vbinary/gen/f32-vpreluc-avx-u8.c", "src/f32-vbinary/gen/f32-vpreluc-avx-u16.c", - "src/f32-vbinary/gen/f32-vrdivc-minmax-avx-u8.c", + "src/f32-vbinary/gen/f32-vrdivc-avx-u8.c", "src/f32-vbinary/gen/f32-vrpreluc-avx-u8.c", "src/f32-vbinary/gen/f32-vrpreluc-avx-u16.c", - "src/f32-vbinary/gen/f32-vrsubc-minmax-avx-u8.c", + "src/f32-vbinary/gen/f32-vrsubc-avx-u8.c", "src/f32-vbinary/gen/f32-vsqrdiff-avx-u8.c", "src/f32-vbinary/gen/f32-vsqrdiffc-avx-u8.c", - "src/f32-vbinary/gen/f32-vsub-minmax-avx-u8.c", - "src/f32-vbinary/gen/f32-vsubc-minmax-avx-u8.c", + "src/f32-vbinary/gen/f32-vsub-avx-u8.c", + "src/f32-vbinary/gen/f32-vsubc-avx-u8.c", "src/f32-vclamp/gen/f32-vclamp-avx-u8.c", "src/f32-velu/gen/f32-velu-avx-rr2-lut4-p4-perm-u8.c", "src/f32-velu/gen/f32-velu-avx-rr2-lut4-p4-perm-u16.c", diff --git a/gen/f16c_microkernels.bzl b/gen/f16c_microkernels.bzl index 16f43ba38be..383bdde3cd5 100644 --- a/gen/f16c_microkernels.bzl +++ b/gen/f16c_microkernels.bzl @@ -16,22 +16,22 @@ PROD_F16C_MICROKERNEL_SRCS = [ "src/f16-maxpool/f16-maxpool-9p8x-minmax-f16c-c8.c", "src/f16-prelu/gen/f16-prelu-f16c-2x16.c", "src/f16-rminmax/f16-rmax-f16c-u32.c", - "src/f16-vbinary/gen/f16-vadd-minmax-f16c-u16.c", - "src/f16-vbinary/gen/f16-vaddc-minmax-f16c-u16.c", - "src/f16-vbinary/gen/f16-vdiv-minmax-f16c-u8.c", - "src/f16-vbinary/gen/f16-vdivc-minmax-f16c-u8.c", + "src/f16-vbinary/gen/f16-vadd-f16c-u16.c", + "src/f16-vbinary/gen/f16-vaddc-f16c-u16.c", + "src/f16-vbinary/gen/f16-vdiv-f16c-u8.c", + "src/f16-vbinary/gen/f16-vdivc-f16c-u8.c", "src/f16-vbinary/gen/f16-vmax-f16c-u16.c", "src/f16-vbinary/gen/f16-vmaxc-f16c-u16.c", "src/f16-vbinary/gen/f16-vmin-f16c-u16.c", "src/f16-vbinary/gen/f16-vminc-f16c-u16.c", - "src/f16-vbinary/gen/f16-vmul-minmax-f16c-u16.c", - "src/f16-vbinary/gen/f16-vmulc-minmax-f16c-u16.c", - "src/f16-vbinary/gen/f16-vrdivc-minmax-f16c-u8.c", - "src/f16-vbinary/gen/f16-vrsubc-minmax-f16c-u16.c", + "src/f16-vbinary/gen/f16-vmul-f16c-u16.c", + "src/f16-vbinary/gen/f16-vmulc-f16c-u16.c", + "src/f16-vbinary/gen/f16-vrdivc-f16c-u8.c", + "src/f16-vbinary/gen/f16-vrsubc-f16c-u16.c", "src/f16-vbinary/gen/f16-vsqrdiff-f16c-u16.c", "src/f16-vbinary/gen/f16-vsqrdiffc-f16c-u16.c", - "src/f16-vbinary/gen/f16-vsub-minmax-f16c-u16.c", - "src/f16-vbinary/gen/f16-vsubc-minmax-f16c-u16.c", + "src/f16-vbinary/gen/f16-vsub-f16c-u16.c", + "src/f16-vbinary/gen/f16-vsubc-f16c-u16.c", "src/f16-vclamp/gen/f16-vclamp-f16c-u16.c", "src/f16-vhswish/gen/f16-vhswish-f16c-u16.c", "src/f16-vlrelu/gen/f16-vlrelu-f16c-u16.c", @@ -62,28 +62,28 @@ NON_PROD_F16C_MICROKERNEL_SRCS = [ "src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c24.c", "src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c32.c", "src/f16-prelu/gen/f16-prelu-f16c-2x8.c", - "src/f16-vbinary/gen/f16-vadd-minmax-f16c-u8.c", - "src/f16-vbinary/gen/f16-vaddc-minmax-f16c-u8.c", - "src/f16-vbinary/gen/f16-vdiv-minmax-f16c-u16.c", - "src/f16-vbinary/gen/f16-vdivc-minmax-f16c-u16.c", + "src/f16-vbinary/gen/f16-vadd-f16c-u8.c", + "src/f16-vbinary/gen/f16-vaddc-f16c-u8.c", + "src/f16-vbinary/gen/f16-vdiv-f16c-u16.c", + "src/f16-vbinary/gen/f16-vdivc-f16c-u16.c", "src/f16-vbinary/gen/f16-vmax-f16c-u8.c", "src/f16-vbinary/gen/f16-vmaxc-f16c-u8.c", "src/f16-vbinary/gen/f16-vmin-f16c-u8.c", "src/f16-vbinary/gen/f16-vminc-f16c-u8.c", - "src/f16-vbinary/gen/f16-vmul-minmax-f16c-u8.c", - "src/f16-vbinary/gen/f16-vmulc-minmax-f16c-u8.c", + "src/f16-vbinary/gen/f16-vmul-f16c-u8.c", + "src/f16-vbinary/gen/f16-vmulc-f16c-u8.c", "src/f16-vbinary/gen/f16-vprelu-f16c-u8.c", "src/f16-vbinary/gen/f16-vprelu-f16c-u16.c", "src/f16-vbinary/gen/f16-vpreluc-f16c-u8.c", "src/f16-vbinary/gen/f16-vpreluc-f16c-u16.c", - "src/f16-vbinary/gen/f16-vrdivc-minmax-f16c-u16.c", + "src/f16-vbinary/gen/f16-vrdivc-f16c-u16.c", "src/f16-vbinary/gen/f16-vrpreluc-f16c-u8.c", "src/f16-vbinary/gen/f16-vrpreluc-f16c-u16.c", - "src/f16-vbinary/gen/f16-vrsubc-minmax-f16c-u8.c", + "src/f16-vbinary/gen/f16-vrsubc-f16c-u8.c", "src/f16-vbinary/gen/f16-vsqrdiff-f16c-u8.c", "src/f16-vbinary/gen/f16-vsqrdiffc-f16c-u8.c", - "src/f16-vbinary/gen/f16-vsub-minmax-f16c-u8.c", - "src/f16-vbinary/gen/f16-vsubc-minmax-f16c-u8.c", + "src/f16-vbinary/gen/f16-vsub-f16c-u8.c", + "src/f16-vbinary/gen/f16-vsubc-f16c-u8.c", "src/f16-vclamp/gen/f16-vclamp-f16c-u8.c", "src/f16-vhswish/gen/f16-vhswish-f16c-u8.c", "src/f16-vlrelu/gen/f16-vlrelu-f16c-u8.c", diff --git a/gen/fp16arith_microkernels.bzl b/gen/fp16arith_microkernels.bzl index e70fc8197c8..71ec6a88c9f 100644 --- a/gen/fp16arith_microkernels.bzl +++ b/gen/fp16arith_microkernels.bzl @@ -6,22 +6,22 @@ Auto-generated file. Do not edit! """ PROD_FP16ARITH_MICROKERNEL_SRCS = [ - "src/f16-vbinary/gen/f16-vdiv-minmax-fp16arith-u2.c", - "src/f16-vbinary/gen/f16-vdivc-minmax-fp16arith-u2.c", - "src/f16-vbinary/gen/f16-vrdivc-minmax-fp16arith-u2.c", + "src/f16-vbinary/gen/f16-vdiv-fp16arith-u2.c", + "src/f16-vbinary/gen/f16-vdivc-fp16arith-u2.c", + "src/f16-vbinary/gen/f16-vrdivc-fp16arith-u2.c", ] NON_PROD_FP16ARITH_MICROKERNEL_SRCS = [ - "src/f16-vbinary/gen/f16-vadd-minmax-fp16arith-u1.c", - "src/f16-vbinary/gen/f16-vadd-minmax-fp16arith-u2.c", - "src/f16-vbinary/gen/f16-vadd-minmax-fp16arith-u4.c", - "src/f16-vbinary/gen/f16-vaddc-minmax-fp16arith-u1.c", - "src/f16-vbinary/gen/f16-vaddc-minmax-fp16arith-u2.c", - "src/f16-vbinary/gen/f16-vaddc-minmax-fp16arith-u4.c", - "src/f16-vbinary/gen/f16-vdiv-minmax-fp16arith-u1.c", - "src/f16-vbinary/gen/f16-vdiv-minmax-fp16arith-u4.c", - "src/f16-vbinary/gen/f16-vdivc-minmax-fp16arith-u1.c", - "src/f16-vbinary/gen/f16-vdivc-minmax-fp16arith-u4.c", + "src/f16-vbinary/gen/f16-vadd-fp16arith-u1.c", + "src/f16-vbinary/gen/f16-vadd-fp16arith-u2.c", + "src/f16-vbinary/gen/f16-vadd-fp16arith-u4.c", + "src/f16-vbinary/gen/f16-vaddc-fp16arith-u1.c", + "src/f16-vbinary/gen/f16-vaddc-fp16arith-u2.c", + "src/f16-vbinary/gen/f16-vaddc-fp16arith-u4.c", + "src/f16-vbinary/gen/f16-vdiv-fp16arith-u1.c", + "src/f16-vbinary/gen/f16-vdiv-fp16arith-u4.c", + "src/f16-vbinary/gen/f16-vdivc-fp16arith-u1.c", + "src/f16-vbinary/gen/f16-vdivc-fp16arith-u4.c", "src/f16-vbinary/gen/f16-vmax-fp16arith-u1.c", "src/f16-vbinary/gen/f16-vmax-fp16arith-u2.c", "src/f16-vbinary/gen/f16-vmax-fp16arith-u4.c", @@ -34,29 +34,29 @@ NON_PROD_FP16ARITH_MICROKERNEL_SRCS = [ "src/f16-vbinary/gen/f16-vminc-fp16arith-u1.c", "src/f16-vbinary/gen/f16-vminc-fp16arith-u2.c", "src/f16-vbinary/gen/f16-vminc-fp16arith-u4.c", - "src/f16-vbinary/gen/f16-vmul-minmax-fp16arith-u1.c", - "src/f16-vbinary/gen/f16-vmul-minmax-fp16arith-u2.c", - "src/f16-vbinary/gen/f16-vmul-minmax-fp16arith-u4.c", - "src/f16-vbinary/gen/f16-vmulc-minmax-fp16arith-u1.c", - "src/f16-vbinary/gen/f16-vmulc-minmax-fp16arith-u2.c", - "src/f16-vbinary/gen/f16-vmulc-minmax-fp16arith-u4.c", - "src/f16-vbinary/gen/f16-vrdivc-minmax-fp16arith-u1.c", - "src/f16-vbinary/gen/f16-vrdivc-minmax-fp16arith-u4.c", - "src/f16-vbinary/gen/f16-vrsubc-minmax-fp16arith-u1.c", - "src/f16-vbinary/gen/f16-vrsubc-minmax-fp16arith-u2.c", - "src/f16-vbinary/gen/f16-vrsubc-minmax-fp16arith-u4.c", + "src/f16-vbinary/gen/f16-vmul-fp16arith-u1.c", + "src/f16-vbinary/gen/f16-vmul-fp16arith-u2.c", + "src/f16-vbinary/gen/f16-vmul-fp16arith-u4.c", + "src/f16-vbinary/gen/f16-vmulc-fp16arith-u1.c", + "src/f16-vbinary/gen/f16-vmulc-fp16arith-u2.c", + "src/f16-vbinary/gen/f16-vmulc-fp16arith-u4.c", + "src/f16-vbinary/gen/f16-vrdivc-fp16arith-u1.c", + "src/f16-vbinary/gen/f16-vrdivc-fp16arith-u4.c", + "src/f16-vbinary/gen/f16-vrsubc-fp16arith-u1.c", + "src/f16-vbinary/gen/f16-vrsubc-fp16arith-u2.c", + "src/f16-vbinary/gen/f16-vrsubc-fp16arith-u4.c", "src/f16-vbinary/gen/f16-vsqrdiff-fp16arith-u1.c", "src/f16-vbinary/gen/f16-vsqrdiff-fp16arith-u2.c", "src/f16-vbinary/gen/f16-vsqrdiff-fp16arith-u4.c", "src/f16-vbinary/gen/f16-vsqrdiffc-fp16arith-u1.c", "src/f16-vbinary/gen/f16-vsqrdiffc-fp16arith-u2.c", "src/f16-vbinary/gen/f16-vsqrdiffc-fp16arith-u4.c", - "src/f16-vbinary/gen/f16-vsub-minmax-fp16arith-u1.c", - "src/f16-vbinary/gen/f16-vsub-minmax-fp16arith-u2.c", - "src/f16-vbinary/gen/f16-vsub-minmax-fp16arith-u4.c", - "src/f16-vbinary/gen/f16-vsubc-minmax-fp16arith-u1.c", - "src/f16-vbinary/gen/f16-vsubc-minmax-fp16arith-u2.c", - "src/f16-vbinary/gen/f16-vsubc-minmax-fp16arith-u4.c", + "src/f16-vbinary/gen/f16-vsub-fp16arith-u1.c", + "src/f16-vbinary/gen/f16-vsub-fp16arith-u2.c", + "src/f16-vbinary/gen/f16-vsub-fp16arith-u4.c", + "src/f16-vbinary/gen/f16-vsubc-fp16arith-u1.c", + "src/f16-vbinary/gen/f16-vsubc-fp16arith-u2.c", + "src/f16-vbinary/gen/f16-vsubc-fp16arith-u4.c", "src/f16-vsqrt/gen/f16-vsqrt-fp16arith-sqrt-u1.c", "src/f16-vsqrt/gen/f16-vsqrt-fp16arith-sqrt-u2.c", "src/f16-vsqrt/gen/f16-vsqrt-fp16arith-sqrt-u4.c", diff --git a/gen/hvx_microkernels.bzl b/gen/hvx_microkernels.bzl index 53b98480e24..c011b417135 100644 --- a/gen/hvx_microkernels.bzl +++ b/gen/hvx_microkernels.bzl @@ -63,12 +63,12 @@ NON_PROD_HVX_MICROKERNEL_SRCS = [ "src/f32-spmm/gen/f32-spmm-128x1-minmax-hvx-x2.c", "src/f32-spmm/gen/f32-spmm-128x1-minmax-hvx-x4.c", "src/f32-spmm/gen/f32-spmm-128x1-minmax-hvx.c", - "src/f32-vbinary/gen/f32-vadd-minmax-hvx-u32.c", - "src/f32-vbinary/gen/f32-vadd-minmax-hvx-u64.c", - "src/f32-vbinary/gen/f32-vadd-minmax-hvx-u128.c", - "src/f32-vbinary/gen/f32-vaddc-minmax-hvx-u32.c", - "src/f32-vbinary/gen/f32-vaddc-minmax-hvx-u64.c", - "src/f32-vbinary/gen/f32-vaddc-minmax-hvx-u128.c", + "src/f32-vbinary/gen/f32-vadd-hvx-u32.c", + "src/f32-vbinary/gen/f32-vadd-hvx-u64.c", + "src/f32-vbinary/gen/f32-vadd-hvx-u128.c", + "src/f32-vbinary/gen/f32-vaddc-hvx-u32.c", + "src/f32-vbinary/gen/f32-vaddc-hvx-u64.c", + "src/f32-vbinary/gen/f32-vaddc-hvx-u128.c", "src/f32-vbinary/gen/f32-vmax-hvx-u32.c", "src/f32-vbinary/gen/f32-vmax-hvx-u64.c", "src/f32-vbinary/gen/f32-vmax-hvx-u128.c", @@ -81,27 +81,27 @@ NON_PROD_HVX_MICROKERNEL_SRCS = [ "src/f32-vbinary/gen/f32-vminc-hvx-u32.c", "src/f32-vbinary/gen/f32-vminc-hvx-u64.c", "src/f32-vbinary/gen/f32-vminc-hvx-u128.c", - "src/f32-vbinary/gen/f32-vmul-minmax-hvx-u32.c", - "src/f32-vbinary/gen/f32-vmul-minmax-hvx-u64.c", - "src/f32-vbinary/gen/f32-vmul-minmax-hvx-u128.c", - "src/f32-vbinary/gen/f32-vmulc-minmax-hvx-u32.c", - "src/f32-vbinary/gen/f32-vmulc-minmax-hvx-u64.c", - "src/f32-vbinary/gen/f32-vmulc-minmax-hvx-u128.c", - "src/f32-vbinary/gen/f32-vrsubc-minmax-hvx-u32.c", - "src/f32-vbinary/gen/f32-vrsubc-minmax-hvx-u64.c", - "src/f32-vbinary/gen/f32-vrsubc-minmax-hvx-u128.c", + "src/f32-vbinary/gen/f32-vmul-hvx-u32.c", + "src/f32-vbinary/gen/f32-vmul-hvx-u64.c", + "src/f32-vbinary/gen/f32-vmul-hvx-u128.c", + "src/f32-vbinary/gen/f32-vmulc-hvx-u32.c", + "src/f32-vbinary/gen/f32-vmulc-hvx-u64.c", + "src/f32-vbinary/gen/f32-vmulc-hvx-u128.c", + "src/f32-vbinary/gen/f32-vrsubc-hvx-u32.c", + "src/f32-vbinary/gen/f32-vrsubc-hvx-u64.c", + "src/f32-vbinary/gen/f32-vrsubc-hvx-u128.c", "src/f32-vbinary/gen/f32-vsqrdiff-hvx-u32.c", "src/f32-vbinary/gen/f32-vsqrdiff-hvx-u64.c", "src/f32-vbinary/gen/f32-vsqrdiff-hvx-u128.c", "src/f32-vbinary/gen/f32-vsqrdiffc-hvx-u32.c", "src/f32-vbinary/gen/f32-vsqrdiffc-hvx-u64.c", "src/f32-vbinary/gen/f32-vsqrdiffc-hvx-u128.c", - "src/f32-vbinary/gen/f32-vsub-minmax-hvx-u32.c", - "src/f32-vbinary/gen/f32-vsub-minmax-hvx-u64.c", - "src/f32-vbinary/gen/f32-vsub-minmax-hvx-u128.c", - "src/f32-vbinary/gen/f32-vsubc-minmax-hvx-u32.c", - "src/f32-vbinary/gen/f32-vsubc-minmax-hvx-u64.c", - "src/f32-vbinary/gen/f32-vsubc-minmax-hvx-u128.c", + "src/f32-vbinary/gen/f32-vsub-hvx-u32.c", + "src/f32-vbinary/gen/f32-vsub-hvx-u64.c", + "src/f32-vbinary/gen/f32-vsub-hvx-u128.c", + "src/f32-vbinary/gen/f32-vsubc-hvx-u32.c", + "src/f32-vbinary/gen/f32-vsubc-hvx-u64.c", + "src/f32-vbinary/gen/f32-vsubc-hvx-u128.c", "src/f32-vgelu/gen/f32-vgelu-hvx-rational-12-10-div.c", "src/f32-vunary/gen/f32-vabs-hvx.c", "src/f32-vunary/gen/f32-vneg-hvx.c", diff --git a/gen/neon_aarch64_microkernels.bzl b/gen/neon_aarch64_microkernels.bzl index 7a480205c08..f52ca5391c0 100644 --- a/gen/neon_aarch64_microkernels.bzl +++ b/gen/neon_aarch64_microkernels.bzl @@ -6,9 +6,9 @@ Auto-generated file. Do not edit! """ PROD_NEON_AARCH64_MICROKERNEL_SRCS = [ - "src/f32-vbinary/gen/f32-vdiv-minmax-aarch64-neon-u8.c", - "src/f32-vbinary/gen/f32-vdivc-minmax-aarch64-neon-u8.c", - "src/f32-vbinary/gen/f32-vrdivc-minmax-aarch64-neon-u8.c", + "src/f32-vbinary/gen/f32-vdiv-aarch64-neon-u8.c", + "src/f32-vbinary/gen/f32-vdivc-aarch64-neon-u8.c", + "src/f32-vbinary/gen/f32-vrdivc-aarch64-neon-u8.c", "src/f32-vsqrt/gen/f32-vsqrt-aarch64-neon-sqrt-u4.c", "src/x8-lut/gen/x8-lut-aarch64-neon-tbx128x4-u64.c", "src/x8-packq/x8-packq-aarch64-neon-f32qp8-u2.c", @@ -17,9 +17,9 @@ PROD_NEON_AARCH64_MICROKERNEL_SRCS = [ ] NON_PROD_NEON_AARCH64_MICROKERNEL_SRCS = [ - "src/f32-vbinary/gen/f32-vdiv-minmax-aarch64-neon-u4.c", - "src/f32-vbinary/gen/f32-vdivc-minmax-aarch64-neon-u4.c", - "src/f32-vbinary/gen/f32-vrdivc-minmax-aarch64-neon-u4.c", + "src/f32-vbinary/gen/f32-vdiv-aarch64-neon-u4.c", + "src/f32-vbinary/gen/f32-vdivc-aarch64-neon-u4.c", + "src/f32-vbinary/gen/f32-vrdivc-aarch64-neon-u4.c", "src/f32-vsqrt/gen/f32-vsqrt-aarch64-neon-sqrt-u8.c", "src/f32-vsqrt/gen/f32-vsqrt-aarch64-neon-sqrt-u16.c", "src/x8-lut/gen/x8-lut-aarch64-neon-tbx128x4-u16.c", diff --git a/gen/neon_microkernels.bzl b/gen/neon_microkernels.bzl index a7d12c81704..ad95a864633 100644 --- a/gen/neon_microkernels.bzl +++ b/gen/neon_microkernels.bzl @@ -50,19 +50,19 @@ PROD_NEON_MICROKERNEL_SRCS = [ "src/f32-rminmax/gen/f32-rminmax-neon-u16-acc4.c", "src/f32-rsum/gen/f32-rsum-neon-u16-acc4.c", "src/f32-spmm/gen/f32-spmm-32x1-minmax-neon.c", - "src/f32-vbinary/gen/f32-vadd-minmax-neon-u8.c", - "src/f32-vbinary/gen/f32-vaddc-minmax-neon-u8.c", + "src/f32-vbinary/gen/f32-vadd-neon-u8.c", + "src/f32-vbinary/gen/f32-vaddc-neon-u8.c", "src/f32-vbinary/gen/f32-vmax-neon-u8.c", "src/f32-vbinary/gen/f32-vmaxc-neon-u8.c", "src/f32-vbinary/gen/f32-vmin-neon-u8.c", "src/f32-vbinary/gen/f32-vminc-neon-u8.c", - "src/f32-vbinary/gen/f32-vmul-minmax-neon-u8.c", - "src/f32-vbinary/gen/f32-vmulc-minmax-neon-u8.c", - "src/f32-vbinary/gen/f32-vrsubc-minmax-neon-u8.c", + "src/f32-vbinary/gen/f32-vmul-neon-u8.c", + "src/f32-vbinary/gen/f32-vmulc-neon-u8.c", + "src/f32-vbinary/gen/f32-vrsubc-neon-u8.c", "src/f32-vbinary/gen/f32-vsqrdiff-neon-u8.c", "src/f32-vbinary/gen/f32-vsqrdiffc-neon-u8.c", - "src/f32-vbinary/gen/f32-vsub-minmax-neon-u8.c", - "src/f32-vbinary/gen/f32-vsubc-minmax-neon-u8.c", + "src/f32-vbinary/gen/f32-vsub-neon-u8.c", + "src/f32-vbinary/gen/f32-vsubc-neon-u8.c", "src/f32-vclamp/gen/f32-vclamp-neon-u16.c", "src/f32-vcmul/gen/f32-vcmul-neon-u8.c", "src/f32-vcopysign/gen/f32-vcopysign-neon.c", @@ -413,25 +413,25 @@ NON_PROD_NEON_MICROKERNEL_SRCS = [ "src/f32-spmm/gen/f32-spmm-16x1-minmax-neon.c", "src/f32-spmm/gen/f32-spmm-32x1-minmax-neon-pipelined.c", "src/f32-spmm/gen/f32-spmm-32x1-minmax-neon-x2.c", - "src/f32-vbinary/gen/f32-vadd-minmax-neon-u4.c", - "src/f32-vbinary/gen/f32-vaddc-minmax-neon-u4.c", + "src/f32-vbinary/gen/f32-vadd-neon-u4.c", + "src/f32-vbinary/gen/f32-vaddc-neon-u4.c", "src/f32-vbinary/gen/f32-vmax-neon-u4.c", "src/f32-vbinary/gen/f32-vmaxc-neon-u4.c", "src/f32-vbinary/gen/f32-vmin-neon-u4.c", "src/f32-vbinary/gen/f32-vminc-neon-u4.c", - "src/f32-vbinary/gen/f32-vmul-minmax-neon-u4.c", - "src/f32-vbinary/gen/f32-vmulc-minmax-neon-u4.c", + "src/f32-vbinary/gen/f32-vmul-neon-u4.c", + "src/f32-vbinary/gen/f32-vmulc-neon-u4.c", "src/f32-vbinary/gen/f32-vprelu-neon-u4.c", "src/f32-vbinary/gen/f32-vprelu-neon-u8.c", "src/f32-vbinary/gen/f32-vpreluc-neon-u4.c", "src/f32-vbinary/gen/f32-vpreluc-neon-u8.c", "src/f32-vbinary/gen/f32-vrpreluc-neon-u4.c", "src/f32-vbinary/gen/f32-vrpreluc-neon-u8.c", - "src/f32-vbinary/gen/f32-vrsubc-minmax-neon-u4.c", + "src/f32-vbinary/gen/f32-vrsubc-neon-u4.c", "src/f32-vbinary/gen/f32-vsqrdiff-neon-u4.c", "src/f32-vbinary/gen/f32-vsqrdiffc-neon-u4.c", - "src/f32-vbinary/gen/f32-vsub-minmax-neon-u4.c", - "src/f32-vbinary/gen/f32-vsubc-minmax-neon-u4.c", + "src/f32-vbinary/gen/f32-vsub-neon-u4.c", + "src/f32-vbinary/gen/f32-vsubc-neon-u4.c", "src/f32-vclamp/gen/f32-vclamp-neon-u4.c", "src/f32-vclamp/gen/f32-vclamp-neon-u8.c", "src/f32-vcmul/gen/f32-vcmul-neon-u4.c", diff --git a/gen/neonfp16arith_aarch64_microkernels.bzl b/gen/neonfp16arith_aarch64_microkernels.bzl index 2cfaf43791b..0cbfdf90606 100644 --- a/gen/neonfp16arith_aarch64_microkernels.bzl +++ b/gen/neonfp16arith_aarch64_microkernels.bzl @@ -6,17 +6,17 @@ Auto-generated file. Do not edit! """ PROD_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS = [ - "src/f16-vbinary/gen/f16-vdiv-minmax-aarch64-neonfp16arith-u8.c", - "src/f16-vbinary/gen/f16-vdivc-minmax-aarch64-neonfp16arith-u8.c", - "src/f16-vbinary/gen/f16-vrdivc-minmax-aarch64-neonfp16arith-u8.c", + "src/f16-vbinary/gen/f16-vdiv-aarch64-neonfp16arith-u8.c", + "src/f16-vbinary/gen/f16-vdivc-aarch64-neonfp16arith-u8.c", + "src/f16-vbinary/gen/f16-vrdivc-aarch64-neonfp16arith-u8.c", "src/f16-vsqrt/gen/f16-vsqrt-aarch64-neonfp16arith-sqrt-u8.c", "src/f16-vtanh/gen/f16-vtanh-aarch64-neonfp16arith-expm1minus-rr1-p3h2ts-div-u32.c", ] NON_PROD_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS = [ - "src/f16-vbinary/gen/f16-vdiv-minmax-aarch64-neonfp16arith-u16.c", - "src/f16-vbinary/gen/f16-vdivc-minmax-aarch64-neonfp16arith-u16.c", - "src/f16-vbinary/gen/f16-vrdivc-minmax-aarch64-neonfp16arith-u16.c", + "src/f16-vbinary/gen/f16-vdiv-aarch64-neonfp16arith-u16.c", + "src/f16-vbinary/gen/f16-vdivc-aarch64-neonfp16arith-u16.c", + "src/f16-vbinary/gen/f16-vrdivc-aarch64-neonfp16arith-u16.c", "src/f16-vsigmoid/gen/f16-vsigmoid-aarch64-neonfp16arith-rr2-p2-div-u8.c", "src/f16-vsigmoid/gen/f16-vsigmoid-aarch64-neonfp16arith-rr2-p2-div-u16.c", "src/f16-vsigmoid/gen/f16-vsigmoid-aarch64-neonfp16arith-rr2-p2-div-u24.c", diff --git a/gen/neonfp16arith_microkernels.bzl b/gen/neonfp16arith_microkernels.bzl index caa3f5d3980..e2d2fbb766b 100644 --- a/gen/neonfp16arith_microkernels.bzl +++ b/gen/neonfp16arith_microkernels.bzl @@ -43,19 +43,19 @@ PROD_NEONFP16ARITH_MICROKERNEL_SRCS = [ "src/f16-rminmax/gen/f16-rmax-neonfp16arith-u32-acc4.c", "src/f16-rminmax/gen/f16-rminmax-neonfp16arith-u32-acc4.c", "src/f16-spmm/gen/f16-spmm-32x1-minmax-neonfp16arith-pipelined.c", - "src/f16-vbinary/gen/f16-vadd-minmax-neonfp16arith-u16.c", - "src/f16-vbinary/gen/f16-vaddc-minmax-neonfp16arith-u16.c", + "src/f16-vbinary/gen/f16-vadd-neonfp16arith-u16.c", + "src/f16-vbinary/gen/f16-vaddc-neonfp16arith-u16.c", "src/f16-vbinary/gen/f16-vmax-neonfp16arith-u16.c", "src/f16-vbinary/gen/f16-vmaxc-neonfp16arith-u16.c", "src/f16-vbinary/gen/f16-vmin-neonfp16arith-u16.c", "src/f16-vbinary/gen/f16-vminc-neonfp16arith-u16.c", - "src/f16-vbinary/gen/f16-vmul-minmax-neonfp16arith-u16.c", - "src/f16-vbinary/gen/f16-vmulc-minmax-neonfp16arith-u16.c", - "src/f16-vbinary/gen/f16-vrsubc-minmax-neonfp16arith-u16.c", + "src/f16-vbinary/gen/f16-vmul-neonfp16arith-u16.c", + "src/f16-vbinary/gen/f16-vmulc-neonfp16arith-u16.c", + "src/f16-vbinary/gen/f16-vrsubc-neonfp16arith-u16.c", "src/f16-vbinary/gen/f16-vsqrdiff-neonfp16arith-u16.c", "src/f16-vbinary/gen/f16-vsqrdiffc-neonfp16arith-u16.c", - "src/f16-vbinary/gen/f16-vsub-minmax-neonfp16arith-u16.c", - "src/f16-vbinary/gen/f16-vsubc-minmax-neonfp16arith-u16.c", + "src/f16-vbinary/gen/f16-vsub-neonfp16arith-u16.c", + "src/f16-vbinary/gen/f16-vsubc-neonfp16arith-u16.c", "src/f16-vclamp/gen/f16-vclamp-neonfp16arith-u16.c", "src/f16-vcmul/gen/f16-vcmul-neonfp16arith-u16.c", "src/f16-velu/gen/f16-velu-neonfp16arith-rr1-p3-u16.c", @@ -259,25 +259,25 @@ NON_PROD_NEONFP16ARITH_MICROKERNEL_SRCS = [ "src/f16-spmm/gen/f16-spmm-24x1-minmax-neonfp16arith.c", "src/f16-spmm/gen/f16-spmm-32x1-minmax-neonfp16arith-x2.c", "src/f16-spmm/gen/f16-spmm-32x1-minmax-neonfp16arith.c", - "src/f16-vbinary/gen/f16-vadd-minmax-neonfp16arith-u8.c", - "src/f16-vbinary/gen/f16-vaddc-minmax-neonfp16arith-u8.c", + "src/f16-vbinary/gen/f16-vadd-neonfp16arith-u8.c", + "src/f16-vbinary/gen/f16-vaddc-neonfp16arith-u8.c", "src/f16-vbinary/gen/f16-vmax-neonfp16arith-u8.c", "src/f16-vbinary/gen/f16-vmaxc-neonfp16arith-u8.c", "src/f16-vbinary/gen/f16-vmin-neonfp16arith-u8.c", "src/f16-vbinary/gen/f16-vminc-neonfp16arith-u8.c", - "src/f16-vbinary/gen/f16-vmul-minmax-neonfp16arith-u8.c", - "src/f16-vbinary/gen/f16-vmulc-minmax-neonfp16arith-u8.c", + "src/f16-vbinary/gen/f16-vmul-neonfp16arith-u8.c", + "src/f16-vbinary/gen/f16-vmulc-neonfp16arith-u8.c", "src/f16-vbinary/gen/f16-vprelu-neonfp16arith-u8.c", "src/f16-vbinary/gen/f16-vprelu-neonfp16arith-u16.c", "src/f16-vbinary/gen/f16-vpreluc-neonfp16arith-u8.c", "src/f16-vbinary/gen/f16-vpreluc-neonfp16arith-u16.c", "src/f16-vbinary/gen/f16-vrpreluc-neonfp16arith-u8.c", "src/f16-vbinary/gen/f16-vrpreluc-neonfp16arith-u16.c", - "src/f16-vbinary/gen/f16-vrsubc-minmax-neonfp16arith-u8.c", + "src/f16-vbinary/gen/f16-vrsubc-neonfp16arith-u8.c", "src/f16-vbinary/gen/f16-vsqrdiff-neonfp16arith-u8.c", "src/f16-vbinary/gen/f16-vsqrdiffc-neonfp16arith-u8.c", - "src/f16-vbinary/gen/f16-vsub-minmax-neonfp16arith-u8.c", - "src/f16-vbinary/gen/f16-vsubc-minmax-neonfp16arith-u8.c", + "src/f16-vbinary/gen/f16-vsub-neonfp16arith-u8.c", + "src/f16-vbinary/gen/f16-vsubc-neonfp16arith-u8.c", "src/f16-vclamp/gen/f16-vclamp-neonfp16arith-u8.c", "src/f16-vcmul/gen/f16-vcmul-neonfp16arith-u8.c", "src/f16-vcmul/gen/f16-vcmul-neonfp16arith-u32.c", diff --git a/gen/rvv_microkernels.bzl b/gen/rvv_microkernels.bzl index 397b67d9954..ffd732f685c 100644 --- a/gen/rvv_microkernels.bzl +++ b/gen/rvv_microkernels.bzl @@ -26,22 +26,22 @@ PROD_RVV_MICROKERNEL_SRCS = [ "src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-rvv-u4v.c", "src/f32-rminmax/gen/f32-rmax-rvv-u8v.c", "src/f32-rminmax/gen/f32-rminmax-rvv-u8v.c", - "src/f32-vbinary/gen/f32-vadd-minmax-rvv-u8v.c", - "src/f32-vbinary/gen/f32-vaddc-minmax-rvv-u8v.c", - "src/f32-vbinary/gen/f32-vdiv-minmax-rvv-u8v.c", - "src/f32-vbinary/gen/f32-vdivc-minmax-rvv-u8v.c", + "src/f32-vbinary/gen/f32-vadd-rvv-u8v.c", + "src/f32-vbinary/gen/f32-vaddc-rvv-u8v.c", + "src/f32-vbinary/gen/f32-vdiv-rvv-u8v.c", + "src/f32-vbinary/gen/f32-vdivc-rvv-u8v.c", "src/f32-vbinary/gen/f32-vmax-rvv-u8v.c", "src/f32-vbinary/gen/f32-vmaxc-rvv-u8v.c", "src/f32-vbinary/gen/f32-vmin-rvv-u8v.c", "src/f32-vbinary/gen/f32-vminc-rvv-u8v.c", - "src/f32-vbinary/gen/f32-vmul-minmax-rvv-u8v.c", - "src/f32-vbinary/gen/f32-vmulc-minmax-rvv-u8v.c", - "src/f32-vbinary/gen/f32-vrdivc-minmax-rvv-u8v.c", - "src/f32-vbinary/gen/f32-vrsubc-minmax-rvv-u8v.c", + "src/f32-vbinary/gen/f32-vmul-rvv-u8v.c", + "src/f32-vbinary/gen/f32-vmulc-rvv-u8v.c", + "src/f32-vbinary/gen/f32-vrdivc-rvv-u8v.c", + "src/f32-vbinary/gen/f32-vrsubc-rvv-u8v.c", "src/f32-vbinary/gen/f32-vsqrdiff-rvv-u8v.c", "src/f32-vbinary/gen/f32-vsqrdiffc-rvv-u8v.c", - "src/f32-vbinary/gen/f32-vsub-minmax-rvv-u8v.c", - "src/f32-vbinary/gen/f32-vsubc-minmax-rvv-u8v.c", + "src/f32-vbinary/gen/f32-vsub-rvv-u8v.c", + "src/f32-vbinary/gen/f32-vsubc-rvv-u8v.c", "src/f32-vcmul/gen/f32-vcmul-rvv-u2v.c", "src/f32-vlrelu/gen/f32-vlrelu-rvv-u4v.c", "src/f32-vrelu/gen/f32-vrelu-rvv-u4v.c", @@ -97,24 +97,22 @@ NON_PROD_RVV_MICROKERNEL_SRCS = [ "src/f32-rminmax/gen/f32-rminmax-rvv-u2v.c", "src/f32-rminmax/gen/f32-rminmax-rvv-u4v.c", "src/f32-rsum/f32-rsum-rvv-u1v.c", - "src/f32-vbinary/gen/f32-vadd-minmax-rvv-u4v.c", - "src/f32-vbinary/gen/f32-vaddc-minmax-rvv-u4v.c", - "src/f32-vbinary/gen/f32-vdiv-minmax-rvv-u4v.c", - "src/f32-vbinary/gen/f32-vdivc-minmax-rvv-u4v.c", + "src/f32-vbinary/gen/f32-vadd-rvv-u4v.c", + "src/f32-vbinary/gen/f32-vaddc-rvv-u4v.c", + "src/f32-vbinary/gen/f32-vdiv-rvv-u4v.c", + "src/f32-vbinary/gen/f32-vdivc-rvv-u4v.c", "src/f32-vbinary/gen/f32-vmax-rvv-u4v.c", "src/f32-vbinary/gen/f32-vmaxc-rvv-u4v.c", "src/f32-vbinary/gen/f32-vmin-rvv-u4v.c", "src/f32-vbinary/gen/f32-vminc-rvv-u4v.c", - "src/f32-vbinary/gen/f32-vmul-minmax-rvv-u4v.c", - "src/f32-vbinary/gen/f32-vmulc-minmax-rvv-u4v.c", + "src/f32-vbinary/gen/f32-vmul-rvv-u4v.c", "src/f32-vbinary/gen/f32-vmulc-rvv-u4v.c", - "src/f32-vbinary/gen/f32-vmulc-rvv-u8v.c", - "src/f32-vbinary/gen/f32-vrdivc-minmax-rvv-u4v.c", - "src/f32-vbinary/gen/f32-vrsubc-minmax-rvv-u4v.c", + "src/f32-vbinary/gen/f32-vrdivc-rvv-u4v.c", + "src/f32-vbinary/gen/f32-vrsubc-rvv-u4v.c", "src/f32-vbinary/gen/f32-vsqrdiff-rvv-u4v.c", "src/f32-vbinary/gen/f32-vsqrdiffc-rvv-u4v.c", - "src/f32-vbinary/gen/f32-vsub-minmax-rvv-u4v.c", - "src/f32-vbinary/gen/f32-vsubc-minmax-rvv-u4v.c", + "src/f32-vbinary/gen/f32-vsub-rvv-u4v.c", + "src/f32-vbinary/gen/f32-vsubc-rvv-u4v.c", "src/f32-vclamp/gen/f32-vclamp-rvv-u1v.c", "src/f32-vclamp/gen/f32-vclamp-rvv-u2v.c", "src/f32-vclamp/gen/f32-vclamp-rvv-u4v.c", diff --git a/gen/scalar_microkernels.bzl b/gen/scalar_microkernels.bzl index ed08da2bc7a..d0858ca1f9d 100644 --- a/gen/scalar_microkernels.bzl +++ b/gen/scalar_microkernels.bzl @@ -85,22 +85,22 @@ PROD_SCALAR_MICROKERNEL_SRCS = [ "src/f32-spmm/gen/f32-spmm-8x1-minmax-scalar.c", "src/f32-spmm/gen/f32-spmm-8x2-minmax-scalar.c", "src/f32-spmm/gen/f32-spmm-8x4-minmax-scalar.c", - "src/f32-vbinary/gen/f32-vadd-minmax-scalar-u8.c", - "src/f32-vbinary/gen/f32-vaddc-minmax-scalar-u8.c", - "src/f32-vbinary/gen/f32-vdiv-minmax-scalar-u2.c", - "src/f32-vbinary/gen/f32-vdivc-minmax-scalar-u2.c", + "src/f32-vbinary/gen/f32-vadd-scalar-u8.c", + "src/f32-vbinary/gen/f32-vaddc-scalar-u8.c", + "src/f32-vbinary/gen/f32-vdiv-scalar-u2.c", + "src/f32-vbinary/gen/f32-vdivc-scalar-u2.c", "src/f32-vbinary/gen/f32-vmax-scalar-u8.c", "src/f32-vbinary/gen/f32-vmaxc-scalar-u8.c", "src/f32-vbinary/gen/f32-vmin-scalar-u8.c", "src/f32-vbinary/gen/f32-vminc-scalar-u8.c", - "src/f32-vbinary/gen/f32-vmul-minmax-scalar-u8.c", - "src/f32-vbinary/gen/f32-vmulc-minmax-scalar-u8.c", - "src/f32-vbinary/gen/f32-vrdivc-minmax-scalar-u2.c", - "src/f32-vbinary/gen/f32-vrsubc-minmax-scalar-u8.c", + "src/f32-vbinary/gen/f32-vmul-scalar-u8.c", + "src/f32-vbinary/gen/f32-vmulc-scalar-u8.c", + "src/f32-vbinary/gen/f32-vrdivc-scalar-u2.c", + "src/f32-vbinary/gen/f32-vrsubc-scalar-u8.c", "src/f32-vbinary/gen/f32-vsqrdiff-scalar-u8.c", "src/f32-vbinary/gen/f32-vsqrdiffc-scalar-u8.c", - "src/f32-vbinary/gen/f32-vsub-minmax-scalar-u8.c", - "src/f32-vbinary/gen/f32-vsubc-minmax-scalar-u8.c", + "src/f32-vbinary/gen/f32-vsub-scalar-u8.c", + "src/f32-vbinary/gen/f32-vsubc-scalar-u8.c", "src/f32-vclamp/gen/f32-vclamp-scalar-u4.c", "src/f32-vcmul/gen/f32-vcmul-scalar-u4.c", "src/f32-vcopysign/gen/f32-vcopysign-scalar.c", @@ -452,32 +452,16 @@ NON_PROD_SCALAR_MICROKERNEL_SRCS = [ "src/f32-spmm/gen/f32-spmm-4x1-minmax-scalar-pipelined.c", "src/f32-spmm/gen/f32-spmm-4x1-minmax-scalar.c", "src/f32-spmm/gen/f32-spmm-8x1-minmax-scalar-pipelined.c", - "src/f32-vbinary/gen/f32-vadd-minmax-scalar-u1.c", - "src/f32-vbinary/gen/f32-vadd-minmax-scalar-u2.c", - "src/f32-vbinary/gen/f32-vadd-minmax-scalar-u4.c", "src/f32-vbinary/gen/f32-vadd-scalar-u1.c", "src/f32-vbinary/gen/f32-vadd-scalar-u2.c", "src/f32-vbinary/gen/f32-vadd-scalar-u4.c", - "src/f32-vbinary/gen/f32-vadd-scalar-u8.c", - "src/f32-vbinary/gen/f32-vaddc-minmax-scalar-u1.c", - "src/f32-vbinary/gen/f32-vaddc-minmax-scalar-u2.c", - "src/f32-vbinary/gen/f32-vaddc-minmax-scalar-u4.c", "src/f32-vbinary/gen/f32-vaddc-scalar-u1.c", "src/f32-vbinary/gen/f32-vaddc-scalar-u2.c", "src/f32-vbinary/gen/f32-vaddc-scalar-u4.c", - "src/f32-vbinary/gen/f32-vaddc-scalar-u8.c", - "src/f32-vbinary/gen/f32-vdiv-minmax-scalar-u1.c", - "src/f32-vbinary/gen/f32-vdiv-minmax-scalar-u4.c", - "src/f32-vbinary/gen/f32-vdiv-minmax-scalar-u8.c", "src/f32-vbinary/gen/f32-vdiv-scalar-u1.c", - "src/f32-vbinary/gen/f32-vdiv-scalar-u2.c", "src/f32-vbinary/gen/f32-vdiv-scalar-u4.c", "src/f32-vbinary/gen/f32-vdiv-scalar-u8.c", - "src/f32-vbinary/gen/f32-vdivc-minmax-scalar-u1.c", - "src/f32-vbinary/gen/f32-vdivc-minmax-scalar-u4.c", - "src/f32-vbinary/gen/f32-vdivc-minmax-scalar-u8.c", "src/f32-vbinary/gen/f32-vdivc-scalar-u1.c", - "src/f32-vbinary/gen/f32-vdivc-scalar-u2.c", "src/f32-vbinary/gen/f32-vdivc-scalar-u4.c", "src/f32-vbinary/gen/f32-vdivc-scalar-u8.c", "src/f32-vbinary/gen/f32-vmax-scalar-u1.c", @@ -492,20 +476,12 @@ NON_PROD_SCALAR_MICROKERNEL_SRCS = [ "src/f32-vbinary/gen/f32-vminc-scalar-u1.c", "src/f32-vbinary/gen/f32-vminc-scalar-u2.c", "src/f32-vbinary/gen/f32-vminc-scalar-u4.c", - "src/f32-vbinary/gen/f32-vmul-minmax-scalar-u1.c", - "src/f32-vbinary/gen/f32-vmul-minmax-scalar-u2.c", - "src/f32-vbinary/gen/f32-vmul-minmax-scalar-u4.c", "src/f32-vbinary/gen/f32-vmul-scalar-u1.c", "src/f32-vbinary/gen/f32-vmul-scalar-u2.c", "src/f32-vbinary/gen/f32-vmul-scalar-u4.c", - "src/f32-vbinary/gen/f32-vmul-scalar-u8.c", - "src/f32-vbinary/gen/f32-vmulc-minmax-scalar-u1.c", - "src/f32-vbinary/gen/f32-vmulc-minmax-scalar-u2.c", - "src/f32-vbinary/gen/f32-vmulc-minmax-scalar-u4.c", "src/f32-vbinary/gen/f32-vmulc-scalar-u1.c", "src/f32-vbinary/gen/f32-vmulc-scalar-u2.c", "src/f32-vbinary/gen/f32-vmulc-scalar-u4.c", - "src/f32-vbinary/gen/f32-vmulc-scalar-u8.c", "src/f32-vbinary/gen/f32-vprelu-scalar-u1.c", "src/f32-vbinary/gen/f32-vprelu-scalar-u2.c", "src/f32-vbinary/gen/f32-vprelu-scalar-u4.c", @@ -514,44 +490,28 @@ NON_PROD_SCALAR_MICROKERNEL_SRCS = [ "src/f32-vbinary/gen/f32-vpreluc-scalar-u2.c", "src/f32-vbinary/gen/f32-vpreluc-scalar-u4.c", "src/f32-vbinary/gen/f32-vpreluc-scalar-u8.c", - "src/f32-vbinary/gen/f32-vrdivc-minmax-scalar-u1.c", - "src/f32-vbinary/gen/f32-vrdivc-minmax-scalar-u4.c", - "src/f32-vbinary/gen/f32-vrdivc-minmax-scalar-u8.c", "src/f32-vbinary/gen/f32-vrdivc-scalar-u1.c", - "src/f32-vbinary/gen/f32-vrdivc-scalar-u2.c", "src/f32-vbinary/gen/f32-vrdivc-scalar-u4.c", "src/f32-vbinary/gen/f32-vrdivc-scalar-u8.c", "src/f32-vbinary/gen/f32-vrpreluc-scalar-u1.c", "src/f32-vbinary/gen/f32-vrpreluc-scalar-u2.c", "src/f32-vbinary/gen/f32-vrpreluc-scalar-u4.c", "src/f32-vbinary/gen/f32-vrpreluc-scalar-u8.c", - "src/f32-vbinary/gen/f32-vrsubc-minmax-scalar-u1.c", - "src/f32-vbinary/gen/f32-vrsubc-minmax-scalar-u2.c", - "src/f32-vbinary/gen/f32-vrsubc-minmax-scalar-u4.c", "src/f32-vbinary/gen/f32-vrsubc-scalar-u1.c", "src/f32-vbinary/gen/f32-vrsubc-scalar-u2.c", "src/f32-vbinary/gen/f32-vrsubc-scalar-u4.c", - "src/f32-vbinary/gen/f32-vrsubc-scalar-u8.c", "src/f32-vbinary/gen/f32-vsqrdiff-scalar-u1.c", "src/f32-vbinary/gen/f32-vsqrdiff-scalar-u2.c", "src/f32-vbinary/gen/f32-vsqrdiff-scalar-u4.c", "src/f32-vbinary/gen/f32-vsqrdiffc-scalar-u1.c", "src/f32-vbinary/gen/f32-vsqrdiffc-scalar-u2.c", "src/f32-vbinary/gen/f32-vsqrdiffc-scalar-u4.c", - "src/f32-vbinary/gen/f32-vsub-minmax-scalar-u1.c", - "src/f32-vbinary/gen/f32-vsub-minmax-scalar-u2.c", - "src/f32-vbinary/gen/f32-vsub-minmax-scalar-u4.c", "src/f32-vbinary/gen/f32-vsub-scalar-u1.c", "src/f32-vbinary/gen/f32-vsub-scalar-u2.c", "src/f32-vbinary/gen/f32-vsub-scalar-u4.c", - "src/f32-vbinary/gen/f32-vsub-scalar-u8.c", - "src/f32-vbinary/gen/f32-vsubc-minmax-scalar-u1.c", - "src/f32-vbinary/gen/f32-vsubc-minmax-scalar-u2.c", - "src/f32-vbinary/gen/f32-vsubc-minmax-scalar-u4.c", "src/f32-vbinary/gen/f32-vsubc-scalar-u1.c", "src/f32-vbinary/gen/f32-vsubc-scalar-u2.c", "src/f32-vbinary/gen/f32-vsubc-scalar-u4.c", - "src/f32-vbinary/gen/f32-vsubc-scalar-u8.c", "src/f32-vclamp/gen/f32-vclamp-scalar-u1.c", "src/f32-vclamp/gen/f32-vclamp-scalar-u2.c", "src/f32-vcmul/gen/f32-vcmul-scalar-u1.c", diff --git a/gen/sse_microkernels.bzl b/gen/sse_microkernels.bzl index c66da0b104a..9364c3c4623 100644 --- a/gen/sse_microkernels.bzl +++ b/gen/sse_microkernels.bzl @@ -37,22 +37,22 @@ PROD_SSE_MICROKERNEL_SRCS = [ "src/f32-rminmax/gen/f32-rminmax-sse-u16-acc4.c", "src/f32-rsum/gen/f32-rsum-sse-u16-acc4.c", "src/f32-spmm/gen/f32-spmm-32x1-minmax-sse.c", - "src/f32-vbinary/gen/f32-vadd-minmax-sse-u8.c", - "src/f32-vbinary/gen/f32-vaddc-minmax-sse-u8.c", - "src/f32-vbinary/gen/f32-vdiv-minmax-sse-u8.c", - "src/f32-vbinary/gen/f32-vdivc-minmax-sse-u8.c", + "src/f32-vbinary/gen/f32-vadd-sse-u8.c", + "src/f32-vbinary/gen/f32-vaddc-sse-u8.c", + "src/f32-vbinary/gen/f32-vdiv-sse-u8.c", + "src/f32-vbinary/gen/f32-vdivc-sse-u8.c", "src/f32-vbinary/gen/f32-vmax-sse-u8.c", "src/f32-vbinary/gen/f32-vmaxc-sse-u8.c", "src/f32-vbinary/gen/f32-vmin-sse-u8.c", "src/f32-vbinary/gen/f32-vminc-sse-u8.c", - "src/f32-vbinary/gen/f32-vmul-minmax-sse-u8.c", - "src/f32-vbinary/gen/f32-vmulc-minmax-sse-u8.c", - "src/f32-vbinary/gen/f32-vrdivc-minmax-sse-u8.c", - "src/f32-vbinary/gen/f32-vrsubc-minmax-sse-u8.c", + "src/f32-vbinary/gen/f32-vmul-sse-u8.c", + "src/f32-vbinary/gen/f32-vmulc-sse-u8.c", + "src/f32-vbinary/gen/f32-vrdivc-sse-u8.c", + "src/f32-vbinary/gen/f32-vrsubc-sse-u8.c", "src/f32-vbinary/gen/f32-vsqrdiff-sse-u8.c", "src/f32-vbinary/gen/f32-vsqrdiffc-sse-u8.c", - "src/f32-vbinary/gen/f32-vsub-minmax-sse-u8.c", - "src/f32-vbinary/gen/f32-vsubc-minmax-sse-u8.c", + "src/f32-vbinary/gen/f32-vsub-sse-u8.c", + "src/f32-vbinary/gen/f32-vsubc-sse-u8.c", "src/f32-vclamp/gen/f32-vclamp-sse-u8.c", "src/f32-vcmul/gen/f32-vcmul-sse-u8.c", "src/f32-vhswish/gen/f32-vhswish-sse-u8.c", @@ -201,22 +201,22 @@ NON_PROD_SSE_MICROKERNEL_SRCS = [ "src/f32-spmm/gen/f32-spmm-4x1-minmax-sse.c", "src/f32-spmm/gen/f32-spmm-8x1-minmax-sse.c", "src/f32-spmm/gen/f32-spmm-16x1-minmax-sse.c", - "src/f32-vbinary/gen/f32-vadd-minmax-sse-u4.c", - "src/f32-vbinary/gen/f32-vaddc-minmax-sse-u4.c", - "src/f32-vbinary/gen/f32-vdiv-minmax-sse-u4.c", - "src/f32-vbinary/gen/f32-vdivc-minmax-sse-u4.c", + "src/f32-vbinary/gen/f32-vadd-sse-u4.c", + "src/f32-vbinary/gen/f32-vaddc-sse-u4.c", + "src/f32-vbinary/gen/f32-vdiv-sse-u4.c", + "src/f32-vbinary/gen/f32-vdivc-sse-u4.c", "src/f32-vbinary/gen/f32-vmax-sse-u4.c", "src/f32-vbinary/gen/f32-vmaxc-sse-u4.c", "src/f32-vbinary/gen/f32-vmin-sse-u4.c", "src/f32-vbinary/gen/f32-vminc-sse-u4.c", - "src/f32-vbinary/gen/f32-vmul-minmax-sse-u4.c", - "src/f32-vbinary/gen/f32-vmulc-minmax-sse-u4.c", - "src/f32-vbinary/gen/f32-vrdivc-minmax-sse-u4.c", - "src/f32-vbinary/gen/f32-vrsubc-minmax-sse-u4.c", + "src/f32-vbinary/gen/f32-vmul-sse-u4.c", + "src/f32-vbinary/gen/f32-vmulc-sse-u4.c", + "src/f32-vbinary/gen/f32-vrdivc-sse-u4.c", + "src/f32-vbinary/gen/f32-vrsubc-sse-u4.c", "src/f32-vbinary/gen/f32-vsqrdiff-sse-u4.c", "src/f32-vbinary/gen/f32-vsqrdiffc-sse-u4.c", - "src/f32-vbinary/gen/f32-vsub-minmax-sse-u4.c", - "src/f32-vbinary/gen/f32-vsubc-minmax-sse-u4.c", + "src/f32-vbinary/gen/f32-vsub-sse-u4.c", + "src/f32-vbinary/gen/f32-vsubc-sse-u4.c", "src/f32-vclamp/gen/f32-vclamp-sse-u4.c", "src/f32-vcmul/gen/f32-vcmul-sse-u4.c", "src/f32-vcmul/gen/f32-vcmul-sse-u12.c", diff --git a/gen/wasm_microkernels.bzl b/gen/wasm_microkernels.bzl index 707f4f30af3..79f5d620c30 100644 --- a/gen/wasm_microkernels.bzl +++ b/gen/wasm_microkernels.bzl @@ -36,20 +36,20 @@ PROD_WASM_MICROKERNEL_SRCS = [ "src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasm-fmagic-u4.c", "src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasm-fmagic-u4.c", "src/f32-rminmax/gen/f32-rminmax-wasm-u4-acc4.c", - "src/f32-vbinary/gen/f32-vadd-minmax-wasm-u8.c", - "src/f32-vbinary/gen/f32-vaddc-minmax-wasm-u8.c", - "src/f32-vbinary/gen/f32-vdiv-minmax-wasm-u8.c", - "src/f32-vbinary/gen/f32-vdivc-minmax-wasm-u8.c", + "src/f32-vbinary/gen/f32-vadd-wasm-u8.c", + "src/f32-vbinary/gen/f32-vaddc-wasm-u8.c", + "src/f32-vbinary/gen/f32-vdiv-wasm-u8.c", + "src/f32-vbinary/gen/f32-vdivc-wasm-u8.c", "src/f32-vbinary/gen/f32-vmax-wasm-u8.c", "src/f32-vbinary/gen/f32-vmaxc-wasm-u8.c", "src/f32-vbinary/gen/f32-vmin-wasm-u8.c", "src/f32-vbinary/gen/f32-vminc-wasm-u8.c", - "src/f32-vbinary/gen/f32-vmul-minmax-wasm-u8.c", - "src/f32-vbinary/gen/f32-vmulc-minmax-wasm-u8.c", - "src/f32-vbinary/gen/f32-vrdivc-minmax-wasm-u8.c", - "src/f32-vbinary/gen/f32-vrsubc-minmax-wasm-u8.c", - "src/f32-vbinary/gen/f32-vsub-minmax-wasm-u8.c", - "src/f32-vbinary/gen/f32-vsubc-minmax-wasm-u8.c", + "src/f32-vbinary/gen/f32-vmul-wasm-u8.c", + "src/f32-vbinary/gen/f32-vmulc-wasm-u8.c", + "src/f32-vbinary/gen/f32-vrdivc-wasm-u8.c", + "src/f32-vbinary/gen/f32-vrsubc-wasm-u8.c", + "src/f32-vbinary/gen/f32-vsub-wasm-u8.c", + "src/f32-vbinary/gen/f32-vsubc-wasm-u8.c", "src/f32-vclamp/gen/f32-vclamp-wasm-u4.c", "src/f32-velu/gen/f32-velu-wasm-rr2-p6-u6.c", "src/f32-vhswish/gen/f32-vhswish-wasm-u4.c", @@ -136,18 +136,18 @@ NON_PROD_WASM_MICROKERNEL_SRCS = [ "src/f32-rminmax/gen/f32-rminmax-wasm-u2-acc2.c", "src/f32-rminmax/gen/f32-rminmax-wasm-u3-acc3.c", "src/f32-rminmax/gen/f32-rminmax-wasm-u4-acc2.c", - "src/f32-vbinary/gen/f32-vadd-minmax-wasm-u1.c", - "src/f32-vbinary/gen/f32-vadd-minmax-wasm-u2.c", - "src/f32-vbinary/gen/f32-vadd-minmax-wasm-u4.c", - "src/f32-vbinary/gen/f32-vaddc-minmax-wasm-u1.c", - "src/f32-vbinary/gen/f32-vaddc-minmax-wasm-u2.c", - "src/f32-vbinary/gen/f32-vaddc-minmax-wasm-u4.c", - "src/f32-vbinary/gen/f32-vdiv-minmax-wasm-u1.c", - "src/f32-vbinary/gen/f32-vdiv-minmax-wasm-u2.c", - "src/f32-vbinary/gen/f32-vdiv-minmax-wasm-u4.c", - "src/f32-vbinary/gen/f32-vdivc-minmax-wasm-u1.c", - "src/f32-vbinary/gen/f32-vdivc-minmax-wasm-u2.c", - "src/f32-vbinary/gen/f32-vdivc-minmax-wasm-u4.c", + "src/f32-vbinary/gen/f32-vadd-wasm-u1.c", + "src/f32-vbinary/gen/f32-vadd-wasm-u2.c", + "src/f32-vbinary/gen/f32-vadd-wasm-u4.c", + "src/f32-vbinary/gen/f32-vaddc-wasm-u1.c", + "src/f32-vbinary/gen/f32-vaddc-wasm-u2.c", + "src/f32-vbinary/gen/f32-vaddc-wasm-u4.c", + "src/f32-vbinary/gen/f32-vdiv-wasm-u1.c", + "src/f32-vbinary/gen/f32-vdiv-wasm-u2.c", + "src/f32-vbinary/gen/f32-vdiv-wasm-u4.c", + "src/f32-vbinary/gen/f32-vdivc-wasm-u1.c", + "src/f32-vbinary/gen/f32-vdivc-wasm-u2.c", + "src/f32-vbinary/gen/f32-vdivc-wasm-u4.c", "src/f32-vbinary/gen/f32-vmax-wasm-u1.c", "src/f32-vbinary/gen/f32-vmax-wasm-u2.c", "src/f32-vbinary/gen/f32-vmax-wasm-u4.c", @@ -160,12 +160,12 @@ NON_PROD_WASM_MICROKERNEL_SRCS = [ "src/f32-vbinary/gen/f32-vminc-wasm-u1.c", "src/f32-vbinary/gen/f32-vminc-wasm-u2.c", "src/f32-vbinary/gen/f32-vminc-wasm-u4.c", - "src/f32-vbinary/gen/f32-vmul-minmax-wasm-u1.c", - "src/f32-vbinary/gen/f32-vmul-minmax-wasm-u2.c", - "src/f32-vbinary/gen/f32-vmul-minmax-wasm-u4.c", - "src/f32-vbinary/gen/f32-vmulc-minmax-wasm-u1.c", - "src/f32-vbinary/gen/f32-vmulc-minmax-wasm-u2.c", - "src/f32-vbinary/gen/f32-vmulc-minmax-wasm-u4.c", + "src/f32-vbinary/gen/f32-vmul-wasm-u1.c", + "src/f32-vbinary/gen/f32-vmul-wasm-u2.c", + "src/f32-vbinary/gen/f32-vmul-wasm-u4.c", + "src/f32-vbinary/gen/f32-vmulc-wasm-u1.c", + "src/f32-vbinary/gen/f32-vmulc-wasm-u2.c", + "src/f32-vbinary/gen/f32-vmulc-wasm-u4.c", "src/f32-vbinary/gen/f32-vprelu-wasm-u1.c", "src/f32-vbinary/gen/f32-vprelu-wasm-u2.c", "src/f32-vbinary/gen/f32-vprelu-wasm-u4.c", @@ -174,22 +174,22 @@ NON_PROD_WASM_MICROKERNEL_SRCS = [ "src/f32-vbinary/gen/f32-vpreluc-wasm-u2.c", "src/f32-vbinary/gen/f32-vpreluc-wasm-u4.c", "src/f32-vbinary/gen/f32-vpreluc-wasm-u8.c", - "src/f32-vbinary/gen/f32-vrdivc-minmax-wasm-u1.c", - "src/f32-vbinary/gen/f32-vrdivc-minmax-wasm-u2.c", - "src/f32-vbinary/gen/f32-vrdivc-minmax-wasm-u4.c", + "src/f32-vbinary/gen/f32-vrdivc-wasm-u1.c", + "src/f32-vbinary/gen/f32-vrdivc-wasm-u2.c", + "src/f32-vbinary/gen/f32-vrdivc-wasm-u4.c", "src/f32-vbinary/gen/f32-vrpreluc-wasm-u1.c", "src/f32-vbinary/gen/f32-vrpreluc-wasm-u2.c", "src/f32-vbinary/gen/f32-vrpreluc-wasm-u4.c", "src/f32-vbinary/gen/f32-vrpreluc-wasm-u8.c", - "src/f32-vbinary/gen/f32-vrsubc-minmax-wasm-u1.c", - "src/f32-vbinary/gen/f32-vrsubc-minmax-wasm-u2.c", - "src/f32-vbinary/gen/f32-vrsubc-minmax-wasm-u4.c", - "src/f32-vbinary/gen/f32-vsub-minmax-wasm-u1.c", - "src/f32-vbinary/gen/f32-vsub-minmax-wasm-u2.c", - "src/f32-vbinary/gen/f32-vsub-minmax-wasm-u4.c", - "src/f32-vbinary/gen/f32-vsubc-minmax-wasm-u1.c", - "src/f32-vbinary/gen/f32-vsubc-minmax-wasm-u2.c", - "src/f32-vbinary/gen/f32-vsubc-minmax-wasm-u4.c", + "src/f32-vbinary/gen/f32-vrsubc-wasm-u1.c", + "src/f32-vbinary/gen/f32-vrsubc-wasm-u2.c", + "src/f32-vbinary/gen/f32-vrsubc-wasm-u4.c", + "src/f32-vbinary/gen/f32-vsub-wasm-u1.c", + "src/f32-vbinary/gen/f32-vsub-wasm-u2.c", + "src/f32-vbinary/gen/f32-vsub-wasm-u4.c", + "src/f32-vbinary/gen/f32-vsubc-wasm-u1.c", + "src/f32-vbinary/gen/f32-vsubc-wasm-u2.c", + "src/f32-vbinary/gen/f32-vsubc-wasm-u4.c", "src/f32-vclamp/gen/f32-vclamp-wasm-u1.c", "src/f32-vclamp/gen/f32-vclamp-wasm-u2.c", "src/f32-velu/gen/f32-velu-wasm-rr2-lut16-p3-u1.c", diff --git a/gen/wasmsimd_microkernels.bzl b/gen/wasmsimd_microkernels.bzl index 6d5e623431f..4efad9dce7f 100644 --- a/gen/wasmsimd_microkernels.bzl +++ b/gen/wasmsimd_microkernels.bzl @@ -117,17 +117,9 @@ PROD_WASMSIMD_MICROKERNEL_SRCS = [ "src/f32-rsum/gen/f32-rsum-wasmsimd-u16-acc4.c", "src/f32-spmm/gen/f32-spmm-32x1-minmax-wasmsimd-arm.c", "src/f32-spmm/gen/f32-spmm-32x1-minmax-wasmsimd-x86.c", - "src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-arm-u16.c", - "src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-x86-u16.c", "src/f32-vbinary/gen/f32-vadd-wasmsimd-u16.c", - "src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-arm-u16.c", - "src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-x86-u16.c", "src/f32-vbinary/gen/f32-vaddc-wasmsimd-u16.c", - "src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-arm-u16.c", - "src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-x86-u16.c", "src/f32-vbinary/gen/f32-vdiv-wasmsimd-u16.c", - "src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-arm-u16.c", - "src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-x86-u16.c", "src/f32-vbinary/gen/f32-vdivc-wasmsimd-u16.c", "src/f32-vbinary/gen/f32-vmax-wasmsimd-arm-u16.c", "src/f32-vbinary/gen/f32-vmax-wasmsimd-x86-u16.c", @@ -137,25 +129,13 @@ PROD_WASMSIMD_MICROKERNEL_SRCS = [ "src/f32-vbinary/gen/f32-vmin-wasmsimd-x86-u16.c", "src/f32-vbinary/gen/f32-vminc-wasmsimd-arm-u16.c", "src/f32-vbinary/gen/f32-vminc-wasmsimd-x86-u16.c", - "src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-arm-u16.c", - "src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-x86-u16.c", "src/f32-vbinary/gen/f32-vmul-wasmsimd-u16.c", - "src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-arm-u16.c", - "src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-x86-u16.c", "src/f32-vbinary/gen/f32-vmulc-wasmsimd-u16.c", - "src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-arm-u16.c", - "src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-x86-u16.c", "src/f32-vbinary/gen/f32-vrdivc-wasmsimd-u16.c", - "src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-arm-u16.c", - "src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-x86-u16.c", "src/f32-vbinary/gen/f32-vrsubc-wasmsimd-u16.c", "src/f32-vbinary/gen/f32-vsqrdiff-wasmsimd-u16.c", "src/f32-vbinary/gen/f32-vsqrdiffc-wasmsimd-u16.c", - "src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-arm-u16.c", - "src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-x86-u16.c", "src/f32-vbinary/gen/f32-vsub-wasmsimd-u16.c", - "src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-arm-u16.c", - "src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-x86-u16.c", "src/f32-vbinary/gen/f32-vsubc-wasmsimd-u16.c", "src/f32-vclamp/gen/f32-vclamp-wasmsimd-arm-u8.c", "src/f32-vclamp/gen/f32-vclamp-wasmsimd-x86-u8.c", @@ -748,28 +728,12 @@ NON_PROD_WASMSIMD_MICROKERNEL_SRCS = [ "src/f32-spmm/gen/f32-spmm-32x1-minmax-wasmsimd-x86-pipelined.c", "src/f32-spmm/gen/f32-spmm-32x1-minmax-wasmsimd-x86-x2.c", "src/f32-spmm/gen/f32-spmm-32x1-minmax-wasmsimd-x86-x4.c", - "src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-arm-u4.c", - "src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-arm-u8.c", - "src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-x86-u4.c", - "src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-x86-u8.c", "src/f32-vbinary/gen/f32-vadd-wasmsimd-u4.c", "src/f32-vbinary/gen/f32-vadd-wasmsimd-u8.c", - "src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-arm-u4.c", - "src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-arm-u8.c", - "src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-x86-u4.c", - "src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-x86-u8.c", "src/f32-vbinary/gen/f32-vaddc-wasmsimd-u4.c", "src/f32-vbinary/gen/f32-vaddc-wasmsimd-u8.c", - "src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-arm-u4.c", - "src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-arm-u8.c", - "src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-x86-u4.c", - "src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-x86-u8.c", "src/f32-vbinary/gen/f32-vdiv-wasmsimd-u4.c", "src/f32-vbinary/gen/f32-vdiv-wasmsimd-u8.c", - "src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-arm-u4.c", - "src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-arm-u8.c", - "src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-x86-u4.c", - "src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-x86-u8.c", "src/f32-vbinary/gen/f32-vdivc-wasmsimd-u4.c", "src/f32-vbinary/gen/f32-vdivc-wasmsimd-u8.c", "src/f32-vbinary/gen/f32-vmax-wasmsimd-arm-u4.c", @@ -788,16 +752,8 @@ NON_PROD_WASMSIMD_MICROKERNEL_SRCS = [ "src/f32-vbinary/gen/f32-vminc-wasmsimd-arm-u8.c", "src/f32-vbinary/gen/f32-vminc-wasmsimd-x86-u4.c", "src/f32-vbinary/gen/f32-vminc-wasmsimd-x86-u8.c", - "src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-arm-u4.c", - "src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-arm-u8.c", - "src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-x86-u4.c", - "src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-x86-u8.c", "src/f32-vbinary/gen/f32-vmul-wasmsimd-u4.c", "src/f32-vbinary/gen/f32-vmul-wasmsimd-u8.c", - "src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-arm-u4.c", - "src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-arm-u8.c", - "src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-x86-u4.c", - "src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-x86-u8.c", "src/f32-vbinary/gen/f32-vmulc-wasmsimd-u4.c", "src/f32-vbinary/gen/f32-vmulc-wasmsimd-u8.c", "src/f32-vbinary/gen/f32-vprelu-wasmsimd-u4.c", @@ -806,35 +762,19 @@ NON_PROD_WASMSIMD_MICROKERNEL_SRCS = [ "src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u4.c", "src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u8.c", "src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u16.c", - "src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-arm-u4.c", - "src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-arm-u8.c", - "src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-x86-u4.c", - "src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-x86-u8.c", "src/f32-vbinary/gen/f32-vrdivc-wasmsimd-u4.c", "src/f32-vbinary/gen/f32-vrdivc-wasmsimd-u8.c", "src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u4.c", "src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u8.c", "src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u16.c", - "src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-arm-u4.c", - "src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-arm-u8.c", - "src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-x86-u4.c", - "src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-x86-u8.c", "src/f32-vbinary/gen/f32-vrsubc-wasmsimd-u4.c", "src/f32-vbinary/gen/f32-vrsubc-wasmsimd-u8.c", "src/f32-vbinary/gen/f32-vsqrdiff-wasmsimd-u4.c", "src/f32-vbinary/gen/f32-vsqrdiff-wasmsimd-u8.c", "src/f32-vbinary/gen/f32-vsqrdiffc-wasmsimd-u4.c", "src/f32-vbinary/gen/f32-vsqrdiffc-wasmsimd-u8.c", - "src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-arm-u4.c", - "src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-arm-u8.c", - "src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-x86-u4.c", - "src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-x86-u8.c", "src/f32-vbinary/gen/f32-vsub-wasmsimd-u4.c", "src/f32-vbinary/gen/f32-vsub-wasmsimd-u8.c", - "src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-arm-u4.c", - "src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-arm-u8.c", - "src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-x86-u4.c", - "src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-x86-u8.c", "src/f32-vbinary/gen/f32-vsubc-wasmsimd-u4.c", "src/f32-vbinary/gen/f32-vsubc-wasmsimd-u8.c", "src/f32-vclamp/gen/f32-vclamp-wasmsimd-arm-u4.c", diff --git a/include/xnnpack.h b/include/xnnpack.h index 7af01be1224..efeba4dc533 100644 --- a/include/xnnpack.h +++ b/include/xnnpack.h @@ -5134,8 +5134,6 @@ enum xnn_status xnn_run_minimum_nd_f32( pthreadpool_t threadpool); enum xnn_status xnn_create_multiply_nd_f16( - float output_min, - float output_max, uint32_t flags, xnn_operator_t* multiply_op_out); @@ -5154,8 +5152,6 @@ enum xnn_status xnn_setup_multiply_nd_f16( void* output); enum xnn_status xnn_create_multiply_nd_f32( - float output_min, - float output_max, uint32_t flags, xnn_operator_t* multiply_op_out); @@ -5181,8 +5177,6 @@ enum xnn_status xnn_run_multiply_nd_f32( const float* input1, const float* input2, float* output, - float output_min, - float output_max, uint32_t flags, pthreadpool_t threadpool); @@ -5193,8 +5187,6 @@ enum xnn_status xnn_create_multiply_nd_qs8( float input2_scale, int8_t output_zero_point, float output_scale, - int8_t output_min, - int8_t output_max, uint32_t flags, xnn_operator_t* multiply_op_out); @@ -5226,8 +5218,6 @@ enum xnn_status xnn_run_multiply_nd_qs8( int8_t* output, int8_t output_zero_point, float output_scale, - int8_t output_min, - int8_t output_max, uint32_t flags, pthreadpool_t threadpool); @@ -5238,8 +5228,6 @@ enum xnn_status xnn_create_multiply_nd_qu8( float input2_scale, uint8_t output_zero_point, float output_scale, - uint8_t output_min, - uint8_t output_max, uint32_t flags, xnn_operator_t* multiply_op_out); @@ -5271,8 +5259,6 @@ enum xnn_status xnn_run_multiply_nd_qu8( uint8_t* output, uint8_t output_zero_point, float output_scale, - uint8_t output_min, - uint8_t output_max, uint32_t flags, pthreadpool_t threadpool); diff --git a/scripts/generate-f16-vbinary.sh b/scripts/generate-f16-vbinary.sh index 90ec227f946..5f29aa15fa7 100755 --- a/scripts/generate-f16-vbinary.sh +++ b/scripts/generate-f16-vbinary.sh @@ -5,177 +5,177 @@ # LICENSE file in the root directory of this source tree. ################################### ARM NEON ################################## -tools/xngen src/f16-vbinary/vop-neonfp16arith.c.in -D OP=ADD -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vadd-minmax-neonfp16arith-u8.c & -tools/xngen src/f16-vbinary/vop-neonfp16arith.c.in -D OP=ADD -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vadd-minmax-neonfp16arith-u16.c & -tools/xngen src/f16-vbinary/vop-neonfp16arith.c.in -D OP=DIV -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vdiv-minmax-aarch64-neonfp16arith-u8.c & -tools/xngen src/f16-vbinary/vop-neonfp16arith.c.in -D OP=DIV -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vdiv-minmax-aarch64-neonfp16arith-u16.c & -tools/xngen src/f16-vbinary/vop-neonfp16arith.c.in -D OP=MAX -D BATCH_TILE=8 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vmax-neonfp16arith-u8.c & -tools/xngen src/f16-vbinary/vop-neonfp16arith.c.in -D OP=MAX -D BATCH_TILE=16 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vmax-neonfp16arith-u16.c & -tools/xngen src/f16-vbinary/vop-neonfp16arith.c.in -D OP=MIN -D BATCH_TILE=8 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vmin-neonfp16arith-u8.c & -tools/xngen src/f16-vbinary/vop-neonfp16arith.c.in -D OP=MIN -D BATCH_TILE=16 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vmin-neonfp16arith-u16.c & -tools/xngen src/f16-vbinary/vop-neonfp16arith.c.in -D OP=MUL -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vmul-minmax-neonfp16arith-u8.c & -tools/xngen src/f16-vbinary/vop-neonfp16arith.c.in -D OP=MUL -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vmul-minmax-neonfp16arith-u16.c & -tools/xngen src/f16-vbinary/vop-neonfp16arith.c.in -D OP=SQRDIFF -D BATCH_TILE=8 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vsqrdiff-neonfp16arith-u8.c & -tools/xngen src/f16-vbinary/vop-neonfp16arith.c.in -D OP=SQRDIFF -D BATCH_TILE=16 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vsqrdiff-neonfp16arith-u16.c & -tools/xngen src/f16-vbinary/vop-neonfp16arith.c.in -D OP=SUB -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vsub-minmax-neonfp16arith-u8.c & -tools/xngen src/f16-vbinary/vop-neonfp16arith.c.in -D OP=SUB -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vsub-minmax-neonfp16arith-u16.c & -tools/xngen src/f16-vbinary/vop-neonfp16arith.c.in -D OP=PRELU -D BATCH_TILE=8 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vprelu-neonfp16arith-u8.c & -tools/xngen src/f16-vbinary/vop-neonfp16arith.c.in -D OP=PRELU -D BATCH_TILE=16 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vprelu-neonfp16arith-u16.c & +tools/xngen src/f16-vbinary/vop-neonfp16arith.c.in -D OP=ADD -D BATCH_TILE=8 -o src/f16-vbinary/gen/f16-vadd-neonfp16arith-u8.c & +tools/xngen src/f16-vbinary/vop-neonfp16arith.c.in -D OP=ADD -D BATCH_TILE=16 -o src/f16-vbinary/gen/f16-vadd-neonfp16arith-u16.c & +tools/xngen src/f16-vbinary/vop-neonfp16arith.c.in -D OP=DIV -D BATCH_TILE=8 -o src/f16-vbinary/gen/f16-vdiv-aarch64-neonfp16arith-u8.c & +tools/xngen src/f16-vbinary/vop-neonfp16arith.c.in -D OP=DIV -D BATCH_TILE=16 -o src/f16-vbinary/gen/f16-vdiv-aarch64-neonfp16arith-u16.c & +tools/xngen src/f16-vbinary/vop-neonfp16arith.c.in -D OP=MAX -D BATCH_TILE=8 -o src/f16-vbinary/gen/f16-vmax-neonfp16arith-u8.c & +tools/xngen src/f16-vbinary/vop-neonfp16arith.c.in -D OP=MAX -D BATCH_TILE=16 -o src/f16-vbinary/gen/f16-vmax-neonfp16arith-u16.c & +tools/xngen src/f16-vbinary/vop-neonfp16arith.c.in -D OP=MIN -D BATCH_TILE=8 -o src/f16-vbinary/gen/f16-vmin-neonfp16arith-u8.c & +tools/xngen src/f16-vbinary/vop-neonfp16arith.c.in -D OP=MIN -D BATCH_TILE=16 -o src/f16-vbinary/gen/f16-vmin-neonfp16arith-u16.c & +tools/xngen src/f16-vbinary/vop-neonfp16arith.c.in -D OP=MUL -D BATCH_TILE=8 -o src/f16-vbinary/gen/f16-vmul-neonfp16arith-u8.c & +tools/xngen src/f16-vbinary/vop-neonfp16arith.c.in -D OP=MUL -D BATCH_TILE=16 -o src/f16-vbinary/gen/f16-vmul-neonfp16arith-u16.c & +tools/xngen src/f16-vbinary/vop-neonfp16arith.c.in -D OP=SQRDIFF -D BATCH_TILE=8 -o src/f16-vbinary/gen/f16-vsqrdiff-neonfp16arith-u8.c & +tools/xngen src/f16-vbinary/vop-neonfp16arith.c.in -D OP=SQRDIFF -D BATCH_TILE=16 -o src/f16-vbinary/gen/f16-vsqrdiff-neonfp16arith-u16.c & +tools/xngen src/f16-vbinary/vop-neonfp16arith.c.in -D OP=SUB -D BATCH_TILE=8 -o src/f16-vbinary/gen/f16-vsub-neonfp16arith-u8.c & +tools/xngen src/f16-vbinary/vop-neonfp16arith.c.in -D OP=SUB -D BATCH_TILE=16 -o src/f16-vbinary/gen/f16-vsub-neonfp16arith-u16.c & +tools/xngen src/f16-vbinary/vop-neonfp16arith.c.in -D OP=PRELU -D BATCH_TILE=8 -o src/f16-vbinary/gen/f16-vprelu-neonfp16arith-u8.c & +tools/xngen src/f16-vbinary/vop-neonfp16arith.c.in -D OP=PRELU -D BATCH_TILE=16 -o src/f16-vbinary/gen/f16-vprelu-neonfp16arith-u16.c & -tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=ADD -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vaddc-minmax-neonfp16arith-u8.c & -tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=ADD -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vaddc-minmax-neonfp16arith-u16.c & -tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=DIV -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vdivc-minmax-aarch64-neonfp16arith-u8.c & -tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=DIV -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vdivc-minmax-aarch64-neonfp16arith-u16.c & -tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=RDIV -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vrdivc-minmax-aarch64-neonfp16arith-u8.c & -tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=RDIV -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vrdivc-minmax-aarch64-neonfp16arith-u16.c & -tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=MAX -D BATCH_TILE=8 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vmaxc-neonfp16arith-u8.c & -tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=MAX -D BATCH_TILE=16 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vmaxc-neonfp16arith-u16.c & -tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=MIN -D BATCH_TILE=8 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vminc-neonfp16arith-u8.c & -tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=MIN -D BATCH_TILE=16 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vminc-neonfp16arith-u16.c & -tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=MUL -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vmulc-minmax-neonfp16arith-u8.c & -tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=MUL -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vmulc-minmax-neonfp16arith-u16.c & -tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=SQRDIFF -D BATCH_TILE=8 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vsqrdiffc-neonfp16arith-u8.c & -tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=SQRDIFF -D BATCH_TILE=16 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vsqrdiffc-neonfp16arith-u16.c & -tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=SUB -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vsubc-minmax-neonfp16arith-u8.c & -tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=SUB -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vsubc-minmax-neonfp16arith-u16.c & -tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=RSUB -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vrsubc-minmax-neonfp16arith-u8.c & -tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=RSUB -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vrsubc-minmax-neonfp16arith-u16.c & -tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=PRELU -D BATCH_TILE=8 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vpreluc-neonfp16arith-u8.c & -tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=PRELU -D BATCH_TILE=16 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vpreluc-neonfp16arith-u16.c & -tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=RPRELU -D BATCH_TILE=8 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vrpreluc-neonfp16arith-u8.c & -tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=RPRELU -D BATCH_TILE=16 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vrpreluc-neonfp16arith-u16.c & +tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=ADD -D BATCH_TILE=8 -o src/f16-vbinary/gen/f16-vaddc-neonfp16arith-u8.c & +tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=ADD -D BATCH_TILE=16 -o src/f16-vbinary/gen/f16-vaddc-neonfp16arith-u16.c & +tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=DIV -D BATCH_TILE=8 -o src/f16-vbinary/gen/f16-vdivc-aarch64-neonfp16arith-u8.c & +tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=DIV -D BATCH_TILE=16 -o src/f16-vbinary/gen/f16-vdivc-aarch64-neonfp16arith-u16.c & +tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=RDIV -D BATCH_TILE=8 -o src/f16-vbinary/gen/f16-vrdivc-aarch64-neonfp16arith-u8.c & +tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=RDIV -D BATCH_TILE=16 -o src/f16-vbinary/gen/f16-vrdivc-aarch64-neonfp16arith-u16.c & +tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=MAX -D BATCH_TILE=8 -o src/f16-vbinary/gen/f16-vmaxc-neonfp16arith-u8.c & +tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=MAX -D BATCH_TILE=16 -o src/f16-vbinary/gen/f16-vmaxc-neonfp16arith-u16.c & +tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=MIN -D BATCH_TILE=8 -o src/f16-vbinary/gen/f16-vminc-neonfp16arith-u8.c & +tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=MIN -D BATCH_TILE=16 -o src/f16-vbinary/gen/f16-vminc-neonfp16arith-u16.c & +tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=MUL -D BATCH_TILE=8 -o src/f16-vbinary/gen/f16-vmulc-neonfp16arith-u8.c & +tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=MUL -D BATCH_TILE=16 -o src/f16-vbinary/gen/f16-vmulc-neonfp16arith-u16.c & +tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=SQRDIFF -D BATCH_TILE=8 -o src/f16-vbinary/gen/f16-vsqrdiffc-neonfp16arith-u8.c & +tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=SQRDIFF -D BATCH_TILE=16 -o src/f16-vbinary/gen/f16-vsqrdiffc-neonfp16arith-u16.c & +tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=SUB -D BATCH_TILE=8 -o src/f16-vbinary/gen/f16-vsubc-neonfp16arith-u8.c & +tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=SUB -D BATCH_TILE=16 -o src/f16-vbinary/gen/f16-vsubc-neonfp16arith-u16.c & +tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=RSUB -D BATCH_TILE=8 -o src/f16-vbinary/gen/f16-vrsubc-neonfp16arith-u8.c & +tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=RSUB -D BATCH_TILE=16 -o src/f16-vbinary/gen/f16-vrsubc-neonfp16arith-u16.c & +tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=PRELU -D BATCH_TILE=8 -o src/f16-vbinary/gen/f16-vpreluc-neonfp16arith-u8.c & +tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=PRELU -D BATCH_TILE=16 -o src/f16-vbinary/gen/f16-vpreluc-neonfp16arith-u16.c & +tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=RPRELU -D BATCH_TILE=8 -o src/f16-vbinary/gen/f16-vrpreluc-neonfp16arith-u8.c & +tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=RPRELU -D BATCH_TILE=16 -o src/f16-vbinary/gen/f16-vrpreluc-neonfp16arith-u16.c & ################################### ARM FP16 ################################## -tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=ADD -D BATCH_TILE=1 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vadd-minmax-fp16arith-u1.c & -tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=ADD -D BATCH_TILE=2 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vadd-minmax-fp16arith-u2.c & -tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=ADD -D BATCH_TILE=4 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vadd-minmax-fp16arith-u4.c & -tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=DIV -D BATCH_TILE=1 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vdiv-minmax-fp16arith-u1.c & -tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=DIV -D BATCH_TILE=2 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vdiv-minmax-fp16arith-u2.c & -tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=DIV -D BATCH_TILE=4 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vdiv-minmax-fp16arith-u4.c & -tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=MAX -D BATCH_TILE=1 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vmax-fp16arith-u1.c & -tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=MAX -D BATCH_TILE=2 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vmax-fp16arith-u2.c & -tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=MAX -D BATCH_TILE=4 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vmax-fp16arith-u4.c & -tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=MIN -D BATCH_TILE=1 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vmin-fp16arith-u1.c & -tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=MIN -D BATCH_TILE=2 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vmin-fp16arith-u2.c & -tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=MIN -D BATCH_TILE=4 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vmin-fp16arith-u4.c & -tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=MUL -D BATCH_TILE=1 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vmul-minmax-fp16arith-u1.c & -tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=MUL -D BATCH_TILE=2 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vmul-minmax-fp16arith-u2.c & -tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=MUL -D BATCH_TILE=4 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vmul-minmax-fp16arith-u4.c & -tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=SQRDIFF -D BATCH_TILE=1 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vsqrdiff-fp16arith-u1.c & -tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=SQRDIFF -D BATCH_TILE=2 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vsqrdiff-fp16arith-u2.c & -tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=SQRDIFF -D BATCH_TILE=4 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vsqrdiff-fp16arith-u4.c & -tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=SUB -D BATCH_TILE=1 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vsub-minmax-fp16arith-u1.c & -tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=SUB -D BATCH_TILE=2 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vsub-minmax-fp16arith-u2.c & -tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=SUB -D BATCH_TILE=4 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vsub-minmax-fp16arith-u4.c & +tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=ADD -D BATCH_TILE=1 -o src/f16-vbinary/gen/f16-vadd-fp16arith-u1.c & +tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=ADD -D BATCH_TILE=2 -o src/f16-vbinary/gen/f16-vadd-fp16arith-u2.c & +tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=ADD -D BATCH_TILE=4 -o src/f16-vbinary/gen/f16-vadd-fp16arith-u4.c & +tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=DIV -D BATCH_TILE=1 -o src/f16-vbinary/gen/f16-vdiv-fp16arith-u1.c & +tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=DIV -D BATCH_TILE=2 -o src/f16-vbinary/gen/f16-vdiv-fp16arith-u2.c & +tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=DIV -D BATCH_TILE=4 -o src/f16-vbinary/gen/f16-vdiv-fp16arith-u4.c & +tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=MAX -D BATCH_TILE=1 -o src/f16-vbinary/gen/f16-vmax-fp16arith-u1.c & +tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=MAX -D BATCH_TILE=2 -o src/f16-vbinary/gen/f16-vmax-fp16arith-u2.c & +tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=MAX -D BATCH_TILE=4 -o src/f16-vbinary/gen/f16-vmax-fp16arith-u4.c & +tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=MIN -D BATCH_TILE=1 -o src/f16-vbinary/gen/f16-vmin-fp16arith-u1.c & +tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=MIN -D BATCH_TILE=2 -o src/f16-vbinary/gen/f16-vmin-fp16arith-u2.c & +tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=MIN -D BATCH_TILE=4 -o src/f16-vbinary/gen/f16-vmin-fp16arith-u4.c & +tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=MUL -D BATCH_TILE=1 -o src/f16-vbinary/gen/f16-vmul-fp16arith-u1.c & +tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=MUL -D BATCH_TILE=2 -o src/f16-vbinary/gen/f16-vmul-fp16arith-u2.c & +tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=MUL -D BATCH_TILE=4 -o src/f16-vbinary/gen/f16-vmul-fp16arith-u4.c & +tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=SQRDIFF -D BATCH_TILE=1 -o src/f16-vbinary/gen/f16-vsqrdiff-fp16arith-u1.c & +tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=SQRDIFF -D BATCH_TILE=2 -o src/f16-vbinary/gen/f16-vsqrdiff-fp16arith-u2.c & +tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=SQRDIFF -D BATCH_TILE=4 -o src/f16-vbinary/gen/f16-vsqrdiff-fp16arith-u4.c & +tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=SUB -D BATCH_TILE=1 -o src/f16-vbinary/gen/f16-vsub-fp16arith-u1.c & +tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=SUB -D BATCH_TILE=2 -o src/f16-vbinary/gen/f16-vsub-fp16arith-u2.c & +tools/xngen src/f16-vbinary/vop-fp16arith.c.in -D OP=SUB -D BATCH_TILE=4 -o src/f16-vbinary/gen/f16-vsub-fp16arith-u4.c & -tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=ADD -D BATCH_TILE=1 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vaddc-minmax-fp16arith-u1.c & -tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=ADD -D BATCH_TILE=2 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vaddc-minmax-fp16arith-u2.c & -tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=ADD -D BATCH_TILE=4 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vaddc-minmax-fp16arith-u4.c & -tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=DIV -D BATCH_TILE=1 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vdivc-minmax-fp16arith-u1.c & -tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=DIV -D BATCH_TILE=2 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vdivc-minmax-fp16arith-u2.c & -tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=DIV -D BATCH_TILE=4 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vdivc-minmax-fp16arith-u4.c & -tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=RDIV -D BATCH_TILE=1 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vrdivc-minmax-fp16arith-u1.c & -tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=RDIV -D BATCH_TILE=2 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vrdivc-minmax-fp16arith-u2.c & -tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=RDIV -D BATCH_TILE=4 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vrdivc-minmax-fp16arith-u4.c & -tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=MAX -D BATCH_TILE=1 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vmaxc-fp16arith-u1.c & -tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=MAX -D BATCH_TILE=2 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vmaxc-fp16arith-u2.c & -tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=MAX -D BATCH_TILE=4 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vmaxc-fp16arith-u4.c & -tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=MIN -D BATCH_TILE=1 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vminc-fp16arith-u1.c & -tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=MIN -D BATCH_TILE=2 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vminc-fp16arith-u2.c & -tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=MIN -D BATCH_TILE=4 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vminc-fp16arith-u4.c & -tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=MUL -D BATCH_TILE=1 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vmulc-minmax-fp16arith-u1.c & -tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=MUL -D BATCH_TILE=2 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vmulc-minmax-fp16arith-u2.c & -tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=MUL -D BATCH_TILE=4 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vmulc-minmax-fp16arith-u4.c & -tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=SQRDIFF -D BATCH_TILE=1 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vsqrdiffc-fp16arith-u1.c & -tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=SQRDIFF -D BATCH_TILE=2 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vsqrdiffc-fp16arith-u2.c & -tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=SQRDIFF -D BATCH_TILE=4 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vsqrdiffc-fp16arith-u4.c & -tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=SUB -D BATCH_TILE=1 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vsubc-minmax-fp16arith-u1.c & -tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=SUB -D BATCH_TILE=2 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vsubc-minmax-fp16arith-u2.c & -tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=SUB -D BATCH_TILE=4 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vsubc-minmax-fp16arith-u4.c & -tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=RSUB -D BATCH_TILE=1 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vrsubc-minmax-fp16arith-u1.c & -tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=RSUB -D BATCH_TILE=2 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vrsubc-minmax-fp16arith-u2.c & -tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=RSUB -D BATCH_TILE=4 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vrsubc-minmax-fp16arith-u4.c & +tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=ADD -D BATCH_TILE=1 -o src/f16-vbinary/gen/f16-vaddc-fp16arith-u1.c & +tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=ADD -D BATCH_TILE=2 -o src/f16-vbinary/gen/f16-vaddc-fp16arith-u2.c & +tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=ADD -D BATCH_TILE=4 -o src/f16-vbinary/gen/f16-vaddc-fp16arith-u4.c & +tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=DIV -D BATCH_TILE=1 -o src/f16-vbinary/gen/f16-vdivc-fp16arith-u1.c & +tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=DIV -D BATCH_TILE=2 -o src/f16-vbinary/gen/f16-vdivc-fp16arith-u2.c & +tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=DIV -D BATCH_TILE=4 -o src/f16-vbinary/gen/f16-vdivc-fp16arith-u4.c & +tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=RDIV -D BATCH_TILE=1 -o src/f16-vbinary/gen/f16-vrdivc-fp16arith-u1.c & +tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=RDIV -D BATCH_TILE=2 -o src/f16-vbinary/gen/f16-vrdivc-fp16arith-u2.c & +tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=RDIV -D BATCH_TILE=4 -o src/f16-vbinary/gen/f16-vrdivc-fp16arith-u4.c & +tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=MAX -D BATCH_TILE=1 -o src/f16-vbinary/gen/f16-vmaxc-fp16arith-u1.c & +tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=MAX -D BATCH_TILE=2 -o src/f16-vbinary/gen/f16-vmaxc-fp16arith-u2.c & +tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=MAX -D BATCH_TILE=4 -o src/f16-vbinary/gen/f16-vmaxc-fp16arith-u4.c & +tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=MIN -D BATCH_TILE=1 -o src/f16-vbinary/gen/f16-vminc-fp16arith-u1.c & +tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=MIN -D BATCH_TILE=2 -o src/f16-vbinary/gen/f16-vminc-fp16arith-u2.c & +tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=MIN -D BATCH_TILE=4 -o src/f16-vbinary/gen/f16-vminc-fp16arith-u4.c & +tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=MUL -D BATCH_TILE=1 -o src/f16-vbinary/gen/f16-vmulc-fp16arith-u1.c & +tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=MUL -D BATCH_TILE=2 -o src/f16-vbinary/gen/f16-vmulc-fp16arith-u2.c & +tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=MUL -D BATCH_TILE=4 -o src/f16-vbinary/gen/f16-vmulc-fp16arith-u4.c & +tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=SQRDIFF -D BATCH_TILE=1 -o src/f16-vbinary/gen/f16-vsqrdiffc-fp16arith-u1.c & +tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=SQRDIFF -D BATCH_TILE=2 -o src/f16-vbinary/gen/f16-vsqrdiffc-fp16arith-u2.c & +tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=SQRDIFF -D BATCH_TILE=4 -o src/f16-vbinary/gen/f16-vsqrdiffc-fp16arith-u4.c & +tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=SUB -D BATCH_TILE=1 -o src/f16-vbinary/gen/f16-vsubc-fp16arith-u1.c & +tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=SUB -D BATCH_TILE=2 -o src/f16-vbinary/gen/f16-vsubc-fp16arith-u2.c & +tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=SUB -D BATCH_TILE=4 -o src/f16-vbinary/gen/f16-vsubc-fp16arith-u4.c & +tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=RSUB -D BATCH_TILE=1 -o src/f16-vbinary/gen/f16-vrsubc-fp16arith-u1.c & +tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=RSUB -D BATCH_TILE=2 -o src/f16-vbinary/gen/f16-vrsubc-fp16arith-u2.c & +tools/xngen src/f16-vbinary/vopc-fp16arith.c.in -D OP=RSUB -D BATCH_TILE=4 -o src/f16-vbinary/gen/f16-vrsubc-fp16arith-u4.c & ################################### x86 AVX512 FP16 ################################## -tools/xngen src/f16-vbinary/vop-avx512fp16.c.in -D OP=ADD -D BATCH_TILE=32 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vadd-minmax-avx512fp16-u32.c & -tools/xngen src/f16-vbinary/vop-avx512fp16.c.in -D OP=ADD -D BATCH_TILE=64 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vadd-minmax-avx512fp16-u64.c & -tools/xngen src/f16-vbinary/vop-avx512fp16.c.in -D OP=DIV -D BATCH_TILE=32 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vdiv-minmax-avx512fp16-u32.c & -tools/xngen src/f16-vbinary/vop-avx512fp16.c.in -D OP=DIV -D BATCH_TILE=64 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vdiv-minmax-avx512fp16-u64.c & -tools/xngen src/f16-vbinary/vop-avx512fp16.c.in -D OP=MAX -D BATCH_TILE=32 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vmax-avx512fp16-u32.c & -tools/xngen src/f16-vbinary/vop-avx512fp16.c.in -D OP=MAX -D BATCH_TILE=64 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vmax-avx512fp16-u64.c & -tools/xngen src/f16-vbinary/vop-avx512fp16.c.in -D OP=MIN -D BATCH_TILE=32 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vmin-avx512fp16-u32.c & -tools/xngen src/f16-vbinary/vop-avx512fp16.c.in -D OP=MIN -D BATCH_TILE=64 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vmin-avx512fp16-u64.c & -tools/xngen src/f16-vbinary/vop-avx512fp16.c.in -D OP=MUL -D BATCH_TILE=32 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vmul-minmax-avx512fp16-u32.c & -tools/xngen src/f16-vbinary/vop-avx512fp16.c.in -D OP=MUL -D BATCH_TILE=64 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vmul-minmax-avx512fp16-u64.c & -tools/xngen src/f16-vbinary/vop-avx512fp16.c.in -D OP=SQRDIFF -D BATCH_TILE=32 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vsqrdiff-avx512fp16-u32.c & -tools/xngen src/f16-vbinary/vop-avx512fp16.c.in -D OP=SQRDIFF -D BATCH_TILE=64 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vsqrdiff-avx512fp16-u64.c & -tools/xngen src/f16-vbinary/vop-avx512fp16.c.in -D OP=SUB -D BATCH_TILE=32 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vsub-minmax-avx512fp16-u32.c & -tools/xngen src/f16-vbinary/vop-avx512fp16.c.in -D OP=SUB -D BATCH_TILE=64 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vsub-minmax-avx512fp16-u64.c & -tools/xngen src/f16-vbinary/vop-avx512fp16.c.in -D OP=PRELU -D BATCH_TILE=32 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vprelu-avx512fp16-u32.c & -tools/xngen src/f16-vbinary/vop-avx512fp16.c.in -D OP=PRELU -D BATCH_TILE=64 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vprelu-avx512fp16-u64.c & +tools/xngen src/f16-vbinary/vop-avx512fp16.c.in -D OP=ADD -D BATCH_TILE=32 -o src/f16-vbinary/gen/f16-vadd-avx512fp16-u32.c & +tools/xngen src/f16-vbinary/vop-avx512fp16.c.in -D OP=ADD -D BATCH_TILE=64 -o src/f16-vbinary/gen/f16-vadd-avx512fp16-u64.c & +tools/xngen src/f16-vbinary/vop-avx512fp16.c.in -D OP=DIV -D BATCH_TILE=32 -o src/f16-vbinary/gen/f16-vdiv-avx512fp16-u32.c & +tools/xngen src/f16-vbinary/vop-avx512fp16.c.in -D OP=DIV -D BATCH_TILE=64 -o src/f16-vbinary/gen/f16-vdiv-avx512fp16-u64.c & +tools/xngen src/f16-vbinary/vop-avx512fp16.c.in -D OP=MAX -D BATCH_TILE=32 -o src/f16-vbinary/gen/f16-vmax-avx512fp16-u32.c & +tools/xngen src/f16-vbinary/vop-avx512fp16.c.in -D OP=MAX -D BATCH_TILE=64 -o src/f16-vbinary/gen/f16-vmax-avx512fp16-u64.c & +tools/xngen src/f16-vbinary/vop-avx512fp16.c.in -D OP=MIN -D BATCH_TILE=32 -o src/f16-vbinary/gen/f16-vmin-avx512fp16-u32.c & +tools/xngen src/f16-vbinary/vop-avx512fp16.c.in -D OP=MIN -D BATCH_TILE=64 -o src/f16-vbinary/gen/f16-vmin-avx512fp16-u64.c & +tools/xngen src/f16-vbinary/vop-avx512fp16.c.in -D OP=MUL -D BATCH_TILE=32 -o src/f16-vbinary/gen/f16-vmul-avx512fp16-u32.c & +tools/xngen src/f16-vbinary/vop-avx512fp16.c.in -D OP=MUL -D BATCH_TILE=64 -o src/f16-vbinary/gen/f16-vmul-avx512fp16-u64.c & +tools/xngen src/f16-vbinary/vop-avx512fp16.c.in -D OP=SQRDIFF -D BATCH_TILE=32 -o src/f16-vbinary/gen/f16-vsqrdiff-avx512fp16-u32.c & +tools/xngen src/f16-vbinary/vop-avx512fp16.c.in -D OP=SQRDIFF -D BATCH_TILE=64 -o src/f16-vbinary/gen/f16-vsqrdiff-avx512fp16-u64.c & +tools/xngen src/f16-vbinary/vop-avx512fp16.c.in -D OP=SUB -D BATCH_TILE=32 -o src/f16-vbinary/gen/f16-vsub-avx512fp16-u32.c & +tools/xngen src/f16-vbinary/vop-avx512fp16.c.in -D OP=SUB -D BATCH_TILE=64 -o src/f16-vbinary/gen/f16-vsub-avx512fp16-u64.c & +tools/xngen src/f16-vbinary/vop-avx512fp16.c.in -D OP=PRELU -D BATCH_TILE=32 -o src/f16-vbinary/gen/f16-vprelu-avx512fp16-u32.c & +tools/xngen src/f16-vbinary/vop-avx512fp16.c.in -D OP=PRELU -D BATCH_TILE=64 -o src/f16-vbinary/gen/f16-vprelu-avx512fp16-u64.c & -tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=ADD -D BATCH_TILE=32 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vaddc-minmax-avx512fp16-u32.c & -tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=ADD -D BATCH_TILE=64 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vaddc-minmax-avx512fp16-u64.c & -tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=DIV -D BATCH_TILE=32 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vdivc-minmax-avx512fp16-u32.c & -tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=DIV -D BATCH_TILE=64 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vdivc-minmax-avx512fp16-u64.c & -tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=RDIV -D BATCH_TILE=32 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vrdivc-minmax-avx512fp16-u32.c & -tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=RDIV -D BATCH_TILE=64 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vrdivc-minmax-avx512fp16-u64.c & -tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=MAX -D BATCH_TILE=32 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vmaxc-avx512fp16-u32.c & -tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=MAX -D BATCH_TILE=64 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vmaxc-avx512fp16-u64.c & -tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=MIN -D BATCH_TILE=32 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vminc-avx512fp16-u32.c & -tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=MIN -D BATCH_TILE=64 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vminc-avx512fp16-u64.c & -tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=MUL -D BATCH_TILE=32 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vmulc-minmax-avx512fp16-u32.c & -tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=MUL -D BATCH_TILE=64 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vmulc-minmax-avx512fp16-u64.c & -tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=SQRDIFF -D BATCH_TILE=32 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vsqrdiffc-avx512fp16-u32.c & -tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=SQRDIFF -D BATCH_TILE=64 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vsqrdiffc-avx512fp16-u64.c & -tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=SUB -D BATCH_TILE=32 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vsubc-minmax-avx512fp16-u32.c & -tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=SUB -D BATCH_TILE=64 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vsubc-minmax-avx512fp16-u64.c & -tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=RSUB -D BATCH_TILE=32 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vrsubc-minmax-avx512fp16-u32.c & -tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=RSUB -D BATCH_TILE=64 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vrsubc-minmax-avx512fp16-u64.c & -tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=PRELU -D BATCH_TILE=32 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vpreluc-avx512fp16-u32.c & -tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=PRELU -D BATCH_TILE=64 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vpreluc-avx512fp16-u64.c & -tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=RPRELU -D BATCH_TILE=32 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vrpreluc-avx512fp16-u32.c & -tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=RPRELU -D BATCH_TILE=64 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vrpreluc-avx512fp16-u64.c & +tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=ADD -D BATCH_TILE=32 -o src/f16-vbinary/gen/f16-vaddc-avx512fp16-u32.c & +tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=ADD -D BATCH_TILE=64 -o src/f16-vbinary/gen/f16-vaddc-avx512fp16-u64.c & +tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=DIV -D BATCH_TILE=32 -o src/f16-vbinary/gen/f16-vdivc-avx512fp16-u32.c & +tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=DIV -D BATCH_TILE=64 -o src/f16-vbinary/gen/f16-vdivc-avx512fp16-u64.c & +tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=RDIV -D BATCH_TILE=32 -o src/f16-vbinary/gen/f16-vrdivc-avx512fp16-u32.c & +tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=RDIV -D BATCH_TILE=64 -o src/f16-vbinary/gen/f16-vrdivc-avx512fp16-u64.c & +tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=MAX -D BATCH_TILE=32 -o src/f16-vbinary/gen/f16-vmaxc-avx512fp16-u32.c & +tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=MAX -D BATCH_TILE=64 -o src/f16-vbinary/gen/f16-vmaxc-avx512fp16-u64.c & +tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=MIN -D BATCH_TILE=32 -o src/f16-vbinary/gen/f16-vminc-avx512fp16-u32.c & +tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=MIN -D BATCH_TILE=64 -o src/f16-vbinary/gen/f16-vminc-avx512fp16-u64.c & +tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=MUL -D BATCH_TILE=32 -o src/f16-vbinary/gen/f16-vmulc-avx512fp16-u32.c & +tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=MUL -D BATCH_TILE=64 -o src/f16-vbinary/gen/f16-vmulc-avx512fp16-u64.c & +tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=SQRDIFF -D BATCH_TILE=32 -o src/f16-vbinary/gen/f16-vsqrdiffc-avx512fp16-u32.c & +tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=SQRDIFF -D BATCH_TILE=64 -o src/f16-vbinary/gen/f16-vsqrdiffc-avx512fp16-u64.c & +tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=SUB -D BATCH_TILE=32 -o src/f16-vbinary/gen/f16-vsubc-avx512fp16-u32.c & +tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=SUB -D BATCH_TILE=64 -o src/f16-vbinary/gen/f16-vsubc-avx512fp16-u64.c & +tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=RSUB -D BATCH_TILE=32 -o src/f16-vbinary/gen/f16-vrsubc-avx512fp16-u32.c & +tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=RSUB -D BATCH_TILE=64 -o src/f16-vbinary/gen/f16-vrsubc-avx512fp16-u64.c & +tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=PRELU -D BATCH_TILE=32 -o src/f16-vbinary/gen/f16-vpreluc-avx512fp16-u32.c & +tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=PRELU -D BATCH_TILE=64 -o src/f16-vbinary/gen/f16-vpreluc-avx512fp16-u64.c & +tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=RPRELU -D BATCH_TILE=32 -o src/f16-vbinary/gen/f16-vrpreluc-avx512fp16-u32.c & +tools/xngen src/f16-vbinary/vopc-avx512fp16.c.in -D OP=RPRELU -D BATCH_TILE=64 -o src/f16-vbinary/gen/f16-vrpreluc-avx512fp16-u64.c & ################################### x86 F16C ################################## -tools/xngen src/f16-vbinary/vop-f16c.c.in -D OP=ADD -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vadd-minmax-f16c-u8.c & -tools/xngen src/f16-vbinary/vop-f16c.c.in -D OP=ADD -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vadd-minmax-f16c-u16.c & -tools/xngen src/f16-vbinary/vop-f16c.c.in -D OP=DIV -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vdiv-minmax-f16c-u8.c & -tools/xngen src/f16-vbinary/vop-f16c.c.in -D OP=DIV -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vdiv-minmax-f16c-u16.c & -tools/xngen src/f16-vbinary/vop-f16c.c.in -D OP=MAX -D BATCH_TILE=8 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vmax-f16c-u8.c & -tools/xngen src/f16-vbinary/vop-f16c.c.in -D OP=MAX -D BATCH_TILE=16 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vmax-f16c-u16.c & -tools/xngen src/f16-vbinary/vop-f16c.c.in -D OP=MIN -D BATCH_TILE=8 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vmin-f16c-u8.c & -tools/xngen src/f16-vbinary/vop-f16c.c.in -D OP=MIN -D BATCH_TILE=16 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vmin-f16c-u16.c & -tools/xngen src/f16-vbinary/vop-f16c.c.in -D OP=MUL -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vmul-minmax-f16c-u8.c & -tools/xngen src/f16-vbinary/vop-f16c.c.in -D OP=MUL -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vmul-minmax-f16c-u16.c & -tools/xngen src/f16-vbinary/vop-f16c.c.in -D OP=SQRDIFF -D BATCH_TILE=8 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vsqrdiff-f16c-u8.c & -tools/xngen src/f16-vbinary/vop-f16c.c.in -D OP=SQRDIFF -D BATCH_TILE=16 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vsqrdiff-f16c-u16.c & -tools/xngen src/f16-vbinary/vop-f16c.c.in -D OP=SUB -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vsub-minmax-f16c-u8.c & -tools/xngen src/f16-vbinary/vop-f16c.c.in -D OP=SUB -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vsub-minmax-f16c-u16.c & -tools/xngen src/f16-vbinary/vop-f16c.c.in -D OP=PRELU -D BATCH_TILE=8 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vprelu-f16c-u8.c & -tools/xngen src/f16-vbinary/vop-f16c.c.in -D OP=PRELU -D BATCH_TILE=16 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vprelu-f16c-u16.c & +tools/xngen src/f16-vbinary/vop-f16c.c.in -D OP=ADD -D BATCH_TILE=8 -o src/f16-vbinary/gen/f16-vadd-f16c-u8.c & +tools/xngen src/f16-vbinary/vop-f16c.c.in -D OP=ADD -D BATCH_TILE=16 -o src/f16-vbinary/gen/f16-vadd-f16c-u16.c & +tools/xngen src/f16-vbinary/vop-f16c.c.in -D OP=DIV -D BATCH_TILE=8 -o src/f16-vbinary/gen/f16-vdiv-f16c-u8.c & +tools/xngen src/f16-vbinary/vop-f16c.c.in -D OP=DIV -D BATCH_TILE=16 -o src/f16-vbinary/gen/f16-vdiv-f16c-u16.c & +tools/xngen src/f16-vbinary/vop-f16c.c.in -D OP=MAX -D BATCH_TILE=8 -o src/f16-vbinary/gen/f16-vmax-f16c-u8.c & +tools/xngen src/f16-vbinary/vop-f16c.c.in -D OP=MAX -D BATCH_TILE=16 -o src/f16-vbinary/gen/f16-vmax-f16c-u16.c & +tools/xngen src/f16-vbinary/vop-f16c.c.in -D OP=MIN -D BATCH_TILE=8 -o src/f16-vbinary/gen/f16-vmin-f16c-u8.c & +tools/xngen src/f16-vbinary/vop-f16c.c.in -D OP=MIN -D BATCH_TILE=16 -o src/f16-vbinary/gen/f16-vmin-f16c-u16.c & +tools/xngen src/f16-vbinary/vop-f16c.c.in -D OP=MUL -D BATCH_TILE=8 -o src/f16-vbinary/gen/f16-vmul-f16c-u8.c & +tools/xngen src/f16-vbinary/vop-f16c.c.in -D OP=MUL -D BATCH_TILE=16 -o src/f16-vbinary/gen/f16-vmul-f16c-u16.c & +tools/xngen src/f16-vbinary/vop-f16c.c.in -D OP=SQRDIFF -D BATCH_TILE=8 -o src/f16-vbinary/gen/f16-vsqrdiff-f16c-u8.c & +tools/xngen src/f16-vbinary/vop-f16c.c.in -D OP=SQRDIFF -D BATCH_TILE=16 -o src/f16-vbinary/gen/f16-vsqrdiff-f16c-u16.c & +tools/xngen src/f16-vbinary/vop-f16c.c.in -D OP=SUB -D BATCH_TILE=8 -o src/f16-vbinary/gen/f16-vsub-f16c-u8.c & +tools/xngen src/f16-vbinary/vop-f16c.c.in -D OP=SUB -D BATCH_TILE=16 -o src/f16-vbinary/gen/f16-vsub-f16c-u16.c & +tools/xngen src/f16-vbinary/vop-f16c.c.in -D OP=PRELU -D BATCH_TILE=8 -o src/f16-vbinary/gen/f16-vprelu-f16c-u8.c & +tools/xngen src/f16-vbinary/vop-f16c.c.in -D OP=PRELU -D BATCH_TILE=16 -o src/f16-vbinary/gen/f16-vprelu-f16c-u16.c & -tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=ADD -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vaddc-minmax-f16c-u8.c & -tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=ADD -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vaddc-minmax-f16c-u16.c & -tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=DIV -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vdivc-minmax-f16c-u8.c & -tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=DIV -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vdivc-minmax-f16c-u16.c & -tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=RDIV -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vrdivc-minmax-f16c-u8.c & -tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=RDIV -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vrdivc-minmax-f16c-u16.c & -tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=MAX -D BATCH_TILE=8 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vmaxc-f16c-u8.c & -tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=MAX -D BATCH_TILE=16 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vmaxc-f16c-u16.c & -tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=MIN -D BATCH_TILE=8 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vminc-f16c-u8.c & -tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=MIN -D BATCH_TILE=16 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vminc-f16c-u16.c & -tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=MUL -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vmulc-minmax-f16c-u8.c & -tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=MUL -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vmulc-minmax-f16c-u16.c & -tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=SQRDIFF -D BATCH_TILE=8 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vsqrdiffc-f16c-u8.c & -tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=SQRDIFF -D BATCH_TILE=16 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vsqrdiffc-f16c-u16.c & -tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=SUB -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vsubc-minmax-f16c-u8.c & -tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=SUB -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vsubc-minmax-f16c-u16.c & -tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=RSUB -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vrsubc-minmax-f16c-u8.c & -tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=RSUB -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/f16-vrsubc-minmax-f16c-u16.c & -tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=PRELU -D BATCH_TILE=8 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vpreluc-f16c-u8.c & -tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=PRELU -D BATCH_TILE=16 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vpreluc-f16c-u16.c & -tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=RPRELU -D BATCH_TILE=8 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vrpreluc-f16c-u8.c & -tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=RPRELU -D BATCH_TILE=16 -D ACTIVATION=LINEAR -o src/f16-vbinary/gen/f16-vrpreluc-f16c-u16.c & +tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=ADD -D BATCH_TILE=8 -o src/f16-vbinary/gen/f16-vaddc-f16c-u8.c & +tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=ADD -D BATCH_TILE=16 -o src/f16-vbinary/gen/f16-vaddc-f16c-u16.c & +tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=DIV -D BATCH_TILE=8 -o src/f16-vbinary/gen/f16-vdivc-f16c-u8.c & +tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=DIV -D BATCH_TILE=16 -o src/f16-vbinary/gen/f16-vdivc-f16c-u16.c & +tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=RDIV -D BATCH_TILE=8 -o src/f16-vbinary/gen/f16-vrdivc-f16c-u8.c & +tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=RDIV -D BATCH_TILE=16 -o src/f16-vbinary/gen/f16-vrdivc-f16c-u16.c & +tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=MAX -D BATCH_TILE=8 -o src/f16-vbinary/gen/f16-vmaxc-f16c-u8.c & +tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=MAX -D BATCH_TILE=16 -o src/f16-vbinary/gen/f16-vmaxc-f16c-u16.c & +tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=MIN -D BATCH_TILE=8 -o src/f16-vbinary/gen/f16-vminc-f16c-u8.c & +tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=MIN -D BATCH_TILE=16 -o src/f16-vbinary/gen/f16-vminc-f16c-u16.c & +tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=MUL -D BATCH_TILE=8 -o src/f16-vbinary/gen/f16-vmulc-f16c-u8.c & +tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=MUL -D BATCH_TILE=16 -o src/f16-vbinary/gen/f16-vmulc-f16c-u16.c & +tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=SQRDIFF -D BATCH_TILE=8 -o src/f16-vbinary/gen/f16-vsqrdiffc-f16c-u8.c & +tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=SQRDIFF -D BATCH_TILE=16 -o src/f16-vbinary/gen/f16-vsqrdiffc-f16c-u16.c & +tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=SUB -D BATCH_TILE=8 -o src/f16-vbinary/gen/f16-vsubc-f16c-u8.c & +tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=SUB -D BATCH_TILE=16 -o src/f16-vbinary/gen/f16-vsubc-f16c-u16.c & +tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=RSUB -D BATCH_TILE=8 -o src/f16-vbinary/gen/f16-vrsubc-f16c-u8.c & +tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=RSUB -D BATCH_TILE=16 -o src/f16-vbinary/gen/f16-vrsubc-f16c-u16.c & +tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=PRELU -D BATCH_TILE=8 -o src/f16-vbinary/gen/f16-vpreluc-f16c-u8.c & +tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=PRELU -D BATCH_TILE=16 -o src/f16-vbinary/gen/f16-vpreluc-f16c-u16.c & +tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=RPRELU -D BATCH_TILE=8 -o src/f16-vbinary/gen/f16-vrpreluc-f16c-u8.c & +tools/xngen src/f16-vbinary/vopc-f16c.c.in -D OP=RPRELU -D BATCH_TILE=16 -o src/f16-vbinary/gen/f16-vrpreluc-f16c-u16.c & wait diff --git a/scripts/generate-f32-vbinary.sh b/scripts/generate-f32-vbinary.sh index cfdb87c8652..bb4557a552f 100755 --- a/scripts/generate-f32-vbinary.sh +++ b/scripts/generate-f32-vbinary.sh @@ -6,587 +6,481 @@ #################################### Scalar ################################### ### Generic C micro-kernels -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vadd-minmax-scalar-u1.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vadd-minmax-scalar-u2.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vadd-minmax-scalar-u4.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vadd-minmax-scalar-u8.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vdiv-minmax-scalar-u1.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vdiv-minmax-scalar-u2.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vdiv-minmax-scalar-u4.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vdiv-minmax-scalar-u8.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vmul-minmax-scalar-u1.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vmul-minmax-scalar-u2.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vmul-minmax-scalar-u4.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vmul-minmax-scalar-u8.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vsub-minmax-scalar-u1.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vsub-minmax-scalar-u2.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vsub-minmax-scalar-u4.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vsub-minmax-scalar-u8.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD -D BATCH_TILE=1 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vadd-scalar-u1.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD -D BATCH_TILE=2 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vadd-scalar-u2.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD -D BATCH_TILE=4 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vadd-scalar-u4.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD -D BATCH_TILE=8 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vadd-scalar-u8.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV -D BATCH_TILE=1 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vdiv-scalar-u1.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV -D BATCH_TILE=2 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vdiv-scalar-u2.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV -D BATCH_TILE=4 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vdiv-scalar-u4.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV -D BATCH_TILE=8 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vdiv-scalar-u8.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MAX -D BATCH_TILE=1 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vmax-scalar-u1.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MAX -D BATCH_TILE=2 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vmax-scalar-u2.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MAX -D BATCH_TILE=4 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vmax-scalar-u4.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MAX -D BATCH_TILE=8 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vmax-scalar-u8.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MIN -D BATCH_TILE=1 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vmin-scalar-u1.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MIN -D BATCH_TILE=2 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vmin-scalar-u2.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MIN -D BATCH_TILE=4 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vmin-scalar-u4.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MIN -D BATCH_TILE=8 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vmin-scalar-u8.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL -D BATCH_TILE=1 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vmul-scalar-u1.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL -D BATCH_TILE=2 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vmul-scalar-u2.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL -D BATCH_TILE=4 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vmul-scalar-u4.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL -D BATCH_TILE=8 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vmul-scalar-u8.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=PRELU -D BATCH_TILE=1 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vprelu-scalar-u1.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=PRELU -D BATCH_TILE=2 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vprelu-scalar-u2.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=PRELU -D BATCH_TILE=4 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vprelu-scalar-u4.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=PRELU -D BATCH_TILE=8 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vprelu-scalar-u8.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SQRDIFF -D BATCH_TILE=1 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vsqrdiff-scalar-u1.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SQRDIFF -D BATCH_TILE=2 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vsqrdiff-scalar-u2.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SQRDIFF -D BATCH_TILE=4 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vsqrdiff-scalar-u4.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SQRDIFF -D BATCH_TILE=8 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vsqrdiff-scalar-u8.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB -D BATCH_TILE=1 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vsub-scalar-u1.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB -D BATCH_TILE=2 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vsub-scalar-u2.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB -D BATCH_TILE=4 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vsub-scalar-u4.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB -D BATCH_TILE=8 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vsub-scalar-u8.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vadd-scalar-u1.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vadd-scalar-u2.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vadd-scalar-u4.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vadd-scalar-u8.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vdiv-scalar-u1.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vdiv-scalar-u2.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vdiv-scalar-u4.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vdiv-scalar-u8.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MAX -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmax-scalar-u1.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MAX -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmax-scalar-u2.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MAX -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmax-scalar-u4.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MAX -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmax-scalar-u8.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MIN -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmin-scalar-u1.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MIN -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmin-scalar-u2.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MIN -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmin-scalar-u4.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MIN -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmin-scalar-u8.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmul-scalar-u1.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmul-scalar-u2.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmul-scalar-u4.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmul-scalar-u8.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SQRDIFF -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vsqrdiff-scalar-u1.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SQRDIFF -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vsqrdiff-scalar-u2.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SQRDIFF -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vsqrdiff-scalar-u4.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SQRDIFF -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vsqrdiff-scalar-u8.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vsub-scalar-u1.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vsub-scalar-u2.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vsub-scalar-u4.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vsub-scalar-u8.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=PRELU -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vprelu-scalar-u1.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=PRELU -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vprelu-scalar-u2.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=PRELU -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vprelu-scalar-u4.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=PRELU -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vprelu-scalar-u8.c & - -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vaddc-minmax-scalar-u1.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vaddc-minmax-scalar-u2.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vaddc-minmax-scalar-u4.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vaddc-minmax-scalar-u8.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vdivc-minmax-scalar-u1.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vdivc-minmax-scalar-u2.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vdivc-minmax-scalar-u4.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vdivc-minmax-scalar-u8.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vmulc-minmax-scalar-u1.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vmulc-minmax-scalar-u2.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vmulc-minmax-scalar-u4.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vmulc-minmax-scalar-u8.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vrdivc-minmax-scalar-u1.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vrdivc-minmax-scalar-u2.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vrdivc-minmax-scalar-u4.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vrdivc-minmax-scalar-u8.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vrsubc-minmax-scalar-u1.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vrsubc-minmax-scalar-u2.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vrsubc-minmax-scalar-u4.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vrsubc-minmax-scalar-u8.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vsubc-minmax-scalar-u1.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vsubc-minmax-scalar-u2.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vsubc-minmax-scalar-u4.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vsubc-minmax-scalar-u8.c & - -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vaddc-scalar-u1.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vaddc-scalar-u2.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vaddc-scalar-u4.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vaddc-scalar-u8.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vdivc-scalar-u1.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vdivc-scalar-u2.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vdivc-scalar-u4.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vdivc-scalar-u8.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MAX -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmaxc-scalar-u1.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MAX -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmaxc-scalar-u2.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MAX -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmaxc-scalar-u4.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MAX -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmaxc-scalar-u8.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MIN -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vminc-scalar-u1.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MIN -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vminc-scalar-u2.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MIN -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vminc-scalar-u4.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MIN -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vminc-scalar-u8.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmulc-scalar-u1.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmulc-scalar-u2.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmulc-scalar-u4.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmulc-scalar-u8.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vrdivc-scalar-u1.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vrdivc-scalar-u2.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vrdivc-scalar-u4.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vrdivc-scalar-u8.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vrsubc-scalar-u1.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vrsubc-scalar-u2.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vrsubc-scalar-u4.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vrsubc-scalar-u8.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SQRDIFF -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vsqrdiffc-scalar-u1.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SQRDIFF -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vsqrdiffc-scalar-u2.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SQRDIFF -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vsqrdiffc-scalar-u4.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SQRDIFF -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vsqrdiffc-scalar-u8.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vsubc-scalar-u1.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vsubc-scalar-u2.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vsubc-scalar-u4.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vsubc-scalar-u8.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=PRELU -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vpreluc-scalar-u1.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=PRELU -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vpreluc-scalar-u2.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=PRELU -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vpreluc-scalar-u4.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=PRELU -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vpreluc-scalar-u8.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RPRELU -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vrpreluc-scalar-u1.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RPRELU -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vrpreluc-scalar-u2.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RPRELU -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vrpreluc-scalar-u4.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RPRELU -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vrpreluc-scalar-u8.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD -D BATCH_TILE=1 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vaddc-scalar-u1.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD -D BATCH_TILE=2 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vaddc-scalar-u2.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD -D BATCH_TILE=4 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vaddc-scalar-u4.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD -D BATCH_TILE=8 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vaddc-scalar-u8.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV -D BATCH_TILE=1 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vdivc-scalar-u1.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV -D BATCH_TILE=2 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vdivc-scalar-u2.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV -D BATCH_TILE=4 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vdivc-scalar-u4.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV -D BATCH_TILE=8 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vdivc-scalar-u8.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MAX -D BATCH_TILE=1 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vmaxc-scalar-u1.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MAX -D BATCH_TILE=2 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vmaxc-scalar-u2.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MAX -D BATCH_TILE=4 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vmaxc-scalar-u4.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MAX -D BATCH_TILE=8 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vmaxc-scalar-u8.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MIN -D BATCH_TILE=1 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vminc-scalar-u1.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MIN -D BATCH_TILE=2 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vminc-scalar-u2.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MIN -D BATCH_TILE=4 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vminc-scalar-u4.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MIN -D BATCH_TILE=8 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vminc-scalar-u8.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL -D BATCH_TILE=1 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vmulc-scalar-u1.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL -D BATCH_TILE=2 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vmulc-scalar-u2.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL -D BATCH_TILE=4 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vmulc-scalar-u4.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL -D BATCH_TILE=8 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vmulc-scalar-u8.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=PRELU -D BATCH_TILE=1 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vpreluc-scalar-u1.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=PRELU -D BATCH_TILE=2 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vpreluc-scalar-u2.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=PRELU -D BATCH_TILE=4 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vpreluc-scalar-u4.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=PRELU -D BATCH_TILE=8 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vpreluc-scalar-u8.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV -D BATCH_TILE=1 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vrdivc-scalar-u1.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV -D BATCH_TILE=2 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vrdivc-scalar-u2.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV -D BATCH_TILE=4 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vrdivc-scalar-u4.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV -D BATCH_TILE=8 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vrdivc-scalar-u8.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RPRELU -D BATCH_TILE=1 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vrpreluc-scalar-u1.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RPRELU -D BATCH_TILE=2 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vrpreluc-scalar-u2.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RPRELU -D BATCH_TILE=4 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vrpreluc-scalar-u4.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RPRELU -D BATCH_TILE=8 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vrpreluc-scalar-u8.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB -D BATCH_TILE=1 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vrsubc-scalar-u1.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB -D BATCH_TILE=2 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vrsubc-scalar-u2.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB -D BATCH_TILE=4 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vrsubc-scalar-u4.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB -D BATCH_TILE=8 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vrsubc-scalar-u8.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SQRDIFF -D BATCH_TILE=1 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vsqrdiffc-scalar-u1.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SQRDIFF -D BATCH_TILE=2 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vsqrdiffc-scalar-u2.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SQRDIFF -D BATCH_TILE=4 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vsqrdiffc-scalar-u4.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SQRDIFF -D BATCH_TILE=8 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vsqrdiffc-scalar-u8.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB -D BATCH_TILE=1 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vsubc-scalar-u1.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB -D BATCH_TILE=2 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vsubc-scalar-u2.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB -D BATCH_TILE=4 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vsubc-scalar-u4.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB -D BATCH_TILE=8 -D WASM=0 -D -o src/f32-vbinary/gen/f32-vsubc-scalar-u8.c & ### WAsm-specific micro-kernels -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vadd-minmax-wasm-u1.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vadd-minmax-wasm-u2.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vadd-minmax-wasm-u4.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vadd-minmax-wasm-u8.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vdiv-minmax-wasm-u1.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vdiv-minmax-wasm-u2.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vdiv-minmax-wasm-u4.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vdiv-minmax-wasm-u8.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MAX -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmax-wasm-u1.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MAX -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmax-wasm-u2.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MAX -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmax-wasm-u4.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MAX -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmax-wasm-u8.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MIN -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmin-wasm-u1.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MIN -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmin-wasm-u2.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MIN -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmin-wasm-u4.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MIN -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmin-wasm-u8.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vmul-minmax-wasm-u1.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vmul-minmax-wasm-u2.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vmul-minmax-wasm-u4.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vmul-minmax-wasm-u8.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vsub-minmax-wasm-u1.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vsub-minmax-wasm-u2.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vsub-minmax-wasm-u4.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vsub-minmax-wasm-u8.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=PRELU -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vprelu-wasm-u1.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=PRELU -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vprelu-wasm-u2.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=PRELU -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vprelu-wasm-u4.c & -tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=PRELU -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vprelu-wasm-u8.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD -D BATCH_TILE=1 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vadd-wasm-u1.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD -D BATCH_TILE=2 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vadd-wasm-u2.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD -D BATCH_TILE=4 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vadd-wasm-u4.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD -D BATCH_TILE=8 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vadd-wasm-u8.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV -D BATCH_TILE=1 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vdiv-wasm-u1.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV -D BATCH_TILE=2 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vdiv-wasm-u2.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV -D BATCH_TILE=4 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vdiv-wasm-u4.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV -D BATCH_TILE=8 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vdiv-wasm-u8.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MAX -D BATCH_TILE=1 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vmax-wasm-u1.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MAX -D BATCH_TILE=2 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vmax-wasm-u2.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MAX -D BATCH_TILE=4 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vmax-wasm-u4.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MAX -D BATCH_TILE=8 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vmax-wasm-u8.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MIN -D BATCH_TILE=1 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vmin-wasm-u1.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MIN -D BATCH_TILE=2 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vmin-wasm-u2.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MIN -D BATCH_TILE=4 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vmin-wasm-u4.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MIN -D BATCH_TILE=8 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vmin-wasm-u8.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL -D BATCH_TILE=1 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vmul-wasm-u1.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL -D BATCH_TILE=2 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vmul-wasm-u2.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL -D BATCH_TILE=4 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vmul-wasm-u4.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL -D BATCH_TILE=8 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vmul-wasm-u8.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=PRELU -D BATCH_TILE=1 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vprelu-wasm-u1.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=PRELU -D BATCH_TILE=2 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vprelu-wasm-u2.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=PRELU -D BATCH_TILE=4 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vprelu-wasm-u4.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=PRELU -D BATCH_TILE=8 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vprelu-wasm-u8.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB -D BATCH_TILE=1 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vsub-wasm-u1.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB -D BATCH_TILE=2 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vsub-wasm-u2.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB -D BATCH_TILE=4 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vsub-wasm-u4.c & +tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB -D BATCH_TILE=8 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vsub-wasm-u8.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vaddc-minmax-wasm-u1.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vaddc-minmax-wasm-u2.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vaddc-minmax-wasm-u4.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vaddc-minmax-wasm-u8.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vdivc-minmax-wasm-u1.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vdivc-minmax-wasm-u2.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vdivc-minmax-wasm-u4.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vdivc-minmax-wasm-u8.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MAX -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmaxc-wasm-u1.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MAX -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmaxc-wasm-u2.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MAX -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmaxc-wasm-u4.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MAX -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmaxc-wasm-u8.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MIN -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vminc-wasm-u1.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MIN -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vminc-wasm-u2.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MIN -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vminc-wasm-u4.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MIN -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vminc-wasm-u8.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vmulc-minmax-wasm-u1.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vmulc-minmax-wasm-u2.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vmulc-minmax-wasm-u4.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vmulc-minmax-wasm-u8.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vrdivc-minmax-wasm-u1.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vrdivc-minmax-wasm-u2.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vrdivc-minmax-wasm-u4.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vrdivc-minmax-wasm-u8.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vrsubc-minmax-wasm-u1.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vrsubc-minmax-wasm-u2.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vrsubc-minmax-wasm-u4.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vrsubc-minmax-wasm-u8.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vsubc-minmax-wasm-u1.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vsubc-minmax-wasm-u2.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vsubc-minmax-wasm-u4.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vsubc-minmax-wasm-u8.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=PRELU -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vpreluc-wasm-u1.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=PRELU -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vpreluc-wasm-u2.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=PRELU -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vpreluc-wasm-u4.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=PRELU -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vpreluc-wasm-u8.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RPRELU -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vrpreluc-wasm-u1.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RPRELU -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vrpreluc-wasm-u2.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RPRELU -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vrpreluc-wasm-u4.c & -tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RPRELU -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vrpreluc-wasm-u8.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD -D BATCH_TILE=1 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vaddc-wasm-u1.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD -D BATCH_TILE=2 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vaddc-wasm-u2.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD -D BATCH_TILE=4 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vaddc-wasm-u4.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD -D BATCH_TILE=8 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vaddc-wasm-u8.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV -D BATCH_TILE=1 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vdivc-wasm-u1.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV -D BATCH_TILE=2 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vdivc-wasm-u2.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV -D BATCH_TILE=4 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vdivc-wasm-u4.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV -D BATCH_TILE=8 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vdivc-wasm-u8.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MAX -D BATCH_TILE=1 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vmaxc-wasm-u1.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MAX -D BATCH_TILE=2 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vmaxc-wasm-u2.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MAX -D BATCH_TILE=4 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vmaxc-wasm-u4.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MAX -D BATCH_TILE=8 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vmaxc-wasm-u8.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MIN -D BATCH_TILE=1 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vminc-wasm-u1.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MIN -D BATCH_TILE=2 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vminc-wasm-u2.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MIN -D BATCH_TILE=4 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vminc-wasm-u4.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MIN -D BATCH_TILE=8 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vminc-wasm-u8.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL -D BATCH_TILE=1 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vmulc-wasm-u1.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL -D BATCH_TILE=2 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vmulc-wasm-u2.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL -D BATCH_TILE=4 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vmulc-wasm-u4.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL -D BATCH_TILE=8 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vmulc-wasm-u8.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=PRELU -D BATCH_TILE=1 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vpreluc-wasm-u1.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=PRELU -D BATCH_TILE=2 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vpreluc-wasm-u2.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=PRELU -D BATCH_TILE=4 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vpreluc-wasm-u4.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=PRELU -D BATCH_TILE=8 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vpreluc-wasm-u8.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV -D BATCH_TILE=1 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vrdivc-wasm-u1.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV -D BATCH_TILE=2 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vrdivc-wasm-u2.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV -D BATCH_TILE=4 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vrdivc-wasm-u4.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV -D BATCH_TILE=8 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vrdivc-wasm-u8.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RPRELU -D BATCH_TILE=1 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vrpreluc-wasm-u1.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RPRELU -D BATCH_TILE=2 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vrpreluc-wasm-u2.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RPRELU -D BATCH_TILE=4 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vrpreluc-wasm-u4.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RPRELU -D BATCH_TILE=8 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vrpreluc-wasm-u8.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB -D BATCH_TILE=1 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vrsubc-wasm-u1.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB -D BATCH_TILE=2 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vrsubc-wasm-u2.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB -D BATCH_TILE=4 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vrsubc-wasm-u4.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB -D BATCH_TILE=8 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vrsubc-wasm-u8.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB -D BATCH_TILE=1 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vsubc-wasm-u1.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB -D BATCH_TILE=2 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vsubc-wasm-u2.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB -D BATCH_TILE=4 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vsubc-wasm-u4.c & +tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB -D BATCH_TILE=8 -D WASM=1 -D -o src/f32-vbinary/gen/f32-vsubc-wasm-u8.c & ################################## WAsm SIMD ################################## -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=ADD -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-arm-u4.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=ADD -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-x86-u4.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=ADD -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-arm-u8.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=ADD -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-x86-u8.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=ADD -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-arm-u16.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=ADD -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-x86-u16.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=DIV -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-arm-u4.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=DIV -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-x86-u4.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=DIV -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-arm-u8.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=DIV -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-x86-u8.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=DIV -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-arm-u16.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=DIV -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-x86-u16.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MUL -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-arm-u4.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MUL -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-x86-u4.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MUL -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-arm-u8.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MUL -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-x86-u8.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MUL -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-arm-u16.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MUL -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-x86-u16.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SUB -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-arm-u4.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SUB -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-x86-u4.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SUB -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-arm-u8.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SUB -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-x86-u8.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SUB -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-arm-u16.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SUB -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-x86-u16.c & - -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=ADD -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vadd-wasmsimd-u4.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=ADD -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vadd-wasmsimd-u8.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=ADD -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vadd-wasmsimd-u16.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=DIV -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vdiv-wasmsimd-u4.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=DIV -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vdiv-wasmsimd-u8.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=DIV -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vdiv-wasmsimd-u16.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MAX -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vmax-wasmsimd-arm-u4.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MAX -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vmax-wasmsimd-x86-u4.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MAX -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vmax-wasmsimd-arm-u8.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MAX -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vmax-wasmsimd-x86-u8.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MAX -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vmax-wasmsimd-arm-u16.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MAX -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vmax-wasmsimd-x86-u16.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MIN -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vmin-wasmsimd-arm-u4.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MIN -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vmin-wasmsimd-x86-u4.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MIN -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vmin-wasmsimd-arm-u8.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MIN -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vmin-wasmsimd-x86-u8.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MIN -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vmin-wasmsimd-arm-u16.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MIN -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vmin-wasmsimd-x86-u16.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MUL -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vmul-wasmsimd-u4.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MUL -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vmul-wasmsimd-u8.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MUL -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vmul-wasmsimd-u16.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SQRDIFF -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vsqrdiff-wasmsimd-u4.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SQRDIFF -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vsqrdiff-wasmsimd-u8.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SQRDIFF -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vsqrdiff-wasmsimd-u16.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SUB -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vsub-wasmsimd-u4.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SUB -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vsub-wasmsimd-u8.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SUB -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vsub-wasmsimd-u16.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=PRELU -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vprelu-wasmsimd-u4.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=PRELU -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vprelu-wasmsimd-u8.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=PRELU -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vprelu-wasmsimd-u16.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=PRELU -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D RELAXED=1 -D X86=0 -o src/f32-vbinary/gen/f32-vprelu-wasmrelaxedsimd-u4.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=PRELU -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D RELAXED=1 -D X86=0 -o src/f32-vbinary/gen/f32-vprelu-wasmrelaxedsimd-u8.c & -tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=PRELU -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D RELAXED=1 -D X86=0 -o src/f32-vbinary/gen/f32-vprelu-wasmrelaxedsimd-u16.c & +tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=ADD -D BATCH_TILE=16 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vadd-wasmsimd-u16.c & +tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=ADD -D BATCH_TILE=4 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vadd-wasmsimd-u4.c & +tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=ADD -D BATCH_TILE=8 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vadd-wasmsimd-u8.c & +tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=DIV -D BATCH_TILE=16 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vdiv-wasmsimd-u16.c & +tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=DIV -D BATCH_TILE=4 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vdiv-wasmsimd-u4.c & +tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=DIV -D BATCH_TILE=8 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vdiv-wasmsimd-u8.c & +tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MAX -D BATCH_TILE=16 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vmax-wasmsimd-arm-u16.c & +tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MAX -D BATCH_TILE=16 -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vmax-wasmsimd-x86-u16.c & +tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MAX -D BATCH_TILE=4 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vmax-wasmsimd-arm-u4.c & +tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MAX -D BATCH_TILE=4 -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vmax-wasmsimd-x86-u4.c & +tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MAX -D BATCH_TILE=8 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vmax-wasmsimd-arm-u8.c & +tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MAX -D BATCH_TILE=8 -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vmax-wasmsimd-x86-u8.c & +tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MIN -D BATCH_TILE=16 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vmin-wasmsimd-arm-u16.c & +tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MIN -D BATCH_TILE=16 -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vmin-wasmsimd-x86-u16.c & +tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MIN -D BATCH_TILE=4 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vmin-wasmsimd-arm-u4.c & +tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MIN -D BATCH_TILE=4 -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vmin-wasmsimd-x86-u4.c & +tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MIN -D BATCH_TILE=8 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vmin-wasmsimd-arm-u8.c & +tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MIN -D BATCH_TILE=8 -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vmin-wasmsimd-x86-u8.c & +tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MUL -D BATCH_TILE=16 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vmul-wasmsimd-u16.c & +tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MUL -D BATCH_TILE=4 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vmul-wasmsimd-u4.c & +tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MUL -D BATCH_TILE=8 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vmul-wasmsimd-u8.c & +tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=PRELU -D BATCH_TILE=16 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vprelu-wasmsimd-u16.c & +tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=PRELU -D BATCH_TILE=16 -D RELAXED=1 -D X86=0 -o src/f32-vbinary/gen/f32-vprelu-wasmrelaxedsimd-u16.c & +tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=PRELU -D BATCH_TILE=4 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vprelu-wasmsimd-u4.c & +tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=PRELU -D BATCH_TILE=4 -D RELAXED=1 -D X86=0 -o src/f32-vbinary/gen/f32-vprelu-wasmrelaxedsimd-u4.c & +tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=PRELU -D BATCH_TILE=8 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vprelu-wasmsimd-u8.c & +tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=PRELU -D BATCH_TILE=8 -D RELAXED=1 -D X86=0 -o src/f32-vbinary/gen/f32-vprelu-wasmrelaxedsimd-u8.c & +tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SQRDIFF -D BATCH_TILE=16 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vsqrdiff-wasmsimd-u16.c & +tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SQRDIFF -D BATCH_TILE=4 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vsqrdiff-wasmsimd-u4.c & +tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SQRDIFF -D BATCH_TILE=8 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vsqrdiff-wasmsimd-u8.c & +tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SUB -D BATCH_TILE=16 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vsub-wasmsimd-u16.c & +tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SUB -D BATCH_TILE=4 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vsub-wasmsimd-u4.c & +tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SUB -D BATCH_TILE=8 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vsub-wasmsimd-u8.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=ADD -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-arm-u4.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=ADD -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-x86-u4.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=ADD -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-arm-u8.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=ADD -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-x86-u8.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=ADD -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-arm-u16.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=ADD -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-x86-u16.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=DIV -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-arm-u4.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=DIV -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-x86-u4.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=DIV -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-arm-u8.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=DIV -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-x86-u8.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=DIV -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-arm-u16.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=DIV -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-x86-u16.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MUL -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-arm-u4.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MUL -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-x86-u4.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MUL -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-arm-u8.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MUL -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-x86-u8.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MUL -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-arm-u16.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MUL -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-x86-u16.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RDIV -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-arm-u4.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RDIV -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-x86-u4.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RDIV -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-arm-u8.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RDIV -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-x86-u8.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RDIV -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-arm-u16.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RDIV -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-x86-u16.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RSUB -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-arm-u4.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RSUB -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-x86-u4.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RSUB -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-arm-u8.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RSUB -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-x86-u8.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RSUB -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-arm-u16.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RSUB -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-x86-u16.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SUB -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-arm-u4.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SUB -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-x86-u4.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SUB -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-arm-u8.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SUB -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-x86-u8.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SUB -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-arm-u16.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SUB -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-x86-u16.c & - -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=ADD -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vaddc-wasmsimd-u4.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=ADD -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vaddc-wasmsimd-u8.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=ADD -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vaddc-wasmsimd-u16.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=DIV -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vdivc-wasmsimd-u4.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=DIV -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vdivc-wasmsimd-u8.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=DIV -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vdivc-wasmsimd-u16.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MAX -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vmaxc-wasmsimd-arm-u4.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MAX -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vmaxc-wasmsimd-x86-u4.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MAX -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vmaxc-wasmsimd-arm-u8.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MAX -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vmaxc-wasmsimd-x86-u8.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MAX -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vmaxc-wasmsimd-arm-u16.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MAX -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vmaxc-wasmsimd-x86-u16.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MIN -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vminc-wasmsimd-arm-u4.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MIN -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vminc-wasmsimd-x86-u4.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MIN -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vminc-wasmsimd-arm-u8.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MIN -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vminc-wasmsimd-x86-u8.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MIN -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vminc-wasmsimd-arm-u16.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MIN -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vminc-wasmsimd-x86-u16.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MUL -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vmulc-wasmsimd-u4.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MUL -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vmulc-wasmsimd-u8.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MUL -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vmulc-wasmsimd-u16.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RDIV -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vrdivc-wasmsimd-u4.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RDIV -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vrdivc-wasmsimd-u8.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RDIV -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vrdivc-wasmsimd-u16.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RSUB -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vrsubc-wasmsimd-u4.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RSUB -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vrsubc-wasmsimd-u8.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RSUB -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vrsubc-wasmsimd-u16.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SQRDIFF -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vsqrdiffc-wasmsimd-u4.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SQRDIFF -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vsqrdiffc-wasmsimd-u8.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SQRDIFF -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vsqrdiffc-wasmsimd-u16.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SUB -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vsubc-wasmsimd-u4.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SUB -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vsubc-wasmsimd-u8.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SUB -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vsubc-wasmsimd-u16.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=PRELU -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u4.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=PRELU -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u8.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=PRELU -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u16.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=PRELU -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D RELAXED=1 -D X86=0 -o src/f32-vbinary/gen/f32-vpreluc-wasmrelaxedsimd-u4.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=PRELU -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D RELAXED=1 -D X86=0 -o src/f32-vbinary/gen/f32-vpreluc-wasmrelaxedsimd-u8.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=PRELU -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D RELAXED=1 -D X86=0 -o src/f32-vbinary/gen/f32-vpreluc-wasmrelaxedsimd-u16.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RPRELU -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u4.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RPRELU -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u8.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RPRELU -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u16.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RPRELU -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D RELAXED=1 -D X86=0 -o src/f32-vbinary/gen/f32-vrpreluc-wasmrelaxedsimd-u4.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RPRELU -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D RELAXED=1 -D X86=0 -o src/f32-vbinary/gen/f32-vrpreluc-wasmrelaxedsimd-u8.c & -tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RPRELU -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D RELAXED=1 -D X86=0 -o src/f32-vbinary/gen/f32-vrpreluc-wasmrelaxedsimd-u16.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=ADD -D BATCH_TILE=16 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vaddc-wasmsimd-u16.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=ADD -D BATCH_TILE=4 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vaddc-wasmsimd-u4.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=ADD -D BATCH_TILE=8 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vaddc-wasmsimd-u8.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=DIV -D BATCH_TILE=16 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vdivc-wasmsimd-u16.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=DIV -D BATCH_TILE=4 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vdivc-wasmsimd-u4.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=DIV -D BATCH_TILE=8 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vdivc-wasmsimd-u8.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MAX -D BATCH_TILE=16 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vmaxc-wasmsimd-arm-u16.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MAX -D BATCH_TILE=16 -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vmaxc-wasmsimd-x86-u16.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MAX -D BATCH_TILE=4 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vmaxc-wasmsimd-arm-u4.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MAX -D BATCH_TILE=4 -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vmaxc-wasmsimd-x86-u4.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MAX -D BATCH_TILE=8 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vmaxc-wasmsimd-arm-u8.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MAX -D BATCH_TILE=8 -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vmaxc-wasmsimd-x86-u8.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MIN -D BATCH_TILE=16 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vminc-wasmsimd-arm-u16.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MIN -D BATCH_TILE=16 -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vminc-wasmsimd-x86-u16.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MIN -D BATCH_TILE=4 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vminc-wasmsimd-arm-u4.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MIN -D BATCH_TILE=4 -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vminc-wasmsimd-x86-u4.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MIN -D BATCH_TILE=8 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vminc-wasmsimd-arm-u8.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MIN -D BATCH_TILE=8 -D RELAXED=0 -D X86=1 -o src/f32-vbinary/gen/f32-vminc-wasmsimd-x86-u8.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MUL -D BATCH_TILE=16 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vmulc-wasmsimd-u16.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MUL -D BATCH_TILE=4 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vmulc-wasmsimd-u4.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MUL -D BATCH_TILE=8 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vmulc-wasmsimd-u8.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=PRELU -D BATCH_TILE=16 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u16.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=PRELU -D BATCH_TILE=16 -D RELAXED=1 -D X86=0 -o src/f32-vbinary/gen/f32-vpreluc-wasmrelaxedsimd-u16.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=PRELU -D BATCH_TILE=4 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u4.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=PRELU -D BATCH_TILE=4 -D RELAXED=1 -D X86=0 -o src/f32-vbinary/gen/f32-vpreluc-wasmrelaxedsimd-u4.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=PRELU -D BATCH_TILE=8 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u8.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=PRELU -D BATCH_TILE=8 -D RELAXED=1 -D X86=0 -o src/f32-vbinary/gen/f32-vpreluc-wasmrelaxedsimd-u8.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RDIV -D BATCH_TILE=16 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vrdivc-wasmsimd-u16.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RDIV -D BATCH_TILE=4 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vrdivc-wasmsimd-u4.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RDIV -D BATCH_TILE=8 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vrdivc-wasmsimd-u8.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RPRELU -D BATCH_TILE=16 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u16.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RPRELU -D BATCH_TILE=16 -D RELAXED=1 -D X86=0 -o src/f32-vbinary/gen/f32-vrpreluc-wasmrelaxedsimd-u16.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RPRELU -D BATCH_TILE=4 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u4.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RPRELU -D BATCH_TILE=4 -D RELAXED=1 -D X86=0 -o src/f32-vbinary/gen/f32-vrpreluc-wasmrelaxedsimd-u4.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RPRELU -D BATCH_TILE=8 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u8.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RPRELU -D BATCH_TILE=8 -D RELAXED=1 -D X86=0 -o src/f32-vbinary/gen/f32-vrpreluc-wasmrelaxedsimd-u8.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RSUB -D BATCH_TILE=16 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vrsubc-wasmsimd-u16.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RSUB -D BATCH_TILE=4 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vrsubc-wasmsimd-u4.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RSUB -D BATCH_TILE=8 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vrsubc-wasmsimd-u8.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SQRDIFF -D BATCH_TILE=16 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vsqrdiffc-wasmsimd-u16.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SQRDIFF -D BATCH_TILE=4 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vsqrdiffc-wasmsimd-u4.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SQRDIFF -D BATCH_TILE=8 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vsqrdiffc-wasmsimd-u8.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SUB -D BATCH_TILE=16 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vsubc-wasmsimd-u16.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SUB -D BATCH_TILE=4 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vsubc-wasmsimd-u4.c & +tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SUB -D BATCH_TILE=8 -D RELAXED=0 -D X86=0 -o src/f32-vbinary/gen/f32-vsubc-wasmsimd-u8.c & ################################### ARM NEON ################################## -tools/xngen src/f32-vbinary/vop-neon.c.in -D OP=ADD -D BATCH_TILE=4 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vadd-minmax-neon-u4.c & -tools/xngen src/f32-vbinary/vop-neon.c.in -D OP=ADD -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vadd-minmax-neon-u8.c & -tools/xngen src/f32-vbinary/vop-neon.c.in -D OP=DIV -D BATCH_TILE=4 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vdiv-minmax-aarch64-neon-u4.c & -tools/xngen src/f32-vbinary/vop-neon.c.in -D OP=DIV -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vdiv-minmax-aarch64-neon-u8.c & -tools/xngen src/f32-vbinary/vop-neon.c.in -D OP=MAX -D BATCH_TILE=4 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmax-neon-u4.c & -tools/xngen src/f32-vbinary/vop-neon.c.in -D OP=MAX -D BATCH_TILE=8 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmax-neon-u8.c & -tools/xngen src/f32-vbinary/vop-neon.c.in -D OP=MIN -D BATCH_TILE=4 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmin-neon-u4.c & -tools/xngen src/f32-vbinary/vop-neon.c.in -D OP=MIN -D BATCH_TILE=8 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmin-neon-u8.c & -tools/xngen src/f32-vbinary/vop-neon.c.in -D OP=MUL -D BATCH_TILE=4 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vmul-minmax-neon-u4.c & -tools/xngen src/f32-vbinary/vop-neon.c.in -D OP=MUL -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vmul-minmax-neon-u8.c & -tools/xngen src/f32-vbinary/vop-neon.c.in -D OP=SQRDIFF -D BATCH_TILE=4 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vsqrdiff-neon-u4.c & -tools/xngen src/f32-vbinary/vop-neon.c.in -D OP=SQRDIFF -D BATCH_TILE=8 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vsqrdiff-neon-u8.c & -tools/xngen src/f32-vbinary/vop-neon.c.in -D OP=SUB -D BATCH_TILE=4 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vsub-minmax-neon-u4.c & -tools/xngen src/f32-vbinary/vop-neon.c.in -D OP=SUB -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vsub-minmax-neon-u8.c & -tools/xngen src/f32-vbinary/vop-neon.c.in -D OP=PRELU -D BATCH_TILE=4 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vprelu-neon-u4.c & -tools/xngen src/f32-vbinary/vop-neon.c.in -D OP=PRELU -D BATCH_TILE=8 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vprelu-neon-u8.c & - -tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=ADD -D BATCH_TILE=4 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vaddc-minmax-neon-u4.c & -tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=ADD -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vaddc-minmax-neon-u8.c & -tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=DIV -D BATCH_TILE=4 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vdivc-minmax-aarch64-neon-u4.c & -tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=DIV -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vdivc-minmax-aarch64-neon-u8.c & -tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=MAX -D BATCH_TILE=4 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmaxc-neon-u4.c & -tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=MAX -D BATCH_TILE=8 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmaxc-neon-u8.c & -tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=MIN -D BATCH_TILE=4 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vminc-neon-u4.c & -tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=MIN -D BATCH_TILE=8 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vminc-neon-u8.c & -tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=MUL -D BATCH_TILE=4 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vmulc-minmax-neon-u4.c & -tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=MUL -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vmulc-minmax-neon-u8.c & -tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=RDIV -D BATCH_TILE=4 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vrdivc-minmax-aarch64-neon-u4.c & -tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=RDIV -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vrdivc-minmax-aarch64-neon-u8.c & -tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=RSUB -D BATCH_TILE=4 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vrsubc-minmax-neon-u4.c & -tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=RSUB -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vrsubc-minmax-neon-u8.c & -tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=SQRDIFF -D BATCH_TILE=4 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vsqrdiffc-neon-u4.c & -tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=SQRDIFF -D BATCH_TILE=8 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vsqrdiffc-neon-u8.c & -tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=SUB -D BATCH_TILE=4 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vsubc-minmax-neon-u4.c & -tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=SUB -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vsubc-minmax-neon-u8.c & -tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=PRELU -D BATCH_TILE=4 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vpreluc-neon-u4.c & -tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=PRELU -D BATCH_TILE=8 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vpreluc-neon-u8.c & -tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=RPRELU -D BATCH_TILE=4 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vrpreluc-neon-u4.c & -tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=RPRELU -D BATCH_TILE=8 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vrpreluc-neon-u8.c & +tools/xngen src/f32-vbinary/vop-neon.c.in -D OP=ADD -D BATCH_TILE=4 -D -o src/f32-vbinary/gen/f32-vadd-neon-u4.c & +tools/xngen src/f32-vbinary/vop-neon.c.in -D OP=ADD -D BATCH_TILE=8 -D -o src/f32-vbinary/gen/f32-vadd-neon-u8.c & +tools/xngen src/f32-vbinary/vop-neon.c.in -D OP=DIV -D BATCH_TILE=4 -D -o src/f32-vbinary/gen/f32-vdiv-aarch64-neon-u4.c & +tools/xngen src/f32-vbinary/vop-neon.c.in -D OP=DIV -D BATCH_TILE=8 -D -o src/f32-vbinary/gen/f32-vdiv-aarch64-neon-u8.c & +tools/xngen src/f32-vbinary/vop-neon.c.in -D OP=MAX -D BATCH_TILE=4 -D -o src/f32-vbinary/gen/f32-vmax-neon-u4.c & +tools/xngen src/f32-vbinary/vop-neon.c.in -D OP=MAX -D BATCH_TILE=8 -D -o src/f32-vbinary/gen/f32-vmax-neon-u8.c & +tools/xngen src/f32-vbinary/vop-neon.c.in -D OP=MIN -D BATCH_TILE=4 -D -o src/f32-vbinary/gen/f32-vmin-neon-u4.c & +tools/xngen src/f32-vbinary/vop-neon.c.in -D OP=MIN -D BATCH_TILE=8 -D -o src/f32-vbinary/gen/f32-vmin-neon-u8.c & +tools/xngen src/f32-vbinary/vop-neon.c.in -D OP=MUL -D BATCH_TILE=4 -D -o src/f32-vbinary/gen/f32-vmul-neon-u4.c & +tools/xngen src/f32-vbinary/vop-neon.c.in -D OP=MUL -D BATCH_TILE=8 -D -o src/f32-vbinary/gen/f32-vmul-neon-u8.c & +tools/xngen src/f32-vbinary/vop-neon.c.in -D OP=PRELU -D BATCH_TILE=4 -D -o src/f32-vbinary/gen/f32-vprelu-neon-u4.c & +tools/xngen src/f32-vbinary/vop-neon.c.in -D OP=PRELU -D BATCH_TILE=8 -D -o src/f32-vbinary/gen/f32-vprelu-neon-u8.c & +tools/xngen src/f32-vbinary/vop-neon.c.in -D OP=SQRDIFF -D BATCH_TILE=4 -D -o src/f32-vbinary/gen/f32-vsqrdiff-neon-u4.c & +tools/xngen src/f32-vbinary/vop-neon.c.in -D OP=SQRDIFF -D BATCH_TILE=8 -D -o src/f32-vbinary/gen/f32-vsqrdiff-neon-u8.c & +tools/xngen src/f32-vbinary/vop-neon.c.in -D OP=SUB -D BATCH_TILE=4 -D -o src/f32-vbinary/gen/f32-vsub-neon-u4.c & +tools/xngen src/f32-vbinary/vop-neon.c.in -D OP=SUB -D BATCH_TILE=8 -D -o src/f32-vbinary/gen/f32-vsub-neon-u8.c & -################################ RISC-V Vector ################################ -tools/xngen src/f32-vbinary/vop-rvv.c.in -D OP=ADD -D LMUL=4 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vadd-minmax-rvv-u4v.c & -tools/xngen src/f32-vbinary/vop-rvv.c.in -D OP=ADD -D LMUL=8 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vadd-minmax-rvv-u8v.c & -tools/xngen src/f32-vbinary/vop-rvv.c.in -D OP=DIV -D LMUL=4 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vdiv-minmax-rvv-u4v.c & -tools/xngen src/f32-vbinary/vop-rvv.c.in -D OP=DIV -D LMUL=8 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vdiv-minmax-rvv-u8v.c & -tools/xngen src/f32-vbinary/vop-rvv.c.in -D OP=MAX -D LMUL=4 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmax-rvv-u4v.c & -tools/xngen src/f32-vbinary/vop-rvv.c.in -D OP=MAX -D LMUL=8 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmax-rvv-u8v.c & -tools/xngen src/f32-vbinary/vop-rvv.c.in -D OP=MIN -D LMUL=4 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmin-rvv-u4v.c & -tools/xngen src/f32-vbinary/vop-rvv.c.in -D OP=MIN -D LMUL=8 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmin-rvv-u8v.c & -tools/xngen src/f32-vbinary/vop-rvv.c.in -D OP=MUL -D LMUL=4 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vmul-minmax-rvv-u4v.c & -tools/xngen src/f32-vbinary/vop-rvv.c.in -D OP=MUL -D LMUL=8 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vmul-minmax-rvv-u8v.c & -tools/xngen src/f32-vbinary/vop-rvv.c.in -D OP=SQRDIFF -D LMUL=4 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vsqrdiff-rvv-u4v.c & -tools/xngen src/f32-vbinary/vop-rvv.c.in -D OP=SQRDIFF -D LMUL=8 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vsqrdiff-rvv-u8v.c & -tools/xngen src/f32-vbinary/vop-rvv.c.in -D OP=SUB -D LMUL=4 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vsub-minmax-rvv-u4v.c & -tools/xngen src/f32-vbinary/vop-rvv.c.in -D OP=SUB -D LMUL=8 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vsub-minmax-rvv-u8v.c & - -tools/xngen src/f32-vbinary/vopc-rvv.c.in -D OP=ADD -D LMUL=4 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vaddc-minmax-rvv-u4v.c & -tools/xngen src/f32-vbinary/vopc-rvv.c.in -D OP=ADD -D LMUL=8 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vaddc-minmax-rvv-u8v.c & -tools/xngen src/f32-vbinary/vopc-rvv.c.in -D OP=DIV -D LMUL=4 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vdivc-minmax-rvv-u4v.c & -tools/xngen src/f32-vbinary/vopc-rvv.c.in -D OP=DIV -D LMUL=8 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vdivc-minmax-rvv-u8v.c & -tools/xngen src/f32-vbinary/vopc-rvv.c.in -D OP=MAX -D LMUL=4 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmaxc-rvv-u4v.c & -tools/xngen src/f32-vbinary/vopc-rvv.c.in -D OP=MAX -D LMUL=8 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmaxc-rvv-u8v.c & -tools/xngen src/f32-vbinary/vopc-rvv.c.in -D OP=MIN -D LMUL=4 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vminc-rvv-u4v.c & -tools/xngen src/f32-vbinary/vopc-rvv.c.in -D OP=MIN -D LMUL=8 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vminc-rvv-u8v.c & -tools/xngen src/f32-vbinary/vopc-rvv.c.in -D OP=MUL -D LMUL=4 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vmulc-minmax-rvv-u4v.c & -tools/xngen src/f32-vbinary/vopc-rvv.c.in -D OP=MUL -D LMUL=8 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vmulc-minmax-rvv-u8v.c & -tools/xngen src/f32-vbinary/vopc-rvv.c.in -D OP=MUL -D LMUL=4 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmulc-rvv-u4v.c & -tools/xngen src/f32-vbinary/vopc-rvv.c.in -D OP=MUL -D LMUL=8 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmulc-rvv-u8v.c & -tools/xngen src/f32-vbinary/vopc-rvv.c.in -D OP=RDIV -D LMUL=4 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vrdivc-minmax-rvv-u4v.c & -tools/xngen src/f32-vbinary/vopc-rvv.c.in -D OP=RDIV -D LMUL=8 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vrdivc-minmax-rvv-u8v.c & -tools/xngen src/f32-vbinary/vopc-rvv.c.in -D OP=RSUB -D LMUL=4 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vrsubc-minmax-rvv-u4v.c & -tools/xngen src/f32-vbinary/vopc-rvv.c.in -D OP=RSUB -D LMUL=8 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vrsubc-minmax-rvv-u8v.c & -tools/xngen src/f32-vbinary/vopc-rvv.c.in -D OP=SQRDIFF -D LMUL=4 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vsqrdiffc-rvv-u4v.c & -tools/xngen src/f32-vbinary/vopc-rvv.c.in -D OP=SQRDIFF -D LMUL=8 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vsqrdiffc-rvv-u8v.c & -tools/xngen src/f32-vbinary/vopc-rvv.c.in -D OP=SUB -D LMUL=4 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vsubc-minmax-rvv-u4v.c & -tools/xngen src/f32-vbinary/vopc-rvv.c.in -D OP=SUB -D LMUL=8 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vsubc-minmax-rvv-u8v.c & +tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=ADD -D BATCH_TILE=4 -D -o src/f32-vbinary/gen/f32-vaddc-neon-u4.c & +tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=ADD -D BATCH_TILE=8 -D -o src/f32-vbinary/gen/f32-vaddc-neon-u8.c & +tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=DIV -D BATCH_TILE=4 -D -o src/f32-vbinary/gen/f32-vdivc-aarch64-neon-u4.c & +tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=DIV -D BATCH_TILE=8 -D -o src/f32-vbinary/gen/f32-vdivc-aarch64-neon-u8.c & +tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=MAX -D BATCH_TILE=4 -D -o src/f32-vbinary/gen/f32-vmaxc-neon-u4.c & +tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=MAX -D BATCH_TILE=8 -D -o src/f32-vbinary/gen/f32-vmaxc-neon-u8.c & +tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=MIN -D BATCH_TILE=4 -D -o src/f32-vbinary/gen/f32-vminc-neon-u4.c & +tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=MIN -D BATCH_TILE=8 -D -o src/f32-vbinary/gen/f32-vminc-neon-u8.c & +tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=MUL -D BATCH_TILE=4 -D -o src/f32-vbinary/gen/f32-vmulc-neon-u4.c & +tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=MUL -D BATCH_TILE=8 -D -o src/f32-vbinary/gen/f32-vmulc-neon-u8.c & +tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=PRELU -D BATCH_TILE=4 -D -o src/f32-vbinary/gen/f32-vpreluc-neon-u4.c & +tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=PRELU -D BATCH_TILE=8 -D -o src/f32-vbinary/gen/f32-vpreluc-neon-u8.c & +tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=RDIV -D BATCH_TILE=4 -D -o src/f32-vbinary/gen/f32-vrdivc-aarch64-neon-u4.c & +tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=RDIV -D BATCH_TILE=8 -D -o src/f32-vbinary/gen/f32-vrdivc-aarch64-neon-u8.c & +tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=RPRELU -D BATCH_TILE=4 -D -o src/f32-vbinary/gen/f32-vrpreluc-neon-u4.c & +tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=RPRELU -D BATCH_TILE=8 -D -o src/f32-vbinary/gen/f32-vrpreluc-neon-u8.c & +tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=RSUB -D BATCH_TILE=4 -D -o src/f32-vbinary/gen/f32-vrsubc-neon-u4.c & +tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=RSUB -D BATCH_TILE=8 -D -o src/f32-vbinary/gen/f32-vrsubc-neon-u8.c & +tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=SQRDIFF -D BATCH_TILE=4 -D -o src/f32-vbinary/gen/f32-vsqrdiffc-neon-u4.c & +tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=SQRDIFF -D BATCH_TILE=8 -D -o src/f32-vbinary/gen/f32-vsqrdiffc-neon-u8.c & +tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=SUB -D BATCH_TILE=4 -D -o src/f32-vbinary/gen/f32-vsubc-neon-u4.c & +tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=SUB -D BATCH_TILE=8 -D -o src/f32-vbinary/gen/f32-vsubc-neon-u8.c & ################################# x86 128-bit ################################# -tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=ADD -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D SSE=1 -o src/f32-vbinary/gen/f32-vadd-minmax-sse-u4.c & -tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=ADD -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D SSE=1 -o src/f32-vbinary/gen/f32-vadd-minmax-sse-u8.c & -tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=DIV -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D SSE=1 -o src/f32-vbinary/gen/f32-vdiv-minmax-sse-u4.c & -tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=DIV -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D SSE=1 -o src/f32-vbinary/gen/f32-vdiv-minmax-sse-u8.c & -tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=MAX -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D SSE=1 -o src/f32-vbinary/gen/f32-vmax-sse-u4.c & -tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=MAX -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D SSE=1 -o src/f32-vbinary/gen/f32-vmax-sse-u8.c & -tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=MIN -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D SSE=1 -o src/f32-vbinary/gen/f32-vmin-sse-u4.c & -tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=MIN -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D SSE=1 -o src/f32-vbinary/gen/f32-vmin-sse-u8.c & -tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=MUL -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D SSE=1 -o src/f32-vbinary/gen/f32-vmul-minmax-sse-u4.c & -tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=MUL -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D SSE=1 -o src/f32-vbinary/gen/f32-vmul-minmax-sse-u8.c & -tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=SQRDIFF -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D SSE=1 -o src/f32-vbinary/gen/f32-vsqrdiff-sse-u4.c & -tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=SQRDIFF -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D SSE=1 -o src/f32-vbinary/gen/f32-vsqrdiff-sse-u8.c & -tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=SUB -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D SSE=1 -o src/f32-vbinary/gen/f32-vsub-minmax-sse-u4.c & -tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=SUB -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D SSE=1 -o src/f32-vbinary/gen/f32-vsub-minmax-sse-u8.c & -tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=PRELU -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D SSE=2 -o src/f32-vbinary/gen/f32-vprelu-sse2-u4.c & -tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=PRELU -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D SSE=2 -o src/f32-vbinary/gen/f32-vprelu-sse2-u8.c & -tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=PRELU -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D SSE=4 -o src/f32-vbinary/gen/f32-vprelu-sse41-u4.c & -tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=PRELU -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D SSE=4 -o src/f32-vbinary/gen/f32-vprelu-sse41-u8.c & +tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=ADD -D BATCH_TILE=4 -D SSE=1 -o src/f32-vbinary/gen/f32-vadd-sse-u4.c & +tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=ADD -D BATCH_TILE=8 -D SSE=1 -o src/f32-vbinary/gen/f32-vadd-sse-u8.c & +tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=DIV -D BATCH_TILE=4 -D SSE=1 -o src/f32-vbinary/gen/f32-vdiv-sse-u4.c & +tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=DIV -D BATCH_TILE=8 -D SSE=1 -o src/f32-vbinary/gen/f32-vdiv-sse-u8.c & +tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=MAX -D BATCH_TILE=4 -D SSE=1 -o src/f32-vbinary/gen/f32-vmax-sse-u4.c & +tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=MAX -D BATCH_TILE=8 -D SSE=1 -o src/f32-vbinary/gen/f32-vmax-sse-u8.c & +tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=MIN -D BATCH_TILE=4 -D SSE=1 -o src/f32-vbinary/gen/f32-vmin-sse-u4.c & +tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=MIN -D BATCH_TILE=8 -D SSE=1 -o src/f32-vbinary/gen/f32-vmin-sse-u8.c & +tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=MUL -D BATCH_TILE=4 -D SSE=1 -o src/f32-vbinary/gen/f32-vmul-sse-u4.c & +tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=MUL -D BATCH_TILE=8 -D SSE=1 -o src/f32-vbinary/gen/f32-vmul-sse-u8.c & +tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=PRELU -D BATCH_TILE=4 -D SSE=2 -o src/f32-vbinary/gen/f32-vprelu-sse2-u4.c & +tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=PRELU -D BATCH_TILE=4 -D SSE=4 -o src/f32-vbinary/gen/f32-vprelu-sse41-u4.c & +tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=PRELU -D BATCH_TILE=8 -D SSE=2 -o src/f32-vbinary/gen/f32-vprelu-sse2-u8.c & +tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=PRELU -D BATCH_TILE=8 -D SSE=4 -o src/f32-vbinary/gen/f32-vprelu-sse41-u8.c & +tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=SQRDIFF -D BATCH_TILE=4 -D SSE=1 -o src/f32-vbinary/gen/f32-vsqrdiff-sse-u4.c & +tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=SQRDIFF -D BATCH_TILE=8 -D SSE=1 -o src/f32-vbinary/gen/f32-vsqrdiff-sse-u8.c & +tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=SUB -D BATCH_TILE=4 -D SSE=1 -o src/f32-vbinary/gen/f32-vsub-sse-u4.c & +tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=SUB -D BATCH_TILE=8 -D SSE=1 -o src/f32-vbinary/gen/f32-vsub-sse-u8.c & -tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=ADD -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D SSE=1 -o src/f32-vbinary/gen/f32-vaddc-minmax-sse-u4.c & -tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=ADD -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D SSE=1 -o src/f32-vbinary/gen/f32-vaddc-minmax-sse-u8.c & -tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=DIV -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D SSE=1 -o src/f32-vbinary/gen/f32-vdivc-minmax-sse-u4.c & -tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=DIV -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D SSE=1 -o src/f32-vbinary/gen/f32-vdivc-minmax-sse-u8.c & -tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=MAX -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D SSE=1 -o src/f32-vbinary/gen/f32-vmaxc-sse-u4.c & -tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=MAX -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D SSE=1 -o src/f32-vbinary/gen/f32-vmaxc-sse-u8.c & -tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=MIN -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D SSE=1 -o src/f32-vbinary/gen/f32-vminc-sse-u4.c & -tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=MIN -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D SSE=1 -o src/f32-vbinary/gen/f32-vminc-sse-u8.c & -tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=MUL -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D SSE=1 -o src/f32-vbinary/gen/f32-vmulc-minmax-sse-u4.c & -tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=MUL -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D SSE=1 -o src/f32-vbinary/gen/f32-vmulc-minmax-sse-u8.c & -tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=RDIV -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D SSE=1 -o src/f32-vbinary/gen/f32-vrdivc-minmax-sse-u4.c & -tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=RDIV -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D SSE=1 -o src/f32-vbinary/gen/f32-vrdivc-minmax-sse-u8.c & -tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=RSUB -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D SSE=1 -o src/f32-vbinary/gen/f32-vrsubc-minmax-sse-u4.c & -tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=RSUB -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D SSE=1 -o src/f32-vbinary/gen/f32-vrsubc-minmax-sse-u8.c & -tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=SQRDIFF -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D SSE=1 -o src/f32-vbinary/gen/f32-vsqrdiffc-sse-u4.c & -tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=SQRDIFF -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D SSE=1 -o src/f32-vbinary/gen/f32-vsqrdiffc-sse-u8.c & -tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=SUB -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D SSE=1 -o src/f32-vbinary/gen/f32-vsubc-minmax-sse-u4.c & -tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=SUB -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D SSE=1 -o src/f32-vbinary/gen/f32-vsubc-minmax-sse-u8.c & -tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=PRELU -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D SSE=2 -o src/f32-vbinary/gen/f32-vpreluc-sse2-u4.c & -tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=PRELU -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D SSE=2 -o src/f32-vbinary/gen/f32-vpreluc-sse2-u8.c & -tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=PRELU -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D SSE=4 -o src/f32-vbinary/gen/f32-vpreluc-sse41-u4.c & -tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=PRELU -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D SSE=4 -o src/f32-vbinary/gen/f32-vpreluc-sse41-u8.c & -tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=RPRELU -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D SSE=2 -o src/f32-vbinary/gen/f32-vrpreluc-sse2-u4.c & -tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=RPRELU -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D SSE=2 -o src/f32-vbinary/gen/f32-vrpreluc-sse2-u8.c & -tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=RPRELU -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D SSE=4 -o src/f32-vbinary/gen/f32-vrpreluc-sse41-u4.c & -tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=RPRELU -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D SSE=4 -o src/f32-vbinary/gen/f32-vrpreluc-sse41-u8.c & +tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=ADD -D BATCH_TILE=4 -D SSE=1 -o src/f32-vbinary/gen/f32-vaddc-sse-u4.c & +tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=ADD -D BATCH_TILE=8 -D SSE=1 -o src/f32-vbinary/gen/f32-vaddc-sse-u8.c & +tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=DIV -D BATCH_TILE=4 -D SSE=1 -o src/f32-vbinary/gen/f32-vdivc-sse-u4.c & +tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=DIV -D BATCH_TILE=8 -D SSE=1 -o src/f32-vbinary/gen/f32-vdivc-sse-u8.c & +tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=MAX -D BATCH_TILE=4 -D SSE=1 -o src/f32-vbinary/gen/f32-vmaxc-sse-u4.c & +tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=MAX -D BATCH_TILE=8 -D SSE=1 -o src/f32-vbinary/gen/f32-vmaxc-sse-u8.c & +tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=MIN -D BATCH_TILE=4 -D SSE=1 -o src/f32-vbinary/gen/f32-vminc-sse-u4.c & +tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=MIN -D BATCH_TILE=8 -D SSE=1 -o src/f32-vbinary/gen/f32-vminc-sse-u8.c & +tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=MUL -D BATCH_TILE=4 -D SSE=1 -o src/f32-vbinary/gen/f32-vmulc-sse-u4.c & +tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=MUL -D BATCH_TILE=8 -D SSE=1 -o src/f32-vbinary/gen/f32-vmulc-sse-u8.c & +tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=RDIV -D BATCH_TILE=4 -D SSE=1 -o src/f32-vbinary/gen/f32-vrdivc-sse-u4.c & +tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=RDIV -D BATCH_TILE=8 -D SSE=1 -o src/f32-vbinary/gen/f32-vrdivc-sse-u8.c & +tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=RSUB -D BATCH_TILE=4 -D SSE=1 -o src/f32-vbinary/gen/f32-vrsubc-sse-u4.c & +tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=RSUB -D BATCH_TILE=8 -D SSE=1 -o src/f32-vbinary/gen/f32-vrsubc-sse-u8.c & +tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=SQRDIFF -D BATCH_TILE=4 -D SSE=1 -o src/f32-vbinary/gen/f32-vsqrdiffc-sse-u4.c & +tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=SQRDIFF -D BATCH_TILE=8 -D SSE=1 -o src/f32-vbinary/gen/f32-vsqrdiffc-sse-u8.c & +tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=SUB -D BATCH_TILE=4 -D SSE=1 -o src/f32-vbinary/gen/f32-vsubc-sse-u4.c & +tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=SUB -D BATCH_TILE=8 -D SSE=1 -o src/f32-vbinary/gen/f32-vsubc-sse-u8.c & +tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=PRELU -D BATCH_TILE=4 -D SSE=2 -o src/f32-vbinary/gen/f32-vpreluc-sse2-u4.c & +tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=PRELU -D BATCH_TILE=8 -D SSE=2 -o src/f32-vbinary/gen/f32-vpreluc-sse2-u8.c & +tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=PRELU -D BATCH_TILE=4 -D SSE=4 -o src/f32-vbinary/gen/f32-vpreluc-sse41-u4.c & +tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=PRELU -D BATCH_TILE=8 -D SSE=4 -o src/f32-vbinary/gen/f32-vpreluc-sse41-u8.c & +tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=RPRELU -D BATCH_TILE=4 -D SSE=2 -o src/f32-vbinary/gen/f32-vrpreluc-sse2-u4.c & +tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=RPRELU -D BATCH_TILE=8 -D SSE=2 -o src/f32-vbinary/gen/f32-vrpreluc-sse2-u8.c & +tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=RPRELU -D BATCH_TILE=4 -D SSE=4 -o src/f32-vbinary/gen/f32-vrpreluc-sse41-u4.c & +tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=RPRELU -D BATCH_TILE=8 -D SSE=4 -o src/f32-vbinary/gen/f32-vrpreluc-sse41-u8.c & ################################# x86 256-bit ################################# -tools/xngen src/f32-vbinary/vop-avx.c.in -D OP=ADD -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vadd-minmax-avx-u16.c & -tools/xngen src/f32-vbinary/vop-avx.c.in -D OP=ADD -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vadd-minmax-avx-u8.c & -tools/xngen src/f32-vbinary/vop-avx.c.in -D OP=DIV -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vdiv-minmax-avx-u16.c & -tools/xngen src/f32-vbinary/vop-avx.c.in -D OP=DIV -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vdiv-minmax-avx-u8.c & -tools/xngen src/f32-vbinary/vop-avx.c.in -D OP=MAX -D BATCH_TILE=16 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmax-avx-u16.c & -tools/xngen src/f32-vbinary/vop-avx.c.in -D OP=MAX -D BATCH_TILE=8 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmax-avx-u8.c & -tools/xngen src/f32-vbinary/vop-avx.c.in -D OP=MIN -D BATCH_TILE=16 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmin-avx-u16.c & -tools/xngen src/f32-vbinary/vop-avx.c.in -D OP=MIN -D BATCH_TILE=8 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmin-avx-u8.c & -tools/xngen src/f32-vbinary/vop-avx.c.in -D OP=MUL -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vmul-minmax-avx-u16.c & -tools/xngen src/f32-vbinary/vop-avx.c.in -D OP=MUL -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vmul-minmax-avx-u8.c & -tools/xngen src/f32-vbinary/vop-avx.c.in -D OP=SQRDIFF -D BATCH_TILE=16 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vsqrdiff-avx-u16.c & -tools/xngen src/f32-vbinary/vop-avx.c.in -D OP=SQRDIFF -D BATCH_TILE=8 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vsqrdiff-avx-u8.c & -tools/xngen src/f32-vbinary/vop-avx.c.in -D OP=SUB -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vsub-minmax-avx-u16.c & -tools/xngen src/f32-vbinary/vop-avx.c.in -D OP=SUB -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vsub-minmax-avx-u8.c & -tools/xngen src/f32-vbinary/vop-avx.c.in -D OP=PRELU -D BATCH_TILE=8 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vprelu-avx-u8.c & -tools/xngen src/f32-vbinary/vop-avx.c.in -D OP=PRELU -D BATCH_TILE=16 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vprelu-avx-u16.c & +tools/xngen src/f32-vbinary/vop-avx.c.in -D OP=ADD -D BATCH_TILE=16 -D -o src/f32-vbinary/gen/f32-vadd-avx-u16.c & +tools/xngen src/f32-vbinary/vop-avx.c.in -D OP=ADD -D BATCH_TILE=8 -D -o src/f32-vbinary/gen/f32-vadd-avx-u8.c & +tools/xngen src/f32-vbinary/vop-avx.c.in -D OP=DIV -D BATCH_TILE=16 -D -o src/f32-vbinary/gen/f32-vdiv-avx-u16.c & +tools/xngen src/f32-vbinary/vop-avx.c.in -D OP=DIV -D BATCH_TILE=8 -D -o src/f32-vbinary/gen/f32-vdiv-avx-u8.c & +tools/xngen src/f32-vbinary/vop-avx.c.in -D OP=MAX -D BATCH_TILE=16 -D -o src/f32-vbinary/gen/f32-vmax-avx-u16.c & +tools/xngen src/f32-vbinary/vop-avx.c.in -D OP=MAX -D BATCH_TILE=8 -D -o src/f32-vbinary/gen/f32-vmax-avx-u8.c & +tools/xngen src/f32-vbinary/vop-avx.c.in -D OP=MIN -D BATCH_TILE=16 -D -o src/f32-vbinary/gen/f32-vmin-avx-u16.c & +tools/xngen src/f32-vbinary/vop-avx.c.in -D OP=MIN -D BATCH_TILE=8 -D -o src/f32-vbinary/gen/f32-vmin-avx-u8.c & +tools/xngen src/f32-vbinary/vop-avx.c.in -D OP=MUL -D BATCH_TILE=16 -D -o src/f32-vbinary/gen/f32-vmul-avx-u16.c & +tools/xngen src/f32-vbinary/vop-avx.c.in -D OP=MUL -D BATCH_TILE=8 -D -o src/f32-vbinary/gen/f32-vmul-avx-u8.c & +tools/xngen src/f32-vbinary/vop-avx.c.in -D OP=SQRDIFF -D BATCH_TILE=16 -D -o src/f32-vbinary/gen/f32-vsqrdiff-avx-u16.c & +tools/xngen src/f32-vbinary/vop-avx.c.in -D OP=SQRDIFF -D BATCH_TILE=8 -D -o src/f32-vbinary/gen/f32-vsqrdiff-avx-u8.c & +tools/xngen src/f32-vbinary/vop-avx.c.in -D OP=SUB -D BATCH_TILE=16 -D -o src/f32-vbinary/gen/f32-vsub-avx-u16.c & +tools/xngen src/f32-vbinary/vop-avx.c.in -D OP=SUB -D BATCH_TILE=8 -D -o src/f32-vbinary/gen/f32-vsub-avx-u8.c & +tools/xngen src/f32-vbinary/vop-avx.c.in -D OP=PRELU -D BATCH_TILE=8 -D -o src/f32-vbinary/gen/f32-vprelu-avx-u8.c & +tools/xngen src/f32-vbinary/vop-avx.c.in -D OP=PRELU -D BATCH_TILE=16 -D -o src/f32-vbinary/gen/f32-vprelu-avx-u16.c & -tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=ADD -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vaddc-minmax-avx-u16.c & -tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=ADD -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vaddc-minmax-avx-u8.c & -tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=DIV -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vdivc-minmax-avx-u16.c & -tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=DIV -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vdivc-minmax-avx-u8.c & -tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=MAX -D BATCH_TILE=16 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmaxc-avx-u16.c & -tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=MAX -D BATCH_TILE=8 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmaxc-avx-u8.c & -tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=MIN -D BATCH_TILE=16 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vminc-avx-u16.c & -tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=MIN -D BATCH_TILE=8 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vminc-avx-u8.c & -tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=MUL -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vmulc-minmax-avx-u16.c & -tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=MUL -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vmulc-minmax-avx-u8.c & -tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=RDIV -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vrdivc-minmax-avx-u16.c & -tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=RDIV -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vrdivc-minmax-avx-u8.c & -tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=RSUB -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vrsubc-minmax-avx-u16.c & -tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=RSUB -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vrsubc-minmax-avx-u8.c & -tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=SQRDIFF -D BATCH_TILE=16 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vsqrdiffc-avx-u16.c & -tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=SQRDIFF -D BATCH_TILE=8 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vsqrdiffc-avx-u8.c & -tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=SUB -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vsubc-minmax-avx-u16.c & -tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=SUB -D BATCH_TILE=8 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vsubc-minmax-avx-u8.c & -tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=PRELU -D BATCH_TILE=8 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vpreluc-avx-u8.c & -tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=PRELU -D BATCH_TILE=16 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vpreluc-avx-u16.c & -tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=RPRELU -D BATCH_TILE=8 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vrpreluc-avx-u8.c & -tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=RPRELU -D BATCH_TILE=16 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vrpreluc-avx-u16.c & +tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=ADD -D BATCH_TILE=16 -D -o src/f32-vbinary/gen/f32-vaddc-avx-u16.c & +tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=ADD -D BATCH_TILE=8 -D -o src/f32-vbinary/gen/f32-vaddc-avx-u8.c & +tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=DIV -D BATCH_TILE=16 -D -o src/f32-vbinary/gen/f32-vdivc-avx-u16.c & +tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=DIV -D BATCH_TILE=8 -D -o src/f32-vbinary/gen/f32-vdivc-avx-u8.c & +tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=MAX -D BATCH_TILE=16 -D -o src/f32-vbinary/gen/f32-vmaxc-avx-u16.c & +tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=MAX -D BATCH_TILE=8 -D -o src/f32-vbinary/gen/f32-vmaxc-avx-u8.c & +tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=MIN -D BATCH_TILE=16 -D -o src/f32-vbinary/gen/f32-vminc-avx-u16.c & +tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=MIN -D BATCH_TILE=8 -D -o src/f32-vbinary/gen/f32-vminc-avx-u8.c & +tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=MUL -D BATCH_TILE=16 -D -o src/f32-vbinary/gen/f32-vmulc-avx-u16.c & +tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=MUL -D BATCH_TILE=8 -D -o src/f32-vbinary/gen/f32-vmulc-avx-u8.c & +tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=RDIV -D BATCH_TILE=16 -D -o src/f32-vbinary/gen/f32-vrdivc-avx-u16.c & +tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=RDIV -D BATCH_TILE=8 -D -o src/f32-vbinary/gen/f32-vrdivc-avx-u8.c & +tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=RSUB -D BATCH_TILE=16 -D -o src/f32-vbinary/gen/f32-vrsubc-avx-u16.c & +tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=RSUB -D BATCH_TILE=8 -D -o src/f32-vbinary/gen/f32-vrsubc-avx-u8.c & +tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=SQRDIFF -D BATCH_TILE=16 -D -o src/f32-vbinary/gen/f32-vsqrdiffc-avx-u16.c & +tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=SQRDIFF -D BATCH_TILE=8 -D -o src/f32-vbinary/gen/f32-vsqrdiffc-avx-u8.c & +tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=SUB -D BATCH_TILE=16 -D -o src/f32-vbinary/gen/f32-vsubc-avx-u16.c & +tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=SUB -D BATCH_TILE=8 -D -o src/f32-vbinary/gen/f32-vsubc-avx-u8.c & +tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=PRELU -D BATCH_TILE=8 -D -o src/f32-vbinary/gen/f32-vpreluc-avx-u8.c & +tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=PRELU -D BATCH_TILE=16 -D -o src/f32-vbinary/gen/f32-vpreluc-avx-u16.c & +tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=RPRELU -D BATCH_TILE=8 -D -o src/f32-vbinary/gen/f32-vrpreluc-avx-u8.c & +tools/xngen src/f32-vbinary/vopc-avx.c.in -D OP=RPRELU -D BATCH_TILE=16 -D -o src/f32-vbinary/gen/f32-vrpreluc-avx-u16.c & ################################# x86 512-bit ################################# -tools/xngen src/f32-vbinary/vop-avx512f.c.in -D OP=ADD -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vadd-minmax-avx512f-u16.c & -tools/xngen src/f32-vbinary/vop-avx512f.c.in -D OP=ADD -D BATCH_TILE=32 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vadd-minmax-avx512f-u32.c & -tools/xngen src/f32-vbinary/vop-avx512f.c.in -D OP=DIV -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vdiv-minmax-avx512f-u16.c & -tools/xngen src/f32-vbinary/vop-avx512f.c.in -D OP=DIV -D BATCH_TILE=32 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vdiv-minmax-avx512f-u32.c & -tools/xngen src/f32-vbinary/vop-avx512f.c.in -D OP=MAX -D BATCH_TILE=16 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmax-avx512f-u16.c & -tools/xngen src/f32-vbinary/vop-avx512f.c.in -D OP=MAX -D BATCH_TILE=32 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmax-avx512f-u32.c & -tools/xngen src/f32-vbinary/vop-avx512f.c.in -D OP=MIN -D BATCH_TILE=16 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmin-avx512f-u16.c & -tools/xngen src/f32-vbinary/vop-avx512f.c.in -D OP=MIN -D BATCH_TILE=32 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmin-avx512f-u32.c & -tools/xngen src/f32-vbinary/vop-avx512f.c.in -D OP=MUL -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vmul-minmax-avx512f-u16.c & -tools/xngen src/f32-vbinary/vop-avx512f.c.in -D OP=MUL -D BATCH_TILE=32 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vmul-minmax-avx512f-u32.c & -tools/xngen src/f32-vbinary/vop-avx512f.c.in -D OP=SQRDIFF -D BATCH_TILE=16 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vsqrdiff-avx512f-u16.c & -tools/xngen src/f32-vbinary/vop-avx512f.c.in -D OP=SQRDIFF -D BATCH_TILE=32 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vsqrdiff-avx512f-u32.c & -tools/xngen src/f32-vbinary/vop-avx512f.c.in -D OP=SUB -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vsub-minmax-avx512f-u16.c & -tools/xngen src/f32-vbinary/vop-avx512f.c.in -D OP=SUB -D BATCH_TILE=32 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vsub-minmax-avx512f-u32.c & -tools/xngen src/f32-vbinary/vop-avx512f.c.in -D OP=PRELU -D BATCH_TILE=16 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vprelu-avx512f-u16.c & -tools/xngen src/f32-vbinary/vop-avx512f.c.in -D OP=PRELU -D BATCH_TILE=32 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vprelu-avx512f-u32.c & +tools/xngen src/f32-vbinary/vop-avx512f.c.in -D OP=ADD -D BATCH_TILE=16 -D -o src/f32-vbinary/gen/f32-vadd-avx512f-u16.c & +tools/xngen src/f32-vbinary/vop-avx512f.c.in -D OP=ADD -D BATCH_TILE=32 -D -o src/f32-vbinary/gen/f32-vadd-avx512f-u32.c & +tools/xngen src/f32-vbinary/vop-avx512f.c.in -D OP=DIV -D BATCH_TILE=16 -D -o src/f32-vbinary/gen/f32-vdiv-avx512f-u16.c & +tools/xngen src/f32-vbinary/vop-avx512f.c.in -D OP=DIV -D BATCH_TILE=32 -D -o src/f32-vbinary/gen/f32-vdiv-avx512f-u32.c & +tools/xngen src/f32-vbinary/vop-avx512f.c.in -D OP=MAX -D BATCH_TILE=16 -D -o src/f32-vbinary/gen/f32-vmax-avx512f-u16.c & +tools/xngen src/f32-vbinary/vop-avx512f.c.in -D OP=MAX -D BATCH_TILE=32 -D -o src/f32-vbinary/gen/f32-vmax-avx512f-u32.c & +tools/xngen src/f32-vbinary/vop-avx512f.c.in -D OP=MIN -D BATCH_TILE=16 -D -o src/f32-vbinary/gen/f32-vmin-avx512f-u16.c & +tools/xngen src/f32-vbinary/vop-avx512f.c.in -D OP=MIN -D BATCH_TILE=32 -D -o src/f32-vbinary/gen/f32-vmin-avx512f-u32.c & +tools/xngen src/f32-vbinary/vop-avx512f.c.in -D OP=MUL -D BATCH_TILE=16 -D -o src/f32-vbinary/gen/f32-vmul-avx512f-u16.c & +tools/xngen src/f32-vbinary/vop-avx512f.c.in -D OP=MUL -D BATCH_TILE=32 -D -o src/f32-vbinary/gen/f32-vmul-avx512f-u32.c & +tools/xngen src/f32-vbinary/vop-avx512f.c.in -D OP=SQRDIFF -D BATCH_TILE=16 -D -o src/f32-vbinary/gen/f32-vsqrdiff-avx512f-u16.c & +tools/xngen src/f32-vbinary/vop-avx512f.c.in -D OP=SQRDIFF -D BATCH_TILE=32 -D -o src/f32-vbinary/gen/f32-vsqrdiff-avx512f-u32.c & +tools/xngen src/f32-vbinary/vop-avx512f.c.in -D OP=SUB -D BATCH_TILE=16 -D -o src/f32-vbinary/gen/f32-vsub-avx512f-u16.c & +tools/xngen src/f32-vbinary/vop-avx512f.c.in -D OP=SUB -D BATCH_TILE=32 -D -o src/f32-vbinary/gen/f32-vsub-avx512f-u32.c & +tools/xngen src/f32-vbinary/vop-avx512f.c.in -D OP=PRELU -D BATCH_TILE=16 -D -o src/f32-vbinary/gen/f32-vprelu-avx512f-u16.c & +tools/xngen src/f32-vbinary/vop-avx512f.c.in -D OP=PRELU -D BATCH_TILE=32 -D -o src/f32-vbinary/gen/f32-vprelu-avx512f-u32.c & -tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=ADD -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vaddc-minmax-avx512f-u16.c & -tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=ADD -D BATCH_TILE=32 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vaddc-minmax-avx512f-u32.c & -tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=DIV -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vdivc-minmax-avx512f-u16.c & -tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=DIV -D BATCH_TILE=32 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vdivc-minmax-avx512f-u32.c & -tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=MAX -D BATCH_TILE=16 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmaxc-avx512f-u16.c & -tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=MAX -D BATCH_TILE=32 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmaxc-avx512f-u32.c & -tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=MIN -D BATCH_TILE=16 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vminc-avx512f-u16.c & -tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=MIN -D BATCH_TILE=32 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vminc-avx512f-u32.c & -tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=MUL -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vmulc-minmax-avx512f-u16.c & -tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=MUL -D BATCH_TILE=32 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vmulc-minmax-avx512f-u32.c & -tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=RDIV -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vrdivc-minmax-avx512f-u16.c & -tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=RDIV -D BATCH_TILE=32 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vrdivc-minmax-avx512f-u32.c & -tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=RSUB -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vrsubc-minmax-avx512f-u16.c & -tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=RSUB -D BATCH_TILE=32 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vrsubc-minmax-avx512f-u32.c & -tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=SQRDIFF -D BATCH_TILE=16 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vsqrdiffc-avx512f-u16.c & -tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=SQRDIFF -D BATCH_TILE=32 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vsqrdiffc-avx512f-u32.c & -tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=SUB -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vsubc-minmax-avx512f-u16.c & -tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=SUB -D BATCH_TILE=32 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vsubc-minmax-avx512f-u32.c & -tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=PRELU -D BATCH_TILE=16 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vpreluc-avx512f-u16.c & -tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=PRELU -D BATCH_TILE=32 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vpreluc-avx512f-u32.c & -tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=RPRELU -D BATCH_TILE=16 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vrpreluc-avx512f-u16.c & -tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=RPRELU -D BATCH_TILE=32 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vrpreluc-avx512f-u32.c & +tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=ADD -D BATCH_TILE=16 -D -o src/f32-vbinary/gen/f32-vaddc-avx512f-u16.c & +tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=ADD -D BATCH_TILE=32 -D -o src/f32-vbinary/gen/f32-vaddc-avx512f-u32.c & +tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=DIV -D BATCH_TILE=16 -D -o src/f32-vbinary/gen/f32-vdivc-avx512f-u16.c & +tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=DIV -D BATCH_TILE=32 -D -o src/f32-vbinary/gen/f32-vdivc-avx512f-u32.c & +tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=MAX -D BATCH_TILE=16 -D -o src/f32-vbinary/gen/f32-vmaxc-avx512f-u16.c & +tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=MAX -D BATCH_TILE=32 -D -o src/f32-vbinary/gen/f32-vmaxc-avx512f-u32.c & +tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=MIN -D BATCH_TILE=16 -D -o src/f32-vbinary/gen/f32-vminc-avx512f-u16.c & +tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=MIN -D BATCH_TILE=32 -D -o src/f32-vbinary/gen/f32-vminc-avx512f-u32.c & +tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=MUL -D BATCH_TILE=16 -D -o src/f32-vbinary/gen/f32-vmulc-avx512f-u16.c & +tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=MUL -D BATCH_TILE=32 -D -o src/f32-vbinary/gen/f32-vmulc-avx512f-u32.c & +tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=RDIV -D BATCH_TILE=16 -D -o src/f32-vbinary/gen/f32-vrdivc-avx512f-u16.c & +tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=RDIV -D BATCH_TILE=32 -D -o src/f32-vbinary/gen/f32-vrdivc-avx512f-u32.c & +tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=RSUB -D BATCH_TILE=16 -D -o src/f32-vbinary/gen/f32-vrsubc-avx512f-u16.c & +tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=RSUB -D BATCH_TILE=32 -D -o src/f32-vbinary/gen/f32-vrsubc-avx512f-u32.c & +tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=SQRDIFF -D BATCH_TILE=16 -D -o src/f32-vbinary/gen/f32-vsqrdiffc-avx512f-u16.c & +tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=SQRDIFF -D BATCH_TILE=32 -D -o src/f32-vbinary/gen/f32-vsqrdiffc-avx512f-u32.c & +tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=SUB -D BATCH_TILE=16 -D -o src/f32-vbinary/gen/f32-vsubc-avx512f-u16.c & +tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=SUB -D BATCH_TILE=32 -D -o src/f32-vbinary/gen/f32-vsubc-avx512f-u32.c & +tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=PRELU -D BATCH_TILE=16 -D -o src/f32-vbinary/gen/f32-vpreluc-avx512f-u16.c & +tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=PRELU -D BATCH_TILE=32 -D -o src/f32-vbinary/gen/f32-vpreluc-avx512f-u32.c & +tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=RPRELU -D BATCH_TILE=16 -D -o src/f32-vbinary/gen/f32-vrpreluc-avx512f-u16.c & +tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=RPRELU -D BATCH_TILE=32 -D -o src/f32-vbinary/gen/f32-vrpreluc-avx512f-u32.c & ################################### HEXAGON HVX ################################## -tools/xngen src/f32-vbinary/vop-hvx.c.in -D OP=ADD -D BATCH_TILE=32 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vadd-minmax-hvx-u32.c & -tools/xngen src/f32-vbinary/vop-hvx.c.in -D OP=ADD -D BATCH_TILE=64 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vadd-minmax-hvx-u64.c & -tools/xngen src/f32-vbinary/vop-hvx.c.in -D OP=ADD -D BATCH_TILE=128 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vadd-minmax-hvx-u128.c & -tools/xngen src/f32-vbinary/vop-hvx.c.in -D OP=MAX -D BATCH_TILE=32 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmax-hvx-u32.c & -tools/xngen src/f32-vbinary/vop-hvx.c.in -D OP=MAX -D BATCH_TILE=64 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmax-hvx-u64.c & -tools/xngen src/f32-vbinary/vop-hvx.c.in -D OP=MAX -D BATCH_TILE=128 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmax-hvx-u128.c & -tools/xngen src/f32-vbinary/vop-hvx.c.in -D OP=MIN -D BATCH_TILE=32 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmin-hvx-u32.c & -tools/xngen src/f32-vbinary/vop-hvx.c.in -D OP=MIN -D BATCH_TILE=64 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmin-hvx-u64.c & -tools/xngen src/f32-vbinary/vop-hvx.c.in -D OP=MIN -D BATCH_TILE=128 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmin-hvx-u128.c & -tools/xngen src/f32-vbinary/vop-hvx.c.in -D OP=MUL -D BATCH_TILE=32 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vmul-minmax-hvx-u32.c & -tools/xngen src/f32-vbinary/vop-hvx.c.in -D OP=MUL -D BATCH_TILE=64 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vmul-minmax-hvx-u64.c & -tools/xngen src/f32-vbinary/vop-hvx.c.in -D OP=MUL -D BATCH_TILE=128 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vmul-minmax-hvx-u128.c & -tools/xngen src/f32-vbinary/vop-hvx.c.in -D OP=SUB -D BATCH_TILE=32 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vsub-minmax-hvx-u32.c & -tools/xngen src/f32-vbinary/vop-hvx.c.in -D OP=SUB -D BATCH_TILE=64 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vsub-minmax-hvx-u64.c & -tools/xngen src/f32-vbinary/vop-hvx.c.in -D OP=SUB -D BATCH_TILE=128 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vsub-minmax-hvx-u128.c & -tools/xngen src/f32-vbinary/vop-hvx.c.in -D OP=SQRDIFF -D BATCH_TILE=32 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vsqrdiff-hvx-u32.c & -tools/xngen src/f32-vbinary/vop-hvx.c.in -D OP=SQRDIFF -D BATCH_TILE=64 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vsqrdiff-hvx-u64.c & -tools/xngen src/f32-vbinary/vop-hvx.c.in -D OP=SQRDIFF -D BATCH_TILE=128 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vsqrdiff-hvx-u128.c & +tools/xngen src/f32-vbinary/vop-hvx.c.in -D OP=ADD -D BATCH_TILE=128 -D -o src/f32-vbinary/gen/f32-vadd-hvx-u128.c & +tools/xngen src/f32-vbinary/vop-hvx.c.in -D OP=ADD -D BATCH_TILE=32 -D -o src/f32-vbinary/gen/f32-vadd-hvx-u32.c & +tools/xngen src/f32-vbinary/vop-hvx.c.in -D OP=ADD -D BATCH_TILE=64 -D -o src/f32-vbinary/gen/f32-vadd-hvx-u64.c & +tools/xngen src/f32-vbinary/vop-hvx.c.in -D OP=MAX -D BATCH_TILE=128 -D -o src/f32-vbinary/gen/f32-vmax-hvx-u128.c & +tools/xngen src/f32-vbinary/vop-hvx.c.in -D OP=MAX -D BATCH_TILE=32 -D -o src/f32-vbinary/gen/f32-vmax-hvx-u32.c & +tools/xngen src/f32-vbinary/vop-hvx.c.in -D OP=MAX -D BATCH_TILE=64 -D -o src/f32-vbinary/gen/f32-vmax-hvx-u64.c & +tools/xngen src/f32-vbinary/vop-hvx.c.in -D OP=MIN -D BATCH_TILE=128 -D -o src/f32-vbinary/gen/f32-vmin-hvx-u128.c & +tools/xngen src/f32-vbinary/vop-hvx.c.in -D OP=MIN -D BATCH_TILE=32 -D -o src/f32-vbinary/gen/f32-vmin-hvx-u32.c & +tools/xngen src/f32-vbinary/vop-hvx.c.in -D OP=MIN -D BATCH_TILE=64 -D -o src/f32-vbinary/gen/f32-vmin-hvx-u64.c & +tools/xngen src/f32-vbinary/vop-hvx.c.in -D OP=MUL -D BATCH_TILE=128 -D -o src/f32-vbinary/gen/f32-vmul-hvx-u128.c & +tools/xngen src/f32-vbinary/vop-hvx.c.in -D OP=MUL -D BATCH_TILE=32 -D -o src/f32-vbinary/gen/f32-vmul-hvx-u32.c & +tools/xngen src/f32-vbinary/vop-hvx.c.in -D OP=MUL -D BATCH_TILE=64 -D -o src/f32-vbinary/gen/f32-vmul-hvx-u64.c & +tools/xngen src/f32-vbinary/vop-hvx.c.in -D OP=SQRDIFF -D BATCH_TILE=128 -D -o src/f32-vbinary/gen/f32-vsqrdiff-hvx-u128.c & +tools/xngen src/f32-vbinary/vop-hvx.c.in -D OP=SQRDIFF -D BATCH_TILE=32 -D -o src/f32-vbinary/gen/f32-vsqrdiff-hvx-u32.c & +tools/xngen src/f32-vbinary/vop-hvx.c.in -D OP=SQRDIFF -D BATCH_TILE=64 -D -o src/f32-vbinary/gen/f32-vsqrdiff-hvx-u64.c & +tools/xngen src/f32-vbinary/vop-hvx.c.in -D OP=SUB -D BATCH_TILE=128 -D -o src/f32-vbinary/gen/f32-vsub-hvx-u128.c & +tools/xngen src/f32-vbinary/vop-hvx.c.in -D OP=SUB -D BATCH_TILE=32 -D -o src/f32-vbinary/gen/f32-vsub-hvx-u32.c & +tools/xngen src/f32-vbinary/vop-hvx.c.in -D OP=SUB -D BATCH_TILE=64 -D -o src/f32-vbinary/gen/f32-vsub-hvx-u64.c & + +tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=ADD -D BATCH_TILE=128 -D -o src/f32-vbinary/gen/f32-vaddc-hvx-u128.c & +tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=ADD -D BATCH_TILE=32 -D -o src/f32-vbinary/gen/f32-vaddc-hvx-u32.c & +tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=ADD -D BATCH_TILE=64 -D -o src/f32-vbinary/gen/f32-vaddc-hvx-u64.c & +tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=MAX -D BATCH_TILE=128 -D -o src/f32-vbinary/gen/f32-vmaxc-hvx-u128.c & +tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=MAX -D BATCH_TILE=32 -D -o src/f32-vbinary/gen/f32-vmaxc-hvx-u32.c & +tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=MAX -D BATCH_TILE=64 -D -o src/f32-vbinary/gen/f32-vmaxc-hvx-u64.c & +tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=MIN -D BATCH_TILE=128 -D -o src/f32-vbinary/gen/f32-vminc-hvx-u128.c & +tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=MIN -D BATCH_TILE=32 -D -o src/f32-vbinary/gen/f32-vminc-hvx-u32.c & +tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=MIN -D BATCH_TILE=64 -D -o src/f32-vbinary/gen/f32-vminc-hvx-u64.c & +tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=MUL -D BATCH_TILE=128 -D -o src/f32-vbinary/gen/f32-vmulc-hvx-u128.c & +tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=MUL -D BATCH_TILE=32 -D -o src/f32-vbinary/gen/f32-vmulc-hvx-u32.c & +tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=MUL -D BATCH_TILE=64 -D -o src/f32-vbinary/gen/f32-vmulc-hvx-u64.c & +tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=RSUB -D BATCH_TILE=128 -D -o src/f32-vbinary/gen/f32-vrsubc-hvx-u128.c & +tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=RSUB -D BATCH_TILE=32 -D -o src/f32-vbinary/gen/f32-vrsubc-hvx-u32.c & +tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=RSUB -D BATCH_TILE=64 -D -o src/f32-vbinary/gen/f32-vrsubc-hvx-u64.c & +tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=SQRDIFF -D BATCH_TILE=128 -D -o src/f32-vbinary/gen/f32-vsqrdiffc-hvx-u128.c & +tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=SQRDIFF -D BATCH_TILE=32 -D -o src/f32-vbinary/gen/f32-vsqrdiffc-hvx-u32.c & +tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=SQRDIFF -D BATCH_TILE=64 -D -o src/f32-vbinary/gen/f32-vsqrdiffc-hvx-u64.c & +tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=SUB -D BATCH_TILE=128 -D -o src/f32-vbinary/gen/f32-vsubc-hvx-u128.c & +tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=SUB -D BATCH_TILE=32 -D -o src/f32-vbinary/gen/f32-vsubc-hvx-u32.c & +tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=SUB -D BATCH_TILE=64 -D -o src/f32-vbinary/gen/f32-vsubc-hvx-u64.c & + +################################ RISC-V Vector ################################ +tools/xngen src/f32-vbinary/vop-rvv.c.in -D OP=ADD -D LMUL=4 -D -o src/f32-vbinary/gen/f32-vadd-rvv-u4v.c & +tools/xngen src/f32-vbinary/vop-rvv.c.in -D OP=ADD -D LMUL=8 -D -o src/f32-vbinary/gen/f32-vadd-rvv-u8v.c & +tools/xngen src/f32-vbinary/vop-rvv.c.in -D OP=DIV -D LMUL=4 -D -o src/f32-vbinary/gen/f32-vdiv-rvv-u4v.c & +tools/xngen src/f32-vbinary/vop-rvv.c.in -D OP=DIV -D LMUL=8 -D -o src/f32-vbinary/gen/f32-vdiv-rvv-u8v.c & +tools/xngen src/f32-vbinary/vop-rvv.c.in -D OP=MAX -D LMUL=4 -D -o src/f32-vbinary/gen/f32-vmax-rvv-u4v.c & +tools/xngen src/f32-vbinary/vop-rvv.c.in -D OP=MAX -D LMUL=8 -D -o src/f32-vbinary/gen/f32-vmax-rvv-u8v.c & +tools/xngen src/f32-vbinary/vop-rvv.c.in -D OP=MIN -D LMUL=4 -D -o src/f32-vbinary/gen/f32-vmin-rvv-u4v.c & +tools/xngen src/f32-vbinary/vop-rvv.c.in -D OP=MIN -D LMUL=8 -D -o src/f32-vbinary/gen/f32-vmin-rvv-u8v.c & +tools/xngen src/f32-vbinary/vop-rvv.c.in -D OP=MUL -D LMUL=4 -D -o src/f32-vbinary/gen/f32-vmul-rvv-u4v.c & +tools/xngen src/f32-vbinary/vop-rvv.c.in -D OP=MUL -D LMUL=8 -D -o src/f32-vbinary/gen/f32-vmul-rvv-u8v.c & +tools/xngen src/f32-vbinary/vop-rvv.c.in -D OP=SQRDIFF -D LMUL=4 -D -o src/f32-vbinary/gen/f32-vsqrdiff-rvv-u4v.c & +tools/xngen src/f32-vbinary/vop-rvv.c.in -D OP=SQRDIFF -D LMUL=8 -D -o src/f32-vbinary/gen/f32-vsqrdiff-rvv-u8v.c & +tools/xngen src/f32-vbinary/vop-rvv.c.in -D OP=SUB -D LMUL=4 -D -o src/f32-vbinary/gen/f32-vsub-rvv-u4v.c & +tools/xngen src/f32-vbinary/vop-rvv.c.in -D OP=SUB -D LMUL=8 -D -o src/f32-vbinary/gen/f32-vsub-rvv-u8v.c & -tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=ADD -D BATCH_TILE=32 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vaddc-minmax-hvx-u32.c & -tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=ADD -D BATCH_TILE=64 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vaddc-minmax-hvx-u64.c & -tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=ADD -D BATCH_TILE=128 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vaddc-minmax-hvx-u128.c & -tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=MAX -D BATCH_TILE=32 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmaxc-hvx-u32.c & -tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=MAX -D BATCH_TILE=64 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmaxc-hvx-u64.c & -tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=MAX -D BATCH_TILE=128 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vmaxc-hvx-u128.c & -tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=MIN -D BATCH_TILE=32 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vminc-hvx-u32.c & -tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=MIN -D BATCH_TILE=64 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vminc-hvx-u64.c & -tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=MIN -D BATCH_TILE=128 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vminc-hvx-u128.c & -tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=MUL -D BATCH_TILE=32 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vmulc-minmax-hvx-u32.c & -tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=MUL -D BATCH_TILE=64 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vmulc-minmax-hvx-u64.c & -tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=MUL -D BATCH_TILE=128 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vmulc-minmax-hvx-u128.c & -tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=SUB -D BATCH_TILE=32 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vsubc-minmax-hvx-u32.c & -tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=SUB -D BATCH_TILE=64 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vsubc-minmax-hvx-u64.c & -tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=SUB -D BATCH_TILE=128 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vsubc-minmax-hvx-u128.c & -tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=RSUB -D BATCH_TILE=32 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vrsubc-minmax-hvx-u32.c & -tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=RSUB -D BATCH_TILE=64 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vrsubc-minmax-hvx-u64.c & -tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=RSUB -D BATCH_TILE=128 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/f32-vrsubc-minmax-hvx-u128.c & -tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=SQRDIFF -D BATCH_TILE=32 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vsqrdiffc-hvx-u32.c & -tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=SQRDIFF -D BATCH_TILE=64 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vsqrdiffc-hvx-u64.c & -tools/xngen src/f32-vbinary/vopc-hvx.c.in -D OP=SQRDIFF -D BATCH_TILE=128 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/f32-vsqrdiffc-hvx-u128.c & +tools/xngen src/f32-vbinary/vopc-rvv.c.in -D OP=ADD -D LMUL=4 -D -o src/f32-vbinary/gen/f32-vaddc-rvv-u4v.c & +tools/xngen src/f32-vbinary/vopc-rvv.c.in -D OP=ADD -D LMUL=8 -D -o src/f32-vbinary/gen/f32-vaddc-rvv-u8v.c & +tools/xngen src/f32-vbinary/vopc-rvv.c.in -D OP=DIV -D LMUL=4 -D -o src/f32-vbinary/gen/f32-vdivc-rvv-u4v.c & +tools/xngen src/f32-vbinary/vopc-rvv.c.in -D OP=DIV -D LMUL=8 -D -o src/f32-vbinary/gen/f32-vdivc-rvv-u8v.c & +tools/xngen src/f32-vbinary/vopc-rvv.c.in -D OP=MAX -D LMUL=4 -D -o src/f32-vbinary/gen/f32-vmaxc-rvv-u4v.c & +tools/xngen src/f32-vbinary/vopc-rvv.c.in -D OP=MAX -D LMUL=8 -D -o src/f32-vbinary/gen/f32-vmaxc-rvv-u8v.c & +tools/xngen src/f32-vbinary/vopc-rvv.c.in -D OP=MIN -D LMUL=4 -D -o src/f32-vbinary/gen/f32-vminc-rvv-u4v.c & +tools/xngen src/f32-vbinary/vopc-rvv.c.in -D OP=MIN -D LMUL=8 -D -o src/f32-vbinary/gen/f32-vminc-rvv-u8v.c & +tools/xngen src/f32-vbinary/vopc-rvv.c.in -D OP=MUL -D LMUL=4 -D -o src/f32-vbinary/gen/f32-vmulc-rvv-u4v.c & +tools/xngen src/f32-vbinary/vopc-rvv.c.in -D OP=MUL -D LMUL=8 -D -o src/f32-vbinary/gen/f32-vmulc-rvv-u8v.c & +tools/xngen src/f32-vbinary/vopc-rvv.c.in -D OP=RDIV -D LMUL=4 -D -o src/f32-vbinary/gen/f32-vrdivc-rvv-u4v.c & +tools/xngen src/f32-vbinary/vopc-rvv.c.in -D OP=RDIV -D LMUL=8 -D -o src/f32-vbinary/gen/f32-vrdivc-rvv-u8v.c & +tools/xngen src/f32-vbinary/vopc-rvv.c.in -D OP=RSUB -D LMUL=4 -D -o src/f32-vbinary/gen/f32-vrsubc-rvv-u4v.c & +tools/xngen src/f32-vbinary/vopc-rvv.c.in -D OP=RSUB -D LMUL=8 -D -o src/f32-vbinary/gen/f32-vrsubc-rvv-u8v.c & +tools/xngen src/f32-vbinary/vopc-rvv.c.in -D OP=SQRDIFF -D LMUL=4 -D -o src/f32-vbinary/gen/f32-vsqrdiffc-rvv-u4v.c & +tools/xngen src/f32-vbinary/vopc-rvv.c.in -D OP=SQRDIFF -D LMUL=8 -D -o src/f32-vbinary/gen/f32-vsqrdiffc-rvv-u8v.c & +tools/xngen src/f32-vbinary/vopc-rvv.c.in -D OP=SUB -D LMUL=4 -D -o src/f32-vbinary/gen/f32-vsubc-rvv-u4v.c & +tools/xngen src/f32-vbinary/vopc-rvv.c.in -D OP=SUB -D LMUL=8 -D -o src/f32-vbinary/gen/f32-vsubc-rvv-u8v.c & wait diff --git a/scripts/generate-tests.sh b/scripts/generate-tests.sh index 5d295936fc3..a29c9b9917b 100755 --- a/scripts/generate-tests.sh +++ b/scripts/generate-tests.sh @@ -75,59 +75,49 @@ tools/generate-spmm-test.py --spec test/f16-spmm-minmax.yaml --output-test test/ tools/generate-spmm-test.py --spec test/f32-spmm-minmax.yaml --output-test test/f32-spmm-minmax.cc --output-test test/f32-spmm-minmax-2.cc --output-test test/f32-spmm-minmax-3.cc --output-test test/f32-spmm-minmax-4.cc --output-bench bench/f32-spmm.cc & ### Tests for VBinary micro-kernels -tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --ukernel f16-vadd-minmax --output test/f16-vadd-minmax.cc & -tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --ukernel f16-vdiv-minmax --output test/f16-vdiv-minmax.cc & +tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --ukernel f16-vadd --output test/f16-vadd.cc & +tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --ukernel f16-vdiv --output test/f16-vdiv.cc & tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --ukernel f16-vmax --output test/f16-vmax.cc & tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --ukernel f16-vmin --output test/f16-vmin.cc & -tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --ukernel f16-vmul-minmax --output test/f16-vmul-minmax.cc & +tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --ukernel f16-vmul --output test/f16-vmul.cc & tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --ukernel f16-vprelu --output test/f16-vprelu.cc & tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --ukernel f16-vsqrdiff --output test/f16-vsqrdiff.cc & -tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --ukernel f16-vsub-minmax --output test/f16-vsub-minmax.cc & +tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --ukernel f16-vsub --output test/f16-vsub.cc & -tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel f16-vaddc-minmax --output test/f16-vaddc-minmax.cc & -tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel f16-vdivc-minmax --output test/f16-vdivc-minmax.cc & -tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel f16-vrdivc-minmax --output test/f16-vrdivc-minmax.cc & +tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel f16-vaddc --output test/f16-vaddc.cc & +tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel f16-vdivc --output test/f16-vdivc.cc & +tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel f16-vrdivc --output test/f16-vrdivc.cc & tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel f16-vmaxc --output test/f16-vmaxc.cc & tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel f16-vminc --output test/f16-vminc.cc & -tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel f16-vmulc-minmax --output test/f16-vmulc-minmax.cc & +tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel f16-vmulc --output test/f16-vmulc.cc & tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel f16-vpreluc --output test/f16-vpreluc.cc & tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel f16-vrpreluc --output test/f16-vrpreluc.cc & tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel f16-vsqrdiffc --output test/f16-vsqrdiffc.cc & -tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel f16-vsubc-minmax --output test/f16-vsubc-minmax.cc & -tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel f16-vrsubc-minmax --output test/f16-vrsubc-minmax.cc & +tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel f16-vsubc --output test/f16-vsubc.cc & +tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel f16-vrsubc --output test/f16-vrsubc.cc & -tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --ukernel f32-vadd-minmax --output test/f32-vadd-minmax.cc & tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --ukernel f32-vadd --output test/f32-vadd.cc & tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --ukernel f32-vcopysign --output test/f32-vcopysign.cc & -tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --ukernel f32-vdiv-minmax --output test/f32-vdiv-minmax.cc & tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --ukernel f32-vdiv --output test/f32-vdiv.cc & tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --ukernel f32-vmax --output test/f32-vmax.cc & tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --ukernel f32-vmin --output test/f32-vmin.cc & -tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --ukernel f32-vmul-minmax --output test/f32-vmul-minmax.cc & tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --ukernel f32-vmul --output test/f32-vmul.cc & tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --ukernel f32-vprelu --output test/f32-vprelu.cc & tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --ukernel f32-vsqrdiff --output test/f32-vsqrdiff.cc & -tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --ukernel f32-vsub-minmax --output test/f32-vsub-minmax.cc & tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --ukernel f32-vsub --output test/f32-vsub.cc & -tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel f32-vaddc-minmax --output test/f32-vaddc-minmax.cc & tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel f32-vaddc --output test/f32-vaddc.cc & tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel f32-vcopysignc --output test/f32-vcopysignc.cc & -tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel f32-vdivc-minmax --output test/f32-vdivc-minmax.cc & tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel f32-vdivc --output test/f32-vdivc.cc & tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel f32-vmaxc --output test/f32-vmaxc.cc & tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel f32-vminc --output test/f32-vminc.cc & -tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel f32-vmulc-minmax --output test/f32-vmulc-minmax.cc & tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel f32-vmulc --output test/f32-vmulc.cc & tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel f32-vpreluc --output test/f32-vpreluc.cc & tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel f32-vrpreluc --output test/f32-vrpreluc.cc & tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel f32-vrcopysignc --output test/f32-vrcopysignc.cc & -tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel f32-vrdivc-minmax --output test/f32-vrdivc-minmax.cc & tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel f32-vrdivc --output test/f32-vrdivc.cc & -tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel f32-vrsubc-minmax --output test/f32-vrsubc-minmax.cc & tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel f32-vrsubc --output test/f32-vrsubc.cc & tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel f32-vsqrdiffc --output test/f32-vsqrdiffc.cc & -tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel f32-vsubc-minmax --output test/f32-vsubc-minmax.cc & tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel f32-vsubc --output test/f32-vsubc.cc & tools/generate-vbinary-test.py --tester VCMulMicrokernelTester --ukernel f16-vcmul --output test/f16-vcmul.cc & diff --git a/src/configs/binary-elementwise-config.c b/src/configs/binary-elementwise-config.c index 453db818652..87b5bf3efb8 100644 --- a/src/configs/binary-elementwise-config.c +++ b/src/configs/binary-elementwise-config.c @@ -66,40 +66,36 @@ static void init_f16_vadd_config(void) { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon_fp16_arith) { - f16_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vadd_minmax_ukernel__neonfp16arith_u16; - f16_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vaddc_minmax_ukernel__neonfp16arith_u16; - f16_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vaddc_minmax_ukernel__neonfp16arith_u16; - f16_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; - f16_vadd_config.minmax.element_tile = 16; + f16_vadd_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vadd_ukernel__neonfp16arith_u16; + f16_vadd_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vaddc_ukernel__neonfp16arith_u16; + f16_vadd_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vaddc_ukernel__neonfp16arith_u16; + f16_vadd_config.element_tile = 16; } #elif XNN_ARCH_ARM64 && XNN_ENABLE_ARM_FP16_VECTOR const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon_fp16_arith) { - f16_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vadd_minmax_ukernel__neonfp16arith_u16; - f16_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vaddc_minmax_ukernel__neonfp16arith_u16; - f16_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vaddc_minmax_ukernel__neonfp16arith_u16; - f16_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; - f16_vadd_config.minmax.element_tile = 16; + f16_vadd_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vadd_ukernel__neonfp16arith_u16; + f16_vadd_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vaddc_ukernel__neonfp16arith_u16; + f16_vadd_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vaddc_ukernel__neonfp16arith_u16; + f16_vadd_config.element_tile = 16; } #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); #if XNN_ENABLE_AVX512FP16 if (hardware_config->use_x86_avx512fp16) { - f16_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vadd_minmax_ukernel__avx512fp16_u64; - f16_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vaddc_minmax_ukernel__avx512fp16_u64; - f16_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vaddc_minmax_ukernel__avx512fp16_u64; - f16_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; - f16_vadd_config.minmax.element_tile = 64; + f16_vadd_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vadd_ukernel__avx512fp16_u64; + f16_vadd_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vaddc_ukernel__avx512fp16_u64; + f16_vadd_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vaddc_ukernel__avx512fp16_u64; + f16_vadd_config.element_tile = 64; } else #endif if (hardware_config->use_x86_f16c) { - f16_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vadd_minmax_ukernel__f16c_u16; - f16_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vaddc_minmax_ukernel__f16c_u16; - f16_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vaddc_minmax_ukernel__f16c_u16; - f16_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; - f16_vadd_config.minmax.element_tile = 16; + f16_vadd_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vadd_ukernel__f16c_u16; + f16_vadd_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vaddc_ukernel__f16c_u16; + f16_vadd_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vaddc_ukernel__f16c_u16; + f16_vadd_config.element_tile = 16; } #endif } @@ -109,40 +105,36 @@ static void init_f16_vdiv_config(void) { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon_fp16_arith) { - f16_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vdiv_minmax_ukernel__fp16arith_u2; - f16_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vdivc_minmax_ukernel__fp16arith_u2; - f16_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vrdivc_minmax_ukernel__fp16arith_u2; - f16_vdiv_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; - f16_vdiv_config.minmax.element_tile = 2; + f16_vdiv_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vdiv_ukernel__fp16arith_u2; + f16_vdiv_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vdivc_ukernel__fp16arith_u2; + f16_vdiv_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vrdivc_ukernel__fp16arith_u2; + f16_vdiv_config.element_tile = 2; } #elif XNN_ARCH_ARM64 && XNN_ENABLE_ARM_FP16_VECTOR const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon_fp16_arith) { - f16_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vdiv_minmax_ukernel__aarch64_neonfp16arith_u8; - f16_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vdivc_minmax_ukernel__aarch64_neonfp16arith_u8; - f16_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vrdivc_minmax_ukernel__aarch64_neonfp16arith_u8; - f16_vdiv_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; - f16_vdiv_config.minmax.element_tile = 8; + f16_vdiv_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vdiv_ukernel__aarch64_neonfp16arith_u8; + f16_vdiv_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vdivc_ukernel__aarch64_neonfp16arith_u8; + f16_vdiv_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vrdivc_ukernel__aarch64_neonfp16arith_u8; + f16_vdiv_config.element_tile = 8; } #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); #if XNN_ENABLE_AVX512FP16 if (hardware_config->use_x86_avx512fp16) { - f16_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vdiv_minmax_ukernel__avx512fp16_u64; - f16_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vdivc_minmax_ukernel__avx512fp16_u64; - f16_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vrdivc_minmax_ukernel__avx512fp16_u64; - f16_vdiv_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; - f16_vdiv_config.minmax.element_tile = 64; + f16_vdiv_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vdiv_ukernel__avx512fp16_u64; + f16_vdiv_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vdivc_ukernel__avx512fp16_u64; + f16_vdiv_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vrdivc_ukernel__avx512fp16_u64; + f16_vdiv_config.element_tile = 64; } else #endif if (hardware_config->use_x86_f16c) { - f16_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vdiv_minmax_ukernel__f16c_u8; - f16_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vdivc_minmax_ukernel__f16c_u8; - f16_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vrdivc_minmax_ukernel__f16c_u8; - f16_vdiv_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; - f16_vdiv_config.minmax.element_tile = 8; + f16_vdiv_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vdiv_ukernel__f16c_u8; + f16_vdiv_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vdivc_ukernel__f16c_u8; + f16_vdiv_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vrdivc_ukernel__f16c_u8; + f16_vdiv_config.element_tile = 8; } #endif } @@ -152,37 +144,36 @@ static void init_f16_vmax_config(void) { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon_fp16_arith) { - f16_vmax_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmax_ukernel__neonfp16arith_u16; - f16_vmax_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmaxc_ukernel__neonfp16arith_u16; - f16_vmax_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmaxc_ukernel__neonfp16arith_u16; - f16_vmax_config.linear.element_tile = 16; + f16_vmax_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmax_ukernel__neonfp16arith_u16; + f16_vmax_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmaxc_ukernel__neonfp16arith_u16; + f16_vmax_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmaxc_ukernel__neonfp16arith_u16; + f16_vmax_config.element_tile = 16; } #elif XNN_ARCH_ARM64 && XNN_ENABLE_ARM_FP16_VECTOR const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon_fp16_arith) { - f16_vmax_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmax_ukernel__neonfp16arith_u16; - f16_vmax_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmaxc_ukernel__neonfp16arith_u16; - f16_vmax_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmaxc_ukernel__neonfp16arith_u16; - f16_vmax_config.linear.element_tile = 16; + f16_vmax_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmax_ukernel__neonfp16arith_u16; + f16_vmax_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmaxc_ukernel__neonfp16arith_u16; + f16_vmax_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmaxc_ukernel__neonfp16arith_u16; + f16_vmax_config.element_tile = 16; } #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); #if XNN_ENABLE_AVX512FP16 if (hardware_config->use_x86_avx512fp16) { - f16_vmax_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmax_ukernel__avx512fp16_u64; - f16_vmax_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmaxc_ukernel__avx512fp16_u64; - f16_vmax_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmaxc_ukernel__avx512fp16_u64; - f16_vmax_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; - f16_vmax_config.linear.element_tile = 64; + f16_vmax_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmax_ukernel__avx512fp16_u64; + f16_vmax_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmaxc_ukernel__avx512fp16_u64; + f16_vmax_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmaxc_ukernel__avx512fp16_u64; + f16_vmax_config.element_tile = 64; } else #endif if (hardware_config->use_x86_f16c) { - f16_vmax_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmax_ukernel__f16c_u16; - f16_vmax_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmaxc_ukernel__f16c_u16; - f16_vmax_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmaxc_ukernel__f16c_u16; - f16_vmax_config.linear.element_tile = 16; + f16_vmax_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmax_ukernel__f16c_u16; + f16_vmax_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmaxc_ukernel__f16c_u16; + f16_vmax_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmaxc_ukernel__f16c_u16; + f16_vmax_config.element_tile = 16; } #endif } @@ -192,37 +183,36 @@ static void init_f16_vmin_config(void) { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon_fp16_arith) { - f16_vmin_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmin_ukernel__neonfp16arith_u16; - f16_vmin_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vminc_ukernel__neonfp16arith_u16; - f16_vmin_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vminc_ukernel__neonfp16arith_u16; - f16_vmin_config.linear.element_tile = 16; + f16_vmin_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmin_ukernel__neonfp16arith_u16; + f16_vmin_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vminc_ukernel__neonfp16arith_u16; + f16_vmin_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vminc_ukernel__neonfp16arith_u16; + f16_vmin_config.element_tile = 16; } #elif XNN_ARCH_ARM64 && XNN_ENABLE_ARM_FP16_VECTOR const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon_fp16_arith) { - f16_vmin_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmin_ukernel__neonfp16arith_u16; - f16_vmin_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vminc_ukernel__neonfp16arith_u16; - f16_vmin_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vminc_ukernel__neonfp16arith_u16; - f16_vmin_config.linear.element_tile = 16; + f16_vmin_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmin_ukernel__neonfp16arith_u16; + f16_vmin_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vminc_ukernel__neonfp16arith_u16; + f16_vmin_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vminc_ukernel__neonfp16arith_u16; + f16_vmin_config.element_tile = 16; } #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); #if XNN_ENABLE_AVX512FP16 if (hardware_config->use_x86_avx512fp16) { - f16_vmin_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmin_ukernel__avx512fp16_u64; - f16_vmin_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vminc_ukernel__avx512fp16_u64; - f16_vmin_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vminc_ukernel__avx512fp16_u64; - f16_vmin_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; - f16_vmin_config.linear.element_tile = 64; + f16_vmin_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmin_ukernel__avx512fp16_u64; + f16_vmin_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vminc_ukernel__avx512fp16_u64; + f16_vmin_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vminc_ukernel__avx512fp16_u64; + f16_vmin_config.element_tile = 64; } else #endif if (hardware_config->use_x86_f16c) { - f16_vmin_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmin_ukernel__f16c_u16; - f16_vmin_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vminc_ukernel__f16c_u16; - f16_vmin_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vminc_ukernel__f16c_u16; - f16_vmin_config.linear.element_tile = 16; + f16_vmin_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmin_ukernel__f16c_u16; + f16_vmin_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vminc_ukernel__f16c_u16; + f16_vmin_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vminc_ukernel__f16c_u16; + f16_vmin_config.element_tile = 16; } #endif } @@ -232,40 +222,36 @@ static void init_f16_vmul_config(void) { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon_fp16_arith) { - f16_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmul_minmax_ukernel__neonfp16arith_u16; - f16_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmulc_minmax_ukernel__neonfp16arith_u16; - f16_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmulc_minmax_ukernel__neonfp16arith_u16; - f16_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; - f16_vmul_config.minmax.element_tile = 16; + f16_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmul_ukernel__neonfp16arith_u16; + f16_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmulc_ukernel__neonfp16arith_u16; + f16_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmulc_ukernel__neonfp16arith_u16; + f16_vmul_config.element_tile = 16; } #elif XNN_ARCH_ARM64 && XNN_ENABLE_ARM_FP16_VECTOR const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon_fp16_arith) { - f16_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmul_minmax_ukernel__neonfp16arith_u16; - f16_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmulc_minmax_ukernel__neonfp16arith_u16; - f16_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmulc_minmax_ukernel__neonfp16arith_u16; - f16_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; - f16_vmul_config.minmax.element_tile = 16; + f16_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmul_ukernel__neonfp16arith_u16; + f16_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmulc_ukernel__neonfp16arith_u16; + f16_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmulc_ukernel__neonfp16arith_u16; + f16_vmul_config.element_tile = 16; } #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); #if XNN_ENABLE_AVX512FP16 if (hardware_config->use_x86_avx512fp16) { - f16_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmul_minmax_ukernel__avx512fp16_u64; - f16_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmulc_minmax_ukernel__avx512fp16_u64; - f16_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmulc_minmax_ukernel__avx512fp16_u64; - f16_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; - f16_vmul_config.minmax.element_tile = 64; + f16_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmul_ukernel__avx512fp16_u64; + f16_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmulc_ukernel__avx512fp16_u64; + f16_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmulc_ukernel__avx512fp16_u64; + f16_vmul_config.element_tile = 64; } else #endif if (hardware_config->use_x86_f16c) { - f16_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmul_minmax_ukernel__f16c_u16; - f16_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmulc_minmax_ukernel__f16c_u16; - f16_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmulc_minmax_ukernel__f16c_u16; - f16_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; - f16_vmul_config.minmax.element_tile = 16; + f16_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmul_ukernel__f16c_u16; + f16_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmulc_ukernel__f16c_u16; + f16_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmulc_ukernel__f16c_u16; + f16_vmul_config.element_tile = 16; } #endif } @@ -275,40 +261,36 @@ static void init_f16_vsub_config(void) { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon_fp16_arith) { - f16_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsub_minmax_ukernel__neonfp16arith_u16; - f16_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsubc_minmax_ukernel__neonfp16arith_u16; - f16_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vrsubc_minmax_ukernel__neonfp16arith_u16; - f16_vsub_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; - f16_vsub_config.minmax.element_tile = 16; + f16_vsub_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsub_ukernel__neonfp16arith_u16; + f16_vsub_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsubc_ukernel__neonfp16arith_u16; + f16_vsub_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vrsubc_ukernel__neonfp16arith_u16; + f16_vsub_config.element_tile = 16; } #elif XNN_ARCH_ARM64 && XNN_ENABLE_ARM_FP16_VECTOR const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon_fp16_arith) { - f16_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsub_minmax_ukernel__neonfp16arith_u16; - f16_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsubc_minmax_ukernel__neonfp16arith_u16; - f16_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vrsubc_minmax_ukernel__neonfp16arith_u16; - f16_vsub_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; - f16_vsub_config.minmax.element_tile = 16; + f16_vsub_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsub_ukernel__neonfp16arith_u16; + f16_vsub_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsubc_ukernel__neonfp16arith_u16; + f16_vsub_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vrsubc_ukernel__neonfp16arith_u16; + f16_vsub_config.element_tile = 16; } #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); #if XNN_ENABLE_AVX512FP16 if (hardware_config->use_x86_avx512fp16) { - f16_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsub_minmax_ukernel__avx512fp16_u64; - f16_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsubc_minmax_ukernel__avx512fp16_u64; - f16_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vrsubc_minmax_ukernel__avx512fp16_u64; - f16_vsub_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; - f16_vsub_config.minmax.element_tile = 64; + f16_vsub_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsub_ukernel__avx512fp16_u64; + f16_vsub_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsubc_ukernel__avx512fp16_u64; + f16_vsub_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vrsubc_ukernel__avx512fp16_u64; + f16_vsub_config.element_tile = 64; } else #endif if (hardware_config->use_x86_f16c) { - f16_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsub_minmax_ukernel__f16c_u16; - f16_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsubc_minmax_ukernel__f16c_u16; - f16_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vrsubc_minmax_ukernel__f16c_u16; - f16_vsub_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; - f16_vsub_config.minmax.element_tile = 16; + f16_vsub_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsub_ukernel__f16c_u16; + f16_vsub_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsubc_ukernel__f16c_u16; + f16_vsub_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vrsubc_ukernel__f16c_u16; + f16_vsub_config.element_tile = 16; } #endif } @@ -318,40 +300,36 @@ static void init_f16_vsqrdiff_config(void) { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon_fp16_arith) { - f16_vsqrdiff_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiff_ukernel__neonfp16arith_u16; - f16_vsqrdiff_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiffc_ukernel__neonfp16arith_u16; - f16_vsqrdiff_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiffc_ukernel__neonfp16arith_u16; - f16_vsqrdiff_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; - f16_vsqrdiff_config.linear.element_tile = 16; + f16_vsqrdiff_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiff_ukernel__neonfp16arith_u16; + f16_vsqrdiff_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiffc_ukernel__neonfp16arith_u16; + f16_vsqrdiff_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiffc_ukernel__neonfp16arith_u16; + f16_vsqrdiff_config.element_tile = 16; } #elif XNN_ARCH_ARM64 && XNN_ENABLE_ARM_FP16_VECTOR const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon_fp16_arith) { - f16_vsqrdiff_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiff_ukernel__neonfp16arith_u16; - f16_vsqrdiff_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiffc_ukernel__neonfp16arith_u16; - f16_vsqrdiff_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiffc_ukernel__neonfp16arith_u16; - f16_vsqrdiff_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; - f16_vsqrdiff_config.linear.element_tile = 16; + f16_vsqrdiff_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiff_ukernel__neonfp16arith_u16; + f16_vsqrdiff_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiffc_ukernel__neonfp16arith_u16; + f16_vsqrdiff_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiffc_ukernel__neonfp16arith_u16; + f16_vsqrdiff_config.element_tile = 16; } #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); #if XNN_ENABLE_AVX512FP16 if (hardware_config->use_x86_avx512fp16) { - f16_vsqrdiff_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiff_ukernel__avx512fp16_u64; - f16_vsqrdiff_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiffc_ukernel__avx512fp16_u64; - f16_vsqrdiff_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiffc_ukernel__avx512fp16_u64; - f16_vsqrdiff_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; - f16_vsqrdiff_config.linear.element_tile = 64; + f16_vsqrdiff_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiff_ukernel__avx512fp16_u64; + f16_vsqrdiff_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiffc_ukernel__avx512fp16_u64; + f16_vsqrdiff_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiffc_ukernel__avx512fp16_u64; + f16_vsqrdiff_config.element_tile = 64; } else #endif if (hardware_config->use_x86_f16c) { - f16_vsqrdiff_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiff_ukernel__f16c_u16; - f16_vsqrdiff_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiffc_ukernel__f16c_u16; - f16_vsqrdiff_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiffc_ukernel__f16c_u16; - f16_vsqrdiff_config.init = (xnn_init_binary_params_fn) xnn_init_f16_minmax_binary_params; - f16_vsqrdiff_config.linear.element_tile = 16; + f16_vsqrdiff_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiff_ukernel__f16c_u16; + f16_vsqrdiff_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiffc_ukernel__f16c_u16; + f16_vsqrdiff_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiffc_ukernel__f16c_u16; + f16_vsqrdiff_config.element_tile = 16; } #endif } @@ -361,89 +339,63 @@ static void init_f32_vadd_config(void) { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon){ - f32_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_minmax_ukernel__neon_u8; - f32_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__neon_u8; - f32_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__neon_u8; - f32_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vadd_config.minmax.element_tile = 8; + f32_vadd_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_ukernel__neon_u8; + f32_vadd_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_ukernel__neon_u8; + f32_vadd_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_ukernel__neon_u8; + f32_vadd_config.element_tile = 8; } else if (!XNN_PLATFORM_MOBILE) { - f32_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_minmax_ukernel__scalar_u8; - f32_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__scalar_u8; - f32_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__scalar_u8; - f32_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vadd_config.minmax.element_tile = 8; + f32_vadd_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_ukernel__scalar_u8; + f32_vadd_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_ukernel__scalar_u8; + f32_vadd_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_ukernel__scalar_u8; + f32_vadd_config.element_tile = 8; } #elif XNN_ARCH_ARM64 - f32_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_minmax_ukernel__neon_u8; - f32_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__neon_u8; - f32_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__neon_u8; - f32_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vadd_config.minmax.element_tile = 8; + f32_vadd_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_ukernel__neon_u8; + f32_vadd_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_ukernel__neon_u8; + f32_vadd_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_ukernel__neon_u8; + f32_vadd_config.element_tile = 8; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) { - f32_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_minmax_ukernel__avx512f_u32; - f32_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__avx512f_u32; - f32_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__avx512f_u32; - f32_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vadd_config.minmax.element_tile = 32; + f32_vadd_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_ukernel__avx512f_u32; + f32_vadd_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_ukernel__avx512f_u32; + f32_vadd_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_ukernel__avx512f_u32; + f32_vadd_config.element_tile = 32; } else if (hardware_config->use_x86_avx) { - f32_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_minmax_ukernel__avx_u16; - f32_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__avx_u16; - f32_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__avx_u16; - f32_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vadd_config.minmax.element_tile = 16; + f32_vadd_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_ukernel__avx_u16; + f32_vadd_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_ukernel__avx_u16; + f32_vadd_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_ukernel__avx_u16; + f32_vadd_config.element_tile = 16; } else { - f32_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_minmax_ukernel__sse_u8; - f32_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__sse_u8; - f32_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__sse_u8; - f32_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vadd_config.minmax.element_tile = 8; + f32_vadd_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_ukernel__sse_u8; + f32_vadd_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_ukernel__sse_u8; + f32_vadd_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_ukernel__sse_u8; + f32_vadd_config.element_tile = 8; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); - if (hardware_config->is_x86) { - f32_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_minmax_ukernel__wasmsimd_x86_u16; - f32_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_u16; - f32_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_u16; - f32_vadd_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_ukernel__wasmsimd_u16; - f32_vadd_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_ukernel__wasmsimd_u16; - f32_vadd_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_ukernel__wasmsimd_u16; - f32_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vadd_config.minmax.element_tile = 16; - f32_vadd_config.linear.element_tile = 16; - } else { - f32_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_minmax_ukernel__wasmsimd_arm_u16; - f32_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_u16; - f32_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_u16; - f32_vadd_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_ukernel__wasmsimd_u16; - f32_vadd_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_ukernel__wasmsimd_u16; - f32_vadd_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_ukernel__wasmsimd_u16; - f32_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vadd_config.minmax.element_tile = 16; - f32_vadd_config.linear.element_tile = 16; - } + f32_vadd_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_ukernel__wasmsimd_u16; + f32_vadd_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_ukernel__wasmsimd_u16; + f32_vadd_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_ukernel__wasmsimd_u16; + f32_vadd_config.element_tile = 16; #elif XNN_ARCH_WASM - f32_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_minmax_ukernel__wasm_u8; - f32_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__wasm_u8; - f32_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__wasm_u8; - f32_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vadd_config.minmax.element_tile = 8; + f32_vadd_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_ukernel__wasm_u8; + f32_vadd_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_ukernel__wasm_u8; + f32_vadd_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_ukernel__wasm_u8; + f32_vadd_config.element_tile = 8; #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - f32_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_minmax_ukernel__rvv_u8v; - f32_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__rvv_u8v; - f32_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__rvv_u8v; - f32_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vadd_config.minmax.element_tile = hardware_config->vlenb * 2; // VLENB * (8 / sizeof(float)) + f32_vadd_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_ukernel__rvv_u8v; + f32_vadd_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_ukernel__rvv_u8v; + f32_vadd_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_ukernel__rvv_u8v; + f32_vadd_config.element_tile = hardware_config->vlenb * 2; // VLENB * (8 / sizeof(float)) #else - f32_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_minmax_ukernel__scalar_u8; - f32_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__scalar_u8; - f32_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__scalar_u8; - f32_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vadd_config.minmax.element_tile = 8; + f32_vadd_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_ukernel__scalar_u8; + f32_vadd_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_ukernel__scalar_u8; + f32_vadd_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_ukernel__scalar_u8; + f32_vadd_config.element_tile = 8; #endif } @@ -452,51 +404,51 @@ static void init_f32_vcopysign_config(void) { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon){ - f32_vcopysign_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vcopysign_ukernel__neon_u8; - f32_vcopysign_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vcopysignc_ukernel__neon_u8; - f32_vcopysign_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrcopysignc_ukernel__neon_u8; - f32_vcopysign_config.linear.element_tile = 2; + f32_vcopysign_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vcopysign_ukernel__neon_u8; + f32_vcopysign_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vcopysignc_ukernel__neon_u8; + f32_vcopysign_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrcopysignc_ukernel__neon_u8; + f32_vcopysign_config.element_tile = 2; } else if (!XNN_PLATFORM_MOBILE) { - f32_vcopysign_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vcopysign_ukernel__scalar_u2; - f32_vcopysign_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vcopysignc_ukernel__scalar_u2; - f32_vcopysign_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrcopysignc_ukernel__scalar_u2; - f32_vcopysign_config.linear.element_tile = 2; + f32_vcopysign_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vcopysign_ukernel__scalar_u2; + f32_vcopysign_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vcopysignc_ukernel__scalar_u2; + f32_vcopysign_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrcopysignc_ukernel__scalar_u2; + f32_vcopysign_config.element_tile = 2; } #elif XNN_ARCH_ARM64 - f32_vcopysign_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vcopysign_ukernel__neon_u8; - f32_vcopysign_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vcopysignc_ukernel__neon_u8; - f32_vcopysign_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrcopysignc_ukernel__neon_u8; - f32_vcopysign_config.linear.element_tile = 8; + f32_vcopysign_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vcopysign_ukernel__neon_u8; + f32_vcopysign_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vcopysignc_ukernel__neon_u8; + f32_vcopysign_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrcopysignc_ukernel__neon_u8; + f32_vcopysign_config.element_tile = 8; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) { - f32_vcopysign_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vcopysign_ukernel__avx512f_u32; - f32_vcopysign_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vcopysignc_ukernel__avx512f_u32; - f32_vcopysign_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrcopysignc_ukernel__avx512f_u32; - f32_vcopysign_config.linear.element_tile = 32; + f32_vcopysign_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vcopysign_ukernel__avx512f_u32; + f32_vcopysign_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vcopysignc_ukernel__avx512f_u32; + f32_vcopysign_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrcopysignc_ukernel__avx512f_u32; + f32_vcopysign_config.element_tile = 32; } else if (hardware_config->use_x86_avx) { - f32_vcopysign_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vcopysign_ukernel__avx_u16; - f32_vcopysign_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vcopysignc_ukernel__avx_u16; - f32_vcopysign_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrcopysignc_ukernel__avx_u16; - f32_vcopysign_config.linear.element_tile = 16; + f32_vcopysign_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vcopysign_ukernel__avx_u16; + f32_vcopysign_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vcopysignc_ukernel__avx_u16; + f32_vcopysign_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrcopysignc_ukernel__avx_u16; + f32_vcopysign_config.element_tile = 16; } else { - f32_vcopysign_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vcopysign_ukernel__sse2_u8; - f32_vcopysign_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vcopysignc_ukernel__sse2_u8; - f32_vcopysign_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrcopysignc_ukernel__sse2_u8; - f32_vcopysign_config.linear.element_tile = 8; + f32_vcopysign_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vcopysign_ukernel__sse2_u8; + f32_vcopysign_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vcopysignc_ukernel__sse2_u8; + f32_vcopysign_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrcopysignc_ukernel__sse2_u8; + f32_vcopysign_config.element_tile = 8; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - f32_vcopysign_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vcopysign_ukernel__wasmsimd_u16; - f32_vcopysign_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vcopysignc_ukernel__wasmsimd_u16; - f32_vcopysign_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrcopysignc_ukernel__wasmsimd_u16; - f32_vcopysign_config.linear.element_tile = 16; - f32_vcopysign_config.linear.element_tile = 16; + f32_vcopysign_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vcopysign_ukernel__wasmsimd_u16; + f32_vcopysign_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vcopysignc_ukernel__wasmsimd_u16; + f32_vcopysign_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrcopysignc_ukernel__wasmsimd_u16; + f32_vcopysign_config.element_tile = 16; + f32_vcopysign_config.element_tile = 16; #else - f32_vcopysign_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vcopysign_ukernel__scalar_u2; - f32_vcopysign_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vcopysignc_ukernel__scalar_u2; - f32_vcopysign_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrcopysignc_ukernel__scalar_u2; - f32_vcopysign_config.linear.element_tile = 2; + f32_vcopysign_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vcopysign_ukernel__scalar_u2; + f32_vcopysign_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vcopysignc_ukernel__scalar_u2; + f32_vcopysign_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrcopysignc_ukernel__scalar_u2; + f32_vcopysign_config.element_tile = 2; #endif } @@ -506,53 +458,53 @@ static void init_s32_vmul_config(void) { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon) { - s32_vmul_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmul_ukernel__neon_u8; - s32_vmul_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmulc_ukernel__neon_u8; - s32_vmul_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmulc_ukernel__neon_u8; - s32_vmul_config.linear.element_tile = 8; + s32_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmul_ukernel__neon_u8; + s32_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmulc_ukernel__neon_u8; + s32_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmulc_ukernel__neon_u8; + s32_vmul_config.element_tile = 8; } else if (!XNN_PLATFORM_MOBILE) { - s32_vmul_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmul_ukernel__scalar_u2; - s32_vmul_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmulc_ukernel__scalar_u2; - s32_vmul_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmulc_ukernel__scalar_u2; - s32_vmul_config.linear.element_tile = 2; + s32_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmul_ukernel__scalar_u2; + s32_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmulc_ukernel__scalar_u2; + s32_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmulc_ukernel__scalar_u2; + s32_vmul_config.element_tile = 2; } #elif XNN_ARCH_ARM64 - s32_vmul_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmul_ukernel__neon_u8; - s32_vmul_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmulc_ukernel__neon_u8; - s32_vmul_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmulc_ukernel__neon_u8; - s32_vmul_config.linear.element_tile = 8; + s32_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmul_ukernel__neon_u8; + s32_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmulc_ukernel__neon_u8; + s32_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmulc_ukernel__neon_u8; + s32_vmul_config.element_tile = 8; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) { - s32_vmul_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmul_ukernel__avx512f_u32; - s32_vmul_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmulc_ukernel__avx512f_u32; - s32_vmul_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmulc_ukernel__avx512f_u32; - s32_vmul_config.linear.element_tile = 32; + s32_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmul_ukernel__avx512f_u32; + s32_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmulc_ukernel__avx512f_u32; + s32_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmulc_ukernel__avx512f_u32; + s32_vmul_config.element_tile = 32; } else if (hardware_config->use_x86_avx2) { - s32_vmul_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmul_ukernel__avx2_u16; - s32_vmul_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmulc_ukernel__avx2_u16; - s32_vmul_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmulc_ukernel__avx2_u16; - s32_vmul_config.linear.element_tile = 16; + s32_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmul_ukernel__avx2_u16; + s32_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmulc_ukernel__avx2_u16; + s32_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmulc_ukernel__avx2_u16; + s32_vmul_config.element_tile = 16; } else { - s32_vmul_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmul_ukernel__sse41_u8; - s32_vmul_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmulc_ukernel__sse41_u8; - s32_vmul_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmulc_ukernel__sse41_u8; - s32_vmul_config.linear.element_tile = 8; + s32_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmul_ukernel__sse41_u8; + s32_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmulc_ukernel__sse41_u8; + s32_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmulc_ukernel__sse41_u8; + s32_vmul_config.element_tile = 8; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - s32_vmul_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmul_ukernel__wasmsimd_u16; - s32_vmul_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmulc_ukernel__wasmsimd_u16; - s32_vmul_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmulc_ukernel__wasmsimd_u16; - s32_vmul_config.linear.element_tile = 16; + s32_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmul_ukernel__wasmsimd_u16; + s32_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmulc_ukernel__wasmsimd_u16; + s32_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmulc_ukernel__wasmsimd_u16; + s32_vmul_config.element_tile = 16; #else - s32_vmul_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmul_ukernel__scalar_u2; - s32_vmul_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmulc_ukernel__scalar_u2; - s32_vmul_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmulc_ukernel__scalar_u2; - s32_vmul_config.linear.element_tile = 2; + s32_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmul_ukernel__scalar_u2; + s32_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmulc_ukernel__scalar_u2; + s32_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vmulc_ukernel__scalar_u2; + s32_vmul_config.element_tile = 2; #endif } @@ -561,89 +513,63 @@ static void init_f32_vdiv_config(void) { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon){ - f32_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_minmax_ukernel__scalar_u2; - f32_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_minmax_ukernel__scalar_u2; - f32_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_minmax_ukernel__scalar_u2; - f32_vdiv_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vdiv_config.minmax.element_tile = 2; + f32_vdiv_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_ukernel__scalar_u2; + f32_vdiv_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_ukernel__scalar_u2; + f32_vdiv_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_ukernel__scalar_u2; + f32_vdiv_config.element_tile = 2; } else if (!XNN_PLATFORM_MOBILE) { - f32_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_minmax_ukernel__scalar_u2; - f32_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_minmax_ukernel__scalar_u2; - f32_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_minmax_ukernel__scalar_u2; - f32_vdiv_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vdiv_config.minmax.element_tile = 2; + f32_vdiv_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_ukernel__scalar_u2; + f32_vdiv_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_ukernel__scalar_u2; + f32_vdiv_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_ukernel__scalar_u2; + f32_vdiv_config.element_tile = 2; } #elif XNN_ARCH_ARM64 - f32_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_minmax_ukernel__aarch64_neon_u8; - f32_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_minmax_ukernel__aarch64_neon_u8; - f32_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_minmax_ukernel__aarch64_neon_u8; - f32_vdiv_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vdiv_config.minmax.element_tile = 8; + f32_vdiv_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_ukernel__aarch64_neon_u8; + f32_vdiv_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_ukernel__aarch64_neon_u8; + f32_vdiv_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_ukernel__aarch64_neon_u8; + f32_vdiv_config.element_tile = 8; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) { - f32_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_minmax_ukernel__avx512f_u32; - f32_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_minmax_ukernel__avx512f_u32; - f32_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_minmax_ukernel__avx512f_u32; - f32_vdiv_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vdiv_config.minmax.element_tile = 32; + f32_vdiv_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_ukernel__avx512f_u32; + f32_vdiv_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_ukernel__avx512f_u32; + f32_vdiv_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_ukernel__avx512f_u32; + f32_vdiv_config.element_tile = 32; } else if (hardware_config->use_x86_avx) { - f32_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_minmax_ukernel__avx_u16; - f32_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_minmax_ukernel__avx_u16; - f32_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_minmax_ukernel__avx_u16; - f32_vdiv_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vdiv_config.minmax.element_tile = 16; + f32_vdiv_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_ukernel__avx_u16; + f32_vdiv_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_ukernel__avx_u16; + f32_vdiv_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_ukernel__avx_u16; + f32_vdiv_config.element_tile = 16; } else { - f32_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_minmax_ukernel__sse_u8; - f32_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_minmax_ukernel__sse_u8; - f32_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_minmax_ukernel__sse_u8; - f32_vdiv_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vdiv_config.minmax.element_tile = 8; + f32_vdiv_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_ukernel__sse_u8; + f32_vdiv_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_ukernel__sse_u8; + f32_vdiv_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_ukernel__sse_u8; + f32_vdiv_config.element_tile = 8; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); - if (hardware_config->is_x86) { - f32_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_minmax_ukernel__wasmsimd_x86_u16; - f32_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_minmax_ukernel__wasmsimd_x86_u16; - f32_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_minmax_ukernel__wasmsimd_x86_u16; - f32_vdiv_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_ukernel__wasmsimd_u16; - f32_vdiv_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_ukernel__wasmsimd_u16; - f32_vdiv_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_ukernel__wasmsimd_u16; - f32_vdiv_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vdiv_config.minmax.element_tile = 16; - f32_vdiv_config.linear.element_tile = 16; - } else { - f32_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_minmax_ukernel__wasmsimd_arm_u16; - f32_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_minmax_ukernel__wasmsimd_arm_u16; - f32_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_minmax_ukernel__wasmsimd_arm_u16; - f32_vdiv_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_ukernel__wasmsimd_u16; - f32_vdiv_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_ukernel__wasmsimd_u16; - f32_vdiv_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_ukernel__wasmsimd_u16; - f32_vdiv_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vdiv_config.minmax.element_tile = 16; - f32_vdiv_config.linear.element_tile = 16; - } + f32_vdiv_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_ukernel__wasmsimd_u16; + f32_vdiv_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_ukernel__wasmsimd_u16; + f32_vdiv_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_ukernel__wasmsimd_u16; + f32_vdiv_config.element_tile = 16; #elif XNN_ARCH_WASM - f32_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_minmax_ukernel__wasm_u8; - f32_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_minmax_ukernel__wasm_u8; - f32_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_minmax_ukernel__wasm_u8; - f32_vdiv_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vdiv_config.minmax.element_tile = 8; + f32_vdiv_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_ukernel__wasm_u8; + f32_vdiv_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_ukernel__wasm_u8; + f32_vdiv_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_ukernel__wasm_u8; + f32_vdiv_config.element_tile = 8; #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - f32_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_minmax_ukernel__rvv_u8v; - f32_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_minmax_ukernel__rvv_u8v; - f32_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_minmax_ukernel__rvv_u8v; - f32_vdiv_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vdiv_config.minmax.element_tile = hardware_config->vlenb * 2; // VLENB * (8 / sizeof(float)) + f32_vdiv_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_ukernel__rvv_u8v; + f32_vdiv_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_ukernel__rvv_u8v; + f32_vdiv_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_ukernel__rvv_u8v; + f32_vdiv_config.element_tile = hardware_config->vlenb * 2; // VLENB * (8 / sizeof(float)) #else - f32_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_minmax_ukernel__scalar_u2; - f32_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_minmax_ukernel__scalar_u2; - f32_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_minmax_ukernel__scalar_u2; - f32_vdiv_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vdiv_config.minmax.element_tile = 2; + f32_vdiv_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_ukernel__scalar_u2; + f32_vdiv_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_ukernel__scalar_u2; + f32_vdiv_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_ukernel__scalar_u2; + f32_vdiv_config.element_tile = 2; #endif } @@ -652,70 +578,70 @@ static void init_f32_vmax_config(void) { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon){ - f32_vmax_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__neon_u8; - f32_vmax_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__neon_u8; - f32_vmax_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__neon_u8; - f32_vmax_config.linear.element_tile = 8; + f32_vmax_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__neon_u8; + f32_vmax_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__neon_u8; + f32_vmax_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__neon_u8; + f32_vmax_config.element_tile = 8; } else if (!XNN_PLATFORM_MOBILE) { - f32_vmax_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__scalar_u8; - f32_vmax_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__scalar_u8; - f32_vmax_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__scalar_u8; - f32_vmax_config.linear.element_tile = 8; + f32_vmax_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__scalar_u8; + f32_vmax_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__scalar_u8; + f32_vmax_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__scalar_u8; + f32_vmax_config.element_tile = 8; } #elif XNN_ARCH_ARM64 - f32_vmax_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__neon_u8; - f32_vmax_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__neon_u8; - f32_vmax_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__neon_u8; - f32_vmax_config.linear.element_tile = 8; + f32_vmax_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__neon_u8; + f32_vmax_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__neon_u8; + f32_vmax_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__neon_u8; + f32_vmax_config.element_tile = 8; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) { - f32_vmax_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__avx512f_u32; - f32_vmax_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__avx512f_u32; - f32_vmax_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__avx512f_u32; - f32_vmax_config.linear.element_tile = 32; + f32_vmax_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__avx512f_u32; + f32_vmax_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__avx512f_u32; + f32_vmax_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__avx512f_u32; + f32_vmax_config.element_tile = 32; } else if (hardware_config->use_x86_avx) { - f32_vmax_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__avx_u16; - f32_vmax_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__avx_u16; - f32_vmax_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__avx_u16; - f32_vmax_config.linear.element_tile = 16; + f32_vmax_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__avx_u16; + f32_vmax_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__avx_u16; + f32_vmax_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__avx_u16; + f32_vmax_config.element_tile = 16; } else { - f32_vmax_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__sse_u8; - f32_vmax_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__sse_u8; - f32_vmax_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__sse_u8; - f32_vmax_config.linear.element_tile = 8; + f32_vmax_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__sse_u8; + f32_vmax_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__sse_u8; + f32_vmax_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__sse_u8; + f32_vmax_config.element_tile = 8; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->is_x86) { - f32_vmax_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__wasmsimd_x86_u16; - f32_vmax_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__wasmsimd_x86_u16; - f32_vmax_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__wasmsimd_x86_u16; - f32_vmax_config.linear.element_tile = 16; + f32_vmax_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__wasmsimd_x86_u16; + f32_vmax_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__wasmsimd_x86_u16; + f32_vmax_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__wasmsimd_x86_u16; + f32_vmax_config.element_tile = 16; } else { - f32_vmax_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__wasmsimd_arm_u16; - f32_vmax_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__wasmsimd_arm_u16; - f32_vmax_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__wasmsimd_arm_u16; - f32_vmax_config.linear.element_tile = 16; + f32_vmax_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__wasmsimd_arm_u16; + f32_vmax_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__wasmsimd_arm_u16; + f32_vmax_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__wasmsimd_arm_u16; + f32_vmax_config.element_tile = 16; } #elif XNN_ARCH_WASM - f32_vmax_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__wasm_u8; - f32_vmax_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__wasm_u8; - f32_vmax_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__wasm_u8; - f32_vmax_config.linear.element_tile = 8; + f32_vmax_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__wasm_u8; + f32_vmax_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__wasm_u8; + f32_vmax_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__wasm_u8; + f32_vmax_config.element_tile = 8; #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - f32_vmax_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__rvv_u8v; - f32_vmax_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__rvv_u8v; - f32_vmax_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__rvv_u8v; - f32_vmax_config.linear.element_tile = hardware_config->vlenb * 2; // VLENB * (8 / sizeof(float)) + f32_vmax_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__rvv_u8v; + f32_vmax_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__rvv_u8v; + f32_vmax_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__rvv_u8v; + f32_vmax_config.element_tile = hardware_config->vlenb * 2; // VLENB * (8 / sizeof(float)) #else - f32_vmax_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__scalar_u8; - f32_vmax_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__scalar_u8; - f32_vmax_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__scalar_u8; - f32_vmax_config.linear.element_tile = 8; + f32_vmax_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__scalar_u8; + f32_vmax_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__scalar_u8; + f32_vmax_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__scalar_u8; + f32_vmax_config.element_tile = 8; #endif } @@ -724,70 +650,70 @@ static void init_f32_vmin_config(void) { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon){ - f32_vmin_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__neon_u8; - f32_vmin_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__neon_u8; - f32_vmin_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__neon_u8; - f32_vmin_config.linear.element_tile = 8; + f32_vmin_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__neon_u8; + f32_vmin_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__neon_u8; + f32_vmin_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__neon_u8; + f32_vmin_config.element_tile = 8; } else if (!XNN_PLATFORM_MOBILE) { - f32_vmin_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__scalar_u8; - f32_vmin_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__scalar_u8; - f32_vmin_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__scalar_u8; - f32_vmin_config.linear.element_tile = 8; + f32_vmin_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__scalar_u8; + f32_vmin_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__scalar_u8; + f32_vmin_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__scalar_u8; + f32_vmin_config.element_tile = 8; } #elif XNN_ARCH_ARM64 - f32_vmin_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__neon_u8; - f32_vmin_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__neon_u8; - f32_vmin_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__neon_u8; - f32_vmin_config.linear.element_tile = 8; + f32_vmin_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__neon_u8; + f32_vmin_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__neon_u8; + f32_vmin_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__neon_u8; + f32_vmin_config.element_tile = 8; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) { - f32_vmin_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__avx512f_u32; - f32_vmin_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__avx512f_u32; - f32_vmin_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__avx512f_u32; - f32_vmin_config.linear.element_tile = 32; + f32_vmin_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__avx512f_u32; + f32_vmin_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__avx512f_u32; + f32_vmin_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__avx512f_u32; + f32_vmin_config.element_tile = 32; } else if (hardware_config->use_x86_avx) { - f32_vmin_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__avx_u16; - f32_vmin_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__avx_u16; - f32_vmin_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__avx_u16; - f32_vmin_config.linear.element_tile = 16; + f32_vmin_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__avx_u16; + f32_vmin_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__avx_u16; + f32_vmin_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__avx_u16; + f32_vmin_config.element_tile = 16; } else { - f32_vmin_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__sse_u8; - f32_vmin_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__sse_u8; - f32_vmin_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__sse_u8; - f32_vmin_config.linear.element_tile = 8; + f32_vmin_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__sse_u8; + f32_vmin_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__sse_u8; + f32_vmin_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__sse_u8; + f32_vmin_config.element_tile = 8; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->is_x86) { - f32_vmin_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__wasmsimd_x86_u16; - f32_vmin_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__wasmsimd_x86_u16; - f32_vmin_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__wasmsimd_x86_u16; - f32_vmin_config.linear.element_tile = 16; + f32_vmin_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__wasmsimd_x86_u16; + f32_vmin_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__wasmsimd_x86_u16; + f32_vmin_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__wasmsimd_x86_u16; + f32_vmin_config.element_tile = 16; } else { - f32_vmin_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__wasmsimd_arm_u16; - f32_vmin_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__wasmsimd_arm_u16; - f32_vmin_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__wasmsimd_arm_u16; - f32_vmin_config.linear.element_tile = 16; + f32_vmin_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__wasmsimd_arm_u16; + f32_vmin_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__wasmsimd_arm_u16; + f32_vmin_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__wasmsimd_arm_u16; + f32_vmin_config.element_tile = 16; } #elif XNN_ARCH_WASM - f32_vmin_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__wasm_u8; - f32_vmin_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__wasm_u8; - f32_vmin_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__wasm_u8; - f32_vmin_config.linear.element_tile = 8; + f32_vmin_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__wasm_u8; + f32_vmin_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__wasm_u8; + f32_vmin_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__wasm_u8; + f32_vmin_config.element_tile = 8; #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - f32_vmin_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__rvv_u8v; - f32_vmin_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__rvv_u8v; - f32_vmin_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__rvv_u8v; - f32_vmin_config.linear.element_tile = hardware_config->vlenb * 2; // VLENB * (8 / sizeof(float)) + f32_vmin_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__rvv_u8v; + f32_vmin_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__rvv_u8v; + f32_vmin_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__rvv_u8v; + f32_vmin_config.element_tile = hardware_config->vlenb * 2; // VLENB * (8 / sizeof(float)) #else - f32_vmin_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__scalar_u8; - f32_vmin_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__scalar_u8; - f32_vmin_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__scalar_u8; - f32_vmin_config.linear.element_tile = 8; + f32_vmin_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__scalar_u8; + f32_vmin_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__scalar_u8; + f32_vmin_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__scalar_u8; + f32_vmin_config.element_tile = 8; #endif } @@ -796,89 +722,63 @@ static void init_f32_vmul_config(void) { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon){ - f32_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_minmax_ukernel__neon_u8; - f32_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__neon_u8; - f32_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__neon_u8; - f32_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vmul_config.minmax.element_tile = 8; + f32_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_ukernel__neon_u8; + f32_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_ukernel__neon_u8; + f32_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_ukernel__neon_u8; + f32_vmul_config.element_tile = 8; } else if (!XNN_PLATFORM_MOBILE) { - f32_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_minmax_ukernel__scalar_u8; - f32_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__scalar_u8; - f32_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__scalar_u8; - f32_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vmul_config.minmax.element_tile = 8; + f32_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_ukernel__scalar_u8; + f32_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_ukernel__scalar_u8; + f32_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_ukernel__scalar_u8; + f32_vmul_config.element_tile = 8; } #elif XNN_ARCH_ARM64 - f32_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_minmax_ukernel__neon_u8; - f32_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__neon_u8; - f32_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__neon_u8; - f32_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vmul_config.minmax.element_tile = 8; + f32_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_ukernel__neon_u8; + f32_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_ukernel__neon_u8; + f32_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_ukernel__neon_u8; + f32_vmul_config.element_tile = 8; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) { - f32_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_minmax_ukernel__avx512f_u32; - f32_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__avx512f_u32; - f32_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__avx512f_u32; - f32_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vmul_config.minmax.element_tile = 32; + f32_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_ukernel__avx512f_u32; + f32_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_ukernel__avx512f_u32; + f32_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_ukernel__avx512f_u32; + f32_vmul_config.element_tile = 32; } else if (hardware_config->use_x86_avx) { - f32_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_minmax_ukernel__avx_u16; - f32_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__avx_u16; - f32_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__avx_u16; - f32_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vmul_config.minmax.element_tile = 16; + f32_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_ukernel__avx_u16; + f32_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_ukernel__avx_u16; + f32_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_ukernel__avx_u16; + f32_vmul_config.element_tile = 16; } else { - f32_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_minmax_ukernel__sse_u8; - f32_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__sse_u8; - f32_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__sse_u8; - f32_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vmul_config.minmax.element_tile = 8; + f32_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_ukernel__sse_u8; + f32_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_ukernel__sse_u8; + f32_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_ukernel__sse_u8; + f32_vmul_config.element_tile = 8; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); - if (hardware_config->is_x86) { - f32_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_minmax_ukernel__wasmsimd_x86_u16; - f32_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_u16; - f32_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_u16; - f32_vmul_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_ukernel__wasmsimd_u16; - f32_vmul_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_ukernel__wasmsimd_u16; - f32_vmul_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_ukernel__wasmsimd_u16; - f32_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vmul_config.minmax.element_tile = 16; - f32_vmul_config.linear.element_tile = 16; - } else { - f32_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_minmax_ukernel__wasmsimd_arm_u16; - f32_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_u16; - f32_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_u16; - f32_vmul_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_ukernel__wasmsimd_u16; - f32_vmul_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_ukernel__wasmsimd_u16; - f32_vmul_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_ukernel__wasmsimd_u16; - f32_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vmul_config.minmax.element_tile = 16; - f32_vmul_config.linear.element_tile = 16; - } + f32_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_ukernel__wasmsimd_u16; + f32_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_ukernel__wasmsimd_u16; + f32_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_ukernel__wasmsimd_u16; + f32_vmul_config.element_tile = 16; #elif XNN_ARCH_WASM - f32_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_minmax_ukernel__wasm_u8; - f32_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__wasm_u8; - f32_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__wasm_u8; - f32_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vmul_config.minmax.element_tile = 8; + f32_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_ukernel__wasm_u8; + f32_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_ukernel__wasm_u8; + f32_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_ukernel__wasm_u8; + f32_vmul_config.element_tile = 8; #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - f32_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_minmax_ukernel__rvv_u8v; - f32_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__rvv_u8v; - f32_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__rvv_u8v; - f32_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vmul_config.minmax.element_tile = hardware_config->vlenb * 2; // VLENB * (8 / sizeof(float)) + f32_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_ukernel__rvv_u8v; + f32_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_ukernel__rvv_u8v; + f32_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_ukernel__rvv_u8v; + f32_vmul_config.element_tile = hardware_config->vlenb * 2; // VLENB * (8 / sizeof(float)) #else - f32_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_minmax_ukernel__scalar_u8; - f32_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__scalar_u8; - f32_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__scalar_u8; - f32_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vmul_config.minmax.element_tile = 8; + f32_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_ukernel__scalar_u8; + f32_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_ukernel__scalar_u8; + f32_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_ukernel__scalar_u8; + f32_vmul_config.element_tile = 8; #endif } @@ -887,89 +787,63 @@ static void init_f32_vsub_config(void) { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon){ - f32_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_minmax_ukernel__neon_u8; - f32_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_minmax_ukernel__neon_u8; - f32_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_minmax_ukernel__neon_u8; - f32_vsub_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vsub_config.minmax.element_tile = 8; + f32_vsub_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_ukernel__neon_u8; + f32_vsub_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_ukernel__neon_u8; + f32_vsub_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_ukernel__neon_u8; + f32_vsub_config.element_tile = 8; } else if (!XNN_PLATFORM_MOBILE) { - f32_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_minmax_ukernel__scalar_u8; - f32_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_minmax_ukernel__scalar_u8; - f32_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_minmax_ukernel__scalar_u8; - f32_vsub_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vsub_config.minmax.element_tile = 8; + f32_vsub_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_ukernel__scalar_u8; + f32_vsub_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_ukernel__scalar_u8; + f32_vsub_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_ukernel__scalar_u8; + f32_vsub_config.element_tile = 8; } #elif XNN_ARCH_ARM64 - f32_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_minmax_ukernel__neon_u8; - f32_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_minmax_ukernel__neon_u8; - f32_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_minmax_ukernel__neon_u8; - f32_vsub_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vsub_config.minmax.element_tile = 8; + f32_vsub_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_ukernel__neon_u8; + f32_vsub_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_ukernel__neon_u8; + f32_vsub_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_ukernel__neon_u8; + f32_vsub_config.element_tile = 8; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) { - f32_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_minmax_ukernel__avx512f_u32; - f32_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_minmax_ukernel__avx512f_u32; - f32_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_minmax_ukernel__avx512f_u32; - f32_vsub_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vsub_config.minmax.element_tile = 32; + f32_vsub_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_ukernel__avx512f_u32; + f32_vsub_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_ukernel__avx512f_u32; + f32_vsub_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_ukernel__avx512f_u32; + f32_vsub_config.element_tile = 32; } else if (hardware_config->use_x86_avx) { - f32_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_minmax_ukernel__avx_u16; - f32_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_minmax_ukernel__avx_u16; - f32_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_minmax_ukernel__avx_u16; - f32_vsub_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vsub_config.minmax.element_tile = 16; + f32_vsub_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_ukernel__avx_u16; + f32_vsub_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_ukernel__avx_u16; + f32_vsub_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_ukernel__avx_u16; + f32_vsub_config.element_tile = 16; } else { - f32_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_minmax_ukernel__sse_u8; - f32_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_minmax_ukernel__sse_u8; - f32_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_minmax_ukernel__sse_u8; - f32_vsub_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vsub_config.minmax.element_tile = 8; + f32_vsub_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_ukernel__sse_u8; + f32_vsub_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_ukernel__sse_u8; + f32_vsub_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_ukernel__sse_u8; + f32_vsub_config.element_tile = 8; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); - if (hardware_config->is_x86) { - f32_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_minmax_ukernel__wasmsimd_x86_u16; - f32_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_minmax_ukernel__wasmsimd_x86_u16; - f32_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_minmax_ukernel__wasmsimd_x86_u16; - f32_vsub_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_ukernel__wasmsimd_u16; - f32_vsub_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_ukernel__wasmsimd_u16; - f32_vsub_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_ukernel__wasmsimd_u16; - f32_vsub_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vsub_config.minmax.element_tile = 16; - f32_vsub_config.linear.element_tile = 16; - } else { - f32_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_minmax_ukernel__wasmsimd_arm_u16; - f32_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_minmax_ukernel__wasmsimd_arm_u16; - f32_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_minmax_ukernel__wasmsimd_arm_u16; - f32_vsub_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_ukernel__wasmsimd_u16; - f32_vsub_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_ukernel__wasmsimd_u16; - f32_vsub_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_ukernel__wasmsimd_u16; - f32_vsub_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vsub_config.minmax.element_tile = 16; - f32_vsub_config.linear.element_tile = 16; - } + f32_vsub_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_ukernel__wasmsimd_u16; + f32_vsub_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_ukernel__wasmsimd_u16; + f32_vsub_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_ukernel__wasmsimd_u16; + f32_vsub_config.element_tile = 16; #elif XNN_ARCH_WASM - f32_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_minmax_ukernel__wasm_u8; - f32_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_minmax_ukernel__wasm_u8; - f32_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_minmax_ukernel__wasm_u8; - f32_vsub_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vsub_config.minmax.element_tile = 8; + f32_vsub_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_ukernel__wasm_u8; + f32_vsub_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_ukernel__wasm_u8; + f32_vsub_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_ukernel__wasm_u8; + f32_vsub_config.element_tile = 8; #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - f32_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_minmax_ukernel__rvv_u8v; - f32_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_minmax_ukernel__rvv_u8v; - f32_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_minmax_ukernel__rvv_u8v; - f32_vsub_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vsub_config.minmax.element_tile = hardware_config->vlenb * 2; // VLENB * (8 / sizeof(float)) + f32_vsub_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_ukernel__rvv_u8v; + f32_vsub_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_ukernel__rvv_u8v; + f32_vsub_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_ukernel__rvv_u8v; + f32_vsub_config.element_tile = hardware_config->vlenb * 2; // VLENB * (8 / sizeof(float)) #else - f32_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_minmax_ukernel__scalar_u8; - f32_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_minmax_ukernel__scalar_u8; - f32_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_minmax_ukernel__scalar_u8; - f32_vsub_config.init = (xnn_init_binary_params_fn) xnn_init_f32_minmax_binary_params; - f32_vsub_config.minmax.element_tile = 8; + f32_vsub_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_ukernel__scalar_u8; + f32_vsub_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_ukernel__scalar_u8; + f32_vsub_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_ukernel__scalar_u8; + f32_vsub_config.element_tile = 8; #endif } @@ -978,56 +852,56 @@ static void init_f32_vsqrdiff_config(void) { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon){ - f32_vsqrdiff_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__neon_u8; - f32_vsqrdiff_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__neon_u8; - f32_vsqrdiff_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__neon_u8; - f32_vsqrdiff_config.linear.element_tile = 8; + f32_vsqrdiff_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__neon_u8; + f32_vsqrdiff_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__neon_u8; + f32_vsqrdiff_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__neon_u8; + f32_vsqrdiff_config.element_tile = 8; } else if (!XNN_PLATFORM_MOBILE) { - f32_vsqrdiff_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__scalar_u8; - f32_vsqrdiff_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__scalar_u8; - f32_vsqrdiff_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__scalar_u8; - f32_vsqrdiff_config.linear.element_tile = 8; + f32_vsqrdiff_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__scalar_u8; + f32_vsqrdiff_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__scalar_u8; + f32_vsqrdiff_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__scalar_u8; + f32_vsqrdiff_config.element_tile = 8; } #elif XNN_ARCH_ARM64 - f32_vsqrdiff_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__neon_u8; - f32_vsqrdiff_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__neon_u8; - f32_vsqrdiff_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__neon_u8; - f32_vsqrdiff_config.linear.element_tile = 8; + f32_vsqrdiff_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__neon_u8; + f32_vsqrdiff_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__neon_u8; + f32_vsqrdiff_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__neon_u8; + f32_vsqrdiff_config.element_tile = 8; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) { - f32_vsqrdiff_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__avx512f_u32; - f32_vsqrdiff_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__avx512f_u32; - f32_vsqrdiff_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__avx512f_u32; - f32_vsqrdiff_config.linear.element_tile = 32; + f32_vsqrdiff_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__avx512f_u32; + f32_vsqrdiff_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__avx512f_u32; + f32_vsqrdiff_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__avx512f_u32; + f32_vsqrdiff_config.element_tile = 32; } else if (hardware_config->use_x86_avx) { - f32_vsqrdiff_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__avx_u16; - f32_vsqrdiff_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__avx_u16; - f32_vsqrdiff_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__avx_u16; - f32_vsqrdiff_config.linear.element_tile = 16; + f32_vsqrdiff_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__avx_u16; + f32_vsqrdiff_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__avx_u16; + f32_vsqrdiff_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__avx_u16; + f32_vsqrdiff_config.element_tile = 16; } else { - f32_vsqrdiff_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__sse_u8; - f32_vsqrdiff_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__sse_u8; - f32_vsqrdiff_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__sse_u8; - f32_vsqrdiff_config.linear.element_tile = 8; + f32_vsqrdiff_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__sse_u8; + f32_vsqrdiff_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__sse_u8; + f32_vsqrdiff_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__sse_u8; + f32_vsqrdiff_config.element_tile = 8; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - f32_vsqrdiff_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__wasmsimd_u16; - f32_vsqrdiff_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__wasmsimd_u16; - f32_vsqrdiff_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__wasmsimd_u16; - f32_vsqrdiff_config.linear.element_tile = 16; + f32_vsqrdiff_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__wasmsimd_u16; + f32_vsqrdiff_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__wasmsimd_u16; + f32_vsqrdiff_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__wasmsimd_u16; + f32_vsqrdiff_config.element_tile = 16; #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - f32_vsqrdiff_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__rvv_u8v; - f32_vsqrdiff_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__rvv_u8v; - f32_vsqrdiff_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__rvv_u8v; - f32_vsqrdiff_config.linear.element_tile = hardware_config->vlenb * 2; // VLENB * (8 / sizeof(float)) + f32_vsqrdiff_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__rvv_u8v; + f32_vsqrdiff_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__rvv_u8v; + f32_vsqrdiff_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__rvv_u8v; + f32_vsqrdiff_config.element_tile = hardware_config->vlenb * 2; // VLENB * (8 / sizeof(float)) #else - f32_vsqrdiff_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__scalar_u8; - f32_vsqrdiff_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__scalar_u8; - f32_vsqrdiff_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__scalar_u8; - f32_vsqrdiff_config.linear.element_tile = 8; + f32_vsqrdiff_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__scalar_u8; + f32_vsqrdiff_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__scalar_u8; + f32_vsqrdiff_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__scalar_u8; + f32_vsqrdiff_config.element_tile = 8; #endif } @@ -1036,70 +910,70 @@ static void init_qs8_vadd_config(void) { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon){ - qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__neon_ld64_u16; - qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_u16; - qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_u16; + qs8_vadd_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__neon_ld64_u16; + qs8_vadd_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_u16; + qs8_vadd_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_u16; qs8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_add_minmax_scalar_params; - qs8_vadd_config.minmax.element_tile = 16; + qs8_vadd_config.element_tile = 16; } else if (!XNN_PLATFORM_MOBILE) { - qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__scalar_u1; - qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__scalar_u1; - qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__scalar_u1; + qs8_vadd_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__scalar_u1; + qs8_vadd_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__scalar_u1; + qs8_vadd_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__scalar_u1; qs8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_add_minmax_scalar_params; - qs8_vadd_config.minmax.element_tile = 1; + qs8_vadd_config.element_tile = 1; } #elif XNN_ARCH_ARM64 - qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__neon_ld64_u32; - qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_u32; - qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_u32; + qs8_vadd_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__neon_ld64_u32; + qs8_vadd_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_u32; + qs8_vadd_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_u32; qs8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_add_minmax_scalar_params; - qs8_vadd_config.minmax.element_tile = 32; + qs8_vadd_config.element_tile = 32; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) { - qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__avx512skx_mul32_ld128_u16; - qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_u16; - qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_u16; + qs8_vadd_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__avx512skx_mul32_ld128_u16; + qs8_vadd_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_u16; + qs8_vadd_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_u16; qs8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_add_minmax_scalar_params; - qs8_vadd_config.minmax.element_tile = 16; + qs8_vadd_config.element_tile = 16; } else if (hardware_config->use_x86_avx2) { - qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__avx2_mul32_ld64_u16; - qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_u16; - qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_u16; + qs8_vadd_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__avx2_mul32_ld64_u16; + qs8_vadd_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_u16; + qs8_vadd_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_u16; qs8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_add_minmax_scalar_params; - qs8_vadd_config.minmax.element_tile = 16; + qs8_vadd_config.element_tile = 16; } else if (hardware_config->use_x86_avx) { - qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__avx_mul32_ld32_u8; - qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_u8; - qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_u8; + qs8_vadd_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__avx_mul32_ld32_u8; + qs8_vadd_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_u8; + qs8_vadd_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_u8; qs8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_add_minmax_scalar_params; - qs8_vadd_config.minmax.element_tile = 8; + qs8_vadd_config.element_tile = 8; } else if (hardware_config->use_x86_sse4_1) { - qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_u8; - qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_u8; - qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_u8; + qs8_vadd_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_u8; + qs8_vadd_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_u8; + qs8_vadd_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_u8; qs8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_add_minmax_scalar_params; - qs8_vadd_config.minmax.element_tile = 8; + qs8_vadd_config.element_tile = 8; } else { - qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_u8; - qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_u8; - qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_u8; + qs8_vadd_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_u8; + qs8_vadd_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_u8; + qs8_vadd_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_u8; qs8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_add_minmax_scalar_params; - qs8_vadd_config.minmax.element_tile = 8; + qs8_vadd_config.element_tile = 8; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__wasmsimd_u32; - qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__wasmsimd_u32; - qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__wasmsimd_u32; + qs8_vadd_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__wasmsimd_u32; + qs8_vadd_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__wasmsimd_u32; + qs8_vadd_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__wasmsimd_u32; qs8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_add_minmax_scalar_params; - qs8_vadd_config.minmax.element_tile = 32; + qs8_vadd_config.element_tile = 32; #else - qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__scalar_u4; - qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__scalar_u4; - qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__scalar_u4; + qs8_vadd_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__scalar_u4; + qs8_vadd_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__scalar_u4; + qs8_vadd_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__scalar_u4; qs8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_add_minmax_scalar_params; - qs8_vadd_config.minmax.element_tile = 4; + qs8_vadd_config.element_tile = 4; #endif } @@ -1108,64 +982,64 @@ static void init_qs8_vmul_config(void) { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon){ - qs8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmul_minmax_rndnu_ukernel__neon_ld64_u16; - qs8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_u16; - qs8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_u16; + qs8_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmul_minmax_rndnu_ukernel__neon_ld64_u16; + qs8_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_u16; + qs8_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_u16; qs8_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_mul_minmax_rndnu_neon_params; - qs8_vmul_config.minmax.element_tile = 16; + qs8_vmul_config.element_tile = 16; } else if (!XNN_PLATFORM_MOBILE) { - qs8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmul_minmax_fp32_ukernel__scalar_u4; - qs8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_u4; - qs8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_u4; + qs8_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmul_minmax_fp32_ukernel__scalar_u4; + qs8_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_u4; + qs8_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_u4; qs8_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_mul_minmax_scalar_params; - qs8_vmul_config.minmax.element_tile = 4; + qs8_vmul_config.element_tile = 4; } #elif XNN_ARCH_ARM64 - qs8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmul_minmax_rndnu_ukernel__neon_ld64_u16; - qs8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_u16; - qs8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_u16; + qs8_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmul_minmax_rndnu_ukernel__neon_ld64_u16; + qs8_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_u16; + qs8_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_u16; qs8_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_mul_minmax_rndnu_neon_params; - qs8_vmul_config.minmax.element_tile = 16; + qs8_vmul_config.element_tile = 16; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_x86_avx) { - qs8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmul_minmax_fp32_ukernel__avx_mul16_ld64_u16; - qs8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_u16; - qs8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_u16; + qs8_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmul_minmax_fp32_ukernel__avx_mul16_ld64_u16; + qs8_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_u16; + qs8_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_u16; qs8_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_mul_minmax_scalar_params; - qs8_vmul_config.minmax.element_tile = 16; + qs8_vmul_config.element_tile = 16; } else if (hardware_config->use_x86_sse4_1) { - qs8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmul_minmax_fp32_ukernel__sse41_mul16_ld64_u16; - qs8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_u16; - qs8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_u16; + qs8_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmul_minmax_fp32_ukernel__sse41_mul16_ld64_u16; + qs8_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_u16; + qs8_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_u16; qs8_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_mul_minmax_scalar_params; - qs8_vmul_config.minmax.element_tile = 16; + qs8_vmul_config.element_tile = 16; } else { - qs8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmul_minmax_fp32_ukernel__sse2_mul16_ld64_u8; - qs8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_u8; - qs8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_u8; + qs8_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmul_minmax_fp32_ukernel__sse2_mul16_ld64_u8; + qs8_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_u8; + qs8_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_u8; qs8_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_mul_minmax_scalar_params; - qs8_vmul_config.minmax.element_tile = 8; + qs8_vmul_config.element_tile = 8; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - qs8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmul_minmax_fp32_ukernel__wasmsimd_mul32_ld64_u8; - qs8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_u8; - qs8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_u8; + qs8_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmul_minmax_fp32_ukernel__wasmsimd_mul32_ld64_u8; + qs8_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_u8; + qs8_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_u8; qs8_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_mul_minmax_scalar_params; - qs8_vmul_config.minmax.element_tile = 8; + qs8_vmul_config.element_tile = 8; #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR - qs8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmul_minmax_fp32_ukernel__rvv_u2v; - qs8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__rvv_u2v; - qs8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__rvv_u2v; + qs8_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmul_minmax_fp32_ukernel__rvv_u2v; + qs8_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__rvv_u2v; + qs8_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__rvv_u2v; qs8_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_mul_minmax_scalar_params; - qs8_vmul_config.minmax.element_tile = 2; + qs8_vmul_config.element_tile = 2; #else - qs8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmul_minmax_fp32_ukernel__scalar_u4; - qs8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_u4; - qs8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_u4; + qs8_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmul_minmax_fp32_ukernel__scalar_u4; + qs8_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_u4; + qs8_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_u4; qs8_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_mul_minmax_scalar_params; - qs8_vmul_config.minmax.element_tile = 4; + qs8_vmul_config.element_tile = 4; #endif } @@ -1174,70 +1048,70 @@ static void init_qu8_vadd_config(void) { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon){ - qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__neon_ld64_u16; - qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_u16; - qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_u16; + qu8_vadd_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__neon_ld64_u16; + qu8_vadd_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_u16; + qu8_vadd_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_u16; qu8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_add_minmax_scalar_params; - qu8_vadd_config.minmax.element_tile = 8; + qu8_vadd_config.element_tile = 8; } else if (!XNN_PLATFORM_MOBILE) { - qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__scalar_u1; - qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__scalar_u1; - qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__scalar_u1; + qu8_vadd_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__scalar_u1; + qu8_vadd_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__scalar_u1; + qu8_vadd_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__scalar_u1; qu8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_add_minmax_scalar_params; - qu8_vadd_config.minmax.element_tile = 1; + qu8_vadd_config.element_tile = 1; } #elif XNN_ARCH_ARM64 - qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__neon_ld64_u32; - qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_u32; - qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_u32; + qu8_vadd_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__neon_ld64_u32; + qu8_vadd_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_u32; + qu8_vadd_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_u32; qu8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_add_minmax_scalar_params; - qu8_vadd_config.minmax.element_tile = 8; + qu8_vadd_config.element_tile = 8; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) { - qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__avx512skx_mul32_ld128_u16; - qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_u16; - qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_u16; + qu8_vadd_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__avx512skx_mul32_ld128_u16; + qu8_vadd_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_u16; + qu8_vadd_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_u16; qu8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_add_minmax_scalar_params; - qu8_vadd_config.minmax.element_tile = 16; + qu8_vadd_config.element_tile = 16; } else if (hardware_config->use_x86_avx2) { - qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__avx2_mul32_ld64_u16; - qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_u16; - qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_u16; + qu8_vadd_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__avx2_mul32_ld64_u16; + qu8_vadd_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_u16; + qu8_vadd_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_u16; qu8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_add_minmax_scalar_params; - qu8_vadd_config.minmax.element_tile = 16; + qu8_vadd_config.element_tile = 16; } else if (hardware_config->use_x86_avx) { - qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__avx_mul32_ld32_u8; - qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__avx_mul32_ld32_u8; - qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__avx_mul32_ld32_u8; + qu8_vadd_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__avx_mul32_ld32_u8; + qu8_vadd_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__avx_mul32_ld32_u8; + qu8_vadd_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__avx_mul32_ld32_u8; qu8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_add_minmax_scalar_params; - qu8_vadd_config.minmax.element_tile = 8; + qu8_vadd_config.element_tile = 8; } else if (hardware_config->use_x86_sse4_1) { - qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__sse41_mul16_ld64_u8; - qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__sse41_mul16_ld64_u8; - qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__sse41_mul16_ld64_u8; + qu8_vadd_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__sse41_mul16_ld64_u8; + qu8_vadd_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__sse41_mul16_ld64_u8; + qu8_vadd_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__sse41_mul16_ld64_u8; qu8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_add_minmax_scalar_params; - qu8_vadd_config.minmax.element_tile = 8; + qu8_vadd_config.element_tile = 8; } else { - qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__sse2_mul16_ld64_u8; - qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__sse2_mul16_ld64_u8; - qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__sse2_mul16_ld64_u8; + qu8_vadd_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__sse2_mul16_ld64_u8; + qu8_vadd_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__sse2_mul16_ld64_u8; + qu8_vadd_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__sse2_mul16_ld64_u8; qu8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_add_minmax_scalar_params; - qu8_vadd_config.minmax.element_tile = 8; + qu8_vadd_config.element_tile = 8; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__wasmsimd_u32; - qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__wasmsimd_u32; - qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__wasmsimd_u32; + qu8_vadd_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__wasmsimd_u32; + qu8_vadd_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__wasmsimd_u32; + qu8_vadd_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__wasmsimd_u32; qu8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_add_minmax_scalar_params; - qu8_vadd_config.minmax.element_tile = 32; + qu8_vadd_config.element_tile = 32; #else - qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__scalar_u4; - qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__scalar_u4; - qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__scalar_u4; + qu8_vadd_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__scalar_u4; + qu8_vadd_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__scalar_u4; + qu8_vadd_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__scalar_u4; qu8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_add_minmax_scalar_params; - qu8_vadd_config.minmax.element_tile = 4; + qu8_vadd_config.element_tile = 4; #endif } @@ -1246,64 +1120,64 @@ static void init_qu8_vmul_config(void) { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon){ - qu8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmul_minmax_rndnu_ukernel__neon_ld64_u16; - qu8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_u16; - qu8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_u16; + qu8_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmul_minmax_rndnu_ukernel__neon_ld64_u16; + qu8_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_u16; + qu8_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_u16; qu8_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_mul_minmax_rndnu_neon_params; - qu8_vmul_config.minmax.element_tile = 16; + qu8_vmul_config.element_tile = 16; } else if (!XNN_PLATFORM_MOBILE) { - qu8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmul_minmax_fp32_ukernel__scalar_u4; - qu8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_u4; - qu8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_u4; + qu8_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmul_minmax_fp32_ukernel__scalar_u4; + qu8_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_u4; + qu8_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_u4; qu8_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_mul_minmax_scalar_params; - qu8_vmul_config.minmax.element_tile = 4; + qu8_vmul_config.element_tile = 4; } #elif XNN_ARCH_ARM64 - qu8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmul_minmax_rndnu_ukernel__neon_ld64_u16; - qu8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_u16; - qu8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_u16; + qu8_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmul_minmax_rndnu_ukernel__neon_ld64_u16; + qu8_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_u16; + qu8_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_u16; qu8_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_mul_minmax_rndnu_neon_params; - qu8_vmul_config.minmax.element_tile = 16; + qu8_vmul_config.element_tile = 16; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_x86_avx) { - qu8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmul_minmax_fp32_ukernel__avx_mul16_ld64_u16; - qu8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_u16; - qu8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_u16; + qu8_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmul_minmax_fp32_ukernel__avx_mul16_ld64_u16; + qu8_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_u16; + qu8_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_u16; qu8_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_mul_minmax_scalar_params; - qu8_vmul_config.minmax.element_tile = 16; + qu8_vmul_config.element_tile = 16; } else if (hardware_config->use_x86_sse4_1) { - qu8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmul_minmax_fp32_ukernel__sse41_mul16_ld64_u16; - qu8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_u16; - qu8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_u16; + qu8_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmul_minmax_fp32_ukernel__sse41_mul16_ld64_u16; + qu8_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_u16; + qu8_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_u16; qu8_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_mul_minmax_scalar_params; - qu8_vmul_config.minmax.element_tile = 16; + qu8_vmul_config.element_tile = 16; } else { - qu8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmul_minmax_fp32_ukernel__sse2_mul16_ld64_u8; - qu8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_u8; - qu8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_u8; + qu8_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmul_minmax_fp32_ukernel__sse2_mul16_ld64_u8; + qu8_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_u8; + qu8_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_u8; qu8_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_mul_minmax_scalar_params; - qu8_vmul_config.minmax.element_tile = 8; + qu8_vmul_config.element_tile = 8; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - qu8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmul_minmax_fp32_ukernel__wasmsimd_mul32_ld64_u8; - qu8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_u8; - qu8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_u8; + qu8_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmul_minmax_fp32_ukernel__wasmsimd_mul32_ld64_u8; + qu8_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_u8; + qu8_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_u8; qu8_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_mul_minmax_scalar_params; - qu8_vmul_config.minmax.element_tile = 8; + qu8_vmul_config.element_tile = 8; #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR - qu8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmul_minmax_fp32_ukernel__rvv_u2v; - qu8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__rvv_u2v; - qu8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__rvv_u2v; + qu8_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmul_minmax_fp32_ukernel__rvv_u2v; + qu8_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__rvv_u2v; + qu8_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__rvv_u2v; qu8_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_mul_minmax_scalar_params; - qu8_vmul_config.minmax.element_tile = 2; + qu8_vmul_config.element_tile = 2; #else - qu8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmul_minmax_fp32_ukernel__scalar_u4; - qu8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_u4; - qu8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_u4; + qu8_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmul_minmax_fp32_ukernel__scalar_u4; + qu8_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_u4; + qu8_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_u4; qu8_vmul_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_mul_minmax_scalar_params; - qu8_vmul_config.minmax.element_tile = 4; + qu8_vmul_config.element_tile = 4; #endif } diff --git a/src/f16-vbinary/f16-vadd-minmax.h b/src/f16-vbinary/f16-vadd.h similarity index 50% rename from src/f16-vbinary/f16-vadd-minmax.h rename to src/f16-vbinary/f16-vadd.h index c4e48d52b3b..a31278e9ce7 100644 --- a/src/f16-vbinary/f16-vadd-minmax.h +++ b/src/f16-vbinary/f16-vadd.h @@ -17,24 +17,24 @@ #if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vadd_minmax_ukernel__neonfp16arith_u8, 8, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vadd_minmax_ukernel__neonfp16arith_u16, 16, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vadd_ukernel__neonfp16arith_u8, 8, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vadd_ukernel__neonfp16arith_u16, 16, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) #if XNN_ENABLE_ARM_FP16_SCALAR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vadd_minmax_ukernel__fp16arith_u1, 1, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vadd_minmax_ukernel__fp16arith_u2, 2, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vadd_minmax_ukernel__fp16arith_u4, 4, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vadd_ukernel__fp16arith_u1, 1, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vadd_ukernel__fp16arith_u2, 2, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vadd_ukernel__fp16arith_u4, 4, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) #endif // XNN_ENABLE_ARM_FP16_SCALAR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) #if XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vadd_minmax_ukernel__avx512fp16_u32, 32, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vadd_minmax_ukernel__avx512fp16_u64, 64, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vadd_ukernel__avx512fp16_u32, 32, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vadd_ukernel__avx512fp16_u64, 64, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) #endif // XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vadd_minmax_ukernel__f16c_u8, 8, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vadd_minmax_ukernel__f16c_u16, 16, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vadd_ukernel__f16c_u8, 8, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vadd_ukernel__f16c_u16, 16, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f16-vbinary/f16-vaddc-minmax.h b/src/f16-vbinary/f16-vaddc.h similarity index 50% rename from src/f16-vbinary/f16-vaddc-minmax.h rename to src/f16-vbinary/f16-vaddc.h index c4e2d1e5940..06481041b0c 100644 --- a/src/f16-vbinary/f16-vaddc-minmax.h +++ b/src/f16-vbinary/f16-vaddc.h @@ -17,24 +17,24 @@ #if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vaddc_minmax_ukernel__neonfp16arith_u8, 8, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vaddc_minmax_ukernel__neonfp16arith_u16, 16, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vaddc_ukernel__neonfp16arith_u8, 8, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vaddc_ukernel__neonfp16arith_u16, 16, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) #if XNN_ENABLE_ARM_FP16_SCALAR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vaddc_minmax_ukernel__fp16arith_u1, 1, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vaddc_minmax_ukernel__fp16arith_u2, 2, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vaddc_minmax_ukernel__fp16arith_u4, 4, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vaddc_ukernel__fp16arith_u1, 1, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vaddc_ukernel__fp16arith_u2, 2, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vaddc_ukernel__fp16arith_u4, 4, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) #endif // XNN_ENABLE_ARM_FP16_SCALAR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) #if XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vaddc_minmax_ukernel__avx512fp16_u32, 32, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vaddc_minmax_ukernel__avx512fp16_u64, 64, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vaddc_ukernel__avx512fp16_u32, 32, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vaddc_ukernel__avx512fp16_u64, 64, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) #endif // XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vaddc_minmax_ukernel__f16c_u8, 8, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vaddc_minmax_ukernel__f16c_u16, 16, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vaddc_ukernel__f16c_u8, 8, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vaddc_ukernel__f16c_u16, 16, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f16-vbinary/f16-vdiv-minmax.h b/src/f16-vbinary/f16-vdiv-minmax.h deleted file mode 100644 index 1f1e165717e..00000000000 --- a/src/f16-vbinary/f16-vdiv-minmax.h +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#ifndef XNN_UKERNEL_WITH_PARAMS -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ - XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) -#define XNN_DEFINED_UKERNEL_WITH_PARAMS -#endif - -#ifndef XNN_UKERNEL -#define XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) \ - XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, void, /*init_params=*/nullptr) -#define XNN_DEFINED_UKERNEL -#endif - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vdiv_minmax_ukernel__aarch64_neonfp16arith_u8, 8, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vdiv_minmax_ukernel__aarch64_neonfp16arith_u16, 16, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM64) - -#if XNN_ENABLE_ARM_FP16_SCALAR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vdiv_minmax_ukernel__fp16arith_u1, 1, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vdiv_minmax_ukernel__fp16arith_u2, 2, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vdiv_minmax_ukernel__fp16arith_u4, 4, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -#endif // XNN_ENABLE_ARM_FP16_SCALAR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - -#if XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vdiv_minmax_ukernel__avx512fp16_u32, 32, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vdiv_minmax_ukernel__avx512fp16_u64, 64, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -#endif // XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vdiv_minmax_ukernel__f16c_u8, 8, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vdiv_minmax_ukernel__f16c_u16, 16, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS -#undef XNN_DEFINED_UKERNEL_WITH_PARAMS -#undef XNN_UKERNEL_WITH_PARAMS -#endif - -#ifdef XNN_DEFINED_UKERNEL -#undef XNN_DEFINED_UKERNEL -#undef XNN_UKERNEL -#endif diff --git a/src/f16-vbinary/f16-vdiv.h b/src/f16-vbinary/f16-vdiv.h new file mode 100644 index 00000000000..1f0597bff88 --- /dev/null +++ b/src/f16-vbinary/f16-vdiv.h @@ -0,0 +1,49 @@ +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#ifndef XNN_UKERNEL_WITH_PARAMS +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) +#define XNN_DEFINED_UKERNEL_WITH_PARAMS +#endif + +#ifndef XNN_UKERNEL +#define XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) \ + XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, void, /*init_params=*/nullptr) +#define XNN_DEFINED_UKERNEL +#endif + + +#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM64) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vdiv_ukernel__aarch64_neonfp16arith_u8, 8, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vdiv_ukernel__aarch64_neonfp16arith_u16, 16, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM64) + +#if XNN_ENABLE_ARM_FP16_SCALAR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vdiv_ukernel__fp16arith_u1, 1, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vdiv_ukernel__fp16arith_u2, 2, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vdiv_ukernel__fp16arith_u4, 4, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +#endif // XNN_ENABLE_ARM_FP16_SCALAR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vdiv_ukernel__avx512fp16_u32, 32, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vdiv_ukernel__avx512fp16_u64, 64, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +#endif // XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vdiv_ukernel__f16c_u8, 8, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vdiv_ukernel__f16c_u16, 16, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS +#undef XNN_DEFINED_UKERNEL_WITH_PARAMS +#undef XNN_UKERNEL_WITH_PARAMS +#endif + +#ifdef XNN_DEFINED_UKERNEL +#undef XNN_DEFINED_UKERNEL +#undef XNN_UKERNEL +#endif diff --git a/src/f16-vbinary/f16-vdivc-minmax.h b/src/f16-vbinary/f16-vdivc-minmax.h deleted file mode 100644 index a2961baedac..00000000000 --- a/src/f16-vbinary/f16-vdivc-minmax.h +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#ifndef XNN_UKERNEL_WITH_PARAMS -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ - XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) -#define XNN_DEFINED_UKERNEL_WITH_PARAMS -#endif - -#ifndef XNN_UKERNEL -#define XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) \ - XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, void, /*init_params=*/nullptr) -#define XNN_DEFINED_UKERNEL -#endif - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vdivc_minmax_ukernel__aarch64_neonfp16arith_u8, 8, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vdivc_minmax_ukernel__aarch64_neonfp16arith_u16, 16, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM64) - -#if XNN_ENABLE_ARM_FP16_SCALAR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vdivc_minmax_ukernel__fp16arith_u1, 1, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vdivc_minmax_ukernel__fp16arith_u2, 2, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vdivc_minmax_ukernel__fp16arith_u4, 4, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -#endif // XNN_ENABLE_ARM_FP16_SCALAR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - -#if XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vdivc_minmax_ukernel__avx512fp16_u32, 32, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vdivc_minmax_ukernel__avx512fp16_u64, 64, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -#endif // XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vdivc_minmax_ukernel__f16c_u8, 8, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vdivc_minmax_ukernel__f16c_u16, 16, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS -#undef XNN_DEFINED_UKERNEL_WITH_PARAMS -#undef XNN_UKERNEL_WITH_PARAMS -#endif - -#ifdef XNN_DEFINED_UKERNEL -#undef XNN_DEFINED_UKERNEL -#undef XNN_UKERNEL -#endif diff --git a/src/f16-vbinary/f16-vdivc.h b/src/f16-vbinary/f16-vdivc.h new file mode 100644 index 00000000000..ffa5f4d27e4 --- /dev/null +++ b/src/f16-vbinary/f16-vdivc.h @@ -0,0 +1,49 @@ +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#ifndef XNN_UKERNEL_WITH_PARAMS +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) +#define XNN_DEFINED_UKERNEL_WITH_PARAMS +#endif + +#ifndef XNN_UKERNEL +#define XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) \ + XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, void, /*init_params=*/nullptr) +#define XNN_DEFINED_UKERNEL +#endif + + +#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM64) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vdivc_ukernel__aarch64_neonfp16arith_u8, 8, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vdivc_ukernel__aarch64_neonfp16arith_u16, 16, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM64) + +#if XNN_ENABLE_ARM_FP16_SCALAR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vdivc_ukernel__fp16arith_u1, 1, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vdivc_ukernel__fp16arith_u2, 2, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vdivc_ukernel__fp16arith_u4, 4, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +#endif // XNN_ENABLE_ARM_FP16_SCALAR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vdivc_ukernel__avx512fp16_u32, 32, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vdivc_ukernel__avx512fp16_u64, 64, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +#endif // XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vdivc_ukernel__f16c_u8, 8, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vdivc_ukernel__f16c_u16, 16, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS +#undef XNN_DEFINED_UKERNEL_WITH_PARAMS +#undef XNN_UKERNEL_WITH_PARAMS +#endif + +#ifdef XNN_DEFINED_UKERNEL +#undef XNN_DEFINED_UKERNEL +#undef XNN_UKERNEL +#endif diff --git a/src/f16-vbinary/f16-vmul-minmax.h b/src/f16-vbinary/f16-vmul.h similarity index 50% rename from src/f16-vbinary/f16-vmul-minmax.h rename to src/f16-vbinary/f16-vmul.h index de18691a723..463380e6d70 100644 --- a/src/f16-vbinary/f16-vmul-minmax.h +++ b/src/f16-vbinary/f16-vmul.h @@ -17,24 +17,24 @@ #if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vmul_minmax_ukernel__neonfp16arith_u8, 8, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vmul_minmax_ukernel__neonfp16arith_u16, 16, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vmul_ukernel__neonfp16arith_u8, 8, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vmul_ukernel__neonfp16arith_u16, 16, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) #if XNN_ENABLE_ARM_FP16_SCALAR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vmul_minmax_ukernel__fp16arith_u1, 1, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vmul_minmax_ukernel__fp16arith_u2, 2, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vmul_minmax_ukernel__fp16arith_u4, 4, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vmul_ukernel__fp16arith_u1, 1, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vmul_ukernel__fp16arith_u2, 2, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vmul_ukernel__fp16arith_u4, 4, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) #endif // XNN_ENABLE_ARM_FP16_SCALAR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) #if XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vmul_minmax_ukernel__avx512fp16_u32, 32, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vmul_minmax_ukernel__avx512fp16_u64, 64, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vmul_ukernel__avx512fp16_u32, 32, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vmul_ukernel__avx512fp16_u64, 64, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) #endif // XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vmul_minmax_ukernel__f16c_u8, 8, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vmul_minmax_ukernel__f16c_u16, 16, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vmul_ukernel__f16c_u8, 8, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vmul_ukernel__f16c_u16, 16, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f16-vbinary/f16-vmulc-minmax.h b/src/f16-vbinary/f16-vmulc.h similarity index 50% rename from src/f16-vbinary/f16-vmulc-minmax.h rename to src/f16-vbinary/f16-vmulc.h index c05e958ec94..83b14d8c525 100644 --- a/src/f16-vbinary/f16-vmulc-minmax.h +++ b/src/f16-vbinary/f16-vmulc.h @@ -17,24 +17,24 @@ #if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vmulc_minmax_ukernel__neonfp16arith_u8, 8, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vmulc_minmax_ukernel__neonfp16arith_u16, 16, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vmulc_ukernel__neonfp16arith_u8, 8, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vmulc_ukernel__neonfp16arith_u16, 16, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) #if XNN_ENABLE_ARM_FP16_SCALAR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vmulc_minmax_ukernel__fp16arith_u1, 1, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vmulc_minmax_ukernel__fp16arith_u2, 2, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vmulc_minmax_ukernel__fp16arith_u4, 4, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vmulc_ukernel__fp16arith_u1, 1, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vmulc_ukernel__fp16arith_u2, 2, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vmulc_ukernel__fp16arith_u4, 4, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) #endif // XNN_ENABLE_ARM_FP16_SCALAR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) #if XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vmulc_minmax_ukernel__avx512fp16_u32, 32, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vmulc_minmax_ukernel__avx512fp16_u64, 64, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vmulc_ukernel__avx512fp16_u32, 32, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vmulc_ukernel__avx512fp16_u64, 64, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) #endif // XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vmulc_minmax_ukernel__f16c_u8, 8, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vmulc_minmax_ukernel__f16c_u16, 16, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vmulc_ukernel__f16c_u8, 8, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vmulc_ukernel__f16c_u16, 16, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f16-vbinary/f16-vrdivc-minmax.h b/src/f16-vbinary/f16-vrdivc.h similarity index 59% rename from src/f16-vbinary/f16-vrdivc-minmax.h rename to src/f16-vbinary/f16-vrdivc.h index cbeeac95728..fdd28d2bc74 100644 --- a/src/f16-vbinary/f16-vrdivc-minmax.h +++ b/src/f16-vbinary/f16-vrdivc.h @@ -17,24 +17,24 @@ #if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vrdivc_minmax_ukernel__aarch64_neonfp16arith_u8, 8, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vrdivc_minmax_ukernel__aarch64_neonfp16arith_u16, 16, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vrdivc_ukernel__aarch64_neonfp16arith_u8, 8, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vrdivc_ukernel__aarch64_neonfp16arith_u16, 16, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM64) #if XNN_ENABLE_ARM_FP16_SCALAR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vrdivc_minmax_ukernel__fp16arith_u1, 1, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vrdivc_minmax_ukernel__fp16arith_u2, 2, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vrdivc_minmax_ukernel__fp16arith_u4, 4, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vrdivc_ukernel__fp16arith_u1, 1, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vrdivc_ukernel__fp16arith_u2, 2, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vrdivc_ukernel__fp16arith_u4, 4, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) #endif // XNN_ENABLE_ARM_FP16_SCALAR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) #if XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vrdivc_minmax_ukernel__avx512fp16_u32, 32, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vrdivc_minmax_ukernel__avx512fp16_u64, 64, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vrdivc_ukernel__avx512fp16_u32, 32, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vrdivc_ukernel__avx512fp16_u64, 64, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) #endif // XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vrdivc_minmax_ukernel__f16c_u8, 8, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vrdivc_minmax_ukernel__f16c_u16, 16, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vrdivc_ukernel__f16c_u8, 8, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vrdivc_ukernel__f16c_u16, 16, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f16-vbinary/f16-vrsubc-minmax.h b/src/f16-vbinary/f16-vrsubc.h similarity index 60% rename from src/f16-vbinary/f16-vrsubc-minmax.h rename to src/f16-vbinary/f16-vrsubc.h index 681d2a4d8dd..f13094d8117 100644 --- a/src/f16-vbinary/f16-vrsubc-minmax.h +++ b/src/f16-vbinary/f16-vrsubc.h @@ -17,24 +17,24 @@ #if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vrsubc_minmax_ukernel__neonfp16arith_u8, 8, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vrsubc_minmax_ukernel__neonfp16arith_u16, 16, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vrsubc_ukernel__neonfp16arith_u8, 8, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vrsubc_ukernel__neonfp16arith_u16, 16, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) #if XNN_ENABLE_ARM_FP16_SCALAR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vrsubc_minmax_ukernel__fp16arith_u1, 1, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vrsubc_minmax_ukernel__fp16arith_u2, 2, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vrsubc_minmax_ukernel__fp16arith_u4, 4, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vrsubc_ukernel__fp16arith_u1, 1, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vrsubc_ukernel__fp16arith_u2, 2, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vrsubc_ukernel__fp16arith_u4, 4, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) #endif // XNN_ENABLE_ARM_FP16_SCALAR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) #if XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vrsubc_minmax_ukernel__avx512fp16_u32, 32, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vrsubc_minmax_ukernel__avx512fp16_u64, 64, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vrsubc_ukernel__avx512fp16_u32, 32, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vrsubc_ukernel__avx512fp16_u64, 64, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) #endif // XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vrsubc_minmax_ukernel__f16c_u8, 8, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vrsubc_minmax_ukernel__f16c_u16, 16, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vrsubc_ukernel__f16c_u8, 8, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vrsubc_ukernel__f16c_u16, 16, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f16-vbinary/f16-vsub-minmax.h b/src/f16-vbinary/f16-vsub.h similarity index 50% rename from src/f16-vbinary/f16-vsub-minmax.h rename to src/f16-vbinary/f16-vsub.h index 72279e651a3..d2b1983204a 100644 --- a/src/f16-vbinary/f16-vsub-minmax.h +++ b/src/f16-vbinary/f16-vsub.h @@ -17,24 +17,24 @@ #if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsub_minmax_ukernel__neonfp16arith_u8, 8, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsub_minmax_ukernel__neonfp16arith_u16, 16, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsub_ukernel__neonfp16arith_u8, 8, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsub_ukernel__neonfp16arith_u16, 16, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) #if XNN_ENABLE_ARM_FP16_SCALAR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vsub_minmax_ukernel__fp16arith_u1, 1, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vsub_minmax_ukernel__fp16arith_u2, 2, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vsub_minmax_ukernel__fp16arith_u4, 4, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vsub_ukernel__fp16arith_u1, 1, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vsub_ukernel__fp16arith_u2, 2, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vsub_ukernel__fp16arith_u4, 4, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) #endif // XNN_ENABLE_ARM_FP16_SCALAR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) #if XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vsub_minmax_ukernel__avx512fp16_u32, 32, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vsub_minmax_ukernel__avx512fp16_u64, 64, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vsub_ukernel__avx512fp16_u32, 32, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vsub_ukernel__avx512fp16_u64, 64, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) #endif // XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vsub_minmax_ukernel__f16c_u8, 8, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vsub_minmax_ukernel__f16c_u16, 16, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vsub_ukernel__f16c_u8, 8, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vsub_ukernel__f16c_u16, 16, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f16-vbinary/f16-vsubc-minmax.h b/src/f16-vbinary/f16-vsubc.h similarity index 50% rename from src/f16-vbinary/f16-vsubc-minmax.h rename to src/f16-vbinary/f16-vsubc.h index b3a082a944f..424fa6aceee 100644 --- a/src/f16-vbinary/f16-vsubc-minmax.h +++ b/src/f16-vbinary/f16-vsubc.h @@ -17,24 +17,24 @@ #if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsubc_minmax_ukernel__neonfp16arith_u8, 8, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsubc_minmax_ukernel__neonfp16arith_u16, 16, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsubc_ukernel__neonfp16arith_u8, 8, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsubc_ukernel__neonfp16arith_u16, 16, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) #if XNN_ENABLE_ARM_FP16_SCALAR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vsubc_minmax_ukernel__fp16arith_u1, 1, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vsubc_minmax_ukernel__fp16arith_u2, 2, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vsubc_minmax_ukernel__fp16arith_u4, 4, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vsubc_ukernel__fp16arith_u1, 1, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vsubc_ukernel__fp16arith_u2, 2, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vsubc_ukernel__fp16arith_u4, 4, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) #endif // XNN_ENABLE_ARM_FP16_SCALAR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) #if XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vsubc_minmax_ukernel__avx512fp16_u32, 32, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vsubc_minmax_ukernel__avx512fp16_u64, 64, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vsubc_ukernel__avx512fp16_u32, 32, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vsubc_ukernel__avx512fp16_u64, 64, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) #endif // XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vsubc_minmax_ukernel__f16c_u8, 8, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vsubc_minmax_ukernel__f16c_u16, 16, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vsubc_ukernel__f16c_u8, 8, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vsubc_ukernel__f16c_u16, 16, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f16-vbinary/gen/f16-vadd-avx512fp16-u32.c b/src/f16-vbinary/gen/f16-vadd-avx512fp16-u32.c new file mode 100644 index 00000000000..d2bfac67cfa --- /dev/null +++ b/src/f16-vbinary/gen/f16-vadd-avx512fp16-u32.c @@ -0,0 +1,63 @@ +// Auto-generated file. Do not edit! +// Template: src/f16-vbinary/vop-avx512fp16.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/vbinary.h" + + +void xnn_f16_vadd_ukernel__avx512fp16_u32( + size_t batch, + const xnn_float16* restrict input_a, + const xnn_float16* restrict input_b, + xnn_float16* restrict output, + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint16_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + +#if defined(__AVX512FP16__) + const uint16_t* a = (const uint16_t*) input_a; + const uint16_t* b = (const uint16_t*) input_b; + uint16_t* o = (uint16_t*) output; + + + for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { + const __m512h va = _mm512_loadu_ph(a); + a += 32; + + __m512h vacc = _mm512_add_ph(va, _mm512_loadu_ph(b)); + b += 32; + + + _mm512_storeu_ph(o, vacc); + o += 32; + } + if XNN_UNLIKELY(batch != 0) { + assert(batch >= 1 * sizeof(uint16_t)); + assert(batch <= 31 * sizeof(uint16_t)); + // Prepare mask for valid 16-bit elements (depends on batch). + batch >>= XNN_LOG2_SIZEOF_HALF; + const __mmask32 vmask = _cvtu32_mask32((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); + + const __m512h va = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, a)); + + __m512h vacc = _mm512_maskz_add_ph(vmask, va, _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, b))); + + + _mm512_mask_storeu_epi16(o, vmask, _mm512_castph_si512(vacc)); + } +#endif // defined(__AVX512FP16__) +} diff --git a/src/f16-vbinary/gen/f16-vadd-minmax-avx512fp16-u32.c b/src/f16-vbinary/gen/f16-vadd-avx512fp16-u64.c similarity index 75% rename from src/f16-vbinary/gen/f16-vadd-minmax-avx512fp16-u32.c rename to src/f16-vbinary/gen/f16-vadd-avx512fp16-u64.c index 5caeed2f728..d3666bebc85 100644 --- a/src/f16-vbinary/gen/f16-vadd-minmax-avx512fp16-u32.c +++ b/src/f16-vbinary/gen/f16-vadd-avx512fp16-u64.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vadd_minmax_ukernel__avx512fp16_u32( +void xnn_f16_vadd_ukernel__avx512fp16_u64( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -33,10 +33,21 @@ void xnn_f16_vadd_minmax_ukernel__avx512fp16_u32( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m512h voutput_min = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m512h voutput_max = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); + for (; batch >= 64 * sizeof(uint16_t); batch -= 64 * sizeof(uint16_t)) { + const __m512h va0 = _mm512_loadu_ph(a); + const __m512h va1 = _mm512_loadu_ph(a + 32); + a += 64; + __m512h vacc0 = _mm512_add_ph(va0, _mm512_loadu_ph(b)); + __m512h vacc1 = _mm512_add_ph(va1, _mm512_loadu_ph(b + 32)); + b += 64; + + + _mm512_storeu_ph(o, vacc0); + _mm512_storeu_ph(o + 32, vacc1); + o += 64; + } for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { const __m512h va = _mm512_loadu_ph(a); a += 32; @@ -45,9 +56,6 @@ void xnn_f16_vadd_minmax_ukernel__avx512fp16_u32( b += 32; - vacc = _mm512_max_ph(voutput_min, vacc); - vacc = _mm512_min_ph(voutput_max, vacc); - _mm512_storeu_ph(o, vacc); o += 32; } @@ -63,8 +71,6 @@ void xnn_f16_vadd_minmax_ukernel__avx512fp16_u32( __m512h vacc = _mm512_maskz_add_ph(vmask, va, _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, b))); - vacc = _mm512_maskz_max_ph(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ph(vmask, voutput_max, vacc); _mm512_mask_storeu_epi16(o, vmask, _mm512_castph_si512(vacc)); } #endif // defined(__AVX512FP16__) diff --git a/src/f16-vbinary/gen/f16-vadd-minmax-f16c-u16.c b/src/f16-vbinary/gen/f16-vadd-f16c-u16.c similarity index 79% rename from src/f16-vbinary/gen/f16-vadd-minmax-f16c-u16.c rename to src/f16-vbinary/gen/f16-vadd-f16c-u16.c index 40f6ff4ee28..25f4c83d94b 100644 --- a/src/f16-vbinary/gen/f16-vadd-minmax-f16c-u16.c +++ b/src/f16-vbinary/gen/f16-vadd-f16c-u16.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vadd_minmax_ukernel__f16c_u16( +void xnn_f16_vadd_ukernel__f16c_u16( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -33,11 +33,6 @@ void xnn_f16_vadd_minmax_ukernel__f16c_u16( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m256 vy_min = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m256 vy_max = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - XNN_FORCE_REALIZATION(vy_min); - XNN_FORCE_REALIZATION(vy_max); - for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a)); const __m256 vb01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b)); @@ -50,12 +45,6 @@ void xnn_f16_vadd_minmax_ukernel__f16c_u16( __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va456789AB, vb456789AB), _MM_FROUND_TO_NEAREST_INT)); - vy01234567 = _mm256_max_ps(vy01234567, vy_min); - vy456789AB = _mm256_max_ps(vy456789AB, vy_min); - - vy01234567 = _mm256_min_ps(vy01234567, vy_max); - vy456789AB = _mm256_min_ps(vy456789AB, vy_max); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy01234567, _MM_FROUND_TO_NEAREST_INT)); _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_TO_NEAREST_INT)); o += 16; @@ -68,9 +57,6 @@ void xnn_f16_vadd_minmax_ukernel__f16c_u16( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -80,9 +66,6 @@ void xnn_f16_vadd_minmax_ukernel__f16c_u16( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/gen/f16-vadd-minmax-f16c-u8.c b/src/f16-vbinary/gen/f16-vadd-f16c-u8.c similarity index 78% rename from src/f16-vbinary/gen/f16-vadd-minmax-f16c-u8.c rename to src/f16-vbinary/gen/f16-vadd-f16c-u8.c index 04047df0621..ee747e29e4b 100644 --- a/src/f16-vbinary/gen/f16-vadd-minmax-f16c-u8.c +++ b/src/f16-vbinary/gen/f16-vadd-f16c-u8.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vadd_minmax_ukernel__f16c_u8( +void xnn_f16_vadd_ukernel__f16c_u8( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -33,11 +33,6 @@ void xnn_f16_vadd_minmax_ukernel__f16c_u8( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m256 vy_min = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m256 vy_max = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - XNN_FORCE_REALIZATION(vy_min); - XNN_FORCE_REALIZATION(vy_max); - for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a)); const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b)); @@ -46,9 +41,6 @@ void xnn_f16_vadd_minmax_ukernel__f16c_u8( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -58,9 +50,6 @@ void xnn_f16_vadd_minmax_ukernel__f16c_u8( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/gen/f16-vadd-minmax-fp16arith-u1.c b/src/f16-vbinary/gen/f16-vadd-fp16arith-u1.c similarity index 74% rename from src/f16-vbinary/gen/f16-vadd-minmax-fp16arith-u1.c rename to src/f16-vbinary/gen/f16-vadd-fp16arith-u1.c index 85062e3cf59..b4fc2ff516f 100644 --- a/src/f16-vbinary/gen/f16-vadd-minmax-fp16arith-u1.c +++ b/src/f16-vbinary/gen/f16-vadd-fp16arith-u1.c @@ -8,7 +8,6 @@ // LICENSE file in the root directory of this source tree. #include -#include #include @@ -17,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vadd_minmax_ukernel__fp16arith_u1( +void xnn_f16_vadd_ukernel__fp16arith_u1( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float16_t) == 0); @@ -34,16 +33,10 @@ void xnn_f16_vadd_minmax_ukernel__fp16arith_u1( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - float16_t vy_min, vy_max; - memcpy(&vy_min, ¶ms->scalar.min, sizeof(vy_min)); - memcpy(&vy_max, ¶ms->scalar.max, sizeof(vy_max)); - do { const float16_t va = *a++; const float16_t vb = *b++; float16_t vacc = vaddh_f16(va, vb); - vacc = vmaxnmh_f16(vacc, vy_min); - vacc = vminnmh_f16(vacc, vy_max); *o++ = vacc; batch -= sizeof(float16_t); } while (batch != 0); diff --git a/src/f16-vbinary/gen/f16-vadd-minmax-fp16arith-u2.c b/src/f16-vbinary/gen/f16-vadd-fp16arith-u2.c similarity index 71% rename from src/f16-vbinary/gen/f16-vadd-minmax-fp16arith-u2.c rename to src/f16-vbinary/gen/f16-vadd-fp16arith-u2.c index 0e435da66a2..7157f00463a 100644 --- a/src/f16-vbinary/gen/f16-vadd-minmax-fp16arith-u2.c +++ b/src/f16-vbinary/gen/f16-vadd-fp16arith-u2.c @@ -8,7 +8,6 @@ // LICENSE file in the root directory of this source tree. #include -#include #include @@ -17,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vadd_minmax_ukernel__fp16arith_u2( +void xnn_f16_vadd_ukernel__fp16arith_u2( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float16_t) == 0); @@ -34,10 +33,6 @@ void xnn_f16_vadd_minmax_ukernel__fp16arith_u2( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - float16_t vy_min, vy_max; - memcpy(&vy_min, ¶ms->scalar.min, sizeof(vy_min)); - memcpy(&vy_max, ¶ms->scalar.max, sizeof(vy_max)); - for (; batch >= 2 * sizeof(float16_t); batch -= 2 * sizeof(float16_t)) { const float16_t va0 = *a++; const float16_t va1 = *a++; @@ -49,12 +44,6 @@ void xnn_f16_vadd_minmax_ukernel__fp16arith_u2( float16_t vacc1 = vaddh_f16(va1, vb1); - vacc0 = vmaxnmh_f16(vacc0, vy_min); - vacc1 = vmaxnmh_f16(vacc1, vy_min); - - vacc0 = vminnmh_f16(vacc0, vy_max); - vacc1 = vminnmh_f16(vacc1, vy_max); - *o++ = vacc0; *o++ = vacc1; } @@ -62,8 +51,6 @@ void xnn_f16_vadd_minmax_ukernel__fp16arith_u2( const float16_t va = *a; const float16_t vb = *b; float16_t vacc = vaddh_f16(va, vb); - vacc = vmaxnmh_f16(vacc, vy_min); - vacc = vminnmh_f16(vacc, vy_max); *o = vacc; } } diff --git a/src/f16-vbinary/gen/f16-vadd-minmax-fp16arith-u4.c b/src/f16-vbinary/gen/f16-vadd-fp16arith-u4.c similarity index 70% rename from src/f16-vbinary/gen/f16-vadd-minmax-fp16arith-u4.c rename to src/f16-vbinary/gen/f16-vadd-fp16arith-u4.c index c82652baffd..b4851e2d4d6 100644 --- a/src/f16-vbinary/gen/f16-vadd-minmax-fp16arith-u4.c +++ b/src/f16-vbinary/gen/f16-vadd-fp16arith-u4.c @@ -8,7 +8,6 @@ // LICENSE file in the root directory of this source tree. #include -#include #include @@ -17,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vadd_minmax_ukernel__fp16arith_u4( +void xnn_f16_vadd_ukernel__fp16arith_u4( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float16_t) == 0); @@ -34,10 +33,6 @@ void xnn_f16_vadd_minmax_ukernel__fp16arith_u4( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - float16_t vy_min, vy_max; - memcpy(&vy_min, ¶ms->scalar.min, sizeof(vy_min)); - memcpy(&vy_max, ¶ms->scalar.max, sizeof(vy_max)); - for (; batch >= 4 * sizeof(float16_t); batch -= 4 * sizeof(float16_t)) { const float16_t va0 = *a++; const float16_t va1 = *a++; @@ -55,16 +50,6 @@ void xnn_f16_vadd_minmax_ukernel__fp16arith_u4( float16_t vacc3 = vaddh_f16(va3, vb3); - vacc0 = vmaxnmh_f16(vacc0, vy_min); - vacc1 = vmaxnmh_f16(vacc1, vy_min); - vacc2 = vmaxnmh_f16(vacc2, vy_min); - vacc3 = vmaxnmh_f16(vacc3, vy_min); - - vacc0 = vminnmh_f16(vacc0, vy_max); - vacc1 = vminnmh_f16(vacc1, vy_max); - vacc2 = vminnmh_f16(vacc2, vy_max); - vacc3 = vminnmh_f16(vacc3, vy_max); - *o++ = vacc0; *o++ = vacc1; *o++ = vacc2; @@ -75,8 +60,6 @@ void xnn_f16_vadd_minmax_ukernel__fp16arith_u4( const float16_t va = *a++; const float16_t vb = *b++; float16_t vacc = vaddh_f16(va, vb); - vacc = vmaxnmh_f16(vacc, vy_min); - vacc = vminnmh_f16(vacc, vy_max); *o++ = vacc; batch -= sizeof(float16_t); } while (batch != 0); diff --git a/src/f16-vbinary/gen/f16-vadd-minmax-avx512fp16-u64.c b/src/f16-vbinary/gen/f16-vadd-minmax-avx512fp16-u64.c deleted file mode 100644 index 5a78f1058c2..00000000000 --- a/src/f16-vbinary/gen/f16-vadd-minmax-avx512fp16-u64.c +++ /dev/null @@ -1,91 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f16-vbinary/vop-avx512fp16.c.in -// Generator: tools/xngen -// -// Copyright 2024 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f16_vadd_minmax_ukernel__avx512fp16_u64( - size_t batch, - const xnn_float16* restrict input_a, - const xnn_float16* restrict input_b, - xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(uint16_t) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - -#if defined(__AVX512FP16__) - const uint16_t* a = (const uint16_t*) input_a; - const uint16_t* b = (const uint16_t*) input_b; - uint16_t* o = (uint16_t*) output; - - const __m512h voutput_min = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m512h voutput_max = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - - - for (; batch >= 64 * sizeof(uint16_t); batch -= 64 * sizeof(uint16_t)) { - const __m512h va0 = _mm512_loadu_ph(a); - const __m512h va1 = _mm512_loadu_ph(a + 32); - a += 64; - - __m512h vacc0 = _mm512_add_ph(va0, _mm512_loadu_ph(b)); - __m512h vacc1 = _mm512_add_ph(va1, _mm512_loadu_ph(b + 32)); - b += 64; - - - vacc0 = _mm512_max_ph(voutput_min, vacc0); - vacc1 = _mm512_max_ph(voutput_min, vacc1); - - vacc0 = _mm512_min_ph(voutput_max, vacc0); - vacc1 = _mm512_min_ph(voutput_max, vacc1); - - _mm512_storeu_ph(o, vacc0); - _mm512_storeu_ph(o + 32, vacc1); - o += 64; - } - for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { - const __m512h va = _mm512_loadu_ph(a); - a += 32; - - __m512h vacc = _mm512_add_ph(va, _mm512_loadu_ph(b)); - b += 32; - - - vacc = _mm512_max_ph(voutput_min, vacc); - vacc = _mm512_min_ph(voutput_max, vacc); - - _mm512_storeu_ph(o, vacc); - o += 32; - } - if XNN_UNLIKELY(batch != 0) { - assert(batch >= 1 * sizeof(uint16_t)); - assert(batch <= 31 * sizeof(uint16_t)); - // Prepare mask for valid 16-bit elements (depends on batch). - batch >>= XNN_LOG2_SIZEOF_HALF; - const __mmask32 vmask = _cvtu32_mask32((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); - - const __m512h va = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, a)); - - __m512h vacc = _mm512_maskz_add_ph(vmask, va, _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, b))); - - - vacc = _mm512_maskz_max_ph(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ph(vmask, voutput_max, vacc); - _mm512_mask_storeu_epi16(o, vmask, _mm512_castph_si512(vacc)); - } -#endif // defined(__AVX512FP16__) -} diff --git a/src/f16-vbinary/gen/f16-vadd-minmax-neonfp16arith-u16.c b/src/f16-vbinary/gen/f16-vadd-neonfp16arith-u16.c similarity index 77% rename from src/f16-vbinary/gen/f16-vadd-minmax-neonfp16arith-u16.c rename to src/f16-vbinary/gen/f16-vadd-neonfp16arith-u16.c index 35a126151a8..51f14ee8819 100644 --- a/src/f16-vbinary/gen/f16-vadd-minmax-neonfp16arith-u16.c +++ b/src/f16-vbinary/gen/f16-vadd-neonfp16arith-u16.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vadd_minmax_ukernel__neonfp16arith_u16( +void xnn_f16_vadd_ukernel__neonfp16arith_u16( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -32,9 +32,6 @@ void xnn_f16_vadd_minmax_ukernel__neonfp16arith_u16( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const float16x8_t vy_min = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.min)); - const float16x8_t vy_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.max)); - for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; const float16x8_t vb01234567 = vreinterpretq_f16_u16(vld1q_u16(b)); b += 8; @@ -45,12 +42,6 @@ void xnn_f16_vadd_minmax_ukernel__neonfp16arith_u16( float16x8_t vy456789AB = vaddq_f16(va456789AB, vb456789AB); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy456789AB = vmaxq_f16(vy456789AB, vy_min); - - vy01234567 = vminq_f16(vy01234567, vy_max); - vy456789AB = vminq_f16(vy456789AB, vy_max); - vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; vst1q_u16(o, vreinterpretq_u16_f16(vy456789AB)); o += 8; } @@ -59,8 +50,6 @@ void xnn_f16_vadd_minmax_ukernel__neonfp16arith_u16( const float16x8_t vb01234567 = vreinterpretq_f16_u16(vld1q_u16(b)); b += 8; float16x8_t vy01234567 = vaddq_f16(va01234567, vb01234567); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; } if XNN_UNLIKELY(batch != 0) { @@ -68,8 +57,6 @@ void xnn_f16_vadd_minmax_ukernel__neonfp16arith_u16( const float16x8_t vb01234567 = vreinterpretq_f16_u16(vld1q_u16(b)); float16x8_t vy01234567 = vaddq_f16(va01234567, vb01234567); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); float16x4_t vy0123 = vget_low_f16(vy01234567); if (batch & (4 * sizeof(uint16_t))) { diff --git a/src/f16-vbinary/gen/f16-vadd-minmax-neonfp16arith-u8.c b/src/f16-vbinary/gen/f16-vadd-neonfp16arith-u8.c similarity index 77% rename from src/f16-vbinary/gen/f16-vadd-minmax-neonfp16arith-u8.c rename to src/f16-vbinary/gen/f16-vadd-neonfp16arith-u8.c index 24a6a733906..2b28fdb9d32 100644 --- a/src/f16-vbinary/gen/f16-vadd-minmax-neonfp16arith-u8.c +++ b/src/f16-vbinary/gen/f16-vadd-neonfp16arith-u8.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vadd_minmax_ukernel__neonfp16arith_u8( +void xnn_f16_vadd_ukernel__neonfp16arith_u8( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -32,16 +32,11 @@ void xnn_f16_vadd_minmax_ukernel__neonfp16arith_u8( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const float16x8_t vy_min = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.min)); - const float16x8_t vy_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.max)); - for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; const float16x8_t vb01234567 = vreinterpretq_f16_u16(vld1q_u16(b)); b += 8; float16x8_t vy01234567 = vaddq_f16(va01234567, vb01234567); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; } if XNN_UNLIKELY(batch != 0) { @@ -49,8 +44,6 @@ void xnn_f16_vadd_minmax_ukernel__neonfp16arith_u8( const float16x8_t vb01234567 = vreinterpretq_f16_u16(vld1q_u16(b)); float16x8_t vy01234567 = vaddq_f16(va01234567, vb01234567); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); float16x4_t vy0123 = vget_low_f16(vy01234567); if (batch & (4 * sizeof(uint16_t))) { diff --git a/src/f16-vbinary/gen/f16-vaddc-avx512fp16-u32.c b/src/f16-vbinary/gen/f16-vaddc-avx512fp16-u32.c new file mode 100644 index 00000000000..b4ce5cfb92d --- /dev/null +++ b/src/f16-vbinary/gen/f16-vaddc-avx512fp16-u32.c @@ -0,0 +1,64 @@ +// Auto-generated file. Do not edit! +// Template: src/f16-vbinary/vopc-avx512fp16.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/intrinsics-polyfill.h" +#include "xnnpack/vbinary.h" + + +void xnn_f16_vaddc_ukernel__avx512fp16_u32( + size_t batch, + const xnn_float16* restrict input_a, + const xnn_float16* restrict input_b, + xnn_float16* restrict output, + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint16_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + +#if defined(__AVX512FP16__) + const uint16_t* a = (const uint16_t*) input_a; + const uint16_t* b = (const uint16_t*) input_b; + uint16_t* o = (uint16_t*) output; + + const __m512h vb = _mm512_castsi512_ph(_mm512_set1_epi16(*b)); + + + for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { + __m512h va0 = _mm512_loadu_ph(a); + a += 32; + + __m512h vacc0 = _mm512_add_ph(va0, vb); + + + _mm512_storeu_ph(o, vacc0); + o += 32; + } + if XNN_UNLIKELY(batch != 0) { + assert(batch >= 1 * sizeof(uint16_t)); + assert(batch <= 31 * sizeof(uint16_t)); + // Prepare mask for valid 16-bit elements (depends on batch). + batch >>= XNN_LOG2_SIZEOF_HALF; + const __mmask32 vmask = _cvtu32_mask32((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); + + __m512h va = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, a)); + + __m512h vacc = _mm512_maskz_add_ph(vmask, va, vb); + + _mm512_mask_storeu_epi16(o, vmask, _mm512_castph_si512(vacc)); + } +#endif // defined(__AVX512FP16__) +} diff --git a/src/f16-vbinary/gen/f16-vaddc-minmax-avx512fp16-u32.c b/src/f16-vbinary/gen/f16-vaddc-avx512fp16-u64.c similarity index 75% rename from src/f16-vbinary/gen/f16-vaddc-minmax-avx512fp16-u32.c rename to src/f16-vbinary/gen/f16-vaddc-avx512fp16-u64.c index 8eb2ed74d02..d05659204bf 100644 --- a/src/f16-vbinary/gen/f16-vaddc-minmax-avx512fp16-u32.c +++ b/src/f16-vbinary/gen/f16-vaddc-avx512fp16-u64.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vaddc_minmax_ukernel__avx512fp16_u32( +void xnn_f16_vaddc_ukernel__avx512fp16_u64( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -34,23 +34,29 @@ void xnn_f16_vaddc_minmax_ukernel__avx512fp16_u32( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m512h voutput_min = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m512h voutput_max = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); const __m512h vb = _mm512_castsi512_ph(_mm512_set1_epi16(*b)); - for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { + for (; batch >= 64 * sizeof(uint16_t); batch -= 64 * sizeof(uint16_t)) { __m512h va0 = _mm512_loadu_ph(a); - a += 32; + __m512h va1 = _mm512_loadu_ph(a + 32); + a += 64; __m512h vacc0 = _mm512_add_ph(va0, vb); + __m512h vacc1 = _mm512_add_ph(va1, vb); - vacc0 = _mm512_max_ph(voutput_min, vacc0); + _mm512_storeu_ph(o, vacc0); + _mm512_storeu_ph(o + 32, vacc1); + o += 64; + } + for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { + __m512h va = _mm512_loadu_ph(a); + a += 32; - vacc0 = _mm512_min_ph(voutput_max, vacc0); + __m512h vacc = _mm512_add_ph(va, vb); - _mm512_storeu_ph(o, vacc0); + _mm512_storeu_ph(o, vacc); o += 32; } if XNN_UNLIKELY(batch != 0) { @@ -64,8 +70,6 @@ void xnn_f16_vaddc_minmax_ukernel__avx512fp16_u32( __m512h vacc = _mm512_maskz_add_ph(vmask, va, vb); - vacc = _mm512_maskz_max_ph(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ph(vmask, voutput_max, vacc); _mm512_mask_storeu_epi16(o, vmask, _mm512_castph_si512(vacc)); } #endif // defined(__AVX512FP16__) diff --git a/src/f16-vbinary/gen/f16-vaddc-minmax-f16c-u16.c b/src/f16-vbinary/gen/f16-vaddc-f16c-u16.c similarity index 77% rename from src/f16-vbinary/gen/f16-vaddc-minmax-f16c-u16.c rename to src/f16-vbinary/gen/f16-vaddc-f16c-u16.c index 990122ef84d..6418a8d132f 100644 --- a/src/f16-vbinary/gen/f16-vaddc-minmax-f16c-u16.c +++ b/src/f16-vbinary/gen/f16-vaddc-f16c-u16.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vaddc_minmax_ukernel__f16c_u16( +void xnn_f16_vaddc_ukernel__f16c_u16( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -33,11 +33,6 @@ void xnn_f16_vaddc_minmax_ukernel__f16c_u16( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m256 vy_min = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m256 vy_max = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - XNN_FORCE_REALIZATION(vy_min); - XNN_FORCE_REALIZATION(vy_max); - const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b)); for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a)); @@ -48,12 +43,6 @@ void xnn_f16_vaddc_minmax_ukernel__f16c_u16( __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va456789AB, vb), _MM_FROUND_TO_NEAREST_INT)); - vy01234567 = _mm256_max_ps(vy01234567, vy_min); - vy456789AB = _mm256_max_ps(vy456789AB, vy_min); - - vy01234567 = _mm256_min_ps(vy01234567, vy_max); - vy456789AB = _mm256_min_ps(vy456789AB, vy_max); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy01234567, _MM_FROUND_TO_NEAREST_INT)); _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_TO_NEAREST_INT)); o += 16; @@ -64,9 +53,6 @@ void xnn_f16_vaddc_minmax_ukernel__f16c_u16( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -75,9 +61,6 @@ void xnn_f16_vaddc_minmax_ukernel__f16c_u16( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/gen/f16-vaddc-minmax-f16c-u8.c b/src/f16-vbinary/gen/f16-vaddc-f16c-u8.c similarity index 77% rename from src/f16-vbinary/gen/f16-vaddc-minmax-f16c-u8.c rename to src/f16-vbinary/gen/f16-vaddc-f16c-u8.c index 32a9fcb7d9e..0d25a0c59e4 100644 --- a/src/f16-vbinary/gen/f16-vaddc-minmax-f16c-u8.c +++ b/src/f16-vbinary/gen/f16-vaddc-f16c-u8.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vaddc_minmax_ukernel__f16c_u8( +void xnn_f16_vaddc_ukernel__f16c_u8( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -33,11 +33,6 @@ void xnn_f16_vaddc_minmax_ukernel__f16c_u8( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m256 vy_min = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m256 vy_max = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - XNN_FORCE_REALIZATION(vy_min); - XNN_FORCE_REALIZATION(vy_max); - const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b)); for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a)); @@ -45,9 +40,6 @@ void xnn_f16_vaddc_minmax_ukernel__f16c_u8( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -56,9 +48,6 @@ void xnn_f16_vaddc_minmax_ukernel__f16c_u8( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/gen/f16-vaddc-minmax-fp16arith-u1.c b/src/f16-vbinary/gen/f16-vaddc-fp16arith-u1.c similarity index 74% rename from src/f16-vbinary/gen/f16-vaddc-minmax-fp16arith-u1.c rename to src/f16-vbinary/gen/f16-vaddc-fp16arith-u1.c index ca1571aca1c..357cf0c35d8 100644 --- a/src/f16-vbinary/gen/f16-vaddc-minmax-fp16arith-u1.c +++ b/src/f16-vbinary/gen/f16-vaddc-fp16arith-u1.c @@ -8,7 +8,6 @@ // LICENSE file in the root directory of this source tree. #include -#include #include @@ -18,12 +17,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vaddc_minmax_ukernel__fp16arith_u1( +void xnn_f16_vaddc_ukernel__fp16arith_u1( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float16_t) == 0); @@ -35,16 +34,10 @@ void xnn_f16_vaddc_minmax_ukernel__fp16arith_u1( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - float16_t vy_min, vy_max; - memcpy(&vy_min, ¶ms->scalar.min, sizeof(vy_min)); - memcpy(&vy_max, ¶ms->scalar.max, sizeof(vy_max)); - const float16_t vb = *b; do { float16_t vacc = *a++; vacc = vaddh_f16(vacc, vb); - vacc = vmaxnmh_f16(vacc, vy_min); - vacc = vminnmh_f16(vacc, vy_max); *o++ = vacc; batch -= sizeof(float16_t); } while (batch != 0); diff --git a/src/f16-vbinary/gen/f16-vaddc-minmax-fp16arith-u2.c b/src/f16-vbinary/gen/f16-vaddc-fp16arith-u2.c similarity index 70% rename from src/f16-vbinary/gen/f16-vaddc-minmax-fp16arith-u2.c rename to src/f16-vbinary/gen/f16-vaddc-fp16arith-u2.c index e5750e0eb5b..eb4fff21b90 100644 --- a/src/f16-vbinary/gen/f16-vaddc-minmax-fp16arith-u2.c +++ b/src/f16-vbinary/gen/f16-vaddc-fp16arith-u2.c @@ -8,7 +8,6 @@ // LICENSE file in the root directory of this source tree. #include -#include #include @@ -18,12 +17,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vaddc_minmax_ukernel__fp16arith_u2( +void xnn_f16_vaddc_ukernel__fp16arith_u2( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float16_t) == 0); @@ -35,10 +34,6 @@ void xnn_f16_vaddc_minmax_ukernel__fp16arith_u2( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - float16_t vy_min, vy_max; - memcpy(&vy_min, ¶ms->scalar.min, sizeof(vy_min)); - memcpy(&vy_max, ¶ms->scalar.max, sizeof(vy_max)); - const float16_t vb = *b; for (; batch >= 2 * sizeof(float16_t); batch -= 2 * sizeof(float16_t)) { float16_t vacc0 = a[0]; @@ -49,12 +44,6 @@ void xnn_f16_vaddc_minmax_ukernel__fp16arith_u2( vacc1 = vaddh_f16(vacc1, vb); - vacc0 = vmaxnmh_f16(vacc0, vy_min); - vacc1 = vmaxnmh_f16(vacc1, vy_min); - - vacc0 = vminnmh_f16(vacc0, vy_max); - vacc1 = vminnmh_f16(vacc1, vy_max); - o[0] = vacc0; o[1] = vacc1; o += 2; @@ -62,8 +51,6 @@ void xnn_f16_vaddc_minmax_ukernel__fp16arith_u2( if XNN_UNLIKELY(batch != 0) { float16_t vacc = *a; vacc = vaddh_f16(vacc, vb); - vacc = vmaxnmh_f16(vacc, vy_min); - vacc = vminnmh_f16(vacc, vy_max); *o = vacc; } } diff --git a/src/f16-vbinary/gen/f16-vaddc-minmax-fp16arith-u4.c b/src/f16-vbinary/gen/f16-vaddc-fp16arith-u4.c similarity index 68% rename from src/f16-vbinary/gen/f16-vaddc-minmax-fp16arith-u4.c rename to src/f16-vbinary/gen/f16-vaddc-fp16arith-u4.c index 8188d33ff64..f33c98d957c 100644 --- a/src/f16-vbinary/gen/f16-vaddc-minmax-fp16arith-u4.c +++ b/src/f16-vbinary/gen/f16-vaddc-fp16arith-u4.c @@ -8,7 +8,6 @@ // LICENSE file in the root directory of this source tree. #include -#include #include @@ -18,12 +17,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vaddc_minmax_ukernel__fp16arith_u4( +void xnn_f16_vaddc_ukernel__fp16arith_u4( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float16_t) == 0); @@ -35,10 +34,6 @@ void xnn_f16_vaddc_minmax_ukernel__fp16arith_u4( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - float16_t vy_min, vy_max; - memcpy(&vy_min, ¶ms->scalar.min, sizeof(vy_min)); - memcpy(&vy_max, ¶ms->scalar.max, sizeof(vy_max)); - const float16_t vb = *b; for (; batch >= 4 * sizeof(float16_t); batch -= 4 * sizeof(float16_t)) { float16_t vacc0 = a[0]; @@ -53,16 +48,6 @@ void xnn_f16_vaddc_minmax_ukernel__fp16arith_u4( vacc3 = vaddh_f16(vacc3, vb); - vacc0 = vmaxnmh_f16(vacc0, vy_min); - vacc1 = vmaxnmh_f16(vacc1, vy_min); - vacc2 = vmaxnmh_f16(vacc2, vy_min); - vacc3 = vmaxnmh_f16(vacc3, vy_min); - - vacc0 = vminnmh_f16(vacc0, vy_max); - vacc1 = vminnmh_f16(vacc1, vy_max); - vacc2 = vminnmh_f16(vacc2, vy_max); - vacc3 = vminnmh_f16(vacc3, vy_max); - o[0] = vacc0; o[1] = vacc1; o[2] = vacc2; @@ -73,8 +58,6 @@ void xnn_f16_vaddc_minmax_ukernel__fp16arith_u4( do { float16_t vacc = *a++; vacc = vaddh_f16(vacc, vb); - vacc = vmaxnmh_f16(vacc, vy_min); - vacc = vminnmh_f16(vacc, vy_max); *o++ = vacc; batch -= sizeof(float16_t); } while (batch != 0); diff --git a/src/f16-vbinary/gen/f16-vaddc-minmax-avx512fp16-u64.c b/src/f16-vbinary/gen/f16-vaddc-minmax-avx512fp16-u64.c deleted file mode 100644 index 43d2b70665e..00000000000 --- a/src/f16-vbinary/gen/f16-vaddc-minmax-avx512fp16-u64.c +++ /dev/null @@ -1,89 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f16-vbinary/vopc-avx512fp16.c.in -// Generator: tools/xngen -// -// Copyright 2024 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/vbinary.h" - - -void xnn_f16_vaddc_minmax_ukernel__avx512fp16_u64( - size_t batch, - const xnn_float16* restrict input_a, - const xnn_float16* restrict input_b, - xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(uint16_t) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - -#if defined(__AVX512FP16__) - const uint16_t* a = (const uint16_t*) input_a; - const uint16_t* b = (const uint16_t*) input_b; - uint16_t* o = (uint16_t*) output; - - const __m512h voutput_min = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m512h voutput_max = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - const __m512h vb = _mm512_castsi512_ph(_mm512_set1_epi16(*b)); - - - for (; batch >= 64 * sizeof(uint16_t); batch -= 64 * sizeof(uint16_t)) { - __m512h va0 = _mm512_loadu_ph(a); - __m512h va1 = _mm512_loadu_ph(a + 32); - a += 64; - - __m512h vacc0 = _mm512_add_ph(va0, vb); - __m512h vacc1 = _mm512_add_ph(va1, vb); - - - vacc0 = _mm512_max_ph(voutput_min, vacc0); - vacc1 = _mm512_max_ph(voutput_min, vacc1); - - vacc0 = _mm512_min_ph(voutput_max, vacc0); - vacc1 = _mm512_min_ph(voutput_max, vacc1); - - _mm512_storeu_ph(o, vacc0); - _mm512_storeu_ph(o + 32, vacc1); - o += 64; - } - for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { - __m512h va = _mm512_loadu_ph(a); - a += 32; - - __m512h vacc = _mm512_add_ph(va, vb); - - vacc = _mm512_max_ph(voutput_min, vacc); - vacc = _mm512_min_ph(voutput_max, vacc); - - _mm512_storeu_ph(o, vacc); - o += 32; - } - if XNN_UNLIKELY(batch != 0) { - assert(batch >= 1 * sizeof(uint16_t)); - assert(batch <= 31 * sizeof(uint16_t)); - // Prepare mask for valid 16-bit elements (depends on batch). - batch >>= XNN_LOG2_SIZEOF_HALF; - const __mmask32 vmask = _cvtu32_mask32((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); - - __m512h va = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, a)); - - __m512h vacc = _mm512_maskz_add_ph(vmask, va, vb); - - vacc = _mm512_maskz_max_ph(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ph(vmask, voutput_max, vacc); - _mm512_mask_storeu_epi16(o, vmask, _mm512_castph_si512(vacc)); - } -#endif // defined(__AVX512FP16__) -} diff --git a/src/f16-vbinary/gen/f16-vaddc-minmax-neonfp16arith-u16.c b/src/f16-vbinary/gen/f16-vaddc-neonfp16arith-u16.c similarity index 75% rename from src/f16-vbinary/gen/f16-vaddc-minmax-neonfp16arith-u16.c rename to src/f16-vbinary/gen/f16-vaddc-neonfp16arith-u16.c index a22ad12a45d..07e27e3e82e 100644 --- a/src/f16-vbinary/gen/f16-vaddc-minmax-neonfp16arith-u16.c +++ b/src/f16-vbinary/gen/f16-vaddc-neonfp16arith-u16.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vaddc_minmax_ukernel__neonfp16arith_u16( +void xnn_f16_vaddc_ukernel__neonfp16arith_u16( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -32,9 +32,6 @@ void xnn_f16_vaddc_minmax_ukernel__neonfp16arith_u16( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const float16x8_t vy_min = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.min)); - const float16x8_t vy_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.max)); - const float16x8_t vb = vreinterpretq_f16_u16(vld1q_dup_u16(b)); for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; @@ -44,12 +41,6 @@ void xnn_f16_vaddc_minmax_ukernel__neonfp16arith_u16( float16x8_t vy456789AB = vaddq_f16(va456789AB, vb); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy456789AB = vmaxq_f16(vy456789AB, vy_min); - - vy01234567 = vminq_f16(vy01234567, vy_max); - vy456789AB = vminq_f16(vy456789AB, vy_max); - vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; vst1q_u16(o, vreinterpretq_u16_f16(vy456789AB)); o += 8; } @@ -57,16 +48,12 @@ void xnn_f16_vaddc_minmax_ukernel__neonfp16arith_u16( const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; float16x8_t vy01234567 = vaddq_f16(va01234567, vb); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; } if XNN_UNLIKELY(batch != 0) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); float16x8_t vy01234567 = vaddq_f16(va01234567, vb); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); float16x4_t vy0123 = vget_low_f16(vy01234567); if (batch & (4 * sizeof(uint16_t))) { diff --git a/src/f16-vbinary/gen/f16-vaddc-minmax-neonfp16arith-u8.c b/src/f16-vbinary/gen/f16-vaddc-neonfp16arith-u8.c similarity index 76% rename from src/f16-vbinary/gen/f16-vaddc-minmax-neonfp16arith-u8.c rename to src/f16-vbinary/gen/f16-vaddc-neonfp16arith-u8.c index 01b28dd8f3f..958c7e1619a 100644 --- a/src/f16-vbinary/gen/f16-vaddc-minmax-neonfp16arith-u8.c +++ b/src/f16-vbinary/gen/f16-vaddc-neonfp16arith-u8.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vaddc_minmax_ukernel__neonfp16arith_u8( +void xnn_f16_vaddc_ukernel__neonfp16arith_u8( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -32,24 +32,17 @@ void xnn_f16_vaddc_minmax_ukernel__neonfp16arith_u8( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const float16x8_t vy_min = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.min)); - const float16x8_t vy_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.max)); - const float16x8_t vb = vreinterpretq_f16_u16(vld1q_dup_u16(b)); for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; float16x8_t vy01234567 = vaddq_f16(va01234567, vb); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; } if XNN_UNLIKELY(batch != 0) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); float16x8_t vy01234567 = vaddq_f16(va01234567, vb); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); float16x4_t vy0123 = vget_low_f16(vy01234567); if (batch & (4 * sizeof(uint16_t))) { diff --git a/src/f16-vbinary/gen/f16-vdiv-minmax-aarch64-neonfp16arith-u16.c b/src/f16-vbinary/gen/f16-vdiv-aarch64-neonfp16arith-u16.c similarity index 77% rename from src/f16-vbinary/gen/f16-vdiv-minmax-aarch64-neonfp16arith-u16.c rename to src/f16-vbinary/gen/f16-vdiv-aarch64-neonfp16arith-u16.c index 0451c10fe1f..43f45c6c516 100644 --- a/src/f16-vbinary/gen/f16-vdiv-minmax-aarch64-neonfp16arith-u16.c +++ b/src/f16-vbinary/gen/f16-vdiv-aarch64-neonfp16arith-u16.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vdiv_minmax_ukernel__aarch64_neonfp16arith_u16( +void xnn_f16_vdiv_ukernel__aarch64_neonfp16arith_u16( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -32,9 +32,6 @@ void xnn_f16_vdiv_minmax_ukernel__aarch64_neonfp16arith_u16( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const float16x8_t vy_min = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.min)); - const float16x8_t vy_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.max)); - for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; const float16x8_t vb01234567 = vreinterpretq_f16_u16(vld1q_u16(b)); b += 8; @@ -45,12 +42,6 @@ void xnn_f16_vdiv_minmax_ukernel__aarch64_neonfp16arith_u16( float16x8_t vy456789AB = vdivq_f16(va456789AB, vb456789AB); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy456789AB = vmaxq_f16(vy456789AB, vy_min); - - vy01234567 = vminq_f16(vy01234567, vy_max); - vy456789AB = vminq_f16(vy456789AB, vy_max); - vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; vst1q_u16(o, vreinterpretq_u16_f16(vy456789AB)); o += 8; } @@ -59,8 +50,6 @@ void xnn_f16_vdiv_minmax_ukernel__aarch64_neonfp16arith_u16( const float16x8_t vb01234567 = vreinterpretq_f16_u16(vld1q_u16(b)); b += 8; float16x8_t vy01234567 = vdivq_f16(va01234567, vb01234567); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; } if XNN_UNLIKELY(batch != 0) { @@ -68,8 +57,6 @@ void xnn_f16_vdiv_minmax_ukernel__aarch64_neonfp16arith_u16( const float16x8_t vb01234567 = vreinterpretq_f16_u16(vld1q_u16(b)); float16x8_t vy01234567 = vdivq_f16(va01234567, vb01234567); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); float16x4_t vy0123 = vget_low_f16(vy01234567); if (batch & (4 * sizeof(uint16_t))) { diff --git a/src/f16-vbinary/gen/f16-vdiv-minmax-aarch64-neonfp16arith-u8.c b/src/f16-vbinary/gen/f16-vdiv-aarch64-neonfp16arith-u8.c similarity index 77% rename from src/f16-vbinary/gen/f16-vdiv-minmax-aarch64-neonfp16arith-u8.c rename to src/f16-vbinary/gen/f16-vdiv-aarch64-neonfp16arith-u8.c index 9f61e4fc2e9..40e8e0566f2 100644 --- a/src/f16-vbinary/gen/f16-vdiv-minmax-aarch64-neonfp16arith-u8.c +++ b/src/f16-vbinary/gen/f16-vdiv-aarch64-neonfp16arith-u8.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vdiv_minmax_ukernel__aarch64_neonfp16arith_u8( +void xnn_f16_vdiv_ukernel__aarch64_neonfp16arith_u8( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -32,16 +32,11 @@ void xnn_f16_vdiv_minmax_ukernel__aarch64_neonfp16arith_u8( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const float16x8_t vy_min = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.min)); - const float16x8_t vy_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.max)); - for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; const float16x8_t vb01234567 = vreinterpretq_f16_u16(vld1q_u16(b)); b += 8; float16x8_t vy01234567 = vdivq_f16(va01234567, vb01234567); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; } if XNN_UNLIKELY(batch != 0) { @@ -49,8 +44,6 @@ void xnn_f16_vdiv_minmax_ukernel__aarch64_neonfp16arith_u8( const float16x8_t vb01234567 = vreinterpretq_f16_u16(vld1q_u16(b)); float16x8_t vy01234567 = vdivq_f16(va01234567, vb01234567); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); float16x4_t vy0123 = vget_low_f16(vy01234567); if (batch & (4 * sizeof(uint16_t))) { diff --git a/src/f16-vbinary/gen/f16-vdiv-avx512fp16-u32.c b/src/f16-vbinary/gen/f16-vdiv-avx512fp16-u32.c new file mode 100644 index 00000000000..c9a2041d960 --- /dev/null +++ b/src/f16-vbinary/gen/f16-vdiv-avx512fp16-u32.c @@ -0,0 +1,63 @@ +// Auto-generated file. Do not edit! +// Template: src/f16-vbinary/vop-avx512fp16.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/vbinary.h" + + +void xnn_f16_vdiv_ukernel__avx512fp16_u32( + size_t batch, + const xnn_float16* restrict input_a, + const xnn_float16* restrict input_b, + xnn_float16* restrict output, + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint16_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + +#if defined(__AVX512FP16__) + const uint16_t* a = (const uint16_t*) input_a; + const uint16_t* b = (const uint16_t*) input_b; + uint16_t* o = (uint16_t*) output; + + + for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { + const __m512h va = _mm512_loadu_ph(a); + a += 32; + + __m512h vacc = _mm512_div_ph(va, _mm512_loadu_ph(b)); + b += 32; + + + _mm512_storeu_ph(o, vacc); + o += 32; + } + if XNN_UNLIKELY(batch != 0) { + assert(batch >= 1 * sizeof(uint16_t)); + assert(batch <= 31 * sizeof(uint16_t)); + // Prepare mask for valid 16-bit elements (depends on batch). + batch >>= XNN_LOG2_SIZEOF_HALF; + const __mmask32 vmask = _cvtu32_mask32((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); + + const __m512h va = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, a)); + + __m512h vacc = _mm512_maskz_div_ph(vmask, va, _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, b))); + + + _mm512_mask_storeu_epi16(o, vmask, _mm512_castph_si512(vacc)); + } +#endif // defined(__AVX512FP16__) +} diff --git a/src/f16-vbinary/gen/f16-vdiv-minmax-avx512fp16-u32.c b/src/f16-vbinary/gen/f16-vdiv-avx512fp16-u64.c similarity index 75% rename from src/f16-vbinary/gen/f16-vdiv-minmax-avx512fp16-u32.c rename to src/f16-vbinary/gen/f16-vdiv-avx512fp16-u64.c index 8cb1a2fc084..6d42dd84cc1 100644 --- a/src/f16-vbinary/gen/f16-vdiv-minmax-avx512fp16-u32.c +++ b/src/f16-vbinary/gen/f16-vdiv-avx512fp16-u64.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vdiv_minmax_ukernel__avx512fp16_u32( +void xnn_f16_vdiv_ukernel__avx512fp16_u64( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -33,10 +33,21 @@ void xnn_f16_vdiv_minmax_ukernel__avx512fp16_u32( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m512h voutput_min = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m512h voutput_max = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); + for (; batch >= 64 * sizeof(uint16_t); batch -= 64 * sizeof(uint16_t)) { + const __m512h va0 = _mm512_loadu_ph(a); + const __m512h va1 = _mm512_loadu_ph(a + 32); + a += 64; + __m512h vacc0 = _mm512_div_ph(va0, _mm512_loadu_ph(b)); + __m512h vacc1 = _mm512_div_ph(va1, _mm512_loadu_ph(b + 32)); + b += 64; + + + _mm512_storeu_ph(o, vacc0); + _mm512_storeu_ph(o + 32, vacc1); + o += 64; + } for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { const __m512h va = _mm512_loadu_ph(a); a += 32; @@ -45,9 +56,6 @@ void xnn_f16_vdiv_minmax_ukernel__avx512fp16_u32( b += 32; - vacc = _mm512_max_ph(voutput_min, vacc); - vacc = _mm512_min_ph(voutput_max, vacc); - _mm512_storeu_ph(o, vacc); o += 32; } @@ -63,8 +71,6 @@ void xnn_f16_vdiv_minmax_ukernel__avx512fp16_u32( __m512h vacc = _mm512_maskz_div_ph(vmask, va, _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, b))); - vacc = _mm512_maskz_max_ph(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ph(vmask, voutput_max, vacc); _mm512_mask_storeu_epi16(o, vmask, _mm512_castph_si512(vacc)); } #endif // defined(__AVX512FP16__) diff --git a/src/f16-vbinary/gen/f16-vdiv-minmax-f16c-u16.c b/src/f16-vbinary/gen/f16-vdiv-f16c-u16.c similarity index 79% rename from src/f16-vbinary/gen/f16-vdiv-minmax-f16c-u16.c rename to src/f16-vbinary/gen/f16-vdiv-f16c-u16.c index ee35f9e710d..361512f8d90 100644 --- a/src/f16-vbinary/gen/f16-vdiv-minmax-f16c-u16.c +++ b/src/f16-vbinary/gen/f16-vdiv-f16c-u16.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vdiv_minmax_ukernel__f16c_u16( +void xnn_f16_vdiv_ukernel__f16c_u16( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -33,11 +33,6 @@ void xnn_f16_vdiv_minmax_ukernel__f16c_u16( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m256 vy_min = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m256 vy_max = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - XNN_FORCE_REALIZATION(vy_min); - XNN_FORCE_REALIZATION(vy_max); - for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a)); const __m256 vb01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b)); @@ -50,12 +45,6 @@ void xnn_f16_vdiv_minmax_ukernel__f16c_u16( __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_div_ps(va456789AB, vb456789AB), _MM_FROUND_TO_NEAREST_INT)); - vy01234567 = _mm256_max_ps(vy01234567, vy_min); - vy456789AB = _mm256_max_ps(vy456789AB, vy_min); - - vy01234567 = _mm256_min_ps(vy01234567, vy_max); - vy456789AB = _mm256_min_ps(vy456789AB, vy_max); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy01234567, _MM_FROUND_TO_NEAREST_INT)); _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_TO_NEAREST_INT)); o += 16; @@ -68,9 +57,6 @@ void xnn_f16_vdiv_minmax_ukernel__f16c_u16( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_div_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -80,9 +66,6 @@ void xnn_f16_vdiv_minmax_ukernel__f16c_u16( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_div_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/gen/f16-vdiv-minmax-f16c-u8.c b/src/f16-vbinary/gen/f16-vdiv-f16c-u8.c similarity index 78% rename from src/f16-vbinary/gen/f16-vdiv-minmax-f16c-u8.c rename to src/f16-vbinary/gen/f16-vdiv-f16c-u8.c index 51ea651ee57..c54a7ec70e1 100644 --- a/src/f16-vbinary/gen/f16-vdiv-minmax-f16c-u8.c +++ b/src/f16-vbinary/gen/f16-vdiv-f16c-u8.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vdiv_minmax_ukernel__f16c_u8( +void xnn_f16_vdiv_ukernel__f16c_u8( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -33,11 +33,6 @@ void xnn_f16_vdiv_minmax_ukernel__f16c_u8( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m256 vy_min = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m256 vy_max = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - XNN_FORCE_REALIZATION(vy_min); - XNN_FORCE_REALIZATION(vy_max); - for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a)); const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b)); @@ -46,9 +41,6 @@ void xnn_f16_vdiv_minmax_ukernel__f16c_u8( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_div_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -58,9 +50,6 @@ void xnn_f16_vdiv_minmax_ukernel__f16c_u8( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_div_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/gen/f16-vdiv-minmax-fp16arith-u1.c b/src/f16-vbinary/gen/f16-vdiv-fp16arith-u1.c similarity index 74% rename from src/f16-vbinary/gen/f16-vdiv-minmax-fp16arith-u1.c rename to src/f16-vbinary/gen/f16-vdiv-fp16arith-u1.c index e5480b2b233..95ea1023140 100644 --- a/src/f16-vbinary/gen/f16-vdiv-minmax-fp16arith-u1.c +++ b/src/f16-vbinary/gen/f16-vdiv-fp16arith-u1.c @@ -8,7 +8,6 @@ // LICENSE file in the root directory of this source tree. #include -#include #include @@ -17,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vdiv_minmax_ukernel__fp16arith_u1( +void xnn_f16_vdiv_ukernel__fp16arith_u1( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float16_t) == 0); @@ -34,16 +33,10 @@ void xnn_f16_vdiv_minmax_ukernel__fp16arith_u1( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - float16_t vy_min, vy_max; - memcpy(&vy_min, ¶ms->scalar.min, sizeof(vy_min)); - memcpy(&vy_max, ¶ms->scalar.max, sizeof(vy_max)); - do { const float16_t va = *a++; const float16_t vb = *b++; float16_t vacc = vdivh_f16(va, vb); - vacc = vmaxnmh_f16(vacc, vy_min); - vacc = vminnmh_f16(vacc, vy_max); *o++ = vacc; batch -= sizeof(float16_t); } while (batch != 0); diff --git a/src/f16-vbinary/gen/f16-vdiv-minmax-fp16arith-u2.c b/src/f16-vbinary/gen/f16-vdiv-fp16arith-u2.c similarity index 71% rename from src/f16-vbinary/gen/f16-vdiv-minmax-fp16arith-u2.c rename to src/f16-vbinary/gen/f16-vdiv-fp16arith-u2.c index 095db6386d8..bcb9c645118 100644 --- a/src/f16-vbinary/gen/f16-vdiv-minmax-fp16arith-u2.c +++ b/src/f16-vbinary/gen/f16-vdiv-fp16arith-u2.c @@ -8,7 +8,6 @@ // LICENSE file in the root directory of this source tree. #include -#include #include @@ -17,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vdiv_minmax_ukernel__fp16arith_u2( +void xnn_f16_vdiv_ukernel__fp16arith_u2( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float16_t) == 0); @@ -34,10 +33,6 @@ void xnn_f16_vdiv_minmax_ukernel__fp16arith_u2( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - float16_t vy_min, vy_max; - memcpy(&vy_min, ¶ms->scalar.min, sizeof(vy_min)); - memcpy(&vy_max, ¶ms->scalar.max, sizeof(vy_max)); - for (; batch >= 2 * sizeof(float16_t); batch -= 2 * sizeof(float16_t)) { const float16_t va0 = *a++; const float16_t va1 = *a++; @@ -49,12 +44,6 @@ void xnn_f16_vdiv_minmax_ukernel__fp16arith_u2( float16_t vacc1 = vdivh_f16(va1, vb1); - vacc0 = vmaxnmh_f16(vacc0, vy_min); - vacc1 = vmaxnmh_f16(vacc1, vy_min); - - vacc0 = vminnmh_f16(vacc0, vy_max); - vacc1 = vminnmh_f16(vacc1, vy_max); - *o++ = vacc0; *o++ = vacc1; } @@ -62,8 +51,6 @@ void xnn_f16_vdiv_minmax_ukernel__fp16arith_u2( const float16_t va = *a; const float16_t vb = *b; float16_t vacc = vdivh_f16(va, vb); - vacc = vmaxnmh_f16(vacc, vy_min); - vacc = vminnmh_f16(vacc, vy_max); *o = vacc; } } diff --git a/src/f16-vbinary/gen/f16-vdiv-minmax-fp16arith-u4.c b/src/f16-vbinary/gen/f16-vdiv-fp16arith-u4.c similarity index 70% rename from src/f16-vbinary/gen/f16-vdiv-minmax-fp16arith-u4.c rename to src/f16-vbinary/gen/f16-vdiv-fp16arith-u4.c index 70eaf106205..c38aa5bbe29 100644 --- a/src/f16-vbinary/gen/f16-vdiv-minmax-fp16arith-u4.c +++ b/src/f16-vbinary/gen/f16-vdiv-fp16arith-u4.c @@ -8,7 +8,6 @@ // LICENSE file in the root directory of this source tree. #include -#include #include @@ -17,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vdiv_minmax_ukernel__fp16arith_u4( +void xnn_f16_vdiv_ukernel__fp16arith_u4( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float16_t) == 0); @@ -34,10 +33,6 @@ void xnn_f16_vdiv_minmax_ukernel__fp16arith_u4( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - float16_t vy_min, vy_max; - memcpy(&vy_min, ¶ms->scalar.min, sizeof(vy_min)); - memcpy(&vy_max, ¶ms->scalar.max, sizeof(vy_max)); - for (; batch >= 4 * sizeof(float16_t); batch -= 4 * sizeof(float16_t)) { const float16_t va0 = *a++; const float16_t va1 = *a++; @@ -55,16 +50,6 @@ void xnn_f16_vdiv_minmax_ukernel__fp16arith_u4( float16_t vacc3 = vdivh_f16(va3, vb3); - vacc0 = vmaxnmh_f16(vacc0, vy_min); - vacc1 = vmaxnmh_f16(vacc1, vy_min); - vacc2 = vmaxnmh_f16(vacc2, vy_min); - vacc3 = vmaxnmh_f16(vacc3, vy_min); - - vacc0 = vminnmh_f16(vacc0, vy_max); - vacc1 = vminnmh_f16(vacc1, vy_max); - vacc2 = vminnmh_f16(vacc2, vy_max); - vacc3 = vminnmh_f16(vacc3, vy_max); - *o++ = vacc0; *o++ = vacc1; *o++ = vacc2; @@ -75,8 +60,6 @@ void xnn_f16_vdiv_minmax_ukernel__fp16arith_u4( const float16_t va = *a++; const float16_t vb = *b++; float16_t vacc = vdivh_f16(va, vb); - vacc = vmaxnmh_f16(vacc, vy_min); - vacc = vminnmh_f16(vacc, vy_max); *o++ = vacc; batch -= sizeof(float16_t); } while (batch != 0); diff --git a/src/f16-vbinary/gen/f16-vdiv-minmax-avx512fp16-u64.c b/src/f16-vbinary/gen/f16-vdiv-minmax-avx512fp16-u64.c deleted file mode 100644 index 7948dd6659c..00000000000 --- a/src/f16-vbinary/gen/f16-vdiv-minmax-avx512fp16-u64.c +++ /dev/null @@ -1,91 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f16-vbinary/vop-avx512fp16.c.in -// Generator: tools/xngen -// -// Copyright 2024 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f16_vdiv_minmax_ukernel__avx512fp16_u64( - size_t batch, - const xnn_float16* restrict input_a, - const xnn_float16* restrict input_b, - xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(uint16_t) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - -#if defined(__AVX512FP16__) - const uint16_t* a = (const uint16_t*) input_a; - const uint16_t* b = (const uint16_t*) input_b; - uint16_t* o = (uint16_t*) output; - - const __m512h voutput_min = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m512h voutput_max = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - - - for (; batch >= 64 * sizeof(uint16_t); batch -= 64 * sizeof(uint16_t)) { - const __m512h va0 = _mm512_loadu_ph(a); - const __m512h va1 = _mm512_loadu_ph(a + 32); - a += 64; - - __m512h vacc0 = _mm512_div_ph(va0, _mm512_loadu_ph(b)); - __m512h vacc1 = _mm512_div_ph(va1, _mm512_loadu_ph(b + 32)); - b += 64; - - - vacc0 = _mm512_max_ph(voutput_min, vacc0); - vacc1 = _mm512_max_ph(voutput_min, vacc1); - - vacc0 = _mm512_min_ph(voutput_max, vacc0); - vacc1 = _mm512_min_ph(voutput_max, vacc1); - - _mm512_storeu_ph(o, vacc0); - _mm512_storeu_ph(o + 32, vacc1); - o += 64; - } - for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { - const __m512h va = _mm512_loadu_ph(a); - a += 32; - - __m512h vacc = _mm512_div_ph(va, _mm512_loadu_ph(b)); - b += 32; - - - vacc = _mm512_max_ph(voutput_min, vacc); - vacc = _mm512_min_ph(voutput_max, vacc); - - _mm512_storeu_ph(o, vacc); - o += 32; - } - if XNN_UNLIKELY(batch != 0) { - assert(batch >= 1 * sizeof(uint16_t)); - assert(batch <= 31 * sizeof(uint16_t)); - // Prepare mask for valid 16-bit elements (depends on batch). - batch >>= XNN_LOG2_SIZEOF_HALF; - const __mmask32 vmask = _cvtu32_mask32((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); - - const __m512h va = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, a)); - - __m512h vacc = _mm512_maskz_div_ph(vmask, va, _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, b))); - - - vacc = _mm512_maskz_max_ph(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ph(vmask, voutput_max, vacc); - _mm512_mask_storeu_epi16(o, vmask, _mm512_castph_si512(vacc)); - } -#endif // defined(__AVX512FP16__) -} diff --git a/src/f16-vbinary/gen/f16-vdivc-minmax-aarch64-neonfp16arith-u16.c b/src/f16-vbinary/gen/f16-vdivc-aarch64-neonfp16arith-u16.c similarity index 75% rename from src/f16-vbinary/gen/f16-vdivc-minmax-aarch64-neonfp16arith-u16.c rename to src/f16-vbinary/gen/f16-vdivc-aarch64-neonfp16arith-u16.c index acb2c2eef5a..9b752744087 100644 --- a/src/f16-vbinary/gen/f16-vdivc-minmax-aarch64-neonfp16arith-u16.c +++ b/src/f16-vbinary/gen/f16-vdivc-aarch64-neonfp16arith-u16.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vdivc_minmax_ukernel__aarch64_neonfp16arith_u16( +void xnn_f16_vdivc_ukernel__aarch64_neonfp16arith_u16( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -32,9 +32,6 @@ void xnn_f16_vdivc_minmax_ukernel__aarch64_neonfp16arith_u16( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const float16x8_t vy_min = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.min)); - const float16x8_t vy_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.max)); - const float16x8_t vb = vreinterpretq_f16_u16(vld1q_dup_u16(b)); for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; @@ -44,12 +41,6 @@ void xnn_f16_vdivc_minmax_ukernel__aarch64_neonfp16arith_u16( float16x8_t vy456789AB = vdivq_f16(va456789AB, vb); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy456789AB = vmaxq_f16(vy456789AB, vy_min); - - vy01234567 = vminq_f16(vy01234567, vy_max); - vy456789AB = vminq_f16(vy456789AB, vy_max); - vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; vst1q_u16(o, vreinterpretq_u16_f16(vy456789AB)); o += 8; } @@ -57,16 +48,12 @@ void xnn_f16_vdivc_minmax_ukernel__aarch64_neonfp16arith_u16( const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; float16x8_t vy01234567 = vdivq_f16(va01234567, vb); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; } if XNN_UNLIKELY(batch != 0) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); float16x8_t vy01234567 = vdivq_f16(va01234567, vb); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); float16x4_t vy0123 = vget_low_f16(vy01234567); if (batch & (4 * sizeof(uint16_t))) { diff --git a/src/f16-vbinary/gen/f16-vdivc-minmax-aarch64-neonfp16arith-u8.c b/src/f16-vbinary/gen/f16-vdivc-aarch64-neonfp16arith-u8.c similarity index 76% rename from src/f16-vbinary/gen/f16-vdivc-minmax-aarch64-neonfp16arith-u8.c rename to src/f16-vbinary/gen/f16-vdivc-aarch64-neonfp16arith-u8.c index ac7847b0fd6..f2c3abab07d 100644 --- a/src/f16-vbinary/gen/f16-vdivc-minmax-aarch64-neonfp16arith-u8.c +++ b/src/f16-vbinary/gen/f16-vdivc-aarch64-neonfp16arith-u8.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vdivc_minmax_ukernel__aarch64_neonfp16arith_u8( +void xnn_f16_vdivc_ukernel__aarch64_neonfp16arith_u8( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -32,24 +32,17 @@ void xnn_f16_vdivc_minmax_ukernel__aarch64_neonfp16arith_u8( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const float16x8_t vy_min = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.min)); - const float16x8_t vy_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.max)); - const float16x8_t vb = vreinterpretq_f16_u16(vld1q_dup_u16(b)); for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; float16x8_t vy01234567 = vdivq_f16(va01234567, vb); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; } if XNN_UNLIKELY(batch != 0) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); float16x8_t vy01234567 = vdivq_f16(va01234567, vb); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); float16x4_t vy0123 = vget_low_f16(vy01234567); if (batch & (4 * sizeof(uint16_t))) { diff --git a/src/f16-vbinary/gen/f16-vdivc-avx512fp16-u32.c b/src/f16-vbinary/gen/f16-vdivc-avx512fp16-u32.c new file mode 100644 index 00000000000..b2adf6c972c --- /dev/null +++ b/src/f16-vbinary/gen/f16-vdivc-avx512fp16-u32.c @@ -0,0 +1,64 @@ +// Auto-generated file. Do not edit! +// Template: src/f16-vbinary/vopc-avx512fp16.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/intrinsics-polyfill.h" +#include "xnnpack/vbinary.h" + + +void xnn_f16_vdivc_ukernel__avx512fp16_u32( + size_t batch, + const xnn_float16* restrict input_a, + const xnn_float16* restrict input_b, + xnn_float16* restrict output, + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint16_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + +#if defined(__AVX512FP16__) + const uint16_t* a = (const uint16_t*) input_a; + const uint16_t* b = (const uint16_t*) input_b; + uint16_t* o = (uint16_t*) output; + + const __m512h vb = _mm512_castsi512_ph(_mm512_set1_epi16(*b)); + + + for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { + __m512h va0 = _mm512_loadu_ph(a); + a += 32; + + __m512h vacc0 = _mm512_div_ph(va0, vb); + + + _mm512_storeu_ph(o, vacc0); + o += 32; + } + if XNN_UNLIKELY(batch != 0) { + assert(batch >= 1 * sizeof(uint16_t)); + assert(batch <= 31 * sizeof(uint16_t)); + // Prepare mask for valid 16-bit elements (depends on batch). + batch >>= XNN_LOG2_SIZEOF_HALF; + const __mmask32 vmask = _cvtu32_mask32((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); + + __m512h va = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, a)); + + __m512h vacc = _mm512_maskz_div_ph(vmask, va, vb); + + _mm512_mask_storeu_epi16(o, vmask, _mm512_castph_si512(vacc)); + } +#endif // defined(__AVX512FP16__) +} diff --git a/src/f16-vbinary/gen/f16-vdivc-minmax-avx512fp16-u32.c b/src/f16-vbinary/gen/f16-vdivc-avx512fp16-u64.c similarity index 75% rename from src/f16-vbinary/gen/f16-vdivc-minmax-avx512fp16-u32.c rename to src/f16-vbinary/gen/f16-vdivc-avx512fp16-u64.c index 1163b1d94f5..4026c96f08f 100644 --- a/src/f16-vbinary/gen/f16-vdivc-minmax-avx512fp16-u32.c +++ b/src/f16-vbinary/gen/f16-vdivc-avx512fp16-u64.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vdivc_minmax_ukernel__avx512fp16_u32( +void xnn_f16_vdivc_ukernel__avx512fp16_u64( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -34,23 +34,29 @@ void xnn_f16_vdivc_minmax_ukernel__avx512fp16_u32( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m512h voutput_min = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m512h voutput_max = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); const __m512h vb = _mm512_castsi512_ph(_mm512_set1_epi16(*b)); - for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { + for (; batch >= 64 * sizeof(uint16_t); batch -= 64 * sizeof(uint16_t)) { __m512h va0 = _mm512_loadu_ph(a); - a += 32; + __m512h va1 = _mm512_loadu_ph(a + 32); + a += 64; __m512h vacc0 = _mm512_div_ph(va0, vb); + __m512h vacc1 = _mm512_div_ph(va1, vb); - vacc0 = _mm512_max_ph(voutput_min, vacc0); + _mm512_storeu_ph(o, vacc0); + _mm512_storeu_ph(o + 32, vacc1); + o += 64; + } + for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { + __m512h va = _mm512_loadu_ph(a); + a += 32; - vacc0 = _mm512_min_ph(voutput_max, vacc0); + __m512h vacc = _mm512_div_ph(va, vb); - _mm512_storeu_ph(o, vacc0); + _mm512_storeu_ph(o, vacc); o += 32; } if XNN_UNLIKELY(batch != 0) { @@ -64,8 +70,6 @@ void xnn_f16_vdivc_minmax_ukernel__avx512fp16_u32( __m512h vacc = _mm512_maskz_div_ph(vmask, va, vb); - vacc = _mm512_maskz_max_ph(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ph(vmask, voutput_max, vacc); _mm512_mask_storeu_epi16(o, vmask, _mm512_castph_si512(vacc)); } #endif // defined(__AVX512FP16__) diff --git a/src/f16-vbinary/gen/f16-vdivc-minmax-f16c-u16.c b/src/f16-vbinary/gen/f16-vdivc-f16c-u16.c similarity index 77% rename from src/f16-vbinary/gen/f16-vdivc-minmax-f16c-u16.c rename to src/f16-vbinary/gen/f16-vdivc-f16c-u16.c index fb78721723b..4e22f29a893 100644 --- a/src/f16-vbinary/gen/f16-vdivc-minmax-f16c-u16.c +++ b/src/f16-vbinary/gen/f16-vdivc-f16c-u16.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vdivc_minmax_ukernel__f16c_u16( +void xnn_f16_vdivc_ukernel__f16c_u16( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -33,11 +33,6 @@ void xnn_f16_vdivc_minmax_ukernel__f16c_u16( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m256 vy_min = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m256 vy_max = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - XNN_FORCE_REALIZATION(vy_min); - XNN_FORCE_REALIZATION(vy_max); - const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b)); for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a)); @@ -48,12 +43,6 @@ void xnn_f16_vdivc_minmax_ukernel__f16c_u16( __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_div_ps(va456789AB, vb), _MM_FROUND_TO_NEAREST_INT)); - vy01234567 = _mm256_max_ps(vy01234567, vy_min); - vy456789AB = _mm256_max_ps(vy456789AB, vy_min); - - vy01234567 = _mm256_min_ps(vy01234567, vy_max); - vy456789AB = _mm256_min_ps(vy456789AB, vy_max); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy01234567, _MM_FROUND_TO_NEAREST_INT)); _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_TO_NEAREST_INT)); o += 16; @@ -64,9 +53,6 @@ void xnn_f16_vdivc_minmax_ukernel__f16c_u16( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_div_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -75,9 +61,6 @@ void xnn_f16_vdivc_minmax_ukernel__f16c_u16( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_div_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/gen/f16-vdivc-minmax-f16c-u8.c b/src/f16-vbinary/gen/f16-vdivc-f16c-u8.c similarity index 77% rename from src/f16-vbinary/gen/f16-vdivc-minmax-f16c-u8.c rename to src/f16-vbinary/gen/f16-vdivc-f16c-u8.c index fc717e030eb..176fdc07c26 100644 --- a/src/f16-vbinary/gen/f16-vdivc-minmax-f16c-u8.c +++ b/src/f16-vbinary/gen/f16-vdivc-f16c-u8.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vdivc_minmax_ukernel__f16c_u8( +void xnn_f16_vdivc_ukernel__f16c_u8( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -33,11 +33,6 @@ void xnn_f16_vdivc_minmax_ukernel__f16c_u8( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m256 vy_min = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m256 vy_max = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - XNN_FORCE_REALIZATION(vy_min); - XNN_FORCE_REALIZATION(vy_max); - const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b)); for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a)); @@ -45,9 +40,6 @@ void xnn_f16_vdivc_minmax_ukernel__f16c_u8( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_div_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -56,9 +48,6 @@ void xnn_f16_vdivc_minmax_ukernel__f16c_u8( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_div_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/gen/f16-vdivc-minmax-fp16arith-u1.c b/src/f16-vbinary/gen/f16-vdivc-fp16arith-u1.c similarity index 74% rename from src/f16-vbinary/gen/f16-vdivc-minmax-fp16arith-u1.c rename to src/f16-vbinary/gen/f16-vdivc-fp16arith-u1.c index cbe4b877517..ce785970e05 100644 --- a/src/f16-vbinary/gen/f16-vdivc-minmax-fp16arith-u1.c +++ b/src/f16-vbinary/gen/f16-vdivc-fp16arith-u1.c @@ -8,7 +8,6 @@ // LICENSE file in the root directory of this source tree. #include -#include #include @@ -18,12 +17,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vdivc_minmax_ukernel__fp16arith_u1( +void xnn_f16_vdivc_ukernel__fp16arith_u1( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float16_t) == 0); @@ -35,16 +34,10 @@ void xnn_f16_vdivc_minmax_ukernel__fp16arith_u1( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - float16_t vy_min, vy_max; - memcpy(&vy_min, ¶ms->scalar.min, sizeof(vy_min)); - memcpy(&vy_max, ¶ms->scalar.max, sizeof(vy_max)); - const float16_t vb = *b; do { float16_t vacc = *a++; vacc = vdivh_f16(vacc, vb); - vacc = vmaxnmh_f16(vacc, vy_min); - vacc = vminnmh_f16(vacc, vy_max); *o++ = vacc; batch -= sizeof(float16_t); } while (batch != 0); diff --git a/src/f16-vbinary/gen/f16-vdivc-minmax-fp16arith-u2.c b/src/f16-vbinary/gen/f16-vdivc-fp16arith-u2.c similarity index 70% rename from src/f16-vbinary/gen/f16-vdivc-minmax-fp16arith-u2.c rename to src/f16-vbinary/gen/f16-vdivc-fp16arith-u2.c index 23edb7074dd..6acd396ecc8 100644 --- a/src/f16-vbinary/gen/f16-vdivc-minmax-fp16arith-u2.c +++ b/src/f16-vbinary/gen/f16-vdivc-fp16arith-u2.c @@ -8,7 +8,6 @@ // LICENSE file in the root directory of this source tree. #include -#include #include @@ -18,12 +17,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vdivc_minmax_ukernel__fp16arith_u2( +void xnn_f16_vdivc_ukernel__fp16arith_u2( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float16_t) == 0); @@ -35,10 +34,6 @@ void xnn_f16_vdivc_minmax_ukernel__fp16arith_u2( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - float16_t vy_min, vy_max; - memcpy(&vy_min, ¶ms->scalar.min, sizeof(vy_min)); - memcpy(&vy_max, ¶ms->scalar.max, sizeof(vy_max)); - const float16_t vb = *b; for (; batch >= 2 * sizeof(float16_t); batch -= 2 * sizeof(float16_t)) { float16_t vacc0 = a[0]; @@ -49,12 +44,6 @@ void xnn_f16_vdivc_minmax_ukernel__fp16arith_u2( vacc1 = vdivh_f16(vacc1, vb); - vacc0 = vmaxnmh_f16(vacc0, vy_min); - vacc1 = vmaxnmh_f16(vacc1, vy_min); - - vacc0 = vminnmh_f16(vacc0, vy_max); - vacc1 = vminnmh_f16(vacc1, vy_max); - o[0] = vacc0; o[1] = vacc1; o += 2; @@ -62,8 +51,6 @@ void xnn_f16_vdivc_minmax_ukernel__fp16arith_u2( if XNN_UNLIKELY(batch != 0) { float16_t vacc = *a; vacc = vdivh_f16(vacc, vb); - vacc = vmaxnmh_f16(vacc, vy_min); - vacc = vminnmh_f16(vacc, vy_max); *o = vacc; } } diff --git a/src/f16-vbinary/gen/f16-vdivc-minmax-fp16arith-u4.c b/src/f16-vbinary/gen/f16-vdivc-fp16arith-u4.c similarity index 68% rename from src/f16-vbinary/gen/f16-vdivc-minmax-fp16arith-u4.c rename to src/f16-vbinary/gen/f16-vdivc-fp16arith-u4.c index 933cfd27baa..df7e39208ba 100644 --- a/src/f16-vbinary/gen/f16-vdivc-minmax-fp16arith-u4.c +++ b/src/f16-vbinary/gen/f16-vdivc-fp16arith-u4.c @@ -8,7 +8,6 @@ // LICENSE file in the root directory of this source tree. #include -#include #include @@ -18,12 +17,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vdivc_minmax_ukernel__fp16arith_u4( +void xnn_f16_vdivc_ukernel__fp16arith_u4( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float16_t) == 0); @@ -35,10 +34,6 @@ void xnn_f16_vdivc_minmax_ukernel__fp16arith_u4( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - float16_t vy_min, vy_max; - memcpy(&vy_min, ¶ms->scalar.min, sizeof(vy_min)); - memcpy(&vy_max, ¶ms->scalar.max, sizeof(vy_max)); - const float16_t vb = *b; for (; batch >= 4 * sizeof(float16_t); batch -= 4 * sizeof(float16_t)) { float16_t vacc0 = a[0]; @@ -53,16 +48,6 @@ void xnn_f16_vdivc_minmax_ukernel__fp16arith_u4( vacc3 = vdivh_f16(vacc3, vb); - vacc0 = vmaxnmh_f16(vacc0, vy_min); - vacc1 = vmaxnmh_f16(vacc1, vy_min); - vacc2 = vmaxnmh_f16(vacc2, vy_min); - vacc3 = vmaxnmh_f16(vacc3, vy_min); - - vacc0 = vminnmh_f16(vacc0, vy_max); - vacc1 = vminnmh_f16(vacc1, vy_max); - vacc2 = vminnmh_f16(vacc2, vy_max); - vacc3 = vminnmh_f16(vacc3, vy_max); - o[0] = vacc0; o[1] = vacc1; o[2] = vacc2; @@ -73,8 +58,6 @@ void xnn_f16_vdivc_minmax_ukernel__fp16arith_u4( do { float16_t vacc = *a++; vacc = vdivh_f16(vacc, vb); - vacc = vmaxnmh_f16(vacc, vy_min); - vacc = vminnmh_f16(vacc, vy_max); *o++ = vacc; batch -= sizeof(float16_t); } while (batch != 0); diff --git a/src/f16-vbinary/gen/f16-vdivc-minmax-avx512fp16-u64.c b/src/f16-vbinary/gen/f16-vdivc-minmax-avx512fp16-u64.c deleted file mode 100644 index 64cfc28a491..00000000000 --- a/src/f16-vbinary/gen/f16-vdivc-minmax-avx512fp16-u64.c +++ /dev/null @@ -1,89 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f16-vbinary/vopc-avx512fp16.c.in -// Generator: tools/xngen -// -// Copyright 2024 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/vbinary.h" - - -void xnn_f16_vdivc_minmax_ukernel__avx512fp16_u64( - size_t batch, - const xnn_float16* restrict input_a, - const xnn_float16* restrict input_b, - xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(uint16_t) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - -#if defined(__AVX512FP16__) - const uint16_t* a = (const uint16_t*) input_a; - const uint16_t* b = (const uint16_t*) input_b; - uint16_t* o = (uint16_t*) output; - - const __m512h voutput_min = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m512h voutput_max = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - const __m512h vb = _mm512_castsi512_ph(_mm512_set1_epi16(*b)); - - - for (; batch >= 64 * sizeof(uint16_t); batch -= 64 * sizeof(uint16_t)) { - __m512h va0 = _mm512_loadu_ph(a); - __m512h va1 = _mm512_loadu_ph(a + 32); - a += 64; - - __m512h vacc0 = _mm512_div_ph(va0, vb); - __m512h vacc1 = _mm512_div_ph(va1, vb); - - - vacc0 = _mm512_max_ph(voutput_min, vacc0); - vacc1 = _mm512_max_ph(voutput_min, vacc1); - - vacc0 = _mm512_min_ph(voutput_max, vacc0); - vacc1 = _mm512_min_ph(voutput_max, vacc1); - - _mm512_storeu_ph(o, vacc0); - _mm512_storeu_ph(o + 32, vacc1); - o += 64; - } - for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { - __m512h va = _mm512_loadu_ph(a); - a += 32; - - __m512h vacc = _mm512_div_ph(va, vb); - - vacc = _mm512_max_ph(voutput_min, vacc); - vacc = _mm512_min_ph(voutput_max, vacc); - - _mm512_storeu_ph(o, vacc); - o += 32; - } - if XNN_UNLIKELY(batch != 0) { - assert(batch >= 1 * sizeof(uint16_t)); - assert(batch <= 31 * sizeof(uint16_t)); - // Prepare mask for valid 16-bit elements (depends on batch). - batch >>= XNN_LOG2_SIZEOF_HALF; - const __mmask32 vmask = _cvtu32_mask32((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); - - __m512h va = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, a)); - - __m512h vacc = _mm512_maskz_div_ph(vmask, va, vb); - - vacc = _mm512_maskz_max_ph(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ph(vmask, voutput_max, vacc); - _mm512_mask_storeu_epi16(o, vmask, _mm512_castph_si512(vacc)); - } -#endif // defined(__AVX512FP16__) -} diff --git a/src/f16-vbinary/gen/f16-vmax-avx512fp16-u32.c b/src/f16-vbinary/gen/f16-vmax-avx512fp16-u32.c index 918617789e1..425d32e5da1 100644 --- a/src/f16-vbinary/gen/f16-vmax-avx512fp16-u32.c +++ b/src/f16-vbinary/gen/f16-vmax-avx512fp16-u32.c @@ -34,7 +34,6 @@ void xnn_f16_vmax_ukernel__avx512fp16_u32( uint16_t* o = (uint16_t*) output; - for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { const __m512h va = _mm512_loadu_ph(a); a += 32; @@ -43,7 +42,6 @@ void xnn_f16_vmax_ukernel__avx512fp16_u32( b += 32; - _mm512_storeu_ph(o, vacc); o += 32; } diff --git a/src/f16-vbinary/gen/f16-vmax-avx512fp16-u64.c b/src/f16-vbinary/gen/f16-vmax-avx512fp16-u64.c index 1e1f8eb43fb..325718f435d 100644 --- a/src/f16-vbinary/gen/f16-vmax-avx512fp16-u64.c +++ b/src/f16-vbinary/gen/f16-vmax-avx512fp16-u64.c @@ -34,7 +34,6 @@ void xnn_f16_vmax_ukernel__avx512fp16_u64( uint16_t* o = (uint16_t*) output; - for (; batch >= 64 * sizeof(uint16_t); batch -= 64 * sizeof(uint16_t)) { const __m512h va0 = _mm512_loadu_ph(a); const __m512h va1 = _mm512_loadu_ph(a + 32); @@ -45,7 +44,6 @@ void xnn_f16_vmax_ukernel__avx512fp16_u64( b += 64; - _mm512_storeu_ph(o, vacc0); _mm512_storeu_ph(o + 32, vacc1); o += 64; @@ -58,7 +56,6 @@ void xnn_f16_vmax_ukernel__avx512fp16_u64( b += 32; - _mm512_storeu_ph(o, vacc); o += 32; } diff --git a/src/f16-vbinary/gen/f16-vmax-f16c-u16.c b/src/f16-vbinary/gen/f16-vmax-f16c-u16.c index 52832498c46..56b7c8fe5a0 100644 --- a/src/f16-vbinary/gen/f16-vmax-f16c-u16.c +++ b/src/f16-vbinary/gen/f16-vmax-f16c-u16.c @@ -33,7 +33,6 @@ void xnn_f16_vmax_ukernel__f16c_u16( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a)); const __m256 vb01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b)); @@ -46,7 +45,6 @@ void xnn_f16_vmax_ukernel__f16c_u16( __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_max_ps(va456789AB, vb456789AB), _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy01234567, _MM_FROUND_TO_NEAREST_INT)); _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_TO_NEAREST_INT)); o += 16; @@ -59,7 +57,6 @@ void xnn_f16_vmax_ukernel__f16c_u16( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_max_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -69,7 +66,6 @@ void xnn_f16_vmax_ukernel__f16c_u16( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_max_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/gen/f16-vmax-f16c-u8.c b/src/f16-vbinary/gen/f16-vmax-f16c-u8.c index 8ddf945fdd6..159d05b1c05 100644 --- a/src/f16-vbinary/gen/f16-vmax-f16c-u8.c +++ b/src/f16-vbinary/gen/f16-vmax-f16c-u8.c @@ -33,7 +33,6 @@ void xnn_f16_vmax_ukernel__f16c_u8( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a)); const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b)); @@ -42,7 +41,6 @@ void xnn_f16_vmax_ukernel__f16c_u8( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_max_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -52,7 +50,6 @@ void xnn_f16_vmax_ukernel__f16c_u8( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_max_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/gen/f16-vmax-fp16arith-u1.c b/src/f16-vbinary/gen/f16-vmax-fp16arith-u1.c index 3491febf7a4..53742e14604 100644 --- a/src/f16-vbinary/gen/f16-vmax-fp16arith-u1.c +++ b/src/f16-vbinary/gen/f16-vmax-fp16arith-u1.c @@ -33,7 +33,6 @@ void xnn_f16_vmax_ukernel__fp16arith_u1( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - do { const float16_t va = *a++; const float16_t vb = *b++; diff --git a/src/f16-vbinary/gen/f16-vmax-fp16arith-u2.c b/src/f16-vbinary/gen/f16-vmax-fp16arith-u2.c index 555d746e5e5..1fcb40c2916 100644 --- a/src/f16-vbinary/gen/f16-vmax-fp16arith-u2.c +++ b/src/f16-vbinary/gen/f16-vmax-fp16arith-u2.c @@ -33,7 +33,6 @@ void xnn_f16_vmax_ukernel__fp16arith_u2( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - for (; batch >= 2 * sizeof(float16_t); batch -= 2 * sizeof(float16_t)) { const float16_t va0 = *a++; const float16_t va1 = *a++; @@ -45,7 +44,6 @@ void xnn_f16_vmax_ukernel__fp16arith_u2( float16_t vacc1 = vmaxnmh_f16(va1, vb1); - *o++ = vacc0; *o++ = vacc1; } diff --git a/src/f16-vbinary/gen/f16-vmax-fp16arith-u4.c b/src/f16-vbinary/gen/f16-vmax-fp16arith-u4.c index 29aecc73761..5e667905cc6 100644 --- a/src/f16-vbinary/gen/f16-vmax-fp16arith-u4.c +++ b/src/f16-vbinary/gen/f16-vmax-fp16arith-u4.c @@ -33,7 +33,6 @@ void xnn_f16_vmax_ukernel__fp16arith_u4( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - for (; batch >= 4 * sizeof(float16_t); batch -= 4 * sizeof(float16_t)) { const float16_t va0 = *a++; const float16_t va1 = *a++; @@ -51,7 +50,6 @@ void xnn_f16_vmax_ukernel__fp16arith_u4( float16_t vacc3 = vmaxnmh_f16(va3, vb3); - *o++ = vacc0; *o++ = vacc1; *o++ = vacc2; diff --git a/src/f16-vbinary/gen/f16-vmax-neonfp16arith-u16.c b/src/f16-vbinary/gen/f16-vmax-neonfp16arith-u16.c index ca7486858fb..3a874c2037d 100644 --- a/src/f16-vbinary/gen/f16-vmax-neonfp16arith-u16.c +++ b/src/f16-vbinary/gen/f16-vmax-neonfp16arith-u16.c @@ -32,7 +32,6 @@ void xnn_f16_vmax_ukernel__neonfp16arith_u16( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; const float16x8_t vb01234567 = vreinterpretq_f16_u16(vld1q_u16(b)); b += 8; @@ -43,7 +42,6 @@ void xnn_f16_vmax_ukernel__neonfp16arith_u16( float16x8_t vy456789AB = vmaxq_f16(va456789AB, vb456789AB); - vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; vst1q_u16(o, vreinterpretq_u16_f16(vy456789AB)); o += 8; } diff --git a/src/f16-vbinary/gen/f16-vmax-neonfp16arith-u8.c b/src/f16-vbinary/gen/f16-vmax-neonfp16arith-u8.c index 4a178eab0e4..b8b34aa3bc0 100644 --- a/src/f16-vbinary/gen/f16-vmax-neonfp16arith-u8.c +++ b/src/f16-vbinary/gen/f16-vmax-neonfp16arith-u8.c @@ -32,7 +32,6 @@ void xnn_f16_vmax_ukernel__neonfp16arith_u8( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; const float16x8_t vb01234567 = vreinterpretq_f16_u16(vld1q_u16(b)); b += 8; diff --git a/src/f16-vbinary/gen/f16-vmaxc-avx512fp16-u32.c b/src/f16-vbinary/gen/f16-vmaxc-avx512fp16-u32.c index a9fbaaa0da1..f747f8deba6 100644 --- a/src/f16-vbinary/gen/f16-vmaxc-avx512fp16-u32.c +++ b/src/f16-vbinary/gen/f16-vmaxc-avx512fp16-u32.c @@ -44,7 +44,6 @@ void xnn_f16_vmaxc_ukernel__avx512fp16_u32( __m512h vacc0 = _mm512_max_ph(va0, vb); - _mm512_storeu_ph(o, vacc0); o += 32; } diff --git a/src/f16-vbinary/gen/f16-vmaxc-avx512fp16-u64.c b/src/f16-vbinary/gen/f16-vmaxc-avx512fp16-u64.c index b4dfb4bd431..fa258196bf7 100644 --- a/src/f16-vbinary/gen/f16-vmaxc-avx512fp16-u64.c +++ b/src/f16-vbinary/gen/f16-vmaxc-avx512fp16-u64.c @@ -46,7 +46,6 @@ void xnn_f16_vmaxc_ukernel__avx512fp16_u64( __m512h vacc1 = _mm512_max_ph(va1, vb); - _mm512_storeu_ph(o, vacc0); _mm512_storeu_ph(o + 32, vacc1); o += 64; @@ -57,7 +56,6 @@ void xnn_f16_vmaxc_ukernel__avx512fp16_u64( __m512h vacc = _mm512_max_ph(va, vb); - _mm512_storeu_ph(o, vacc); o += 32; } diff --git a/src/f16-vbinary/gen/f16-vmaxc-f16c-u16.c b/src/f16-vbinary/gen/f16-vmaxc-f16c-u16.c index 574bee1b74b..60baf110c85 100644 --- a/src/f16-vbinary/gen/f16-vmaxc-f16c-u16.c +++ b/src/f16-vbinary/gen/f16-vmaxc-f16c-u16.c @@ -33,7 +33,6 @@ void xnn_f16_vmaxc_ukernel__f16c_u16( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b)); for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a)); @@ -44,7 +43,6 @@ void xnn_f16_vmaxc_ukernel__f16c_u16( __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_max_ps(va456789AB, vb), _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy01234567, _MM_FROUND_TO_NEAREST_INT)); _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_TO_NEAREST_INT)); o += 16; @@ -55,7 +53,6 @@ void xnn_f16_vmaxc_ukernel__f16c_u16( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_max_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -64,7 +61,6 @@ void xnn_f16_vmaxc_ukernel__f16c_u16( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_max_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/gen/f16-vmaxc-f16c-u8.c b/src/f16-vbinary/gen/f16-vmaxc-f16c-u8.c index 67e2439c136..efcf1f986a8 100644 --- a/src/f16-vbinary/gen/f16-vmaxc-f16c-u8.c +++ b/src/f16-vbinary/gen/f16-vmaxc-f16c-u8.c @@ -33,7 +33,6 @@ void xnn_f16_vmaxc_ukernel__f16c_u8( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b)); for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a)); @@ -41,7 +40,6 @@ void xnn_f16_vmaxc_ukernel__f16c_u8( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_max_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -50,7 +48,6 @@ void xnn_f16_vmaxc_ukernel__f16c_u8( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_max_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/gen/f16-vmaxc-fp16arith-u1.c b/src/f16-vbinary/gen/f16-vmaxc-fp16arith-u1.c index 3dc5c4dc288..dfecea90a99 100644 --- a/src/f16-vbinary/gen/f16-vmaxc-fp16arith-u1.c +++ b/src/f16-vbinary/gen/f16-vmaxc-fp16arith-u1.c @@ -34,7 +34,6 @@ void xnn_f16_vmaxc_ukernel__fp16arith_u1( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - const float16_t vb = *b; do { float16_t vacc = *a++; diff --git a/src/f16-vbinary/gen/f16-vmaxc-fp16arith-u2.c b/src/f16-vbinary/gen/f16-vmaxc-fp16arith-u2.c index b7824942e4e..bbdd63847b0 100644 --- a/src/f16-vbinary/gen/f16-vmaxc-fp16arith-u2.c +++ b/src/f16-vbinary/gen/f16-vmaxc-fp16arith-u2.c @@ -34,7 +34,6 @@ void xnn_f16_vmaxc_ukernel__fp16arith_u2( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - const float16_t vb = *b; for (; batch >= 2 * sizeof(float16_t); batch -= 2 * sizeof(float16_t)) { float16_t vacc0 = a[0]; @@ -45,7 +44,6 @@ void xnn_f16_vmaxc_ukernel__fp16arith_u2( vacc1 = vmaxnmh_f16(vacc1, vb); - o[0] = vacc0; o[1] = vacc1; o += 2; diff --git a/src/f16-vbinary/gen/f16-vmaxc-fp16arith-u4.c b/src/f16-vbinary/gen/f16-vmaxc-fp16arith-u4.c index fbcda0efe90..9d4f93bbb71 100644 --- a/src/f16-vbinary/gen/f16-vmaxc-fp16arith-u4.c +++ b/src/f16-vbinary/gen/f16-vmaxc-fp16arith-u4.c @@ -34,7 +34,6 @@ void xnn_f16_vmaxc_ukernel__fp16arith_u4( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - const float16_t vb = *b; for (; batch >= 4 * sizeof(float16_t); batch -= 4 * sizeof(float16_t)) { float16_t vacc0 = a[0]; @@ -49,7 +48,6 @@ void xnn_f16_vmaxc_ukernel__fp16arith_u4( vacc3 = vmaxnmh_f16(vacc3, vb); - o[0] = vacc0; o[1] = vacc1; o[2] = vacc2; diff --git a/src/f16-vbinary/gen/f16-vmaxc-neonfp16arith-u16.c b/src/f16-vbinary/gen/f16-vmaxc-neonfp16arith-u16.c index 8efd61782f3..92fc359a21b 100644 --- a/src/f16-vbinary/gen/f16-vmaxc-neonfp16arith-u16.c +++ b/src/f16-vbinary/gen/f16-vmaxc-neonfp16arith-u16.c @@ -32,7 +32,6 @@ void xnn_f16_vmaxc_ukernel__neonfp16arith_u16( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const float16x8_t vb = vreinterpretq_f16_u16(vld1q_dup_u16(b)); for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; @@ -42,7 +41,6 @@ void xnn_f16_vmaxc_ukernel__neonfp16arith_u16( float16x8_t vy456789AB = vmaxq_f16(va456789AB, vb); - vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; vst1q_u16(o, vreinterpretq_u16_f16(vy456789AB)); o += 8; } diff --git a/src/f16-vbinary/gen/f16-vmaxc-neonfp16arith-u8.c b/src/f16-vbinary/gen/f16-vmaxc-neonfp16arith-u8.c index 20522fc90be..22d4d540369 100644 --- a/src/f16-vbinary/gen/f16-vmaxc-neonfp16arith-u8.c +++ b/src/f16-vbinary/gen/f16-vmaxc-neonfp16arith-u8.c @@ -32,7 +32,6 @@ void xnn_f16_vmaxc_ukernel__neonfp16arith_u8( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const float16x8_t vb = vreinterpretq_f16_u16(vld1q_dup_u16(b)); for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; diff --git a/src/f16-vbinary/gen/f16-vmin-avx512fp16-u32.c b/src/f16-vbinary/gen/f16-vmin-avx512fp16-u32.c index a49a8cde1b6..bd87e5ac3fc 100644 --- a/src/f16-vbinary/gen/f16-vmin-avx512fp16-u32.c +++ b/src/f16-vbinary/gen/f16-vmin-avx512fp16-u32.c @@ -34,7 +34,6 @@ void xnn_f16_vmin_ukernel__avx512fp16_u32( uint16_t* o = (uint16_t*) output; - for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { const __m512h va = _mm512_loadu_ph(a); a += 32; @@ -43,7 +42,6 @@ void xnn_f16_vmin_ukernel__avx512fp16_u32( b += 32; - _mm512_storeu_ph(o, vacc); o += 32; } diff --git a/src/f16-vbinary/gen/f16-vmin-avx512fp16-u64.c b/src/f16-vbinary/gen/f16-vmin-avx512fp16-u64.c index dbcd42e3859..1c26929a7e2 100644 --- a/src/f16-vbinary/gen/f16-vmin-avx512fp16-u64.c +++ b/src/f16-vbinary/gen/f16-vmin-avx512fp16-u64.c @@ -34,7 +34,6 @@ void xnn_f16_vmin_ukernel__avx512fp16_u64( uint16_t* o = (uint16_t*) output; - for (; batch >= 64 * sizeof(uint16_t); batch -= 64 * sizeof(uint16_t)) { const __m512h va0 = _mm512_loadu_ph(a); const __m512h va1 = _mm512_loadu_ph(a + 32); @@ -45,7 +44,6 @@ void xnn_f16_vmin_ukernel__avx512fp16_u64( b += 64; - _mm512_storeu_ph(o, vacc0); _mm512_storeu_ph(o + 32, vacc1); o += 64; @@ -58,7 +56,6 @@ void xnn_f16_vmin_ukernel__avx512fp16_u64( b += 32; - _mm512_storeu_ph(o, vacc); o += 32; } diff --git a/src/f16-vbinary/gen/f16-vmin-f16c-u16.c b/src/f16-vbinary/gen/f16-vmin-f16c-u16.c index c09e3600699..f6f0ffc6657 100644 --- a/src/f16-vbinary/gen/f16-vmin-f16c-u16.c +++ b/src/f16-vbinary/gen/f16-vmin-f16c-u16.c @@ -33,7 +33,6 @@ void xnn_f16_vmin_ukernel__f16c_u16( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a)); const __m256 vb01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b)); @@ -46,7 +45,6 @@ void xnn_f16_vmin_ukernel__f16c_u16( __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_min_ps(va456789AB, vb456789AB), _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy01234567, _MM_FROUND_TO_NEAREST_INT)); _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_TO_NEAREST_INT)); o += 16; @@ -59,7 +57,6 @@ void xnn_f16_vmin_ukernel__f16c_u16( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_min_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -69,7 +66,6 @@ void xnn_f16_vmin_ukernel__f16c_u16( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_min_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/gen/f16-vmin-f16c-u8.c b/src/f16-vbinary/gen/f16-vmin-f16c-u8.c index 1a9f142fe92..93dad2fa14d 100644 --- a/src/f16-vbinary/gen/f16-vmin-f16c-u8.c +++ b/src/f16-vbinary/gen/f16-vmin-f16c-u8.c @@ -33,7 +33,6 @@ void xnn_f16_vmin_ukernel__f16c_u8( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a)); const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b)); @@ -42,7 +41,6 @@ void xnn_f16_vmin_ukernel__f16c_u8( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_min_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -52,7 +50,6 @@ void xnn_f16_vmin_ukernel__f16c_u8( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_min_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/gen/f16-vmin-fp16arith-u1.c b/src/f16-vbinary/gen/f16-vmin-fp16arith-u1.c index 2710e7d1908..70a9fe18903 100644 --- a/src/f16-vbinary/gen/f16-vmin-fp16arith-u1.c +++ b/src/f16-vbinary/gen/f16-vmin-fp16arith-u1.c @@ -33,7 +33,6 @@ void xnn_f16_vmin_ukernel__fp16arith_u1( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - do { const float16_t va = *a++; const float16_t vb = *b++; diff --git a/src/f16-vbinary/gen/f16-vmin-fp16arith-u2.c b/src/f16-vbinary/gen/f16-vmin-fp16arith-u2.c index 4103af5cb11..5095648c2a8 100644 --- a/src/f16-vbinary/gen/f16-vmin-fp16arith-u2.c +++ b/src/f16-vbinary/gen/f16-vmin-fp16arith-u2.c @@ -33,7 +33,6 @@ void xnn_f16_vmin_ukernel__fp16arith_u2( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - for (; batch >= 2 * sizeof(float16_t); batch -= 2 * sizeof(float16_t)) { const float16_t va0 = *a++; const float16_t va1 = *a++; @@ -45,7 +44,6 @@ void xnn_f16_vmin_ukernel__fp16arith_u2( float16_t vacc1 = vminnmh_f16(va1, vb1); - *o++ = vacc0; *o++ = vacc1; } diff --git a/src/f16-vbinary/gen/f16-vmin-fp16arith-u4.c b/src/f16-vbinary/gen/f16-vmin-fp16arith-u4.c index 2da90cd5800..1f3ac8532f4 100644 --- a/src/f16-vbinary/gen/f16-vmin-fp16arith-u4.c +++ b/src/f16-vbinary/gen/f16-vmin-fp16arith-u4.c @@ -33,7 +33,6 @@ void xnn_f16_vmin_ukernel__fp16arith_u4( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - for (; batch >= 4 * sizeof(float16_t); batch -= 4 * sizeof(float16_t)) { const float16_t va0 = *a++; const float16_t va1 = *a++; @@ -51,7 +50,6 @@ void xnn_f16_vmin_ukernel__fp16arith_u4( float16_t vacc3 = vminnmh_f16(va3, vb3); - *o++ = vacc0; *o++ = vacc1; *o++ = vacc2; diff --git a/src/f16-vbinary/gen/f16-vmin-neonfp16arith-u16.c b/src/f16-vbinary/gen/f16-vmin-neonfp16arith-u16.c index 2d0210d3f81..ee8c403fa75 100644 --- a/src/f16-vbinary/gen/f16-vmin-neonfp16arith-u16.c +++ b/src/f16-vbinary/gen/f16-vmin-neonfp16arith-u16.c @@ -32,7 +32,6 @@ void xnn_f16_vmin_ukernel__neonfp16arith_u16( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; const float16x8_t vb01234567 = vreinterpretq_f16_u16(vld1q_u16(b)); b += 8; @@ -43,7 +42,6 @@ void xnn_f16_vmin_ukernel__neonfp16arith_u16( float16x8_t vy456789AB = vminq_f16(va456789AB, vb456789AB); - vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; vst1q_u16(o, vreinterpretq_u16_f16(vy456789AB)); o += 8; } diff --git a/src/f16-vbinary/gen/f16-vmin-neonfp16arith-u8.c b/src/f16-vbinary/gen/f16-vmin-neonfp16arith-u8.c index 80de587f4d4..305b0fe7392 100644 --- a/src/f16-vbinary/gen/f16-vmin-neonfp16arith-u8.c +++ b/src/f16-vbinary/gen/f16-vmin-neonfp16arith-u8.c @@ -32,7 +32,6 @@ void xnn_f16_vmin_ukernel__neonfp16arith_u8( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; const float16x8_t vb01234567 = vreinterpretq_f16_u16(vld1q_u16(b)); b += 8; diff --git a/src/f16-vbinary/gen/f16-vminc-avx512fp16-u32.c b/src/f16-vbinary/gen/f16-vminc-avx512fp16-u32.c index 35f68f2c732..de34ee11427 100644 --- a/src/f16-vbinary/gen/f16-vminc-avx512fp16-u32.c +++ b/src/f16-vbinary/gen/f16-vminc-avx512fp16-u32.c @@ -44,7 +44,6 @@ void xnn_f16_vminc_ukernel__avx512fp16_u32( __m512h vacc0 = _mm512_min_ph(va0, vb); - _mm512_storeu_ph(o, vacc0); o += 32; } diff --git a/src/f16-vbinary/gen/f16-vminc-avx512fp16-u64.c b/src/f16-vbinary/gen/f16-vminc-avx512fp16-u64.c index ca8fadf664d..b7057056b4a 100644 --- a/src/f16-vbinary/gen/f16-vminc-avx512fp16-u64.c +++ b/src/f16-vbinary/gen/f16-vminc-avx512fp16-u64.c @@ -46,7 +46,6 @@ void xnn_f16_vminc_ukernel__avx512fp16_u64( __m512h vacc1 = _mm512_min_ph(va1, vb); - _mm512_storeu_ph(o, vacc0); _mm512_storeu_ph(o + 32, vacc1); o += 64; @@ -57,7 +56,6 @@ void xnn_f16_vminc_ukernel__avx512fp16_u64( __m512h vacc = _mm512_min_ph(va, vb); - _mm512_storeu_ph(o, vacc); o += 32; } diff --git a/src/f16-vbinary/gen/f16-vminc-f16c-u16.c b/src/f16-vbinary/gen/f16-vminc-f16c-u16.c index 654af840e57..55b77aa56e0 100644 --- a/src/f16-vbinary/gen/f16-vminc-f16c-u16.c +++ b/src/f16-vbinary/gen/f16-vminc-f16c-u16.c @@ -33,7 +33,6 @@ void xnn_f16_vminc_ukernel__f16c_u16( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b)); for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a)); @@ -44,7 +43,6 @@ void xnn_f16_vminc_ukernel__f16c_u16( __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_min_ps(va456789AB, vb), _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy01234567, _MM_FROUND_TO_NEAREST_INT)); _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_TO_NEAREST_INT)); o += 16; @@ -55,7 +53,6 @@ void xnn_f16_vminc_ukernel__f16c_u16( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_min_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -64,7 +61,6 @@ void xnn_f16_vminc_ukernel__f16c_u16( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_min_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/gen/f16-vminc-f16c-u8.c b/src/f16-vbinary/gen/f16-vminc-f16c-u8.c index bb4d3e842ea..83ecebe7f7f 100644 --- a/src/f16-vbinary/gen/f16-vminc-f16c-u8.c +++ b/src/f16-vbinary/gen/f16-vminc-f16c-u8.c @@ -33,7 +33,6 @@ void xnn_f16_vminc_ukernel__f16c_u8( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b)); for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a)); @@ -41,7 +40,6 @@ void xnn_f16_vminc_ukernel__f16c_u8( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_min_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -50,7 +48,6 @@ void xnn_f16_vminc_ukernel__f16c_u8( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_min_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/gen/f16-vminc-fp16arith-u1.c b/src/f16-vbinary/gen/f16-vminc-fp16arith-u1.c index 902ca402cfe..a0c3760614f 100644 --- a/src/f16-vbinary/gen/f16-vminc-fp16arith-u1.c +++ b/src/f16-vbinary/gen/f16-vminc-fp16arith-u1.c @@ -34,7 +34,6 @@ void xnn_f16_vminc_ukernel__fp16arith_u1( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - const float16_t vb = *b; do { float16_t vacc = *a++; diff --git a/src/f16-vbinary/gen/f16-vminc-fp16arith-u2.c b/src/f16-vbinary/gen/f16-vminc-fp16arith-u2.c index 25a6a2f12f3..5e08cbf4ebd 100644 --- a/src/f16-vbinary/gen/f16-vminc-fp16arith-u2.c +++ b/src/f16-vbinary/gen/f16-vminc-fp16arith-u2.c @@ -34,7 +34,6 @@ void xnn_f16_vminc_ukernel__fp16arith_u2( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - const float16_t vb = *b; for (; batch >= 2 * sizeof(float16_t); batch -= 2 * sizeof(float16_t)) { float16_t vacc0 = a[0]; @@ -45,7 +44,6 @@ void xnn_f16_vminc_ukernel__fp16arith_u2( vacc1 = vminnmh_f16(vacc1, vb); - o[0] = vacc0; o[1] = vacc1; o += 2; diff --git a/src/f16-vbinary/gen/f16-vminc-fp16arith-u4.c b/src/f16-vbinary/gen/f16-vminc-fp16arith-u4.c index 53f97acb9ad..de1b78b72eb 100644 --- a/src/f16-vbinary/gen/f16-vminc-fp16arith-u4.c +++ b/src/f16-vbinary/gen/f16-vminc-fp16arith-u4.c @@ -34,7 +34,6 @@ void xnn_f16_vminc_ukernel__fp16arith_u4( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - const float16_t vb = *b; for (; batch >= 4 * sizeof(float16_t); batch -= 4 * sizeof(float16_t)) { float16_t vacc0 = a[0]; @@ -49,7 +48,6 @@ void xnn_f16_vminc_ukernel__fp16arith_u4( vacc3 = vminnmh_f16(vacc3, vb); - o[0] = vacc0; o[1] = vacc1; o[2] = vacc2; diff --git a/src/f16-vbinary/gen/f16-vminc-neonfp16arith-u16.c b/src/f16-vbinary/gen/f16-vminc-neonfp16arith-u16.c index 4e2b22e260a..bbc31879504 100644 --- a/src/f16-vbinary/gen/f16-vminc-neonfp16arith-u16.c +++ b/src/f16-vbinary/gen/f16-vminc-neonfp16arith-u16.c @@ -32,7 +32,6 @@ void xnn_f16_vminc_ukernel__neonfp16arith_u16( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const float16x8_t vb = vreinterpretq_f16_u16(vld1q_dup_u16(b)); for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; @@ -42,7 +41,6 @@ void xnn_f16_vminc_ukernel__neonfp16arith_u16( float16x8_t vy456789AB = vminq_f16(va456789AB, vb); - vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; vst1q_u16(o, vreinterpretq_u16_f16(vy456789AB)); o += 8; } diff --git a/src/f16-vbinary/gen/f16-vminc-neonfp16arith-u8.c b/src/f16-vbinary/gen/f16-vminc-neonfp16arith-u8.c index e2be12a826d..d8efd505476 100644 --- a/src/f16-vbinary/gen/f16-vminc-neonfp16arith-u8.c +++ b/src/f16-vbinary/gen/f16-vminc-neonfp16arith-u8.c @@ -32,7 +32,6 @@ void xnn_f16_vminc_ukernel__neonfp16arith_u8( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const float16x8_t vb = vreinterpretq_f16_u16(vld1q_dup_u16(b)); for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; diff --git a/src/f16-vbinary/gen/f16-vmul-avx512fp16-u32.c b/src/f16-vbinary/gen/f16-vmul-avx512fp16-u32.c new file mode 100644 index 00000000000..5e03d75835a --- /dev/null +++ b/src/f16-vbinary/gen/f16-vmul-avx512fp16-u32.c @@ -0,0 +1,63 @@ +// Auto-generated file. Do not edit! +// Template: src/f16-vbinary/vop-avx512fp16.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/vbinary.h" + + +void xnn_f16_vmul_ukernel__avx512fp16_u32( + size_t batch, + const xnn_float16* restrict input_a, + const xnn_float16* restrict input_b, + xnn_float16* restrict output, + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint16_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + +#if defined(__AVX512FP16__) + const uint16_t* a = (const uint16_t*) input_a; + const uint16_t* b = (const uint16_t*) input_b; + uint16_t* o = (uint16_t*) output; + + + for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { + const __m512h va = _mm512_loadu_ph(a); + a += 32; + + __m512h vacc = _mm512_mul_ph(va, _mm512_loadu_ph(b)); + b += 32; + + + _mm512_storeu_ph(o, vacc); + o += 32; + } + if XNN_UNLIKELY(batch != 0) { + assert(batch >= 1 * sizeof(uint16_t)); + assert(batch <= 31 * sizeof(uint16_t)); + // Prepare mask for valid 16-bit elements (depends on batch). + batch >>= XNN_LOG2_SIZEOF_HALF; + const __mmask32 vmask = _cvtu32_mask32((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); + + const __m512h va = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, a)); + + __m512h vacc = _mm512_maskz_mul_ph(vmask, va, _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, b))); + + + _mm512_mask_storeu_epi16(o, vmask, _mm512_castph_si512(vacc)); + } +#endif // defined(__AVX512FP16__) +} diff --git a/src/f16-vbinary/gen/f16-vmul-minmax-avx512fp16-u32.c b/src/f16-vbinary/gen/f16-vmul-avx512fp16-u64.c similarity index 75% rename from src/f16-vbinary/gen/f16-vmul-minmax-avx512fp16-u32.c rename to src/f16-vbinary/gen/f16-vmul-avx512fp16-u64.c index 3c79d9b6cc5..d77e1daff86 100644 --- a/src/f16-vbinary/gen/f16-vmul-minmax-avx512fp16-u32.c +++ b/src/f16-vbinary/gen/f16-vmul-avx512fp16-u64.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vmul_minmax_ukernel__avx512fp16_u32( +void xnn_f16_vmul_ukernel__avx512fp16_u64( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -33,10 +33,21 @@ void xnn_f16_vmul_minmax_ukernel__avx512fp16_u32( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m512h voutput_min = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m512h voutput_max = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); + for (; batch >= 64 * sizeof(uint16_t); batch -= 64 * sizeof(uint16_t)) { + const __m512h va0 = _mm512_loadu_ph(a); + const __m512h va1 = _mm512_loadu_ph(a + 32); + a += 64; + __m512h vacc0 = _mm512_mul_ph(va0, _mm512_loadu_ph(b)); + __m512h vacc1 = _mm512_mul_ph(va1, _mm512_loadu_ph(b + 32)); + b += 64; + + + _mm512_storeu_ph(o, vacc0); + _mm512_storeu_ph(o + 32, vacc1); + o += 64; + } for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { const __m512h va = _mm512_loadu_ph(a); a += 32; @@ -45,9 +56,6 @@ void xnn_f16_vmul_minmax_ukernel__avx512fp16_u32( b += 32; - vacc = _mm512_max_ph(voutput_min, vacc); - vacc = _mm512_min_ph(voutput_max, vacc); - _mm512_storeu_ph(o, vacc); o += 32; } @@ -63,8 +71,6 @@ void xnn_f16_vmul_minmax_ukernel__avx512fp16_u32( __m512h vacc = _mm512_maskz_mul_ph(vmask, va, _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, b))); - vacc = _mm512_maskz_max_ph(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ph(vmask, voutput_max, vacc); _mm512_mask_storeu_epi16(o, vmask, _mm512_castph_si512(vacc)); } #endif // defined(__AVX512FP16__) diff --git a/src/f16-vbinary/gen/f16-vmul-minmax-f16c-u16.c b/src/f16-vbinary/gen/f16-vmul-f16c-u16.c similarity index 79% rename from src/f16-vbinary/gen/f16-vmul-minmax-f16c-u16.c rename to src/f16-vbinary/gen/f16-vmul-f16c-u16.c index caab65b07e9..748838a7e57 100644 --- a/src/f16-vbinary/gen/f16-vmul-minmax-f16c-u16.c +++ b/src/f16-vbinary/gen/f16-vmul-f16c-u16.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vmul_minmax_ukernel__f16c_u16( +void xnn_f16_vmul_ukernel__f16c_u16( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -33,11 +33,6 @@ void xnn_f16_vmul_minmax_ukernel__f16c_u16( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m256 vy_min = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m256 vy_max = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - XNN_FORCE_REALIZATION(vy_min); - XNN_FORCE_REALIZATION(vy_max); - for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a)); const __m256 vb01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b)); @@ -50,12 +45,6 @@ void xnn_f16_vmul_minmax_ukernel__f16c_u16( __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va456789AB, vb456789AB), _MM_FROUND_TO_NEAREST_INT)); - vy01234567 = _mm256_max_ps(vy01234567, vy_min); - vy456789AB = _mm256_max_ps(vy456789AB, vy_min); - - vy01234567 = _mm256_min_ps(vy01234567, vy_max); - vy456789AB = _mm256_min_ps(vy456789AB, vy_max); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy01234567, _MM_FROUND_TO_NEAREST_INT)); _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_TO_NEAREST_INT)); o += 16; @@ -68,9 +57,6 @@ void xnn_f16_vmul_minmax_ukernel__f16c_u16( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -80,9 +66,6 @@ void xnn_f16_vmul_minmax_ukernel__f16c_u16( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/gen/f16-vmul-minmax-f16c-u8.c b/src/f16-vbinary/gen/f16-vmul-f16c-u8.c similarity index 78% rename from src/f16-vbinary/gen/f16-vmul-minmax-f16c-u8.c rename to src/f16-vbinary/gen/f16-vmul-f16c-u8.c index f6961bdc2d5..1f79fb7c623 100644 --- a/src/f16-vbinary/gen/f16-vmul-minmax-f16c-u8.c +++ b/src/f16-vbinary/gen/f16-vmul-f16c-u8.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vmul_minmax_ukernel__f16c_u8( +void xnn_f16_vmul_ukernel__f16c_u8( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -33,11 +33,6 @@ void xnn_f16_vmul_minmax_ukernel__f16c_u8( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m256 vy_min = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m256 vy_max = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - XNN_FORCE_REALIZATION(vy_min); - XNN_FORCE_REALIZATION(vy_max); - for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a)); const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b)); @@ -46,9 +41,6 @@ void xnn_f16_vmul_minmax_ukernel__f16c_u8( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -58,9 +50,6 @@ void xnn_f16_vmul_minmax_ukernel__f16c_u8( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/gen/f16-vmul-minmax-fp16arith-u1.c b/src/f16-vbinary/gen/f16-vmul-fp16arith-u1.c similarity index 74% rename from src/f16-vbinary/gen/f16-vmul-minmax-fp16arith-u1.c rename to src/f16-vbinary/gen/f16-vmul-fp16arith-u1.c index bcde230442e..bb7920fc51f 100644 --- a/src/f16-vbinary/gen/f16-vmul-minmax-fp16arith-u1.c +++ b/src/f16-vbinary/gen/f16-vmul-fp16arith-u1.c @@ -8,7 +8,6 @@ // LICENSE file in the root directory of this source tree. #include -#include #include @@ -17,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vmul_minmax_ukernel__fp16arith_u1( +void xnn_f16_vmul_ukernel__fp16arith_u1( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float16_t) == 0); @@ -34,16 +33,10 @@ void xnn_f16_vmul_minmax_ukernel__fp16arith_u1( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - float16_t vy_min, vy_max; - memcpy(&vy_min, ¶ms->scalar.min, sizeof(vy_min)); - memcpy(&vy_max, ¶ms->scalar.max, sizeof(vy_max)); - do { const float16_t va = *a++; const float16_t vb = *b++; float16_t vacc = vmulh_f16(va, vb); - vacc = vmaxnmh_f16(vacc, vy_min); - vacc = vminnmh_f16(vacc, vy_max); *o++ = vacc; batch -= sizeof(float16_t); } while (batch != 0); diff --git a/src/f16-vbinary/gen/f16-vmul-minmax-fp16arith-u2.c b/src/f16-vbinary/gen/f16-vmul-fp16arith-u2.c similarity index 71% rename from src/f16-vbinary/gen/f16-vmul-minmax-fp16arith-u2.c rename to src/f16-vbinary/gen/f16-vmul-fp16arith-u2.c index 0f4d19b70e8..9c341402fd1 100644 --- a/src/f16-vbinary/gen/f16-vmul-minmax-fp16arith-u2.c +++ b/src/f16-vbinary/gen/f16-vmul-fp16arith-u2.c @@ -8,7 +8,6 @@ // LICENSE file in the root directory of this source tree. #include -#include #include @@ -17,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vmul_minmax_ukernel__fp16arith_u2( +void xnn_f16_vmul_ukernel__fp16arith_u2( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float16_t) == 0); @@ -34,10 +33,6 @@ void xnn_f16_vmul_minmax_ukernel__fp16arith_u2( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - float16_t vy_min, vy_max; - memcpy(&vy_min, ¶ms->scalar.min, sizeof(vy_min)); - memcpy(&vy_max, ¶ms->scalar.max, sizeof(vy_max)); - for (; batch >= 2 * sizeof(float16_t); batch -= 2 * sizeof(float16_t)) { const float16_t va0 = *a++; const float16_t va1 = *a++; @@ -49,12 +44,6 @@ void xnn_f16_vmul_minmax_ukernel__fp16arith_u2( float16_t vacc1 = vmulh_f16(va1, vb1); - vacc0 = vmaxnmh_f16(vacc0, vy_min); - vacc1 = vmaxnmh_f16(vacc1, vy_min); - - vacc0 = vminnmh_f16(vacc0, vy_max); - vacc1 = vminnmh_f16(vacc1, vy_max); - *o++ = vacc0; *o++ = vacc1; } @@ -62,8 +51,6 @@ void xnn_f16_vmul_minmax_ukernel__fp16arith_u2( const float16_t va = *a; const float16_t vb = *b; float16_t vacc = vmulh_f16(va, vb); - vacc = vmaxnmh_f16(vacc, vy_min); - vacc = vminnmh_f16(vacc, vy_max); *o = vacc; } } diff --git a/src/f16-vbinary/gen/f16-vmul-minmax-fp16arith-u4.c b/src/f16-vbinary/gen/f16-vmul-fp16arith-u4.c similarity index 70% rename from src/f16-vbinary/gen/f16-vmul-minmax-fp16arith-u4.c rename to src/f16-vbinary/gen/f16-vmul-fp16arith-u4.c index 8e546c1da0d..9b62617c05e 100644 --- a/src/f16-vbinary/gen/f16-vmul-minmax-fp16arith-u4.c +++ b/src/f16-vbinary/gen/f16-vmul-fp16arith-u4.c @@ -8,7 +8,6 @@ // LICENSE file in the root directory of this source tree. #include -#include #include @@ -17,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vmul_minmax_ukernel__fp16arith_u4( +void xnn_f16_vmul_ukernel__fp16arith_u4( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float16_t) == 0); @@ -34,10 +33,6 @@ void xnn_f16_vmul_minmax_ukernel__fp16arith_u4( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - float16_t vy_min, vy_max; - memcpy(&vy_min, ¶ms->scalar.min, sizeof(vy_min)); - memcpy(&vy_max, ¶ms->scalar.max, sizeof(vy_max)); - for (; batch >= 4 * sizeof(float16_t); batch -= 4 * sizeof(float16_t)) { const float16_t va0 = *a++; const float16_t va1 = *a++; @@ -55,16 +50,6 @@ void xnn_f16_vmul_minmax_ukernel__fp16arith_u4( float16_t vacc3 = vmulh_f16(va3, vb3); - vacc0 = vmaxnmh_f16(vacc0, vy_min); - vacc1 = vmaxnmh_f16(vacc1, vy_min); - vacc2 = vmaxnmh_f16(vacc2, vy_min); - vacc3 = vmaxnmh_f16(vacc3, vy_min); - - vacc0 = vminnmh_f16(vacc0, vy_max); - vacc1 = vminnmh_f16(vacc1, vy_max); - vacc2 = vminnmh_f16(vacc2, vy_max); - vacc3 = vminnmh_f16(vacc3, vy_max); - *o++ = vacc0; *o++ = vacc1; *o++ = vacc2; @@ -75,8 +60,6 @@ void xnn_f16_vmul_minmax_ukernel__fp16arith_u4( const float16_t va = *a++; const float16_t vb = *b++; float16_t vacc = vmulh_f16(va, vb); - vacc = vmaxnmh_f16(vacc, vy_min); - vacc = vminnmh_f16(vacc, vy_max); *o++ = vacc; batch -= sizeof(float16_t); } while (batch != 0); diff --git a/src/f16-vbinary/gen/f16-vmul-minmax-avx512fp16-u64.c b/src/f16-vbinary/gen/f16-vmul-minmax-avx512fp16-u64.c deleted file mode 100644 index fc3aa6e1a5a..00000000000 --- a/src/f16-vbinary/gen/f16-vmul-minmax-avx512fp16-u64.c +++ /dev/null @@ -1,91 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f16-vbinary/vop-avx512fp16.c.in -// Generator: tools/xngen -// -// Copyright 2024 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f16_vmul_minmax_ukernel__avx512fp16_u64( - size_t batch, - const xnn_float16* restrict input_a, - const xnn_float16* restrict input_b, - xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(uint16_t) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - -#if defined(__AVX512FP16__) - const uint16_t* a = (const uint16_t*) input_a; - const uint16_t* b = (const uint16_t*) input_b; - uint16_t* o = (uint16_t*) output; - - const __m512h voutput_min = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m512h voutput_max = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - - - for (; batch >= 64 * sizeof(uint16_t); batch -= 64 * sizeof(uint16_t)) { - const __m512h va0 = _mm512_loadu_ph(a); - const __m512h va1 = _mm512_loadu_ph(a + 32); - a += 64; - - __m512h vacc0 = _mm512_mul_ph(va0, _mm512_loadu_ph(b)); - __m512h vacc1 = _mm512_mul_ph(va1, _mm512_loadu_ph(b + 32)); - b += 64; - - - vacc0 = _mm512_max_ph(voutput_min, vacc0); - vacc1 = _mm512_max_ph(voutput_min, vacc1); - - vacc0 = _mm512_min_ph(voutput_max, vacc0); - vacc1 = _mm512_min_ph(voutput_max, vacc1); - - _mm512_storeu_ph(o, vacc0); - _mm512_storeu_ph(o + 32, vacc1); - o += 64; - } - for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { - const __m512h va = _mm512_loadu_ph(a); - a += 32; - - __m512h vacc = _mm512_mul_ph(va, _mm512_loadu_ph(b)); - b += 32; - - - vacc = _mm512_max_ph(voutput_min, vacc); - vacc = _mm512_min_ph(voutput_max, vacc); - - _mm512_storeu_ph(o, vacc); - o += 32; - } - if XNN_UNLIKELY(batch != 0) { - assert(batch >= 1 * sizeof(uint16_t)); - assert(batch <= 31 * sizeof(uint16_t)); - // Prepare mask for valid 16-bit elements (depends on batch). - batch >>= XNN_LOG2_SIZEOF_HALF; - const __mmask32 vmask = _cvtu32_mask32((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); - - const __m512h va = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, a)); - - __m512h vacc = _mm512_maskz_mul_ph(vmask, va, _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, b))); - - - vacc = _mm512_maskz_max_ph(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ph(vmask, voutput_max, vacc); - _mm512_mask_storeu_epi16(o, vmask, _mm512_castph_si512(vacc)); - } -#endif // defined(__AVX512FP16__) -} diff --git a/src/f16-vbinary/gen/f16-vmul-minmax-neonfp16arith-u16.c b/src/f16-vbinary/gen/f16-vmul-neonfp16arith-u16.c similarity index 77% rename from src/f16-vbinary/gen/f16-vmul-minmax-neonfp16arith-u16.c rename to src/f16-vbinary/gen/f16-vmul-neonfp16arith-u16.c index 32489c69376..a8f296d4787 100644 --- a/src/f16-vbinary/gen/f16-vmul-minmax-neonfp16arith-u16.c +++ b/src/f16-vbinary/gen/f16-vmul-neonfp16arith-u16.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vmul_minmax_ukernel__neonfp16arith_u16( +void xnn_f16_vmul_ukernel__neonfp16arith_u16( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -32,9 +32,6 @@ void xnn_f16_vmul_minmax_ukernel__neonfp16arith_u16( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const float16x8_t vy_min = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.min)); - const float16x8_t vy_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.max)); - for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; const float16x8_t vb01234567 = vreinterpretq_f16_u16(vld1q_u16(b)); b += 8; @@ -45,12 +42,6 @@ void xnn_f16_vmul_minmax_ukernel__neonfp16arith_u16( float16x8_t vy456789AB = vmulq_f16(va456789AB, vb456789AB); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy456789AB = vmaxq_f16(vy456789AB, vy_min); - - vy01234567 = vminq_f16(vy01234567, vy_max); - vy456789AB = vminq_f16(vy456789AB, vy_max); - vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; vst1q_u16(o, vreinterpretq_u16_f16(vy456789AB)); o += 8; } @@ -59,8 +50,6 @@ void xnn_f16_vmul_minmax_ukernel__neonfp16arith_u16( const float16x8_t vb01234567 = vreinterpretq_f16_u16(vld1q_u16(b)); b += 8; float16x8_t vy01234567 = vmulq_f16(va01234567, vb01234567); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; } if XNN_UNLIKELY(batch != 0) { @@ -68,8 +57,6 @@ void xnn_f16_vmul_minmax_ukernel__neonfp16arith_u16( const float16x8_t vb01234567 = vreinterpretq_f16_u16(vld1q_u16(b)); float16x8_t vy01234567 = vmulq_f16(va01234567, vb01234567); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); float16x4_t vy0123 = vget_low_f16(vy01234567); if (batch & (4 * sizeof(uint16_t))) { diff --git a/src/f16-vbinary/gen/f16-vmul-minmax-neonfp16arith-u8.c b/src/f16-vbinary/gen/f16-vmul-neonfp16arith-u8.c similarity index 77% rename from src/f16-vbinary/gen/f16-vmul-minmax-neonfp16arith-u8.c rename to src/f16-vbinary/gen/f16-vmul-neonfp16arith-u8.c index 149e60556ae..1443e6e4dc3 100644 --- a/src/f16-vbinary/gen/f16-vmul-minmax-neonfp16arith-u8.c +++ b/src/f16-vbinary/gen/f16-vmul-neonfp16arith-u8.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vmul_minmax_ukernel__neonfp16arith_u8( +void xnn_f16_vmul_ukernel__neonfp16arith_u8( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -32,16 +32,11 @@ void xnn_f16_vmul_minmax_ukernel__neonfp16arith_u8( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const float16x8_t vy_min = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.min)); - const float16x8_t vy_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.max)); - for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; const float16x8_t vb01234567 = vreinterpretq_f16_u16(vld1q_u16(b)); b += 8; float16x8_t vy01234567 = vmulq_f16(va01234567, vb01234567); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; } if XNN_UNLIKELY(batch != 0) { @@ -49,8 +44,6 @@ void xnn_f16_vmul_minmax_ukernel__neonfp16arith_u8( const float16x8_t vb01234567 = vreinterpretq_f16_u16(vld1q_u16(b)); float16x8_t vy01234567 = vmulq_f16(va01234567, vb01234567); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); float16x4_t vy0123 = vget_low_f16(vy01234567); if (batch & (4 * sizeof(uint16_t))) { diff --git a/src/f16-vbinary/gen/f16-vmulc-avx512fp16-u32.c b/src/f16-vbinary/gen/f16-vmulc-avx512fp16-u32.c new file mode 100644 index 00000000000..def97162b58 --- /dev/null +++ b/src/f16-vbinary/gen/f16-vmulc-avx512fp16-u32.c @@ -0,0 +1,64 @@ +// Auto-generated file. Do not edit! +// Template: src/f16-vbinary/vopc-avx512fp16.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/intrinsics-polyfill.h" +#include "xnnpack/vbinary.h" + + +void xnn_f16_vmulc_ukernel__avx512fp16_u32( + size_t batch, + const xnn_float16* restrict input_a, + const xnn_float16* restrict input_b, + xnn_float16* restrict output, + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint16_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + +#if defined(__AVX512FP16__) + const uint16_t* a = (const uint16_t*) input_a; + const uint16_t* b = (const uint16_t*) input_b; + uint16_t* o = (uint16_t*) output; + + const __m512h vb = _mm512_castsi512_ph(_mm512_set1_epi16(*b)); + + + for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { + __m512h va0 = _mm512_loadu_ph(a); + a += 32; + + __m512h vacc0 = _mm512_mul_ph(va0, vb); + + + _mm512_storeu_ph(o, vacc0); + o += 32; + } + if XNN_UNLIKELY(batch != 0) { + assert(batch >= 1 * sizeof(uint16_t)); + assert(batch <= 31 * sizeof(uint16_t)); + // Prepare mask for valid 16-bit elements (depends on batch). + batch >>= XNN_LOG2_SIZEOF_HALF; + const __mmask32 vmask = _cvtu32_mask32((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); + + __m512h va = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, a)); + + __m512h vacc = _mm512_maskz_mul_ph(vmask, va, vb); + + _mm512_mask_storeu_epi16(o, vmask, _mm512_castph_si512(vacc)); + } +#endif // defined(__AVX512FP16__) +} diff --git a/src/f16-vbinary/gen/f16-vmulc-minmax-avx512fp16-u32.c b/src/f16-vbinary/gen/f16-vmulc-avx512fp16-u64.c similarity index 75% rename from src/f16-vbinary/gen/f16-vmulc-minmax-avx512fp16-u32.c rename to src/f16-vbinary/gen/f16-vmulc-avx512fp16-u64.c index bb7c2e956aa..8be7fcde7cc 100644 --- a/src/f16-vbinary/gen/f16-vmulc-minmax-avx512fp16-u32.c +++ b/src/f16-vbinary/gen/f16-vmulc-avx512fp16-u64.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vmulc_minmax_ukernel__avx512fp16_u32( +void xnn_f16_vmulc_ukernel__avx512fp16_u64( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -34,23 +34,29 @@ void xnn_f16_vmulc_minmax_ukernel__avx512fp16_u32( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m512h voutput_min = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m512h voutput_max = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); const __m512h vb = _mm512_castsi512_ph(_mm512_set1_epi16(*b)); - for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { + for (; batch >= 64 * sizeof(uint16_t); batch -= 64 * sizeof(uint16_t)) { __m512h va0 = _mm512_loadu_ph(a); - a += 32; + __m512h va1 = _mm512_loadu_ph(a + 32); + a += 64; __m512h vacc0 = _mm512_mul_ph(va0, vb); + __m512h vacc1 = _mm512_mul_ph(va1, vb); - vacc0 = _mm512_max_ph(voutput_min, vacc0); + _mm512_storeu_ph(o, vacc0); + _mm512_storeu_ph(o + 32, vacc1); + o += 64; + } + for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { + __m512h va = _mm512_loadu_ph(a); + a += 32; - vacc0 = _mm512_min_ph(voutput_max, vacc0); + __m512h vacc = _mm512_mul_ph(va, vb); - _mm512_storeu_ph(o, vacc0); + _mm512_storeu_ph(o, vacc); o += 32; } if XNN_UNLIKELY(batch != 0) { @@ -64,8 +70,6 @@ void xnn_f16_vmulc_minmax_ukernel__avx512fp16_u32( __m512h vacc = _mm512_maskz_mul_ph(vmask, va, vb); - vacc = _mm512_maskz_max_ph(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ph(vmask, voutput_max, vacc); _mm512_mask_storeu_epi16(o, vmask, _mm512_castph_si512(vacc)); } #endif // defined(__AVX512FP16__) diff --git a/src/f16-vbinary/gen/f16-vmulc-minmax-f16c-u16.c b/src/f16-vbinary/gen/f16-vmulc-f16c-u16.c similarity index 77% rename from src/f16-vbinary/gen/f16-vmulc-minmax-f16c-u16.c rename to src/f16-vbinary/gen/f16-vmulc-f16c-u16.c index 17aa8c05a6a..dfd08d9392e 100644 --- a/src/f16-vbinary/gen/f16-vmulc-minmax-f16c-u16.c +++ b/src/f16-vbinary/gen/f16-vmulc-f16c-u16.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vmulc_minmax_ukernel__f16c_u16( +void xnn_f16_vmulc_ukernel__f16c_u16( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -33,11 +33,6 @@ void xnn_f16_vmulc_minmax_ukernel__f16c_u16( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m256 vy_min = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m256 vy_max = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - XNN_FORCE_REALIZATION(vy_min); - XNN_FORCE_REALIZATION(vy_max); - const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b)); for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a)); @@ -48,12 +43,6 @@ void xnn_f16_vmulc_minmax_ukernel__f16c_u16( __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va456789AB, vb), _MM_FROUND_TO_NEAREST_INT)); - vy01234567 = _mm256_max_ps(vy01234567, vy_min); - vy456789AB = _mm256_max_ps(vy456789AB, vy_min); - - vy01234567 = _mm256_min_ps(vy01234567, vy_max); - vy456789AB = _mm256_min_ps(vy456789AB, vy_max); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy01234567, _MM_FROUND_TO_NEAREST_INT)); _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_TO_NEAREST_INT)); o += 16; @@ -64,9 +53,6 @@ void xnn_f16_vmulc_minmax_ukernel__f16c_u16( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -75,9 +61,6 @@ void xnn_f16_vmulc_minmax_ukernel__f16c_u16( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/gen/f16-vmulc-minmax-f16c-u8.c b/src/f16-vbinary/gen/f16-vmulc-f16c-u8.c similarity index 77% rename from src/f16-vbinary/gen/f16-vmulc-minmax-f16c-u8.c rename to src/f16-vbinary/gen/f16-vmulc-f16c-u8.c index e0713f241f4..e76ce1a7b4d 100644 --- a/src/f16-vbinary/gen/f16-vmulc-minmax-f16c-u8.c +++ b/src/f16-vbinary/gen/f16-vmulc-f16c-u8.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vmulc_minmax_ukernel__f16c_u8( +void xnn_f16_vmulc_ukernel__f16c_u8( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -33,11 +33,6 @@ void xnn_f16_vmulc_minmax_ukernel__f16c_u8( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m256 vy_min = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m256 vy_max = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - XNN_FORCE_REALIZATION(vy_min); - XNN_FORCE_REALIZATION(vy_max); - const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b)); for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a)); @@ -45,9 +40,6 @@ void xnn_f16_vmulc_minmax_ukernel__f16c_u8( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -56,9 +48,6 @@ void xnn_f16_vmulc_minmax_ukernel__f16c_u8( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/gen/f16-vmulc-minmax-fp16arith-u1.c b/src/f16-vbinary/gen/f16-vmulc-fp16arith-u1.c similarity index 74% rename from src/f16-vbinary/gen/f16-vmulc-minmax-fp16arith-u1.c rename to src/f16-vbinary/gen/f16-vmulc-fp16arith-u1.c index 89fcf879746..ba0ab957b80 100644 --- a/src/f16-vbinary/gen/f16-vmulc-minmax-fp16arith-u1.c +++ b/src/f16-vbinary/gen/f16-vmulc-fp16arith-u1.c @@ -8,7 +8,6 @@ // LICENSE file in the root directory of this source tree. #include -#include #include @@ -18,12 +17,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vmulc_minmax_ukernel__fp16arith_u1( +void xnn_f16_vmulc_ukernel__fp16arith_u1( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float16_t) == 0); @@ -35,16 +34,10 @@ void xnn_f16_vmulc_minmax_ukernel__fp16arith_u1( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - float16_t vy_min, vy_max; - memcpy(&vy_min, ¶ms->scalar.min, sizeof(vy_min)); - memcpy(&vy_max, ¶ms->scalar.max, sizeof(vy_max)); - const float16_t vb = *b; do { float16_t vacc = *a++; vacc = vmulh_f16(vacc, vb); - vacc = vmaxnmh_f16(vacc, vy_min); - vacc = vminnmh_f16(vacc, vy_max); *o++ = vacc; batch -= sizeof(float16_t); } while (batch != 0); diff --git a/src/f16-vbinary/gen/f16-vmulc-minmax-fp16arith-u2.c b/src/f16-vbinary/gen/f16-vmulc-fp16arith-u2.c similarity index 70% rename from src/f16-vbinary/gen/f16-vmulc-minmax-fp16arith-u2.c rename to src/f16-vbinary/gen/f16-vmulc-fp16arith-u2.c index 5dfe5e35c6e..2b18e5befcf 100644 --- a/src/f16-vbinary/gen/f16-vmulc-minmax-fp16arith-u2.c +++ b/src/f16-vbinary/gen/f16-vmulc-fp16arith-u2.c @@ -8,7 +8,6 @@ // LICENSE file in the root directory of this source tree. #include -#include #include @@ -18,12 +17,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vmulc_minmax_ukernel__fp16arith_u2( +void xnn_f16_vmulc_ukernel__fp16arith_u2( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float16_t) == 0); @@ -35,10 +34,6 @@ void xnn_f16_vmulc_minmax_ukernel__fp16arith_u2( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - float16_t vy_min, vy_max; - memcpy(&vy_min, ¶ms->scalar.min, sizeof(vy_min)); - memcpy(&vy_max, ¶ms->scalar.max, sizeof(vy_max)); - const float16_t vb = *b; for (; batch >= 2 * sizeof(float16_t); batch -= 2 * sizeof(float16_t)) { float16_t vacc0 = a[0]; @@ -49,12 +44,6 @@ void xnn_f16_vmulc_minmax_ukernel__fp16arith_u2( vacc1 = vmulh_f16(vacc1, vb); - vacc0 = vmaxnmh_f16(vacc0, vy_min); - vacc1 = vmaxnmh_f16(vacc1, vy_min); - - vacc0 = vminnmh_f16(vacc0, vy_max); - vacc1 = vminnmh_f16(vacc1, vy_max); - o[0] = vacc0; o[1] = vacc1; o += 2; @@ -62,8 +51,6 @@ void xnn_f16_vmulc_minmax_ukernel__fp16arith_u2( if XNN_UNLIKELY(batch != 0) { float16_t vacc = *a; vacc = vmulh_f16(vacc, vb); - vacc = vmaxnmh_f16(vacc, vy_min); - vacc = vminnmh_f16(vacc, vy_max); *o = vacc; } } diff --git a/src/f16-vbinary/gen/f16-vmulc-minmax-fp16arith-u4.c b/src/f16-vbinary/gen/f16-vmulc-fp16arith-u4.c similarity index 68% rename from src/f16-vbinary/gen/f16-vmulc-minmax-fp16arith-u4.c rename to src/f16-vbinary/gen/f16-vmulc-fp16arith-u4.c index ec915a7be8b..7211a01dab0 100644 --- a/src/f16-vbinary/gen/f16-vmulc-minmax-fp16arith-u4.c +++ b/src/f16-vbinary/gen/f16-vmulc-fp16arith-u4.c @@ -8,7 +8,6 @@ // LICENSE file in the root directory of this source tree. #include -#include #include @@ -18,12 +17,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vmulc_minmax_ukernel__fp16arith_u4( +void xnn_f16_vmulc_ukernel__fp16arith_u4( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float16_t) == 0); @@ -35,10 +34,6 @@ void xnn_f16_vmulc_minmax_ukernel__fp16arith_u4( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - float16_t vy_min, vy_max; - memcpy(&vy_min, ¶ms->scalar.min, sizeof(vy_min)); - memcpy(&vy_max, ¶ms->scalar.max, sizeof(vy_max)); - const float16_t vb = *b; for (; batch >= 4 * sizeof(float16_t); batch -= 4 * sizeof(float16_t)) { float16_t vacc0 = a[0]; @@ -53,16 +48,6 @@ void xnn_f16_vmulc_minmax_ukernel__fp16arith_u4( vacc3 = vmulh_f16(vacc3, vb); - vacc0 = vmaxnmh_f16(vacc0, vy_min); - vacc1 = vmaxnmh_f16(vacc1, vy_min); - vacc2 = vmaxnmh_f16(vacc2, vy_min); - vacc3 = vmaxnmh_f16(vacc3, vy_min); - - vacc0 = vminnmh_f16(vacc0, vy_max); - vacc1 = vminnmh_f16(vacc1, vy_max); - vacc2 = vminnmh_f16(vacc2, vy_max); - vacc3 = vminnmh_f16(vacc3, vy_max); - o[0] = vacc0; o[1] = vacc1; o[2] = vacc2; @@ -73,8 +58,6 @@ void xnn_f16_vmulc_minmax_ukernel__fp16arith_u4( do { float16_t vacc = *a++; vacc = vmulh_f16(vacc, vb); - vacc = vmaxnmh_f16(vacc, vy_min); - vacc = vminnmh_f16(vacc, vy_max); *o++ = vacc; batch -= sizeof(float16_t); } while (batch != 0); diff --git a/src/f16-vbinary/gen/f16-vmulc-minmax-avx512fp16-u64.c b/src/f16-vbinary/gen/f16-vmulc-minmax-avx512fp16-u64.c deleted file mode 100644 index 0dd5a8b4c3c..00000000000 --- a/src/f16-vbinary/gen/f16-vmulc-minmax-avx512fp16-u64.c +++ /dev/null @@ -1,89 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f16-vbinary/vopc-avx512fp16.c.in -// Generator: tools/xngen -// -// Copyright 2024 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/vbinary.h" - - -void xnn_f16_vmulc_minmax_ukernel__avx512fp16_u64( - size_t batch, - const xnn_float16* restrict input_a, - const xnn_float16* restrict input_b, - xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(uint16_t) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - -#if defined(__AVX512FP16__) - const uint16_t* a = (const uint16_t*) input_a; - const uint16_t* b = (const uint16_t*) input_b; - uint16_t* o = (uint16_t*) output; - - const __m512h voutput_min = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m512h voutput_max = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - const __m512h vb = _mm512_castsi512_ph(_mm512_set1_epi16(*b)); - - - for (; batch >= 64 * sizeof(uint16_t); batch -= 64 * sizeof(uint16_t)) { - __m512h va0 = _mm512_loadu_ph(a); - __m512h va1 = _mm512_loadu_ph(a + 32); - a += 64; - - __m512h vacc0 = _mm512_mul_ph(va0, vb); - __m512h vacc1 = _mm512_mul_ph(va1, vb); - - - vacc0 = _mm512_max_ph(voutput_min, vacc0); - vacc1 = _mm512_max_ph(voutput_min, vacc1); - - vacc0 = _mm512_min_ph(voutput_max, vacc0); - vacc1 = _mm512_min_ph(voutput_max, vacc1); - - _mm512_storeu_ph(o, vacc0); - _mm512_storeu_ph(o + 32, vacc1); - o += 64; - } - for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { - __m512h va = _mm512_loadu_ph(a); - a += 32; - - __m512h vacc = _mm512_mul_ph(va, vb); - - vacc = _mm512_max_ph(voutput_min, vacc); - vacc = _mm512_min_ph(voutput_max, vacc); - - _mm512_storeu_ph(o, vacc); - o += 32; - } - if XNN_UNLIKELY(batch != 0) { - assert(batch >= 1 * sizeof(uint16_t)); - assert(batch <= 31 * sizeof(uint16_t)); - // Prepare mask for valid 16-bit elements (depends on batch). - batch >>= XNN_LOG2_SIZEOF_HALF; - const __mmask32 vmask = _cvtu32_mask32((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); - - __m512h va = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, a)); - - __m512h vacc = _mm512_maskz_mul_ph(vmask, va, vb); - - vacc = _mm512_maskz_max_ph(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ph(vmask, voutput_max, vacc); - _mm512_mask_storeu_epi16(o, vmask, _mm512_castph_si512(vacc)); - } -#endif // defined(__AVX512FP16__) -} diff --git a/src/f16-vbinary/gen/f16-vmulc-minmax-neonfp16arith-u16.c b/src/f16-vbinary/gen/f16-vmulc-neonfp16arith-u16.c similarity index 75% rename from src/f16-vbinary/gen/f16-vmulc-minmax-neonfp16arith-u16.c rename to src/f16-vbinary/gen/f16-vmulc-neonfp16arith-u16.c index ec298273828..9a9cc4008ad 100644 --- a/src/f16-vbinary/gen/f16-vmulc-minmax-neonfp16arith-u16.c +++ b/src/f16-vbinary/gen/f16-vmulc-neonfp16arith-u16.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vmulc_minmax_ukernel__neonfp16arith_u16( +void xnn_f16_vmulc_ukernel__neonfp16arith_u16( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -32,9 +32,6 @@ void xnn_f16_vmulc_minmax_ukernel__neonfp16arith_u16( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const float16x8_t vy_min = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.min)); - const float16x8_t vy_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.max)); - const float16x8_t vb = vreinterpretq_f16_u16(vld1q_dup_u16(b)); for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; @@ -44,12 +41,6 @@ void xnn_f16_vmulc_minmax_ukernel__neonfp16arith_u16( float16x8_t vy456789AB = vmulq_f16(va456789AB, vb); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy456789AB = vmaxq_f16(vy456789AB, vy_min); - - vy01234567 = vminq_f16(vy01234567, vy_max); - vy456789AB = vminq_f16(vy456789AB, vy_max); - vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; vst1q_u16(o, vreinterpretq_u16_f16(vy456789AB)); o += 8; } @@ -57,16 +48,12 @@ void xnn_f16_vmulc_minmax_ukernel__neonfp16arith_u16( const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; float16x8_t vy01234567 = vmulq_f16(va01234567, vb); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; } if XNN_UNLIKELY(batch != 0) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); float16x8_t vy01234567 = vmulq_f16(va01234567, vb); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); float16x4_t vy0123 = vget_low_f16(vy01234567); if (batch & (4 * sizeof(uint16_t))) { diff --git a/src/f16-vbinary/gen/f16-vmulc-minmax-neonfp16arith-u8.c b/src/f16-vbinary/gen/f16-vmulc-neonfp16arith-u8.c similarity index 76% rename from src/f16-vbinary/gen/f16-vmulc-minmax-neonfp16arith-u8.c rename to src/f16-vbinary/gen/f16-vmulc-neonfp16arith-u8.c index a877fa5958d..84f504da647 100644 --- a/src/f16-vbinary/gen/f16-vmulc-minmax-neonfp16arith-u8.c +++ b/src/f16-vbinary/gen/f16-vmulc-neonfp16arith-u8.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vmulc_minmax_ukernel__neonfp16arith_u8( +void xnn_f16_vmulc_ukernel__neonfp16arith_u8( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -32,24 +32,17 @@ void xnn_f16_vmulc_minmax_ukernel__neonfp16arith_u8( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const float16x8_t vy_min = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.min)); - const float16x8_t vy_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.max)); - const float16x8_t vb = vreinterpretq_f16_u16(vld1q_dup_u16(b)); for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; float16x8_t vy01234567 = vmulq_f16(va01234567, vb); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; } if XNN_UNLIKELY(batch != 0) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); float16x8_t vy01234567 = vmulq_f16(va01234567, vb); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); float16x4_t vy0123 = vget_low_f16(vy01234567); if (batch & (4 * sizeof(uint16_t))) { diff --git a/src/f16-vbinary/gen/f16-vprelu-avx512fp16-u32.c b/src/f16-vbinary/gen/f16-vprelu-avx512fp16-u32.c index 6b99bb047a8..d86d2489cb4 100644 --- a/src/f16-vbinary/gen/f16-vprelu-avx512fp16-u32.c +++ b/src/f16-vbinary/gen/f16-vprelu-avx512fp16-u32.c @@ -33,7 +33,6 @@ void xnn_f16_vprelu_ukernel__avx512fp16_u32( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m512h vzero = _mm512_setzero_ph(); for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { @@ -44,7 +43,6 @@ void xnn_f16_vprelu_ukernel__avx512fp16_u32( __m512h vacc = _mm512_mask_mul_ph(va, vsign, va, _mm512_loadu_ph(b)); b += 32; - _mm512_storeu_ph(o, vacc); o += 32; } diff --git a/src/f16-vbinary/gen/f16-vprelu-avx512fp16-u64.c b/src/f16-vbinary/gen/f16-vprelu-avx512fp16-u64.c index a30f5adb427..31720e3bb67 100644 --- a/src/f16-vbinary/gen/f16-vprelu-avx512fp16-u64.c +++ b/src/f16-vbinary/gen/f16-vprelu-avx512fp16-u64.c @@ -33,7 +33,6 @@ void xnn_f16_vprelu_ukernel__avx512fp16_u64( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m512h vzero = _mm512_setzero_ph(); for (; batch >= 64 * sizeof(uint16_t); batch -= 64 * sizeof(uint16_t)) { @@ -47,7 +46,6 @@ void xnn_f16_vprelu_ukernel__avx512fp16_u64( __m512h vacc1 = _mm512_mask_mul_ph(va1, vsign1, va1, _mm512_loadu_ph(b + 32)); b += 64; - _mm512_storeu_ph(o, vacc0); _mm512_storeu_ph(o + 32, vacc1); o += 64; @@ -60,7 +58,6 @@ void xnn_f16_vprelu_ukernel__avx512fp16_u64( __m512h vacc = _mm512_mask_mul_ph(va, vsign, va, _mm512_loadu_ph(b)); b += 32; - _mm512_storeu_ph(o, vacc); o += 32; } diff --git a/src/f16-vbinary/gen/f16-vprelu-f16c-u16.c b/src/f16-vbinary/gen/f16-vprelu-f16c-u16.c index 21801568949..00d86d37e3a 100644 --- a/src/f16-vbinary/gen/f16-vprelu-f16c-u16.c +++ b/src/f16-vbinary/gen/f16-vprelu-f16c-u16.c @@ -33,7 +33,6 @@ void xnn_f16_vprelu_ukernel__f16c_u16( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a)); const __m256 vb01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b)); @@ -48,7 +47,6 @@ void xnn_f16_vprelu_ukernel__f16c_u16( vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_blendv_ps(va01234567, vy01234567, va01234567), _MM_FROUND_TO_NEAREST_INT)); vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_blendv_ps(va456789AB, vy456789AB, va456789AB), _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy01234567, _MM_FROUND_TO_NEAREST_INT)); _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_TO_NEAREST_INT)); o += 16; @@ -62,7 +60,6 @@ void xnn_f16_vprelu_ukernel__f16c_u16( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_blendv_ps(va, vy, va), _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -73,7 +70,6 @@ void xnn_f16_vprelu_ukernel__f16c_u16( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_blendv_ps(va, vy, va), _MM_FROUND_TO_NEAREST_INT)); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/gen/f16-vprelu-f16c-u8.c b/src/f16-vbinary/gen/f16-vprelu-f16c-u8.c index f8fd93a3db5..7d3ad813df4 100644 --- a/src/f16-vbinary/gen/f16-vprelu-f16c-u8.c +++ b/src/f16-vbinary/gen/f16-vprelu-f16c-u8.c @@ -33,7 +33,6 @@ void xnn_f16_vprelu_ukernel__f16c_u8( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a)); const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b)); @@ -43,7 +42,6 @@ void xnn_f16_vprelu_ukernel__f16c_u8( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_blendv_ps(va, vy, va), _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -54,7 +52,6 @@ void xnn_f16_vprelu_ukernel__f16c_u8( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_blendv_ps(va, vy, va), _MM_FROUND_TO_NEAREST_INT)); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/gen/f16-vprelu-neonfp16arith-u16.c b/src/f16-vbinary/gen/f16-vprelu-neonfp16arith-u16.c index ef2e0094f52..e23ffe8712d 100644 --- a/src/f16-vbinary/gen/f16-vprelu-neonfp16arith-u16.c +++ b/src/f16-vbinary/gen/f16-vprelu-neonfp16arith-u16.c @@ -32,7 +32,6 @@ void xnn_f16_vprelu_ukernel__neonfp16arith_u16( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; const float16x8_t vb01234567 = vreinterpretq_f16_u16(vld1q_u16(b)); b += 8; @@ -48,7 +47,6 @@ void xnn_f16_vprelu_ukernel__neonfp16arith_u16( vy01234567 = vbslq_f16(vm01234567, vy01234567, va01234567); vy456789AB = vbslq_f16(vm456789AB, vy456789AB, va456789AB); - vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; vst1q_u16(o, vreinterpretq_u16_f16(vy456789AB)); o += 8; } diff --git a/src/f16-vbinary/gen/f16-vprelu-neonfp16arith-u8.c b/src/f16-vbinary/gen/f16-vprelu-neonfp16arith-u8.c index b61456b1282..6368a224f28 100644 --- a/src/f16-vbinary/gen/f16-vprelu-neonfp16arith-u8.c +++ b/src/f16-vbinary/gen/f16-vprelu-neonfp16arith-u8.c @@ -32,7 +32,6 @@ void xnn_f16_vprelu_ukernel__neonfp16arith_u8( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; const float16x8_t vb01234567 = vreinterpretq_f16_u16(vld1q_u16(b)); b += 8; diff --git a/src/f16-vbinary/gen/f16-vpreluc-avx512fp16-u32.c b/src/f16-vbinary/gen/f16-vpreluc-avx512fp16-u32.c index fb8850444b7..fe1b57c2bbd 100644 --- a/src/f16-vbinary/gen/f16-vpreluc-avx512fp16-u32.c +++ b/src/f16-vbinary/gen/f16-vpreluc-avx512fp16-u32.c @@ -45,7 +45,6 @@ void xnn_f16_vpreluc_ukernel__avx512fp16_u32( const __mmask32 vsign0 = _mm512_cmp_ph_mask(va0, vzero, _CMP_LT_OQ); __m512h vacc0 = _mm512_mask_mul_ph(va0, vsign0, va0, vb); - _mm512_storeu_ph(o, vacc0); o += 32; } diff --git a/src/f16-vbinary/gen/f16-vpreluc-avx512fp16-u64.c b/src/f16-vbinary/gen/f16-vpreluc-avx512fp16-u64.c index 5d2487f9b6a..616f7cf0e39 100644 --- a/src/f16-vbinary/gen/f16-vpreluc-avx512fp16-u64.c +++ b/src/f16-vbinary/gen/f16-vpreluc-avx512fp16-u64.c @@ -48,7 +48,6 @@ void xnn_f16_vpreluc_ukernel__avx512fp16_u64( const __mmask32 vsign1 = _mm512_cmp_ph_mask(va1, vzero, _CMP_LT_OQ); __m512h vacc1 = _mm512_mask_mul_ph(va1, vsign1, va1, vb); - _mm512_storeu_ph(o, vacc0); _mm512_storeu_ph(o + 32, vacc1); o += 64; @@ -60,7 +59,6 @@ void xnn_f16_vpreluc_ukernel__avx512fp16_u64( const __mmask32 vsign = _mm512_cmp_ph_mask(va, vzero, _CMP_LT_OQ); __m512h vacc = _mm512_mask_mul_ph(va, vsign, va, vb); - _mm512_storeu_ph(o, vacc); o += 32; } diff --git a/src/f16-vbinary/gen/f16-vpreluc-f16c-u16.c b/src/f16-vbinary/gen/f16-vpreluc-f16c-u16.c index 00a84928d2a..533a92e058e 100644 --- a/src/f16-vbinary/gen/f16-vpreluc-f16c-u16.c +++ b/src/f16-vbinary/gen/f16-vpreluc-f16c-u16.c @@ -33,7 +33,6 @@ void xnn_f16_vpreluc_ukernel__f16c_u16( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b)); for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a)); @@ -46,7 +45,6 @@ void xnn_f16_vpreluc_ukernel__f16c_u16( vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_blendv_ps(va01234567, vy01234567, va01234567), _MM_FROUND_TO_NEAREST_INT)); vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_blendv_ps(va456789AB, vy456789AB, va456789AB), _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy01234567, _MM_FROUND_TO_NEAREST_INT)); _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_TO_NEAREST_INT)); o += 16; @@ -58,7 +56,6 @@ void xnn_f16_vpreluc_ukernel__f16c_u16( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_blendv_ps(va, vy, va), _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -68,7 +65,6 @@ void xnn_f16_vpreluc_ukernel__f16c_u16( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_blendv_ps(va, vy, va), _MM_FROUND_TO_NEAREST_INT)); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/gen/f16-vpreluc-f16c-u8.c b/src/f16-vbinary/gen/f16-vpreluc-f16c-u8.c index 49e1517289c..45a8a77a56f 100644 --- a/src/f16-vbinary/gen/f16-vpreluc-f16c-u8.c +++ b/src/f16-vbinary/gen/f16-vpreluc-f16c-u8.c @@ -33,7 +33,6 @@ void xnn_f16_vpreluc_ukernel__f16c_u8( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b)); for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a)); @@ -42,7 +41,6 @@ void xnn_f16_vpreluc_ukernel__f16c_u8( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_blendv_ps(va, vy, va), _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -52,7 +50,6 @@ void xnn_f16_vpreluc_ukernel__f16c_u8( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_blendv_ps(va, vy, va), _MM_FROUND_TO_NEAREST_INT)); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/gen/f16-vpreluc-neonfp16arith-u16.c b/src/f16-vbinary/gen/f16-vpreluc-neonfp16arith-u16.c index b1234066724..7ddf499c146 100644 --- a/src/f16-vbinary/gen/f16-vpreluc-neonfp16arith-u16.c +++ b/src/f16-vbinary/gen/f16-vpreluc-neonfp16arith-u16.c @@ -32,7 +32,6 @@ void xnn_f16_vpreluc_ukernel__neonfp16arith_u16( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const float16x8_t vb = vreinterpretq_f16_u16(vld1q_dup_u16(b)); for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; @@ -47,7 +46,6 @@ void xnn_f16_vpreluc_ukernel__neonfp16arith_u16( vy01234567 = vbslq_f16(vm01234567, vy01234567, va01234567); vy456789AB = vbslq_f16(vm456789AB, vy456789AB, va456789AB); - vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; vst1q_u16(o, vreinterpretq_u16_f16(vy456789AB)); o += 8; } diff --git a/src/f16-vbinary/gen/f16-vpreluc-neonfp16arith-u8.c b/src/f16-vbinary/gen/f16-vpreluc-neonfp16arith-u8.c index 3b6528e857f..8ea18714e4f 100644 --- a/src/f16-vbinary/gen/f16-vpreluc-neonfp16arith-u8.c +++ b/src/f16-vbinary/gen/f16-vpreluc-neonfp16arith-u8.c @@ -32,7 +32,6 @@ void xnn_f16_vpreluc_ukernel__neonfp16arith_u8( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const float16x8_t vb = vreinterpretq_f16_u16(vld1q_dup_u16(b)); for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; diff --git a/src/f16-vbinary/gen/f16-vrdivc-minmax-aarch64-neonfp16arith-u16.c b/src/f16-vbinary/gen/f16-vrdivc-aarch64-neonfp16arith-u16.c similarity index 75% rename from src/f16-vbinary/gen/f16-vrdivc-minmax-aarch64-neonfp16arith-u16.c rename to src/f16-vbinary/gen/f16-vrdivc-aarch64-neonfp16arith-u16.c index 3c82dc5dff3..a66dc2845c6 100644 --- a/src/f16-vbinary/gen/f16-vrdivc-minmax-aarch64-neonfp16arith-u16.c +++ b/src/f16-vbinary/gen/f16-vrdivc-aarch64-neonfp16arith-u16.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vrdivc_minmax_ukernel__aarch64_neonfp16arith_u16( +void xnn_f16_vrdivc_ukernel__aarch64_neonfp16arith_u16( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -32,9 +32,6 @@ void xnn_f16_vrdivc_minmax_ukernel__aarch64_neonfp16arith_u16( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const float16x8_t vy_min = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.min)); - const float16x8_t vy_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.max)); - const float16x8_t vb = vreinterpretq_f16_u16(vld1q_dup_u16(b)); for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; @@ -44,12 +41,6 @@ void xnn_f16_vrdivc_minmax_ukernel__aarch64_neonfp16arith_u16( float16x8_t vy456789AB = vdivq_f16(vb, va456789AB); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy456789AB = vmaxq_f16(vy456789AB, vy_min); - - vy01234567 = vminq_f16(vy01234567, vy_max); - vy456789AB = vminq_f16(vy456789AB, vy_max); - vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; vst1q_u16(o, vreinterpretq_u16_f16(vy456789AB)); o += 8; } @@ -57,16 +48,12 @@ void xnn_f16_vrdivc_minmax_ukernel__aarch64_neonfp16arith_u16( const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; float16x8_t vy01234567 = vdivq_f16(vb, va01234567); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; } if XNN_UNLIKELY(batch != 0) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); float16x8_t vy01234567 = vdivq_f16(vb, va01234567); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); float16x4_t vy0123 = vget_low_f16(vy01234567); if (batch & (4 * sizeof(uint16_t))) { diff --git a/src/f16-vbinary/gen/f16-vrdivc-minmax-aarch64-neonfp16arith-u8.c b/src/f16-vbinary/gen/f16-vrdivc-aarch64-neonfp16arith-u8.c similarity index 76% rename from src/f16-vbinary/gen/f16-vrdivc-minmax-aarch64-neonfp16arith-u8.c rename to src/f16-vbinary/gen/f16-vrdivc-aarch64-neonfp16arith-u8.c index a496ed308ef..fa7ed76fabc 100644 --- a/src/f16-vbinary/gen/f16-vrdivc-minmax-aarch64-neonfp16arith-u8.c +++ b/src/f16-vbinary/gen/f16-vrdivc-aarch64-neonfp16arith-u8.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vrdivc_minmax_ukernel__aarch64_neonfp16arith_u8( +void xnn_f16_vrdivc_ukernel__aarch64_neonfp16arith_u8( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -32,24 +32,17 @@ void xnn_f16_vrdivc_minmax_ukernel__aarch64_neonfp16arith_u8( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const float16x8_t vy_min = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.min)); - const float16x8_t vy_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.max)); - const float16x8_t vb = vreinterpretq_f16_u16(vld1q_dup_u16(b)); for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; float16x8_t vy01234567 = vdivq_f16(vb, va01234567); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; } if XNN_UNLIKELY(batch != 0) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); float16x8_t vy01234567 = vdivq_f16(vb, va01234567); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); float16x4_t vy0123 = vget_low_f16(vy01234567); if (batch & (4 * sizeof(uint16_t))) { diff --git a/src/f16-vbinary/gen/f16-vrdivc-avx512fp16-u32.c b/src/f16-vbinary/gen/f16-vrdivc-avx512fp16-u32.c new file mode 100644 index 00000000000..8cf4902eb9a --- /dev/null +++ b/src/f16-vbinary/gen/f16-vrdivc-avx512fp16-u32.c @@ -0,0 +1,64 @@ +// Auto-generated file. Do not edit! +// Template: src/f16-vbinary/vopc-avx512fp16.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/intrinsics-polyfill.h" +#include "xnnpack/vbinary.h" + + +void xnn_f16_vrdivc_ukernel__avx512fp16_u32( + size_t batch, + const xnn_float16* restrict input_a, + const xnn_float16* restrict input_b, + xnn_float16* restrict output, + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint16_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + +#if defined(__AVX512FP16__) + const uint16_t* a = (const uint16_t*) input_a; + const uint16_t* b = (const uint16_t*) input_b; + uint16_t* o = (uint16_t*) output; + + const __m512h vb = _mm512_castsi512_ph(_mm512_set1_epi16(*b)); + + + for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { + __m512h va0 = _mm512_loadu_ph(a); + a += 32; + + __m512h vacc0 = _mm512_div_ph(vb, va0); + + + _mm512_storeu_ph(o, vacc0); + o += 32; + } + if XNN_UNLIKELY(batch != 0) { + assert(batch >= 1 * sizeof(uint16_t)); + assert(batch <= 31 * sizeof(uint16_t)); + // Prepare mask for valid 16-bit elements (depends on batch). + batch >>= XNN_LOG2_SIZEOF_HALF; + const __mmask32 vmask = _cvtu32_mask32((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); + + __m512h va = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, a)); + + __m512h vacc = _mm512_maskz_div_ph(vmask, vb, va); + + _mm512_mask_storeu_epi16(o, vmask, _mm512_castph_si512(vacc)); + } +#endif // defined(__AVX512FP16__) +} diff --git a/src/f16-vbinary/gen/f16-vrdivc-minmax-avx512fp16-u32.c b/src/f16-vbinary/gen/f16-vrdivc-avx512fp16-u64.c similarity index 75% rename from src/f16-vbinary/gen/f16-vrdivc-minmax-avx512fp16-u32.c rename to src/f16-vbinary/gen/f16-vrdivc-avx512fp16-u64.c index b0f02578ebf..df081a38ee3 100644 --- a/src/f16-vbinary/gen/f16-vrdivc-minmax-avx512fp16-u32.c +++ b/src/f16-vbinary/gen/f16-vrdivc-avx512fp16-u64.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vrdivc_minmax_ukernel__avx512fp16_u32( +void xnn_f16_vrdivc_ukernel__avx512fp16_u64( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -34,23 +34,29 @@ void xnn_f16_vrdivc_minmax_ukernel__avx512fp16_u32( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m512h voutput_min = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m512h voutput_max = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); const __m512h vb = _mm512_castsi512_ph(_mm512_set1_epi16(*b)); - for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { + for (; batch >= 64 * sizeof(uint16_t); batch -= 64 * sizeof(uint16_t)) { __m512h va0 = _mm512_loadu_ph(a); - a += 32; + __m512h va1 = _mm512_loadu_ph(a + 32); + a += 64; __m512h vacc0 = _mm512_div_ph(vb, va0); + __m512h vacc1 = _mm512_div_ph(vb, va1); - vacc0 = _mm512_max_ph(voutput_min, vacc0); + _mm512_storeu_ph(o, vacc0); + _mm512_storeu_ph(o + 32, vacc1); + o += 64; + } + for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { + __m512h va = _mm512_loadu_ph(a); + a += 32; - vacc0 = _mm512_min_ph(voutput_max, vacc0); + __m512h vacc = _mm512_div_ph(vb, va); - _mm512_storeu_ph(o, vacc0); + _mm512_storeu_ph(o, vacc); o += 32; } if XNN_UNLIKELY(batch != 0) { @@ -64,8 +70,6 @@ void xnn_f16_vrdivc_minmax_ukernel__avx512fp16_u32( __m512h vacc = _mm512_maskz_div_ph(vmask, vb, va); - vacc = _mm512_maskz_max_ph(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ph(vmask, voutput_max, vacc); _mm512_mask_storeu_epi16(o, vmask, _mm512_castph_si512(vacc)); } #endif // defined(__AVX512FP16__) diff --git a/src/f16-vbinary/gen/f16-vrdivc-minmax-f16c-u16.c b/src/f16-vbinary/gen/f16-vrdivc-f16c-u16.c similarity index 77% rename from src/f16-vbinary/gen/f16-vrdivc-minmax-f16c-u16.c rename to src/f16-vbinary/gen/f16-vrdivc-f16c-u16.c index 6b7f3dfd5cf..5c082b27084 100644 --- a/src/f16-vbinary/gen/f16-vrdivc-minmax-f16c-u16.c +++ b/src/f16-vbinary/gen/f16-vrdivc-f16c-u16.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vrdivc_minmax_ukernel__f16c_u16( +void xnn_f16_vrdivc_ukernel__f16c_u16( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -33,11 +33,6 @@ void xnn_f16_vrdivc_minmax_ukernel__f16c_u16( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m256 vy_min = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m256 vy_max = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - XNN_FORCE_REALIZATION(vy_min); - XNN_FORCE_REALIZATION(vy_max); - const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b)); for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a)); @@ -48,12 +43,6 @@ void xnn_f16_vrdivc_minmax_ukernel__f16c_u16( __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_div_ps(vb, va456789AB), _MM_FROUND_TO_NEAREST_INT)); - vy01234567 = _mm256_max_ps(vy01234567, vy_min); - vy456789AB = _mm256_max_ps(vy456789AB, vy_min); - - vy01234567 = _mm256_min_ps(vy01234567, vy_max); - vy456789AB = _mm256_min_ps(vy456789AB, vy_max); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy01234567, _MM_FROUND_TO_NEAREST_INT)); _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_TO_NEAREST_INT)); o += 16; @@ -64,9 +53,6 @@ void xnn_f16_vrdivc_minmax_ukernel__f16c_u16( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_div_ps(vb, va), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -75,9 +61,6 @@ void xnn_f16_vrdivc_minmax_ukernel__f16c_u16( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_div_ps(vb, va), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/gen/f16-vrdivc-minmax-f16c-u8.c b/src/f16-vbinary/gen/f16-vrdivc-f16c-u8.c similarity index 77% rename from src/f16-vbinary/gen/f16-vrdivc-minmax-f16c-u8.c rename to src/f16-vbinary/gen/f16-vrdivc-f16c-u8.c index a83d9e3d727..4f8a4466e2a 100644 --- a/src/f16-vbinary/gen/f16-vrdivc-minmax-f16c-u8.c +++ b/src/f16-vbinary/gen/f16-vrdivc-f16c-u8.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vrdivc_minmax_ukernel__f16c_u8( +void xnn_f16_vrdivc_ukernel__f16c_u8( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -33,11 +33,6 @@ void xnn_f16_vrdivc_minmax_ukernel__f16c_u8( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m256 vy_min = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m256 vy_max = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - XNN_FORCE_REALIZATION(vy_min); - XNN_FORCE_REALIZATION(vy_max); - const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b)); for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a)); @@ -45,9 +40,6 @@ void xnn_f16_vrdivc_minmax_ukernel__f16c_u8( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_div_ps(vb, va), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -56,9 +48,6 @@ void xnn_f16_vrdivc_minmax_ukernel__f16c_u8( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_div_ps(vb, va), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/gen/f16-vrdivc-minmax-fp16arith-u1.c b/src/f16-vbinary/gen/f16-vrdivc-fp16arith-u1.c similarity index 74% rename from src/f16-vbinary/gen/f16-vrdivc-minmax-fp16arith-u1.c rename to src/f16-vbinary/gen/f16-vrdivc-fp16arith-u1.c index b32d41ca3dc..7df22cdefa0 100644 --- a/src/f16-vbinary/gen/f16-vrdivc-minmax-fp16arith-u1.c +++ b/src/f16-vbinary/gen/f16-vrdivc-fp16arith-u1.c @@ -8,7 +8,6 @@ // LICENSE file in the root directory of this source tree. #include -#include #include @@ -18,12 +17,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vrdivc_minmax_ukernel__fp16arith_u1( +void xnn_f16_vrdivc_ukernel__fp16arith_u1( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float16_t) == 0); @@ -35,16 +34,10 @@ void xnn_f16_vrdivc_minmax_ukernel__fp16arith_u1( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - float16_t vy_min, vy_max; - memcpy(&vy_min, ¶ms->scalar.min, sizeof(vy_min)); - memcpy(&vy_max, ¶ms->scalar.max, sizeof(vy_max)); - const float16_t vb = *b; do { float16_t vacc = *a++; vacc = vdivh_f16(vb, vacc); - vacc = vmaxnmh_f16(vacc, vy_min); - vacc = vminnmh_f16(vacc, vy_max); *o++ = vacc; batch -= sizeof(float16_t); } while (batch != 0); diff --git a/src/f16-vbinary/gen/f16-vrdivc-minmax-fp16arith-u2.c b/src/f16-vbinary/gen/f16-vrdivc-fp16arith-u2.c similarity index 70% rename from src/f16-vbinary/gen/f16-vrdivc-minmax-fp16arith-u2.c rename to src/f16-vbinary/gen/f16-vrdivc-fp16arith-u2.c index cc289a0be85..a2072d0eb21 100644 --- a/src/f16-vbinary/gen/f16-vrdivc-minmax-fp16arith-u2.c +++ b/src/f16-vbinary/gen/f16-vrdivc-fp16arith-u2.c @@ -8,7 +8,6 @@ // LICENSE file in the root directory of this source tree. #include -#include #include @@ -18,12 +17,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vrdivc_minmax_ukernel__fp16arith_u2( +void xnn_f16_vrdivc_ukernel__fp16arith_u2( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float16_t) == 0); @@ -35,10 +34,6 @@ void xnn_f16_vrdivc_minmax_ukernel__fp16arith_u2( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - float16_t vy_min, vy_max; - memcpy(&vy_min, ¶ms->scalar.min, sizeof(vy_min)); - memcpy(&vy_max, ¶ms->scalar.max, sizeof(vy_max)); - const float16_t vb = *b; for (; batch >= 2 * sizeof(float16_t); batch -= 2 * sizeof(float16_t)) { float16_t vacc0 = a[0]; @@ -49,12 +44,6 @@ void xnn_f16_vrdivc_minmax_ukernel__fp16arith_u2( vacc1 = vdivh_f16(vb, vacc1); - vacc0 = vmaxnmh_f16(vacc0, vy_min); - vacc1 = vmaxnmh_f16(vacc1, vy_min); - - vacc0 = vminnmh_f16(vacc0, vy_max); - vacc1 = vminnmh_f16(vacc1, vy_max); - o[0] = vacc0; o[1] = vacc1; o += 2; @@ -62,8 +51,6 @@ void xnn_f16_vrdivc_minmax_ukernel__fp16arith_u2( if XNN_UNLIKELY(batch != 0) { float16_t vacc = *a; vacc = vdivh_f16(vb, vacc); - vacc = vmaxnmh_f16(vacc, vy_min); - vacc = vminnmh_f16(vacc, vy_max); *o = vacc; } } diff --git a/src/f16-vbinary/gen/f16-vrdivc-minmax-fp16arith-u4.c b/src/f16-vbinary/gen/f16-vrdivc-fp16arith-u4.c similarity index 68% rename from src/f16-vbinary/gen/f16-vrdivc-minmax-fp16arith-u4.c rename to src/f16-vbinary/gen/f16-vrdivc-fp16arith-u4.c index 336b2eb4201..7d8046b7e0e 100644 --- a/src/f16-vbinary/gen/f16-vrdivc-minmax-fp16arith-u4.c +++ b/src/f16-vbinary/gen/f16-vrdivc-fp16arith-u4.c @@ -8,7 +8,6 @@ // LICENSE file in the root directory of this source tree. #include -#include #include @@ -18,12 +17,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vrdivc_minmax_ukernel__fp16arith_u4( +void xnn_f16_vrdivc_ukernel__fp16arith_u4( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float16_t) == 0); @@ -35,10 +34,6 @@ void xnn_f16_vrdivc_minmax_ukernel__fp16arith_u4( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - float16_t vy_min, vy_max; - memcpy(&vy_min, ¶ms->scalar.min, sizeof(vy_min)); - memcpy(&vy_max, ¶ms->scalar.max, sizeof(vy_max)); - const float16_t vb = *b; for (; batch >= 4 * sizeof(float16_t); batch -= 4 * sizeof(float16_t)) { float16_t vacc0 = a[0]; @@ -53,16 +48,6 @@ void xnn_f16_vrdivc_minmax_ukernel__fp16arith_u4( vacc3 = vdivh_f16(vb, vacc3); - vacc0 = vmaxnmh_f16(vacc0, vy_min); - vacc1 = vmaxnmh_f16(vacc1, vy_min); - vacc2 = vmaxnmh_f16(vacc2, vy_min); - vacc3 = vmaxnmh_f16(vacc3, vy_min); - - vacc0 = vminnmh_f16(vacc0, vy_max); - vacc1 = vminnmh_f16(vacc1, vy_max); - vacc2 = vminnmh_f16(vacc2, vy_max); - vacc3 = vminnmh_f16(vacc3, vy_max); - o[0] = vacc0; o[1] = vacc1; o[2] = vacc2; @@ -73,8 +58,6 @@ void xnn_f16_vrdivc_minmax_ukernel__fp16arith_u4( do { float16_t vacc = *a++; vacc = vdivh_f16(vb, vacc); - vacc = vmaxnmh_f16(vacc, vy_min); - vacc = vminnmh_f16(vacc, vy_max); *o++ = vacc; batch -= sizeof(float16_t); } while (batch != 0); diff --git a/src/f16-vbinary/gen/f16-vrdivc-minmax-avx512fp16-u64.c b/src/f16-vbinary/gen/f16-vrdivc-minmax-avx512fp16-u64.c deleted file mode 100644 index 85b70b4bc7b..00000000000 --- a/src/f16-vbinary/gen/f16-vrdivc-minmax-avx512fp16-u64.c +++ /dev/null @@ -1,89 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f16-vbinary/vopc-avx512fp16.c.in -// Generator: tools/xngen -// -// Copyright 2024 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/vbinary.h" - - -void xnn_f16_vrdivc_minmax_ukernel__avx512fp16_u64( - size_t batch, - const xnn_float16* restrict input_a, - const xnn_float16* restrict input_b, - xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(uint16_t) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - -#if defined(__AVX512FP16__) - const uint16_t* a = (const uint16_t*) input_a; - const uint16_t* b = (const uint16_t*) input_b; - uint16_t* o = (uint16_t*) output; - - const __m512h voutput_min = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m512h voutput_max = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - const __m512h vb = _mm512_castsi512_ph(_mm512_set1_epi16(*b)); - - - for (; batch >= 64 * sizeof(uint16_t); batch -= 64 * sizeof(uint16_t)) { - __m512h va0 = _mm512_loadu_ph(a); - __m512h va1 = _mm512_loadu_ph(a + 32); - a += 64; - - __m512h vacc0 = _mm512_div_ph(vb, va0); - __m512h vacc1 = _mm512_div_ph(vb, va1); - - - vacc0 = _mm512_max_ph(voutput_min, vacc0); - vacc1 = _mm512_max_ph(voutput_min, vacc1); - - vacc0 = _mm512_min_ph(voutput_max, vacc0); - vacc1 = _mm512_min_ph(voutput_max, vacc1); - - _mm512_storeu_ph(o, vacc0); - _mm512_storeu_ph(o + 32, vacc1); - o += 64; - } - for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { - __m512h va = _mm512_loadu_ph(a); - a += 32; - - __m512h vacc = _mm512_div_ph(vb, va); - - vacc = _mm512_max_ph(voutput_min, vacc); - vacc = _mm512_min_ph(voutput_max, vacc); - - _mm512_storeu_ph(o, vacc); - o += 32; - } - if XNN_UNLIKELY(batch != 0) { - assert(batch >= 1 * sizeof(uint16_t)); - assert(batch <= 31 * sizeof(uint16_t)); - // Prepare mask for valid 16-bit elements (depends on batch). - batch >>= XNN_LOG2_SIZEOF_HALF; - const __mmask32 vmask = _cvtu32_mask32((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); - - __m512h va = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, a)); - - __m512h vacc = _mm512_maskz_div_ph(vmask, vb, va); - - vacc = _mm512_maskz_max_ph(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ph(vmask, voutput_max, vacc); - _mm512_mask_storeu_epi16(o, vmask, _mm512_castph_si512(vacc)); - } -#endif // defined(__AVX512FP16__) -} diff --git a/src/f16-vbinary/gen/f16-vrpreluc-avx512fp16-u32.c b/src/f16-vbinary/gen/f16-vrpreluc-avx512fp16-u32.c index 82b2e2cbfbc..6fed63b0ea8 100644 --- a/src/f16-vbinary/gen/f16-vrpreluc-avx512fp16-u32.c +++ b/src/f16-vbinary/gen/f16-vrpreluc-avx512fp16-u32.c @@ -45,7 +45,6 @@ void xnn_f16_vrpreluc_ukernel__avx512fp16_u32( __m512h vacc0 = _mm512_mask_mul_ph(vb, vsign, va0, vb); - _mm512_storeu_ph(o, vacc0); o += 32; } diff --git a/src/f16-vbinary/gen/f16-vrpreluc-avx512fp16-u64.c b/src/f16-vbinary/gen/f16-vrpreluc-avx512fp16-u64.c index 616ec3373ae..919b5b28481 100644 --- a/src/f16-vbinary/gen/f16-vrpreluc-avx512fp16-u64.c +++ b/src/f16-vbinary/gen/f16-vrpreluc-avx512fp16-u64.c @@ -47,7 +47,6 @@ void xnn_f16_vrpreluc_ukernel__avx512fp16_u64( __m512h vacc0 = _mm512_mask_mul_ph(vb, vsign, va0, vb); __m512h vacc1 = _mm512_mask_mul_ph(vb, vsign, va1, vb); - _mm512_storeu_ph(o, vacc0); _mm512_storeu_ph(o + 32, vacc1); o += 64; @@ -58,7 +57,6 @@ void xnn_f16_vrpreluc_ukernel__avx512fp16_u64( __m512h vacc = _mm512_mask_mul_ph(vb, vsign, va, vb); - _mm512_storeu_ph(o, vacc); o += 32; } diff --git a/src/f16-vbinary/gen/f16-vrpreluc-f16c-u16.c b/src/f16-vbinary/gen/f16-vrpreluc-f16c-u16.c index a884d87e613..78b5fad7d2d 100644 --- a/src/f16-vbinary/gen/f16-vrpreluc-f16c-u16.c +++ b/src/f16-vbinary/gen/f16-vrpreluc-f16c-u16.c @@ -33,7 +33,6 @@ void xnn_f16_vrpreluc_ukernel__f16c_u16( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b)); for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a)); @@ -46,7 +45,6 @@ void xnn_f16_vrpreluc_ukernel__f16c_u16( vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_blendv_ps(vb, vy01234567, vb), _MM_FROUND_TO_NEAREST_INT)); vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_blendv_ps(vb, vy456789AB, vb), _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy01234567, _MM_FROUND_TO_NEAREST_INT)); _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_TO_NEAREST_INT)); o += 16; @@ -58,7 +56,6 @@ void xnn_f16_vrpreluc_ukernel__f16c_u16( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_blendv_ps(vb, vy, vb), _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -68,7 +65,6 @@ void xnn_f16_vrpreluc_ukernel__f16c_u16( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_blendv_ps(vb, vy, vb), _MM_FROUND_TO_NEAREST_INT)); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/gen/f16-vrpreluc-f16c-u8.c b/src/f16-vbinary/gen/f16-vrpreluc-f16c-u8.c index 395308a79e0..27136b944e4 100644 --- a/src/f16-vbinary/gen/f16-vrpreluc-f16c-u8.c +++ b/src/f16-vbinary/gen/f16-vrpreluc-f16c-u8.c @@ -33,7 +33,6 @@ void xnn_f16_vrpreluc_ukernel__f16c_u8( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b)); for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a)); @@ -42,7 +41,6 @@ void xnn_f16_vrpreluc_ukernel__f16c_u8( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_blendv_ps(vb, vy, vb), _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -52,7 +50,6 @@ void xnn_f16_vrpreluc_ukernel__f16c_u8( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_blendv_ps(vb, vy, vb), _MM_FROUND_TO_NEAREST_INT)); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/gen/f16-vrpreluc-neonfp16arith-u16.c b/src/f16-vbinary/gen/f16-vrpreluc-neonfp16arith-u16.c index b512c4be5e2..b6db12ceeba 100644 --- a/src/f16-vbinary/gen/f16-vrpreluc-neonfp16arith-u16.c +++ b/src/f16-vbinary/gen/f16-vrpreluc-neonfp16arith-u16.c @@ -32,7 +32,6 @@ void xnn_f16_vrpreluc_ukernel__neonfp16arith_u16( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const float16x8_t vb = vreinterpretq_f16_u16(vld1q_dup_u16(b)); const uint16x8_t vm = vcltq_s16(vreinterpretq_s16_f16(vb), vmovq_n_s16(0)); for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { @@ -45,7 +44,6 @@ void xnn_f16_vrpreluc_ukernel__neonfp16arith_u16( vy01234567 = vbslq_f16(vm, vy01234567, vb); vy456789AB = vbslq_f16(vm, vy456789AB, vb); - vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; vst1q_u16(o, vreinterpretq_u16_f16(vy456789AB)); o += 8; } diff --git a/src/f16-vbinary/gen/f16-vrpreluc-neonfp16arith-u8.c b/src/f16-vbinary/gen/f16-vrpreluc-neonfp16arith-u8.c index 85c0bcb3ac6..f0a92c6f648 100644 --- a/src/f16-vbinary/gen/f16-vrpreluc-neonfp16arith-u8.c +++ b/src/f16-vbinary/gen/f16-vrpreluc-neonfp16arith-u8.c @@ -32,7 +32,6 @@ void xnn_f16_vrpreluc_ukernel__neonfp16arith_u8( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const float16x8_t vb = vreinterpretq_f16_u16(vld1q_dup_u16(b)); const uint16x8_t vm = vcltq_s16(vreinterpretq_s16_f16(vb), vmovq_n_s16(0)); for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { diff --git a/src/f16-vbinary/gen/f16-vrsubc-avx512fp16-u32.c b/src/f16-vbinary/gen/f16-vrsubc-avx512fp16-u32.c new file mode 100644 index 00000000000..3f3e487b9cc --- /dev/null +++ b/src/f16-vbinary/gen/f16-vrsubc-avx512fp16-u32.c @@ -0,0 +1,64 @@ +// Auto-generated file. Do not edit! +// Template: src/f16-vbinary/vopc-avx512fp16.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/intrinsics-polyfill.h" +#include "xnnpack/vbinary.h" + + +void xnn_f16_vrsubc_ukernel__avx512fp16_u32( + size_t batch, + const xnn_float16* restrict input_a, + const xnn_float16* restrict input_b, + xnn_float16* restrict output, + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint16_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + +#if defined(__AVX512FP16__) + const uint16_t* a = (const uint16_t*) input_a; + const uint16_t* b = (const uint16_t*) input_b; + uint16_t* o = (uint16_t*) output; + + const __m512h vb = _mm512_castsi512_ph(_mm512_set1_epi16(*b)); + + + for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { + __m512h va0 = _mm512_loadu_ph(a); + a += 32; + + __m512h vacc0 = _mm512_sub_ph(vb, va0); + + + _mm512_storeu_ph(o, vacc0); + o += 32; + } + if XNN_UNLIKELY(batch != 0) { + assert(batch >= 1 * sizeof(uint16_t)); + assert(batch <= 31 * sizeof(uint16_t)); + // Prepare mask for valid 16-bit elements (depends on batch). + batch >>= XNN_LOG2_SIZEOF_HALF; + const __mmask32 vmask = _cvtu32_mask32((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); + + __m512h va = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, a)); + + __m512h vacc = _mm512_maskz_sub_ph(vmask, vb, va); + + _mm512_mask_storeu_epi16(o, vmask, _mm512_castph_si512(vacc)); + } +#endif // defined(__AVX512FP16__) +} diff --git a/src/f16-vbinary/gen/f16-vrsubc-minmax-avx512fp16-u32.c b/src/f16-vbinary/gen/f16-vrsubc-avx512fp16-u64.c similarity index 75% rename from src/f16-vbinary/gen/f16-vrsubc-minmax-avx512fp16-u32.c rename to src/f16-vbinary/gen/f16-vrsubc-avx512fp16-u64.c index 54ccea3f733..34dc85370e0 100644 --- a/src/f16-vbinary/gen/f16-vrsubc-minmax-avx512fp16-u32.c +++ b/src/f16-vbinary/gen/f16-vrsubc-avx512fp16-u64.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vrsubc_minmax_ukernel__avx512fp16_u32( +void xnn_f16_vrsubc_ukernel__avx512fp16_u64( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -34,23 +34,29 @@ void xnn_f16_vrsubc_minmax_ukernel__avx512fp16_u32( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m512h voutput_min = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m512h voutput_max = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); const __m512h vb = _mm512_castsi512_ph(_mm512_set1_epi16(*b)); - for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { + for (; batch >= 64 * sizeof(uint16_t); batch -= 64 * sizeof(uint16_t)) { __m512h va0 = _mm512_loadu_ph(a); - a += 32; + __m512h va1 = _mm512_loadu_ph(a + 32); + a += 64; __m512h vacc0 = _mm512_sub_ph(vb, va0); + __m512h vacc1 = _mm512_sub_ph(vb, va1); - vacc0 = _mm512_max_ph(voutput_min, vacc0); + _mm512_storeu_ph(o, vacc0); + _mm512_storeu_ph(o + 32, vacc1); + o += 64; + } + for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { + __m512h va = _mm512_loadu_ph(a); + a += 32; - vacc0 = _mm512_min_ph(voutput_max, vacc0); + __m512h vacc = _mm512_sub_ph(vb, va); - _mm512_storeu_ph(o, vacc0); + _mm512_storeu_ph(o, vacc); o += 32; } if XNN_UNLIKELY(batch != 0) { @@ -64,8 +70,6 @@ void xnn_f16_vrsubc_minmax_ukernel__avx512fp16_u32( __m512h vacc = _mm512_maskz_sub_ph(vmask, vb, va); - vacc = _mm512_maskz_max_ph(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ph(vmask, voutput_max, vacc); _mm512_mask_storeu_epi16(o, vmask, _mm512_castph_si512(vacc)); } #endif // defined(__AVX512FP16__) diff --git a/src/f16-vbinary/gen/f16-vrsubc-minmax-f16c-u16.c b/src/f16-vbinary/gen/f16-vrsubc-f16c-u16.c similarity index 77% rename from src/f16-vbinary/gen/f16-vrsubc-minmax-f16c-u16.c rename to src/f16-vbinary/gen/f16-vrsubc-f16c-u16.c index 5048ffedbce..be3c9975bf8 100644 --- a/src/f16-vbinary/gen/f16-vrsubc-minmax-f16c-u16.c +++ b/src/f16-vbinary/gen/f16-vrsubc-f16c-u16.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vrsubc_minmax_ukernel__f16c_u16( +void xnn_f16_vrsubc_ukernel__f16c_u16( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -33,11 +33,6 @@ void xnn_f16_vrsubc_minmax_ukernel__f16c_u16( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m256 vy_min = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m256 vy_max = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - XNN_FORCE_REALIZATION(vy_min); - XNN_FORCE_REALIZATION(vy_max); - const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b)); for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a)); @@ -48,12 +43,6 @@ void xnn_f16_vrsubc_minmax_ukernel__f16c_u16( __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(vb, va456789AB), _MM_FROUND_TO_NEAREST_INT)); - vy01234567 = _mm256_max_ps(vy01234567, vy_min); - vy456789AB = _mm256_max_ps(vy456789AB, vy_min); - - vy01234567 = _mm256_min_ps(vy01234567, vy_max); - vy456789AB = _mm256_min_ps(vy456789AB, vy_max); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy01234567, _MM_FROUND_TO_NEAREST_INT)); _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_TO_NEAREST_INT)); o += 16; @@ -64,9 +53,6 @@ void xnn_f16_vrsubc_minmax_ukernel__f16c_u16( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(vb, va), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -75,9 +61,6 @@ void xnn_f16_vrsubc_minmax_ukernel__f16c_u16( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(vb, va), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/gen/f16-vrsubc-minmax-f16c-u8.c b/src/f16-vbinary/gen/f16-vrsubc-f16c-u8.c similarity index 77% rename from src/f16-vbinary/gen/f16-vrsubc-minmax-f16c-u8.c rename to src/f16-vbinary/gen/f16-vrsubc-f16c-u8.c index 4f647af2077..377239e1b83 100644 --- a/src/f16-vbinary/gen/f16-vrsubc-minmax-f16c-u8.c +++ b/src/f16-vbinary/gen/f16-vrsubc-f16c-u8.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vrsubc_minmax_ukernel__f16c_u8( +void xnn_f16_vrsubc_ukernel__f16c_u8( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -33,11 +33,6 @@ void xnn_f16_vrsubc_minmax_ukernel__f16c_u8( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m256 vy_min = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m256 vy_max = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - XNN_FORCE_REALIZATION(vy_min); - XNN_FORCE_REALIZATION(vy_max); - const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b)); for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a)); @@ -45,9 +40,6 @@ void xnn_f16_vrsubc_minmax_ukernel__f16c_u8( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(vb, va), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -56,9 +48,6 @@ void xnn_f16_vrsubc_minmax_ukernel__f16c_u8( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(vb, va), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/gen/f16-vrsubc-minmax-fp16arith-u1.c b/src/f16-vbinary/gen/f16-vrsubc-fp16arith-u1.c similarity index 74% rename from src/f16-vbinary/gen/f16-vrsubc-minmax-fp16arith-u1.c rename to src/f16-vbinary/gen/f16-vrsubc-fp16arith-u1.c index d941bf61904..dec18fdaddc 100644 --- a/src/f16-vbinary/gen/f16-vrsubc-minmax-fp16arith-u1.c +++ b/src/f16-vbinary/gen/f16-vrsubc-fp16arith-u1.c @@ -8,7 +8,6 @@ // LICENSE file in the root directory of this source tree. #include -#include #include @@ -18,12 +17,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vrsubc_minmax_ukernel__fp16arith_u1( +void xnn_f16_vrsubc_ukernel__fp16arith_u1( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float16_t) == 0); @@ -35,16 +34,10 @@ void xnn_f16_vrsubc_minmax_ukernel__fp16arith_u1( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - float16_t vy_min, vy_max; - memcpy(&vy_min, ¶ms->scalar.min, sizeof(vy_min)); - memcpy(&vy_max, ¶ms->scalar.max, sizeof(vy_max)); - const float16_t vb = *b; do { float16_t vacc = *a++; vacc = vsubh_f16(vb, vacc); - vacc = vmaxnmh_f16(vacc, vy_min); - vacc = vminnmh_f16(vacc, vy_max); *o++ = vacc; batch -= sizeof(float16_t); } while (batch != 0); diff --git a/src/f16-vbinary/gen/f16-vrsubc-minmax-fp16arith-u2.c b/src/f16-vbinary/gen/f16-vrsubc-fp16arith-u2.c similarity index 70% rename from src/f16-vbinary/gen/f16-vrsubc-minmax-fp16arith-u2.c rename to src/f16-vbinary/gen/f16-vrsubc-fp16arith-u2.c index f8d1e6810d7..c68b18845b8 100644 --- a/src/f16-vbinary/gen/f16-vrsubc-minmax-fp16arith-u2.c +++ b/src/f16-vbinary/gen/f16-vrsubc-fp16arith-u2.c @@ -8,7 +8,6 @@ // LICENSE file in the root directory of this source tree. #include -#include #include @@ -18,12 +17,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vrsubc_minmax_ukernel__fp16arith_u2( +void xnn_f16_vrsubc_ukernel__fp16arith_u2( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float16_t) == 0); @@ -35,10 +34,6 @@ void xnn_f16_vrsubc_minmax_ukernel__fp16arith_u2( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - float16_t vy_min, vy_max; - memcpy(&vy_min, ¶ms->scalar.min, sizeof(vy_min)); - memcpy(&vy_max, ¶ms->scalar.max, sizeof(vy_max)); - const float16_t vb = *b; for (; batch >= 2 * sizeof(float16_t); batch -= 2 * sizeof(float16_t)) { float16_t vacc0 = a[0]; @@ -49,12 +44,6 @@ void xnn_f16_vrsubc_minmax_ukernel__fp16arith_u2( vacc1 = vsubh_f16(vb, vacc1); - vacc0 = vmaxnmh_f16(vacc0, vy_min); - vacc1 = vmaxnmh_f16(vacc1, vy_min); - - vacc0 = vminnmh_f16(vacc0, vy_max); - vacc1 = vminnmh_f16(vacc1, vy_max); - o[0] = vacc0; o[1] = vacc1; o += 2; @@ -62,8 +51,6 @@ void xnn_f16_vrsubc_minmax_ukernel__fp16arith_u2( if XNN_UNLIKELY(batch != 0) { float16_t vacc = *a; vacc = vsubh_f16(vb, vacc); - vacc = vmaxnmh_f16(vacc, vy_min); - vacc = vminnmh_f16(vacc, vy_max); *o = vacc; } } diff --git a/src/f16-vbinary/gen/f16-vrsubc-minmax-fp16arith-u4.c b/src/f16-vbinary/gen/f16-vrsubc-fp16arith-u4.c similarity index 68% rename from src/f16-vbinary/gen/f16-vrsubc-minmax-fp16arith-u4.c rename to src/f16-vbinary/gen/f16-vrsubc-fp16arith-u4.c index e30fc2bfce2..69a34852a74 100644 --- a/src/f16-vbinary/gen/f16-vrsubc-minmax-fp16arith-u4.c +++ b/src/f16-vbinary/gen/f16-vrsubc-fp16arith-u4.c @@ -8,7 +8,6 @@ // LICENSE file in the root directory of this source tree. #include -#include #include @@ -18,12 +17,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vrsubc_minmax_ukernel__fp16arith_u4( +void xnn_f16_vrsubc_ukernel__fp16arith_u4( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float16_t) == 0); @@ -35,10 +34,6 @@ void xnn_f16_vrsubc_minmax_ukernel__fp16arith_u4( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - float16_t vy_min, vy_max; - memcpy(&vy_min, ¶ms->scalar.min, sizeof(vy_min)); - memcpy(&vy_max, ¶ms->scalar.max, sizeof(vy_max)); - const float16_t vb = *b; for (; batch >= 4 * sizeof(float16_t); batch -= 4 * sizeof(float16_t)) { float16_t vacc0 = a[0]; @@ -53,16 +48,6 @@ void xnn_f16_vrsubc_minmax_ukernel__fp16arith_u4( vacc3 = vsubh_f16(vb, vacc3); - vacc0 = vmaxnmh_f16(vacc0, vy_min); - vacc1 = vmaxnmh_f16(vacc1, vy_min); - vacc2 = vmaxnmh_f16(vacc2, vy_min); - vacc3 = vmaxnmh_f16(vacc3, vy_min); - - vacc0 = vminnmh_f16(vacc0, vy_max); - vacc1 = vminnmh_f16(vacc1, vy_max); - vacc2 = vminnmh_f16(vacc2, vy_max); - vacc3 = vminnmh_f16(vacc3, vy_max); - o[0] = vacc0; o[1] = vacc1; o[2] = vacc2; @@ -73,8 +58,6 @@ void xnn_f16_vrsubc_minmax_ukernel__fp16arith_u4( do { float16_t vacc = *a++; vacc = vsubh_f16(vb, vacc); - vacc = vmaxnmh_f16(vacc, vy_min); - vacc = vminnmh_f16(vacc, vy_max); *o++ = vacc; batch -= sizeof(float16_t); } while (batch != 0); diff --git a/src/f16-vbinary/gen/f16-vrsubc-minmax-avx512fp16-u64.c b/src/f16-vbinary/gen/f16-vrsubc-minmax-avx512fp16-u64.c deleted file mode 100644 index 13f0ff45d66..00000000000 --- a/src/f16-vbinary/gen/f16-vrsubc-minmax-avx512fp16-u64.c +++ /dev/null @@ -1,89 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f16-vbinary/vopc-avx512fp16.c.in -// Generator: tools/xngen -// -// Copyright 2024 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/vbinary.h" - - -void xnn_f16_vrsubc_minmax_ukernel__avx512fp16_u64( - size_t batch, - const xnn_float16* restrict input_a, - const xnn_float16* restrict input_b, - xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(uint16_t) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - -#if defined(__AVX512FP16__) - const uint16_t* a = (const uint16_t*) input_a; - const uint16_t* b = (const uint16_t*) input_b; - uint16_t* o = (uint16_t*) output; - - const __m512h voutput_min = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m512h voutput_max = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - const __m512h vb = _mm512_castsi512_ph(_mm512_set1_epi16(*b)); - - - for (; batch >= 64 * sizeof(uint16_t); batch -= 64 * sizeof(uint16_t)) { - __m512h va0 = _mm512_loadu_ph(a); - __m512h va1 = _mm512_loadu_ph(a + 32); - a += 64; - - __m512h vacc0 = _mm512_sub_ph(vb, va0); - __m512h vacc1 = _mm512_sub_ph(vb, va1); - - - vacc0 = _mm512_max_ph(voutput_min, vacc0); - vacc1 = _mm512_max_ph(voutput_min, vacc1); - - vacc0 = _mm512_min_ph(voutput_max, vacc0); - vacc1 = _mm512_min_ph(voutput_max, vacc1); - - _mm512_storeu_ph(o, vacc0); - _mm512_storeu_ph(o + 32, vacc1); - o += 64; - } - for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { - __m512h va = _mm512_loadu_ph(a); - a += 32; - - __m512h vacc = _mm512_sub_ph(vb, va); - - vacc = _mm512_max_ph(voutput_min, vacc); - vacc = _mm512_min_ph(voutput_max, vacc); - - _mm512_storeu_ph(o, vacc); - o += 32; - } - if XNN_UNLIKELY(batch != 0) { - assert(batch >= 1 * sizeof(uint16_t)); - assert(batch <= 31 * sizeof(uint16_t)); - // Prepare mask for valid 16-bit elements (depends on batch). - batch >>= XNN_LOG2_SIZEOF_HALF; - const __mmask32 vmask = _cvtu32_mask32((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); - - __m512h va = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, a)); - - __m512h vacc = _mm512_maskz_sub_ph(vmask, vb, va); - - vacc = _mm512_maskz_max_ph(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ph(vmask, voutput_max, vacc); - _mm512_mask_storeu_epi16(o, vmask, _mm512_castph_si512(vacc)); - } -#endif // defined(__AVX512FP16__) -} diff --git a/src/f16-vbinary/gen/f16-vrsubc-minmax-neonfp16arith-u16.c b/src/f16-vbinary/gen/f16-vrsubc-neonfp16arith-u16.c similarity index 75% rename from src/f16-vbinary/gen/f16-vrsubc-minmax-neonfp16arith-u16.c rename to src/f16-vbinary/gen/f16-vrsubc-neonfp16arith-u16.c index 6cc9aea3626..7a2e7e812a9 100644 --- a/src/f16-vbinary/gen/f16-vrsubc-minmax-neonfp16arith-u16.c +++ b/src/f16-vbinary/gen/f16-vrsubc-neonfp16arith-u16.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vrsubc_minmax_ukernel__neonfp16arith_u16( +void xnn_f16_vrsubc_ukernel__neonfp16arith_u16( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -32,9 +32,6 @@ void xnn_f16_vrsubc_minmax_ukernel__neonfp16arith_u16( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const float16x8_t vy_min = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.min)); - const float16x8_t vy_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.max)); - const float16x8_t vb = vreinterpretq_f16_u16(vld1q_dup_u16(b)); for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; @@ -44,12 +41,6 @@ void xnn_f16_vrsubc_minmax_ukernel__neonfp16arith_u16( float16x8_t vy456789AB = vsubq_f16(vb, va456789AB); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy456789AB = vmaxq_f16(vy456789AB, vy_min); - - vy01234567 = vminq_f16(vy01234567, vy_max); - vy456789AB = vminq_f16(vy456789AB, vy_max); - vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; vst1q_u16(o, vreinterpretq_u16_f16(vy456789AB)); o += 8; } @@ -57,16 +48,12 @@ void xnn_f16_vrsubc_minmax_ukernel__neonfp16arith_u16( const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; float16x8_t vy01234567 = vsubq_f16(vb, va01234567); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; } if XNN_UNLIKELY(batch != 0) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); float16x8_t vy01234567 = vsubq_f16(vb, va01234567); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); float16x4_t vy0123 = vget_low_f16(vy01234567); if (batch & (4 * sizeof(uint16_t))) { diff --git a/src/f16-vbinary/gen/f16-vrsubc-minmax-neonfp16arith-u8.c b/src/f16-vbinary/gen/f16-vrsubc-neonfp16arith-u8.c similarity index 76% rename from src/f16-vbinary/gen/f16-vrsubc-minmax-neonfp16arith-u8.c rename to src/f16-vbinary/gen/f16-vrsubc-neonfp16arith-u8.c index 072d4dc1cd5..94de746dca7 100644 --- a/src/f16-vbinary/gen/f16-vrsubc-minmax-neonfp16arith-u8.c +++ b/src/f16-vbinary/gen/f16-vrsubc-neonfp16arith-u8.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vrsubc_minmax_ukernel__neonfp16arith_u8( +void xnn_f16_vrsubc_ukernel__neonfp16arith_u8( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -32,24 +32,17 @@ void xnn_f16_vrsubc_minmax_ukernel__neonfp16arith_u8( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const float16x8_t vy_min = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.min)); - const float16x8_t vy_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.max)); - const float16x8_t vb = vreinterpretq_f16_u16(vld1q_dup_u16(b)); for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; float16x8_t vy01234567 = vsubq_f16(vb, va01234567); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; } if XNN_UNLIKELY(batch != 0) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); float16x8_t vy01234567 = vsubq_f16(vb, va01234567); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); float16x4_t vy0123 = vget_low_f16(vy01234567); if (batch & (4 * sizeof(uint16_t))) { diff --git a/src/f16-vbinary/gen/f16-vsqrdiff-avx512fp16-u32.c b/src/f16-vbinary/gen/f16-vsqrdiff-avx512fp16-u32.c index 8080d42dff1..fe17be558ad 100644 --- a/src/f16-vbinary/gen/f16-vsqrdiff-avx512fp16-u32.c +++ b/src/f16-vbinary/gen/f16-vsqrdiff-avx512fp16-u32.c @@ -34,7 +34,6 @@ void xnn_f16_vsqrdiff_ukernel__avx512fp16_u32( uint16_t* o = (uint16_t*) output; - for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { const __m512h va = _mm512_loadu_ph(a); a += 32; @@ -44,7 +43,6 @@ void xnn_f16_vsqrdiff_ukernel__avx512fp16_u32( vacc = _mm512_mul_ph(vacc, vacc); - _mm512_storeu_ph(o, vacc); o += 32; } diff --git a/src/f16-vbinary/gen/f16-vsqrdiff-avx512fp16-u64.c b/src/f16-vbinary/gen/f16-vsqrdiff-avx512fp16-u64.c index 689af4e4fcf..fc25ab2bd20 100644 --- a/src/f16-vbinary/gen/f16-vsqrdiff-avx512fp16-u64.c +++ b/src/f16-vbinary/gen/f16-vsqrdiff-avx512fp16-u64.c @@ -34,7 +34,6 @@ void xnn_f16_vsqrdiff_ukernel__avx512fp16_u64( uint16_t* o = (uint16_t*) output; - for (; batch >= 64 * sizeof(uint16_t); batch -= 64 * sizeof(uint16_t)) { const __m512h va0 = _mm512_loadu_ph(a); const __m512h va1 = _mm512_loadu_ph(a + 32); @@ -47,7 +46,6 @@ void xnn_f16_vsqrdiff_ukernel__avx512fp16_u64( vacc0 = _mm512_mul_ph(vacc0, vacc0); vacc1 = _mm512_mul_ph(vacc1, vacc1); - _mm512_storeu_ph(o, vacc0); _mm512_storeu_ph(o + 32, vacc1); o += 64; @@ -61,7 +59,6 @@ void xnn_f16_vsqrdiff_ukernel__avx512fp16_u64( vacc = _mm512_mul_ph(vacc, vacc); - _mm512_storeu_ph(o, vacc); o += 32; } diff --git a/src/f16-vbinary/gen/f16-vsqrdiff-f16c-u16.c b/src/f16-vbinary/gen/f16-vsqrdiff-f16c-u16.c index 4eaf65c6f74..f0f800641c2 100644 --- a/src/f16-vbinary/gen/f16-vsqrdiff-f16c-u16.c +++ b/src/f16-vbinary/gen/f16-vsqrdiff-f16c-u16.c @@ -33,7 +33,6 @@ void xnn_f16_vsqrdiff_ukernel__f16c_u16( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a)); const __m256 vb01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b)); @@ -48,7 +47,6 @@ void xnn_f16_vsqrdiff_ukernel__f16c_u16( vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vy01234567, vy01234567), _MM_FROUND_TO_NEAREST_INT)); vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vy456789AB, vy456789AB), _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy01234567, _MM_FROUND_TO_NEAREST_INT)); _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_TO_NEAREST_INT)); o += 16; @@ -62,7 +60,6 @@ void xnn_f16_vsqrdiff_ukernel__f16c_u16( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vy, vy), _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -73,7 +70,6 @@ void xnn_f16_vsqrdiff_ukernel__f16c_u16( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vy, vy), _MM_FROUND_TO_NEAREST_INT)); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/gen/f16-vsqrdiff-f16c-u8.c b/src/f16-vbinary/gen/f16-vsqrdiff-f16c-u8.c index f849e9c5871..7a8da225412 100644 --- a/src/f16-vbinary/gen/f16-vsqrdiff-f16c-u8.c +++ b/src/f16-vbinary/gen/f16-vsqrdiff-f16c-u8.c @@ -33,7 +33,6 @@ void xnn_f16_vsqrdiff_ukernel__f16c_u8( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a)); const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b)); @@ -43,7 +42,6 @@ void xnn_f16_vsqrdiff_ukernel__f16c_u8( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vy, vy), _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -54,7 +52,6 @@ void xnn_f16_vsqrdiff_ukernel__f16c_u8( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vy, vy), _MM_FROUND_TO_NEAREST_INT)); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/gen/f16-vsqrdiff-fp16arith-u1.c b/src/f16-vbinary/gen/f16-vsqrdiff-fp16arith-u1.c index c81a94c569c..b49561b05c9 100644 --- a/src/f16-vbinary/gen/f16-vsqrdiff-fp16arith-u1.c +++ b/src/f16-vbinary/gen/f16-vsqrdiff-fp16arith-u1.c @@ -33,7 +33,6 @@ void xnn_f16_vsqrdiff_ukernel__fp16arith_u1( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - do { const float16_t va = *a++; const float16_t vb = *b++; diff --git a/src/f16-vbinary/gen/f16-vsqrdiff-fp16arith-u2.c b/src/f16-vbinary/gen/f16-vsqrdiff-fp16arith-u2.c index 3dc81fabdd7..bd4ad5193e7 100644 --- a/src/f16-vbinary/gen/f16-vsqrdiff-fp16arith-u2.c +++ b/src/f16-vbinary/gen/f16-vsqrdiff-fp16arith-u2.c @@ -33,7 +33,6 @@ void xnn_f16_vsqrdiff_ukernel__fp16arith_u2( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - for (; batch >= 2 * sizeof(float16_t); batch -= 2 * sizeof(float16_t)) { const float16_t va0 = *a++; const float16_t va1 = *a++; @@ -47,7 +46,6 @@ void xnn_f16_vsqrdiff_ukernel__fp16arith_u2( vacc0 = vmulh_f16(vacc0, vacc0); vacc1 = vmulh_f16(vacc1, vacc1); - *o++ = vacc0; *o++ = vacc1; } diff --git a/src/f16-vbinary/gen/f16-vsqrdiff-fp16arith-u4.c b/src/f16-vbinary/gen/f16-vsqrdiff-fp16arith-u4.c index 4f9167e42aa..d509c8e8bd4 100644 --- a/src/f16-vbinary/gen/f16-vsqrdiff-fp16arith-u4.c +++ b/src/f16-vbinary/gen/f16-vsqrdiff-fp16arith-u4.c @@ -33,7 +33,6 @@ void xnn_f16_vsqrdiff_ukernel__fp16arith_u4( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - for (; batch >= 4 * sizeof(float16_t); batch -= 4 * sizeof(float16_t)) { const float16_t va0 = *a++; const float16_t va1 = *a++; @@ -55,7 +54,6 @@ void xnn_f16_vsqrdiff_ukernel__fp16arith_u4( vacc2 = vmulh_f16(vacc2, vacc2); vacc3 = vmulh_f16(vacc3, vacc3); - *o++ = vacc0; *o++ = vacc1; *o++ = vacc2; diff --git a/src/f16-vbinary/gen/f16-vsqrdiff-neonfp16arith-u16.c b/src/f16-vbinary/gen/f16-vsqrdiff-neonfp16arith-u16.c index c4f3a25638b..0f6c911ed3d 100644 --- a/src/f16-vbinary/gen/f16-vsqrdiff-neonfp16arith-u16.c +++ b/src/f16-vbinary/gen/f16-vsqrdiff-neonfp16arith-u16.c @@ -32,7 +32,6 @@ void xnn_f16_vsqrdiff_ukernel__neonfp16arith_u16( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; const float16x8_t vb01234567 = vreinterpretq_f16_u16(vld1q_u16(b)); b += 8; @@ -45,7 +44,6 @@ void xnn_f16_vsqrdiff_ukernel__neonfp16arith_u16( vy01234567 = vmulq_f16(vy01234567, vy01234567); vy456789AB = vmulq_f16(vy456789AB, vy456789AB); - vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; vst1q_u16(o, vreinterpretq_u16_f16(vy456789AB)); o += 8; } diff --git a/src/f16-vbinary/gen/f16-vsqrdiff-neonfp16arith-u8.c b/src/f16-vbinary/gen/f16-vsqrdiff-neonfp16arith-u8.c index 556d603224d..88825c64afb 100644 --- a/src/f16-vbinary/gen/f16-vsqrdiff-neonfp16arith-u8.c +++ b/src/f16-vbinary/gen/f16-vsqrdiff-neonfp16arith-u8.c @@ -32,7 +32,6 @@ void xnn_f16_vsqrdiff_ukernel__neonfp16arith_u8( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; const float16x8_t vb01234567 = vreinterpretq_f16_u16(vld1q_u16(b)); b += 8; diff --git a/src/f16-vbinary/gen/f16-vsqrdiffc-avx512fp16-u32.c b/src/f16-vbinary/gen/f16-vsqrdiffc-avx512fp16-u32.c index 8c87e9d0f72..0a9abe9815a 100644 --- a/src/f16-vbinary/gen/f16-vsqrdiffc-avx512fp16-u32.c +++ b/src/f16-vbinary/gen/f16-vsqrdiffc-avx512fp16-u32.c @@ -45,7 +45,6 @@ void xnn_f16_vsqrdiffc_ukernel__avx512fp16_u32( vacc0 = _mm512_mul_ph(vacc0, vacc0); - _mm512_storeu_ph(o, vacc0); o += 32; } diff --git a/src/f16-vbinary/gen/f16-vsqrdiffc-avx512fp16-u64.c b/src/f16-vbinary/gen/f16-vsqrdiffc-avx512fp16-u64.c index c336e2c7211..81756348913 100644 --- a/src/f16-vbinary/gen/f16-vsqrdiffc-avx512fp16-u64.c +++ b/src/f16-vbinary/gen/f16-vsqrdiffc-avx512fp16-u64.c @@ -48,7 +48,6 @@ void xnn_f16_vsqrdiffc_ukernel__avx512fp16_u64( vacc0 = _mm512_mul_ph(vacc0, vacc0); vacc1 = _mm512_mul_ph(vacc1, vacc1); - _mm512_storeu_ph(o, vacc0); _mm512_storeu_ph(o + 32, vacc1); o += 64; @@ -60,7 +59,6 @@ void xnn_f16_vsqrdiffc_ukernel__avx512fp16_u64( __m512h vacc = _mm512_sub_ph(va, vb); vacc = _mm512_mul_ph(vacc, vacc); - _mm512_storeu_ph(o, vacc); o += 32; } diff --git a/src/f16-vbinary/gen/f16-vsqrdiffc-f16c-u16.c b/src/f16-vbinary/gen/f16-vsqrdiffc-f16c-u16.c index 24dc3852c1f..7fdfba0265e 100644 --- a/src/f16-vbinary/gen/f16-vsqrdiffc-f16c-u16.c +++ b/src/f16-vbinary/gen/f16-vsqrdiffc-f16c-u16.c @@ -33,7 +33,6 @@ void xnn_f16_vsqrdiffc_ukernel__f16c_u16( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b)); for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a)); @@ -46,7 +45,6 @@ void xnn_f16_vsqrdiffc_ukernel__f16c_u16( vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vy01234567, vy01234567), _MM_FROUND_TO_NEAREST_INT)); vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vy456789AB, vy456789AB), _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy01234567, _MM_FROUND_TO_NEAREST_INT)); _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_TO_NEAREST_INT)); o += 16; @@ -58,7 +56,6 @@ void xnn_f16_vsqrdiffc_ukernel__f16c_u16( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vy, vy), _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -68,7 +65,6 @@ void xnn_f16_vsqrdiffc_ukernel__f16c_u16( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vy, vy), _MM_FROUND_TO_NEAREST_INT)); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/gen/f16-vsqrdiffc-f16c-u8.c b/src/f16-vbinary/gen/f16-vsqrdiffc-f16c-u8.c index 749f8fd3113..58378c45993 100644 --- a/src/f16-vbinary/gen/f16-vsqrdiffc-f16c-u8.c +++ b/src/f16-vbinary/gen/f16-vsqrdiffc-f16c-u8.c @@ -33,7 +33,6 @@ void xnn_f16_vsqrdiffc_ukernel__f16c_u8( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b)); for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a)); @@ -42,7 +41,6 @@ void xnn_f16_vsqrdiffc_ukernel__f16c_u8( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vy, vy), _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -52,7 +50,6 @@ void xnn_f16_vsqrdiffc_ukernel__f16c_u8( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vy, vy), _MM_FROUND_TO_NEAREST_INT)); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/gen/f16-vsqrdiffc-fp16arith-u1.c b/src/f16-vbinary/gen/f16-vsqrdiffc-fp16arith-u1.c index bca6e19c39e..35e81c73f14 100644 --- a/src/f16-vbinary/gen/f16-vsqrdiffc-fp16arith-u1.c +++ b/src/f16-vbinary/gen/f16-vsqrdiffc-fp16arith-u1.c @@ -34,7 +34,6 @@ void xnn_f16_vsqrdiffc_ukernel__fp16arith_u1( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - const float16_t vb = *b; do { float16_t vacc = *a++; diff --git a/src/f16-vbinary/gen/f16-vsqrdiffc-fp16arith-u2.c b/src/f16-vbinary/gen/f16-vsqrdiffc-fp16arith-u2.c index 5b4dad8a79a..bfafee982e1 100644 --- a/src/f16-vbinary/gen/f16-vsqrdiffc-fp16arith-u2.c +++ b/src/f16-vbinary/gen/f16-vsqrdiffc-fp16arith-u2.c @@ -34,7 +34,6 @@ void xnn_f16_vsqrdiffc_ukernel__fp16arith_u2( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - const float16_t vb = *b; for (; batch >= 2 * sizeof(float16_t); batch -= 2 * sizeof(float16_t)) { float16_t vacc0 = a[0]; @@ -47,7 +46,6 @@ void xnn_f16_vsqrdiffc_ukernel__fp16arith_u2( vacc0 = vmulh_f16(vacc0, vacc0); vacc1 = vmulh_f16(vacc1, vacc1); - o[0] = vacc0; o[1] = vacc1; o += 2; diff --git a/src/f16-vbinary/gen/f16-vsqrdiffc-fp16arith-u4.c b/src/f16-vbinary/gen/f16-vsqrdiffc-fp16arith-u4.c index 8ce21d07736..e4db2294b74 100644 --- a/src/f16-vbinary/gen/f16-vsqrdiffc-fp16arith-u4.c +++ b/src/f16-vbinary/gen/f16-vsqrdiffc-fp16arith-u4.c @@ -34,7 +34,6 @@ void xnn_f16_vsqrdiffc_ukernel__fp16arith_u4( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - const float16_t vb = *b; for (; batch >= 4 * sizeof(float16_t); batch -= 4 * sizeof(float16_t)) { float16_t vacc0 = a[0]; @@ -53,7 +52,6 @@ void xnn_f16_vsqrdiffc_ukernel__fp16arith_u4( vacc2 = vmulh_f16(vacc2, vacc2); vacc3 = vmulh_f16(vacc3, vacc3); - o[0] = vacc0; o[1] = vacc1; o[2] = vacc2; diff --git a/src/f16-vbinary/gen/f16-vsqrdiffc-neonfp16arith-u16.c b/src/f16-vbinary/gen/f16-vsqrdiffc-neonfp16arith-u16.c index 4e4e44ecb1d..9c9e22a0ba1 100644 --- a/src/f16-vbinary/gen/f16-vsqrdiffc-neonfp16arith-u16.c +++ b/src/f16-vbinary/gen/f16-vsqrdiffc-neonfp16arith-u16.c @@ -32,7 +32,6 @@ void xnn_f16_vsqrdiffc_ukernel__neonfp16arith_u16( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const float16x8_t vb = vreinterpretq_f16_u16(vld1q_dup_u16(b)); for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; @@ -44,7 +43,6 @@ void xnn_f16_vsqrdiffc_ukernel__neonfp16arith_u16( vy01234567 = vmulq_f16(vy01234567, vy01234567); vy456789AB = vmulq_f16(vy456789AB, vy456789AB); - vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; vst1q_u16(o, vreinterpretq_u16_f16(vy456789AB)); o += 8; } diff --git a/src/f16-vbinary/gen/f16-vsqrdiffc-neonfp16arith-u8.c b/src/f16-vbinary/gen/f16-vsqrdiffc-neonfp16arith-u8.c index cb9e36598ce..63fdedc0318 100644 --- a/src/f16-vbinary/gen/f16-vsqrdiffc-neonfp16arith-u8.c +++ b/src/f16-vbinary/gen/f16-vsqrdiffc-neonfp16arith-u8.c @@ -32,7 +32,6 @@ void xnn_f16_vsqrdiffc_ukernel__neonfp16arith_u8( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const float16x8_t vb = vreinterpretq_f16_u16(vld1q_dup_u16(b)); for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; diff --git a/src/f16-vbinary/gen/f16-vsub-avx512fp16-u32.c b/src/f16-vbinary/gen/f16-vsub-avx512fp16-u32.c new file mode 100644 index 00000000000..74f33e51a4b --- /dev/null +++ b/src/f16-vbinary/gen/f16-vsub-avx512fp16-u32.c @@ -0,0 +1,63 @@ +// Auto-generated file. Do not edit! +// Template: src/f16-vbinary/vop-avx512fp16.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/vbinary.h" + + +void xnn_f16_vsub_ukernel__avx512fp16_u32( + size_t batch, + const xnn_float16* restrict input_a, + const xnn_float16* restrict input_b, + xnn_float16* restrict output, + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint16_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + +#if defined(__AVX512FP16__) + const uint16_t* a = (const uint16_t*) input_a; + const uint16_t* b = (const uint16_t*) input_b; + uint16_t* o = (uint16_t*) output; + + + for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { + const __m512h va = _mm512_loadu_ph(a); + a += 32; + + __m512h vacc = _mm512_sub_ph(va, _mm512_loadu_ph(b)); + b += 32; + + + _mm512_storeu_ph(o, vacc); + o += 32; + } + if XNN_UNLIKELY(batch != 0) { + assert(batch >= 1 * sizeof(uint16_t)); + assert(batch <= 31 * sizeof(uint16_t)); + // Prepare mask for valid 16-bit elements (depends on batch). + batch >>= XNN_LOG2_SIZEOF_HALF; + const __mmask32 vmask = _cvtu32_mask32((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); + + const __m512h va = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, a)); + + __m512h vacc = _mm512_maskz_sub_ph(vmask, va, _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, b))); + + + _mm512_mask_storeu_epi16(o, vmask, _mm512_castph_si512(vacc)); + } +#endif // defined(__AVX512FP16__) +} diff --git a/src/f16-vbinary/gen/f16-vsub-minmax-avx512fp16-u32.c b/src/f16-vbinary/gen/f16-vsub-avx512fp16-u64.c similarity index 75% rename from src/f16-vbinary/gen/f16-vsub-minmax-avx512fp16-u32.c rename to src/f16-vbinary/gen/f16-vsub-avx512fp16-u64.c index 534b3c6df6f..7f51a57f67f 100644 --- a/src/f16-vbinary/gen/f16-vsub-minmax-avx512fp16-u32.c +++ b/src/f16-vbinary/gen/f16-vsub-avx512fp16-u64.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vsub_minmax_ukernel__avx512fp16_u32( +void xnn_f16_vsub_ukernel__avx512fp16_u64( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -33,10 +33,21 @@ void xnn_f16_vsub_minmax_ukernel__avx512fp16_u32( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m512h voutput_min = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m512h voutput_max = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); + for (; batch >= 64 * sizeof(uint16_t); batch -= 64 * sizeof(uint16_t)) { + const __m512h va0 = _mm512_loadu_ph(a); + const __m512h va1 = _mm512_loadu_ph(a + 32); + a += 64; + __m512h vacc0 = _mm512_sub_ph(va0, _mm512_loadu_ph(b)); + __m512h vacc1 = _mm512_sub_ph(va1, _mm512_loadu_ph(b + 32)); + b += 64; + + + _mm512_storeu_ph(o, vacc0); + _mm512_storeu_ph(o + 32, vacc1); + o += 64; + } for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { const __m512h va = _mm512_loadu_ph(a); a += 32; @@ -45,9 +56,6 @@ void xnn_f16_vsub_minmax_ukernel__avx512fp16_u32( b += 32; - vacc = _mm512_max_ph(voutput_min, vacc); - vacc = _mm512_min_ph(voutput_max, vacc); - _mm512_storeu_ph(o, vacc); o += 32; } @@ -63,8 +71,6 @@ void xnn_f16_vsub_minmax_ukernel__avx512fp16_u32( __m512h vacc = _mm512_maskz_sub_ph(vmask, va, _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, b))); - vacc = _mm512_maskz_max_ph(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ph(vmask, voutput_max, vacc); _mm512_mask_storeu_epi16(o, vmask, _mm512_castph_si512(vacc)); } #endif // defined(__AVX512FP16__) diff --git a/src/f16-vbinary/gen/f16-vsub-minmax-f16c-u16.c b/src/f16-vbinary/gen/f16-vsub-f16c-u16.c similarity index 79% rename from src/f16-vbinary/gen/f16-vsub-minmax-f16c-u16.c rename to src/f16-vbinary/gen/f16-vsub-f16c-u16.c index c41ea07b52b..975df0d187c 100644 --- a/src/f16-vbinary/gen/f16-vsub-minmax-f16c-u16.c +++ b/src/f16-vbinary/gen/f16-vsub-f16c-u16.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vsub_minmax_ukernel__f16c_u16( +void xnn_f16_vsub_ukernel__f16c_u16( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -33,11 +33,6 @@ void xnn_f16_vsub_minmax_ukernel__f16c_u16( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m256 vy_min = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m256 vy_max = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - XNN_FORCE_REALIZATION(vy_min); - XNN_FORCE_REALIZATION(vy_max); - for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a)); const __m256 vb01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b)); @@ -50,12 +45,6 @@ void xnn_f16_vsub_minmax_ukernel__f16c_u16( __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va456789AB, vb456789AB), _MM_FROUND_TO_NEAREST_INT)); - vy01234567 = _mm256_max_ps(vy01234567, vy_min); - vy456789AB = _mm256_max_ps(vy456789AB, vy_min); - - vy01234567 = _mm256_min_ps(vy01234567, vy_max); - vy456789AB = _mm256_min_ps(vy456789AB, vy_max); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy01234567, _MM_FROUND_TO_NEAREST_INT)); _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_TO_NEAREST_INT)); o += 16; @@ -68,9 +57,6 @@ void xnn_f16_vsub_minmax_ukernel__f16c_u16( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -80,9 +66,6 @@ void xnn_f16_vsub_minmax_ukernel__f16c_u16( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/gen/f16-vsub-minmax-f16c-u8.c b/src/f16-vbinary/gen/f16-vsub-f16c-u8.c similarity index 78% rename from src/f16-vbinary/gen/f16-vsub-minmax-f16c-u8.c rename to src/f16-vbinary/gen/f16-vsub-f16c-u8.c index c0f0c473e91..ddf4d838d39 100644 --- a/src/f16-vbinary/gen/f16-vsub-minmax-f16c-u8.c +++ b/src/f16-vbinary/gen/f16-vsub-f16c-u8.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vsub_minmax_ukernel__f16c_u8( +void xnn_f16_vsub_ukernel__f16c_u8( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -33,11 +33,6 @@ void xnn_f16_vsub_minmax_ukernel__f16c_u8( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m256 vy_min = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m256 vy_max = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - XNN_FORCE_REALIZATION(vy_min); - XNN_FORCE_REALIZATION(vy_max); - for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a)); const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b)); @@ -46,9 +41,6 @@ void xnn_f16_vsub_minmax_ukernel__f16c_u8( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -58,9 +50,6 @@ void xnn_f16_vsub_minmax_ukernel__f16c_u8( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/gen/f16-vsub-minmax-fp16arith-u1.c b/src/f16-vbinary/gen/f16-vsub-fp16arith-u1.c similarity index 74% rename from src/f16-vbinary/gen/f16-vsub-minmax-fp16arith-u1.c rename to src/f16-vbinary/gen/f16-vsub-fp16arith-u1.c index 4cb61d3fe8f..14bba24249e 100644 --- a/src/f16-vbinary/gen/f16-vsub-minmax-fp16arith-u1.c +++ b/src/f16-vbinary/gen/f16-vsub-fp16arith-u1.c @@ -8,7 +8,6 @@ // LICENSE file in the root directory of this source tree. #include -#include #include @@ -17,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vsub_minmax_ukernel__fp16arith_u1( +void xnn_f16_vsub_ukernel__fp16arith_u1( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float16_t) == 0); @@ -34,16 +33,10 @@ void xnn_f16_vsub_minmax_ukernel__fp16arith_u1( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - float16_t vy_min, vy_max; - memcpy(&vy_min, ¶ms->scalar.min, sizeof(vy_min)); - memcpy(&vy_max, ¶ms->scalar.max, sizeof(vy_max)); - do { const float16_t va = *a++; const float16_t vb = *b++; float16_t vacc = vsubh_f16(va, vb); - vacc = vmaxnmh_f16(vacc, vy_min); - vacc = vminnmh_f16(vacc, vy_max); *o++ = vacc; batch -= sizeof(float16_t); } while (batch != 0); diff --git a/src/f16-vbinary/gen/f16-vsub-minmax-fp16arith-u2.c b/src/f16-vbinary/gen/f16-vsub-fp16arith-u2.c similarity index 71% rename from src/f16-vbinary/gen/f16-vsub-minmax-fp16arith-u2.c rename to src/f16-vbinary/gen/f16-vsub-fp16arith-u2.c index a84539fac40..4f85109cc4b 100644 --- a/src/f16-vbinary/gen/f16-vsub-minmax-fp16arith-u2.c +++ b/src/f16-vbinary/gen/f16-vsub-fp16arith-u2.c @@ -8,7 +8,6 @@ // LICENSE file in the root directory of this source tree. #include -#include #include @@ -17,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vsub_minmax_ukernel__fp16arith_u2( +void xnn_f16_vsub_ukernel__fp16arith_u2( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float16_t) == 0); @@ -34,10 +33,6 @@ void xnn_f16_vsub_minmax_ukernel__fp16arith_u2( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - float16_t vy_min, vy_max; - memcpy(&vy_min, ¶ms->scalar.min, sizeof(vy_min)); - memcpy(&vy_max, ¶ms->scalar.max, sizeof(vy_max)); - for (; batch >= 2 * sizeof(float16_t); batch -= 2 * sizeof(float16_t)) { const float16_t va0 = *a++; const float16_t va1 = *a++; @@ -49,12 +44,6 @@ void xnn_f16_vsub_minmax_ukernel__fp16arith_u2( float16_t vacc1 = vsubh_f16(va1, vb1); - vacc0 = vmaxnmh_f16(vacc0, vy_min); - vacc1 = vmaxnmh_f16(vacc1, vy_min); - - vacc0 = vminnmh_f16(vacc0, vy_max); - vacc1 = vminnmh_f16(vacc1, vy_max); - *o++ = vacc0; *o++ = vacc1; } @@ -62,8 +51,6 @@ void xnn_f16_vsub_minmax_ukernel__fp16arith_u2( const float16_t va = *a; const float16_t vb = *b; float16_t vacc = vsubh_f16(va, vb); - vacc = vmaxnmh_f16(vacc, vy_min); - vacc = vminnmh_f16(vacc, vy_max); *o = vacc; } } diff --git a/src/f16-vbinary/gen/f16-vsub-minmax-fp16arith-u4.c b/src/f16-vbinary/gen/f16-vsub-fp16arith-u4.c similarity index 70% rename from src/f16-vbinary/gen/f16-vsub-minmax-fp16arith-u4.c rename to src/f16-vbinary/gen/f16-vsub-fp16arith-u4.c index 87e447bf225..7bff1507c0f 100644 --- a/src/f16-vbinary/gen/f16-vsub-minmax-fp16arith-u4.c +++ b/src/f16-vbinary/gen/f16-vsub-fp16arith-u4.c @@ -8,7 +8,6 @@ // LICENSE file in the root directory of this source tree. #include -#include #include @@ -17,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vsub_minmax_ukernel__fp16arith_u4( +void xnn_f16_vsub_ukernel__fp16arith_u4( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float16_t) == 0); @@ -34,10 +33,6 @@ void xnn_f16_vsub_minmax_ukernel__fp16arith_u4( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - float16_t vy_min, vy_max; - memcpy(&vy_min, ¶ms->scalar.min, sizeof(vy_min)); - memcpy(&vy_max, ¶ms->scalar.max, sizeof(vy_max)); - for (; batch >= 4 * sizeof(float16_t); batch -= 4 * sizeof(float16_t)) { const float16_t va0 = *a++; const float16_t va1 = *a++; @@ -55,16 +50,6 @@ void xnn_f16_vsub_minmax_ukernel__fp16arith_u4( float16_t vacc3 = vsubh_f16(va3, vb3); - vacc0 = vmaxnmh_f16(vacc0, vy_min); - vacc1 = vmaxnmh_f16(vacc1, vy_min); - vacc2 = vmaxnmh_f16(vacc2, vy_min); - vacc3 = vmaxnmh_f16(vacc3, vy_min); - - vacc0 = vminnmh_f16(vacc0, vy_max); - vacc1 = vminnmh_f16(vacc1, vy_max); - vacc2 = vminnmh_f16(vacc2, vy_max); - vacc3 = vminnmh_f16(vacc3, vy_max); - *o++ = vacc0; *o++ = vacc1; *o++ = vacc2; @@ -75,8 +60,6 @@ void xnn_f16_vsub_minmax_ukernel__fp16arith_u4( const float16_t va = *a++; const float16_t vb = *b++; float16_t vacc = vsubh_f16(va, vb); - vacc = vmaxnmh_f16(vacc, vy_min); - vacc = vminnmh_f16(vacc, vy_max); *o++ = vacc; batch -= sizeof(float16_t); } while (batch != 0); diff --git a/src/f16-vbinary/gen/f16-vsub-minmax-avx512fp16-u64.c b/src/f16-vbinary/gen/f16-vsub-minmax-avx512fp16-u64.c deleted file mode 100644 index 45ee39e58da..00000000000 --- a/src/f16-vbinary/gen/f16-vsub-minmax-avx512fp16-u64.c +++ /dev/null @@ -1,91 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f16-vbinary/vop-avx512fp16.c.in -// Generator: tools/xngen -// -// Copyright 2024 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f16_vsub_minmax_ukernel__avx512fp16_u64( - size_t batch, - const xnn_float16* restrict input_a, - const xnn_float16* restrict input_b, - xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(uint16_t) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - -#if defined(__AVX512FP16__) - const uint16_t* a = (const uint16_t*) input_a; - const uint16_t* b = (const uint16_t*) input_b; - uint16_t* o = (uint16_t*) output; - - const __m512h voutput_min = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m512h voutput_max = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - - - for (; batch >= 64 * sizeof(uint16_t); batch -= 64 * sizeof(uint16_t)) { - const __m512h va0 = _mm512_loadu_ph(a); - const __m512h va1 = _mm512_loadu_ph(a + 32); - a += 64; - - __m512h vacc0 = _mm512_sub_ph(va0, _mm512_loadu_ph(b)); - __m512h vacc1 = _mm512_sub_ph(va1, _mm512_loadu_ph(b + 32)); - b += 64; - - - vacc0 = _mm512_max_ph(voutput_min, vacc0); - vacc1 = _mm512_max_ph(voutput_min, vacc1); - - vacc0 = _mm512_min_ph(voutput_max, vacc0); - vacc1 = _mm512_min_ph(voutput_max, vacc1); - - _mm512_storeu_ph(o, vacc0); - _mm512_storeu_ph(o + 32, vacc1); - o += 64; - } - for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { - const __m512h va = _mm512_loadu_ph(a); - a += 32; - - __m512h vacc = _mm512_sub_ph(va, _mm512_loadu_ph(b)); - b += 32; - - - vacc = _mm512_max_ph(voutput_min, vacc); - vacc = _mm512_min_ph(voutput_max, vacc); - - _mm512_storeu_ph(o, vacc); - o += 32; - } - if XNN_UNLIKELY(batch != 0) { - assert(batch >= 1 * sizeof(uint16_t)); - assert(batch <= 31 * sizeof(uint16_t)); - // Prepare mask for valid 16-bit elements (depends on batch). - batch >>= XNN_LOG2_SIZEOF_HALF; - const __mmask32 vmask = _cvtu32_mask32((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); - - const __m512h va = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, a)); - - __m512h vacc = _mm512_maskz_sub_ph(vmask, va, _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, b))); - - - vacc = _mm512_maskz_max_ph(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ph(vmask, voutput_max, vacc); - _mm512_mask_storeu_epi16(o, vmask, _mm512_castph_si512(vacc)); - } -#endif // defined(__AVX512FP16__) -} diff --git a/src/f16-vbinary/gen/f16-vsub-minmax-neonfp16arith-u16.c b/src/f16-vbinary/gen/f16-vsub-neonfp16arith-u16.c similarity index 77% rename from src/f16-vbinary/gen/f16-vsub-minmax-neonfp16arith-u16.c rename to src/f16-vbinary/gen/f16-vsub-neonfp16arith-u16.c index f2ce72c9e66..842edc2b105 100644 --- a/src/f16-vbinary/gen/f16-vsub-minmax-neonfp16arith-u16.c +++ b/src/f16-vbinary/gen/f16-vsub-neonfp16arith-u16.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vsub_minmax_ukernel__neonfp16arith_u16( +void xnn_f16_vsub_ukernel__neonfp16arith_u16( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -32,9 +32,6 @@ void xnn_f16_vsub_minmax_ukernel__neonfp16arith_u16( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const float16x8_t vy_min = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.min)); - const float16x8_t vy_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.max)); - for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; const float16x8_t vb01234567 = vreinterpretq_f16_u16(vld1q_u16(b)); b += 8; @@ -45,12 +42,6 @@ void xnn_f16_vsub_minmax_ukernel__neonfp16arith_u16( float16x8_t vy456789AB = vsubq_f16(va456789AB, vb456789AB); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy456789AB = vmaxq_f16(vy456789AB, vy_min); - - vy01234567 = vminq_f16(vy01234567, vy_max); - vy456789AB = vminq_f16(vy456789AB, vy_max); - vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; vst1q_u16(o, vreinterpretq_u16_f16(vy456789AB)); o += 8; } @@ -59,8 +50,6 @@ void xnn_f16_vsub_minmax_ukernel__neonfp16arith_u16( const float16x8_t vb01234567 = vreinterpretq_f16_u16(vld1q_u16(b)); b += 8; float16x8_t vy01234567 = vsubq_f16(va01234567, vb01234567); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; } if XNN_UNLIKELY(batch != 0) { @@ -68,8 +57,6 @@ void xnn_f16_vsub_minmax_ukernel__neonfp16arith_u16( const float16x8_t vb01234567 = vreinterpretq_f16_u16(vld1q_u16(b)); float16x8_t vy01234567 = vsubq_f16(va01234567, vb01234567); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); float16x4_t vy0123 = vget_low_f16(vy01234567); if (batch & (4 * sizeof(uint16_t))) { diff --git a/src/f16-vbinary/gen/f16-vsub-minmax-neonfp16arith-u8.c b/src/f16-vbinary/gen/f16-vsub-neonfp16arith-u8.c similarity index 77% rename from src/f16-vbinary/gen/f16-vsub-minmax-neonfp16arith-u8.c rename to src/f16-vbinary/gen/f16-vsub-neonfp16arith-u8.c index 42a89d03c77..e4dc684177b 100644 --- a/src/f16-vbinary/gen/f16-vsub-minmax-neonfp16arith-u8.c +++ b/src/f16-vbinary/gen/f16-vsub-neonfp16arith-u8.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vsub_minmax_ukernel__neonfp16arith_u8( +void xnn_f16_vsub_ukernel__neonfp16arith_u8( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -32,16 +32,11 @@ void xnn_f16_vsub_minmax_ukernel__neonfp16arith_u8( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const float16x8_t vy_min = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.min)); - const float16x8_t vy_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.max)); - for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; const float16x8_t vb01234567 = vreinterpretq_f16_u16(vld1q_u16(b)); b += 8; float16x8_t vy01234567 = vsubq_f16(va01234567, vb01234567); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; } if XNN_UNLIKELY(batch != 0) { @@ -49,8 +44,6 @@ void xnn_f16_vsub_minmax_ukernel__neonfp16arith_u8( const float16x8_t vb01234567 = vreinterpretq_f16_u16(vld1q_u16(b)); float16x8_t vy01234567 = vsubq_f16(va01234567, vb01234567); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); float16x4_t vy0123 = vget_low_f16(vy01234567); if (batch & (4 * sizeof(uint16_t))) { diff --git a/src/f16-vbinary/gen/f16-vsubc-avx512fp16-u32.c b/src/f16-vbinary/gen/f16-vsubc-avx512fp16-u32.c new file mode 100644 index 00000000000..6f4505b663c --- /dev/null +++ b/src/f16-vbinary/gen/f16-vsubc-avx512fp16-u32.c @@ -0,0 +1,64 @@ +// Auto-generated file. Do not edit! +// Template: src/f16-vbinary/vopc-avx512fp16.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/intrinsics-polyfill.h" +#include "xnnpack/vbinary.h" + + +void xnn_f16_vsubc_ukernel__avx512fp16_u32( + size_t batch, + const xnn_float16* restrict input_a, + const xnn_float16* restrict input_b, + xnn_float16* restrict output, + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint16_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + +#if defined(__AVX512FP16__) + const uint16_t* a = (const uint16_t*) input_a; + const uint16_t* b = (const uint16_t*) input_b; + uint16_t* o = (uint16_t*) output; + + const __m512h vb = _mm512_castsi512_ph(_mm512_set1_epi16(*b)); + + + for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { + __m512h va0 = _mm512_loadu_ph(a); + a += 32; + + __m512h vacc0 = _mm512_sub_ph(va0, vb); + + + _mm512_storeu_ph(o, vacc0); + o += 32; + } + if XNN_UNLIKELY(batch != 0) { + assert(batch >= 1 * sizeof(uint16_t)); + assert(batch <= 31 * sizeof(uint16_t)); + // Prepare mask for valid 16-bit elements (depends on batch). + batch >>= XNN_LOG2_SIZEOF_HALF; + const __mmask32 vmask = _cvtu32_mask32((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); + + __m512h va = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, a)); + + __m512h vacc = _mm512_maskz_sub_ph(vmask, va, vb); + + _mm512_mask_storeu_epi16(o, vmask, _mm512_castph_si512(vacc)); + } +#endif // defined(__AVX512FP16__) +} diff --git a/src/f16-vbinary/gen/f16-vsubc-minmax-avx512fp16-u32.c b/src/f16-vbinary/gen/f16-vsubc-avx512fp16-u64.c similarity index 75% rename from src/f16-vbinary/gen/f16-vsubc-minmax-avx512fp16-u32.c rename to src/f16-vbinary/gen/f16-vsubc-avx512fp16-u64.c index 516ad7d1193..05744c14753 100644 --- a/src/f16-vbinary/gen/f16-vsubc-minmax-avx512fp16-u32.c +++ b/src/f16-vbinary/gen/f16-vsubc-avx512fp16-u64.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vsubc_minmax_ukernel__avx512fp16_u32( +void xnn_f16_vsubc_ukernel__avx512fp16_u64( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -34,23 +34,29 @@ void xnn_f16_vsubc_minmax_ukernel__avx512fp16_u32( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m512h voutput_min = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m512h voutput_max = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); const __m512h vb = _mm512_castsi512_ph(_mm512_set1_epi16(*b)); - for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { + for (; batch >= 64 * sizeof(uint16_t); batch -= 64 * sizeof(uint16_t)) { __m512h va0 = _mm512_loadu_ph(a); - a += 32; + __m512h va1 = _mm512_loadu_ph(a + 32); + a += 64; __m512h vacc0 = _mm512_sub_ph(va0, vb); + __m512h vacc1 = _mm512_sub_ph(va1, vb); - vacc0 = _mm512_max_ph(voutput_min, vacc0); + _mm512_storeu_ph(o, vacc0); + _mm512_storeu_ph(o + 32, vacc1); + o += 64; + } + for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { + __m512h va = _mm512_loadu_ph(a); + a += 32; - vacc0 = _mm512_min_ph(voutput_max, vacc0); + __m512h vacc = _mm512_sub_ph(va, vb); - _mm512_storeu_ph(o, vacc0); + _mm512_storeu_ph(o, vacc); o += 32; } if XNN_UNLIKELY(batch != 0) { @@ -64,8 +70,6 @@ void xnn_f16_vsubc_minmax_ukernel__avx512fp16_u32( __m512h vacc = _mm512_maskz_sub_ph(vmask, va, vb); - vacc = _mm512_maskz_max_ph(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ph(vmask, voutput_max, vacc); _mm512_mask_storeu_epi16(o, vmask, _mm512_castph_si512(vacc)); } #endif // defined(__AVX512FP16__) diff --git a/src/f16-vbinary/gen/f16-vsubc-minmax-f16c-u16.c b/src/f16-vbinary/gen/f16-vsubc-f16c-u16.c similarity index 77% rename from src/f16-vbinary/gen/f16-vsubc-minmax-f16c-u16.c rename to src/f16-vbinary/gen/f16-vsubc-f16c-u16.c index 0cf462af396..1aaebc1314e 100644 --- a/src/f16-vbinary/gen/f16-vsubc-minmax-f16c-u16.c +++ b/src/f16-vbinary/gen/f16-vsubc-f16c-u16.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vsubc_minmax_ukernel__f16c_u16( +void xnn_f16_vsubc_ukernel__f16c_u16( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -33,11 +33,6 @@ void xnn_f16_vsubc_minmax_ukernel__f16c_u16( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m256 vy_min = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m256 vy_max = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - XNN_FORCE_REALIZATION(vy_min); - XNN_FORCE_REALIZATION(vy_max); - const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b)); for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a)); @@ -48,12 +43,6 @@ void xnn_f16_vsubc_minmax_ukernel__f16c_u16( __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va456789AB, vb), _MM_FROUND_TO_NEAREST_INT)); - vy01234567 = _mm256_max_ps(vy01234567, vy_min); - vy456789AB = _mm256_max_ps(vy456789AB, vy_min); - - vy01234567 = _mm256_min_ps(vy01234567, vy_max); - vy456789AB = _mm256_min_ps(vy456789AB, vy_max); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy01234567, _MM_FROUND_TO_NEAREST_INT)); _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_TO_NEAREST_INT)); o += 16; @@ -64,9 +53,6 @@ void xnn_f16_vsubc_minmax_ukernel__f16c_u16( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -75,9 +61,6 @@ void xnn_f16_vsubc_minmax_ukernel__f16c_u16( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/gen/f16-vsubc-minmax-f16c-u8.c b/src/f16-vbinary/gen/f16-vsubc-f16c-u8.c similarity index 77% rename from src/f16-vbinary/gen/f16-vsubc-minmax-f16c-u8.c rename to src/f16-vbinary/gen/f16-vsubc-f16c-u8.c index 2f2382c5578..a27eda2771c 100644 --- a/src/f16-vbinary/gen/f16-vsubc-minmax-f16c-u8.c +++ b/src/f16-vbinary/gen/f16-vsubc-f16c-u8.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vsubc_minmax_ukernel__f16c_u8( +void xnn_f16_vsubc_ukernel__f16c_u8( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -33,11 +33,6 @@ void xnn_f16_vsubc_minmax_ukernel__f16c_u8( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const __m256 vy_min = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m256 vy_max = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - XNN_FORCE_REALIZATION(vy_min); - XNN_FORCE_REALIZATION(vy_max); - const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b)); for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a)); @@ -45,9 +40,6 @@ void xnn_f16_vsubc_minmax_ukernel__f16c_u8( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -56,9 +48,6 @@ void xnn_f16_vsubc_minmax_ukernel__f16c_u8( __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_TO_NEAREST_INT)); - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/gen/f16-vsubc-minmax-fp16arith-u1.c b/src/f16-vbinary/gen/f16-vsubc-fp16arith-u1.c similarity index 74% rename from src/f16-vbinary/gen/f16-vsubc-minmax-fp16arith-u1.c rename to src/f16-vbinary/gen/f16-vsubc-fp16arith-u1.c index db26819e3d2..00335b55cc0 100644 --- a/src/f16-vbinary/gen/f16-vsubc-minmax-fp16arith-u1.c +++ b/src/f16-vbinary/gen/f16-vsubc-fp16arith-u1.c @@ -8,7 +8,6 @@ // LICENSE file in the root directory of this source tree. #include -#include #include @@ -18,12 +17,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vsubc_minmax_ukernel__fp16arith_u1( +void xnn_f16_vsubc_ukernel__fp16arith_u1( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float16_t) == 0); @@ -35,16 +34,10 @@ void xnn_f16_vsubc_minmax_ukernel__fp16arith_u1( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - float16_t vy_min, vy_max; - memcpy(&vy_min, ¶ms->scalar.min, sizeof(vy_min)); - memcpy(&vy_max, ¶ms->scalar.max, sizeof(vy_max)); - const float16_t vb = *b; do { float16_t vacc = *a++; vacc = vsubh_f16(vacc, vb); - vacc = vmaxnmh_f16(vacc, vy_min); - vacc = vminnmh_f16(vacc, vy_max); *o++ = vacc; batch -= sizeof(float16_t); } while (batch != 0); diff --git a/src/f16-vbinary/gen/f16-vsubc-minmax-fp16arith-u2.c b/src/f16-vbinary/gen/f16-vsubc-fp16arith-u2.c similarity index 70% rename from src/f16-vbinary/gen/f16-vsubc-minmax-fp16arith-u2.c rename to src/f16-vbinary/gen/f16-vsubc-fp16arith-u2.c index c579fab1171..b5878d9b1f0 100644 --- a/src/f16-vbinary/gen/f16-vsubc-minmax-fp16arith-u2.c +++ b/src/f16-vbinary/gen/f16-vsubc-fp16arith-u2.c @@ -8,7 +8,6 @@ // LICENSE file in the root directory of this source tree. #include -#include #include @@ -18,12 +17,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vsubc_minmax_ukernel__fp16arith_u2( +void xnn_f16_vsubc_ukernel__fp16arith_u2( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float16_t) == 0); @@ -35,10 +34,6 @@ void xnn_f16_vsubc_minmax_ukernel__fp16arith_u2( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - float16_t vy_min, vy_max; - memcpy(&vy_min, ¶ms->scalar.min, sizeof(vy_min)); - memcpy(&vy_max, ¶ms->scalar.max, sizeof(vy_max)); - const float16_t vb = *b; for (; batch >= 2 * sizeof(float16_t); batch -= 2 * sizeof(float16_t)) { float16_t vacc0 = a[0]; @@ -49,12 +44,6 @@ void xnn_f16_vsubc_minmax_ukernel__fp16arith_u2( vacc1 = vsubh_f16(vacc1, vb); - vacc0 = vmaxnmh_f16(vacc0, vy_min); - vacc1 = vmaxnmh_f16(vacc1, vy_min); - - vacc0 = vminnmh_f16(vacc0, vy_max); - vacc1 = vminnmh_f16(vacc1, vy_max); - o[0] = vacc0; o[1] = vacc1; o += 2; @@ -62,8 +51,6 @@ void xnn_f16_vsubc_minmax_ukernel__fp16arith_u2( if XNN_UNLIKELY(batch != 0) { float16_t vacc = *a; vacc = vsubh_f16(vacc, vb); - vacc = vmaxnmh_f16(vacc, vy_min); - vacc = vminnmh_f16(vacc, vy_max); *o = vacc; } } diff --git a/src/f16-vbinary/gen/f16-vsubc-minmax-fp16arith-u4.c b/src/f16-vbinary/gen/f16-vsubc-fp16arith-u4.c similarity index 68% rename from src/f16-vbinary/gen/f16-vsubc-minmax-fp16arith-u4.c rename to src/f16-vbinary/gen/f16-vsubc-fp16arith-u4.c index 8c6553402e6..4c76e348e3b 100644 --- a/src/f16-vbinary/gen/f16-vsubc-minmax-fp16arith-u4.c +++ b/src/f16-vbinary/gen/f16-vsubc-fp16arith-u4.c @@ -8,7 +8,6 @@ // LICENSE file in the root directory of this source tree. #include -#include #include @@ -18,12 +17,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vsubc_minmax_ukernel__fp16arith_u4( +void xnn_f16_vsubc_ukernel__fp16arith_u4( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float16_t) == 0); @@ -35,10 +34,6 @@ void xnn_f16_vsubc_minmax_ukernel__fp16arith_u4( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - float16_t vy_min, vy_max; - memcpy(&vy_min, ¶ms->scalar.min, sizeof(vy_min)); - memcpy(&vy_max, ¶ms->scalar.max, sizeof(vy_max)); - const float16_t vb = *b; for (; batch >= 4 * sizeof(float16_t); batch -= 4 * sizeof(float16_t)) { float16_t vacc0 = a[0]; @@ -53,16 +48,6 @@ void xnn_f16_vsubc_minmax_ukernel__fp16arith_u4( vacc3 = vsubh_f16(vacc3, vb); - vacc0 = vmaxnmh_f16(vacc0, vy_min); - vacc1 = vmaxnmh_f16(vacc1, vy_min); - vacc2 = vmaxnmh_f16(vacc2, vy_min); - vacc3 = vmaxnmh_f16(vacc3, vy_min); - - vacc0 = vminnmh_f16(vacc0, vy_max); - vacc1 = vminnmh_f16(vacc1, vy_max); - vacc2 = vminnmh_f16(vacc2, vy_max); - vacc3 = vminnmh_f16(vacc3, vy_max); - o[0] = vacc0; o[1] = vacc1; o[2] = vacc2; @@ -73,8 +58,6 @@ void xnn_f16_vsubc_minmax_ukernel__fp16arith_u4( do { float16_t vacc = *a++; vacc = vsubh_f16(vacc, vb); - vacc = vmaxnmh_f16(vacc, vy_min); - vacc = vminnmh_f16(vacc, vy_max); *o++ = vacc; batch -= sizeof(float16_t); } while (batch != 0); diff --git a/src/f16-vbinary/gen/f16-vsubc-minmax-avx512fp16-u64.c b/src/f16-vbinary/gen/f16-vsubc-minmax-avx512fp16-u64.c deleted file mode 100644 index 456cc2f88d7..00000000000 --- a/src/f16-vbinary/gen/f16-vsubc-minmax-avx512fp16-u64.c +++ /dev/null @@ -1,89 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f16-vbinary/vopc-avx512fp16.c.in -// Generator: tools/xngen -// -// Copyright 2024 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/vbinary.h" - - -void xnn_f16_vsubc_minmax_ukernel__avx512fp16_u64( - size_t batch, - const xnn_float16* restrict input_a, - const xnn_float16* restrict input_b, - xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(uint16_t) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - -#if defined(__AVX512FP16__) - const uint16_t* a = (const uint16_t*) input_a; - const uint16_t* b = (const uint16_t*) input_b; - uint16_t* o = (uint16_t*) output; - - const __m512h voutput_min = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m512h voutput_max = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - const __m512h vb = _mm512_castsi512_ph(_mm512_set1_epi16(*b)); - - - for (; batch >= 64 * sizeof(uint16_t); batch -= 64 * sizeof(uint16_t)) { - __m512h va0 = _mm512_loadu_ph(a); - __m512h va1 = _mm512_loadu_ph(a + 32); - a += 64; - - __m512h vacc0 = _mm512_sub_ph(va0, vb); - __m512h vacc1 = _mm512_sub_ph(va1, vb); - - - vacc0 = _mm512_max_ph(voutput_min, vacc0); - vacc1 = _mm512_max_ph(voutput_min, vacc1); - - vacc0 = _mm512_min_ph(voutput_max, vacc0); - vacc1 = _mm512_min_ph(voutput_max, vacc1); - - _mm512_storeu_ph(o, vacc0); - _mm512_storeu_ph(o + 32, vacc1); - o += 64; - } - for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { - __m512h va = _mm512_loadu_ph(a); - a += 32; - - __m512h vacc = _mm512_sub_ph(va, vb); - - vacc = _mm512_max_ph(voutput_min, vacc); - vacc = _mm512_min_ph(voutput_max, vacc); - - _mm512_storeu_ph(o, vacc); - o += 32; - } - if XNN_UNLIKELY(batch != 0) { - assert(batch >= 1 * sizeof(uint16_t)); - assert(batch <= 31 * sizeof(uint16_t)); - // Prepare mask for valid 16-bit elements (depends on batch). - batch >>= XNN_LOG2_SIZEOF_HALF; - const __mmask32 vmask = _cvtu32_mask32((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); - - __m512h va = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, a)); - - __m512h vacc = _mm512_maskz_sub_ph(vmask, va, vb); - - vacc = _mm512_maskz_max_ph(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ph(vmask, voutput_max, vacc); - _mm512_mask_storeu_epi16(o, vmask, _mm512_castph_si512(vacc)); - } -#endif // defined(__AVX512FP16__) -} diff --git a/src/f16-vbinary/gen/f16-vsubc-minmax-neonfp16arith-u16.c b/src/f16-vbinary/gen/f16-vsubc-neonfp16arith-u16.c similarity index 75% rename from src/f16-vbinary/gen/f16-vsubc-minmax-neonfp16arith-u16.c rename to src/f16-vbinary/gen/f16-vsubc-neonfp16arith-u16.c index 9490b7a4d12..e023e42dd08 100644 --- a/src/f16-vbinary/gen/f16-vsubc-minmax-neonfp16arith-u16.c +++ b/src/f16-vbinary/gen/f16-vsubc-neonfp16arith-u16.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vsubc_minmax_ukernel__neonfp16arith_u16( +void xnn_f16_vsubc_ukernel__neonfp16arith_u16( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -32,9 +32,6 @@ void xnn_f16_vsubc_minmax_ukernel__neonfp16arith_u16( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const float16x8_t vy_min = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.min)); - const float16x8_t vy_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.max)); - const float16x8_t vb = vreinterpretq_f16_u16(vld1q_dup_u16(b)); for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; @@ -44,12 +41,6 @@ void xnn_f16_vsubc_minmax_ukernel__neonfp16arith_u16( float16x8_t vy456789AB = vsubq_f16(va456789AB, vb); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy456789AB = vmaxq_f16(vy456789AB, vy_min); - - vy01234567 = vminq_f16(vy01234567, vy_max); - vy456789AB = vminq_f16(vy456789AB, vy_max); - vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; vst1q_u16(o, vreinterpretq_u16_f16(vy456789AB)); o += 8; } @@ -57,16 +48,12 @@ void xnn_f16_vsubc_minmax_ukernel__neonfp16arith_u16( const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; float16x8_t vy01234567 = vsubq_f16(va01234567, vb); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; } if XNN_UNLIKELY(batch != 0) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); float16x8_t vy01234567 = vsubq_f16(va01234567, vb); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); float16x4_t vy0123 = vget_low_f16(vy01234567); if (batch & (4 * sizeof(uint16_t))) { diff --git a/src/f16-vbinary/gen/f16-vsubc-minmax-neonfp16arith-u8.c b/src/f16-vbinary/gen/f16-vsubc-neonfp16arith-u8.c similarity index 76% rename from src/f16-vbinary/gen/f16-vsubc-minmax-neonfp16arith-u8.c rename to src/f16-vbinary/gen/f16-vsubc-neonfp16arith-u8.c index 0a8d26bce04..16efdaddd53 100644 --- a/src/f16-vbinary/gen/f16-vsubc-minmax-neonfp16arith-u8.c +++ b/src/f16-vbinary/gen/f16-vsubc-neonfp16arith-u8.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f16_vsubc_minmax_ukernel__neonfp16arith_u8( +void xnn_f16_vsubc_ukernel__neonfp16arith_u8( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -32,24 +32,17 @@ void xnn_f16_vsubc_minmax_ukernel__neonfp16arith_u8( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - const float16x8_t vy_min = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.min)); - const float16x8_t vy_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.max)); - const float16x8_t vb = vreinterpretq_f16_u16(vld1q_dup_u16(b)); for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); a += 8; float16x8_t vy01234567 = vsubq_f16(va01234567, vb); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; } if XNN_UNLIKELY(batch != 0) { const float16x8_t va01234567 = vreinterpretq_f16_u16(vld1q_u16(a)); float16x8_t vy01234567 = vsubq_f16(va01234567, vb); - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); float16x4_t vy0123 = vget_low_f16(vy01234567); if (batch & (4 * sizeof(uint16_t))) { diff --git a/src/f16-vbinary/vop-avx512fp16.c.in b/src/f16-vbinary/vop-avx512fp16.c.in index ac72def1f19..df8c8e55ea0 100644 --- a/src/f16-vbinary/vop-avx512fp16.c.in +++ b/src/f16-vbinary/vop-avx512fp16.c.in @@ -8,7 +8,6 @@ $assert BATCH_TILE >= 32 $SIMD_TILE = BATCH_TILE // 32 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" $assert OP in ["ADD", "DIV", "MAX", "MIN", "MUL", "SUB", "SQRDIFF", "PRELU"] -$assert ACTIVATION in ["LINEAR", "MINMAX"] #include #include @@ -37,14 +36,12 @@ $ "SUB": "_mm512_maskz_sub_ph", $ "SQRDIFF": "_mm512_maskz_sub_ph", $ "PRELU": "_mm512_maskz_mul_ph", $}[OP] -$SUFFIX = {"LINEAR": "", "MINMAX": "_minmax"}[ACTIVATION] -$PARAMS = {"LINEAR": "struct xnn_f16_default_params", "MINMAX": "union xnn_f16_minmax_params"}[ACTIVATION] -void xnn_f16_v${OP.lower()}${SUFFIX}_ukernel__avx512fp16_u${BATCH_TILE}( +void xnn_f16_v${OP.lower()}_ukernel__avx512fp16_u${BATCH_TILE}( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const ${PARAMS} params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -57,10 +54,6 @@ void xnn_f16_v${OP.lower()}${SUFFIX}_ukernel__avx512fp16_u${BATCH_TILE}( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - $if ACTIVATION == "MINMAX": - const __m512h voutput_min = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m512h voutput_max = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - $if OP == "PRELU": const __m512h vzero = _mm512_setzero_ph(); @@ -86,13 +79,6 @@ void xnn_f16_v${OP.lower()}${SUFFIX}_ukernel__avx512fp16_u${BATCH_TILE}( $for N in range(SIMD_TILE): vacc${ABC[N]} = _mm512_mul_ph(vacc${ABC[N]}, vacc${ABC[N]}); - $if ACTIVATION == "MINMAX": - $for N in range(SIMD_TILE): - vacc${ABC[N]} = _mm512_max_ph(voutput_min, vacc${ABC[N]}); - - $for N in range(SIMD_TILE): - vacc${ABC[N]} = _mm512_min_ph(voutput_max, vacc${ABC[N]}); - _mm512_storeu_ph(o, vacc${ABC[0]}); $for N in range(1, SIMD_TILE): _mm512_storeu_ph(o + ${N * 32}, vacc${ABC[N]}); @@ -113,10 +99,6 @@ void xnn_f16_v${OP.lower()}${SUFFIX}_ukernel__avx512fp16_u${BATCH_TILE}( $if OP == "SQRDIFF": vacc = _mm512_mul_ph(vacc, vacc); - $if ACTIVATION == "MINMAX": - vacc = _mm512_max_ph(voutput_min, vacc); - vacc = _mm512_min_ph(voutput_max, vacc); - _mm512_storeu_ph(o, vacc); o += 32; } @@ -138,9 +120,6 @@ void xnn_f16_v${OP.lower()}${SUFFIX}_ukernel__avx512fp16_u${BATCH_TILE}( $if OP == "SQRDIFF": vacc = _mm512_maskz_mul_ph(vmask, vacc, vacc); - $if ACTIVATION == "MINMAX": - vacc = _mm512_maskz_max_ph(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ph(vmask, voutput_max, vacc); _mm512_mask_storeu_epi16(o, vmask, _mm512_castph_si512(vacc)); } #endif // defined(__AVX512FP16__) diff --git a/src/f16-vbinary/vop-f16c.c.in b/src/f16-vbinary/vop-f16c.c.in index a5208c50988..69a1958ac6f 100644 --- a/src/f16-vbinary/vop-f16c.c.in +++ b/src/f16-vbinary/vop-f16c.c.in @@ -7,7 +7,6 @@ $assert BATCH_TILE % 8 == 0 $assert BATCH_TILE >= 8 $ABC = "01234567456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" $assert OP in ["ADD", "DIV", "MAX", "MIN", "MUL", "SUB", "SQRDIFF", "PRELU"] -$assert ACTIVATION in ["LINEAR", "MINMAX"] #include #include @@ -27,14 +26,12 @@ $ "SUB": lambda x, y: "_mm256_sub_ps(%s, %s)" % (x, y), $ "SQRDIFF": lambda x, y: "_mm256_sub_ps(%s, %s)" % (x, y), $ "PRELU": lambda x, y: "_mm256_mul_ps(%s, %s)" % (x, y), $}[OP] -$SUFFIX = {"LINEAR": "", "MINMAX": "_minmax"}[ACTIVATION] -$PARAMS = {"LINEAR": "struct xnn_f16_default_params", "MINMAX": "union xnn_f16_minmax_params"}[ACTIVATION] -void xnn_f16_v${OP.lower()}${SUFFIX}_ukernel__f16c_u${BATCH_TILE}( +void xnn_f16_v${OP.lower()}_ukernel__f16c_u${BATCH_TILE}( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const ${PARAMS} params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -46,12 +43,6 @@ void xnn_f16_v${OP.lower()}${SUFFIX}_ukernel__f16c_u${BATCH_TILE}( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - $if ACTIVATION == "MINMAX": - const __m256 vy_min = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m256 vy_max = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - XNN_FORCE_REALIZATION(vy_min); - XNN_FORCE_REALIZATION(vy_max); - $if BATCH_TILE > 8: for (; batch >= ${BATCH_TILE} * sizeof(uint16_t); batch -= ${BATCH_TILE} * sizeof(uint16_t)) { const __m256 va${ABC[0:8]} = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a)); @@ -72,13 +63,6 @@ void xnn_f16_v${OP.lower()}${SUFFIX}_ukernel__f16c_u${BATCH_TILE}( $for N in range(0, BATCH_TILE, 8): vy${ABC[N:N+8]} = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_blendv_ps(va${ABC[N:N+8]}, vy${ABC[N:N+8]}, va${ABC[N:N+8]}), _MM_FROUND_TO_NEAREST_INT)); - $if ACTIVATION == "MINMAX": - $for N in range(0, BATCH_TILE, 8): - vy${ABC[N:N+8]} = _mm256_max_ps(vy${ABC[N:N+8]}, vy_min); - - $for N in range(0, BATCH_TILE, 8): - vy${ABC[N:N+8]} = _mm256_min_ps(vy${ABC[N:N+8]}, vy_max); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy${ABC[0:8]}, _MM_FROUND_TO_NEAREST_INT)); $for N in range(8, BATCH_TILE, 8): _mm_storeu_si128((__m128i*) (o + ${N}), _mm256_cvtps_ph(vy${ABC[N:N+8]}, _MM_FROUND_TO_NEAREST_INT)); @@ -96,10 +80,6 @@ void xnn_f16_v${OP.lower()}${SUFFIX}_ukernel__f16c_u${BATCH_TILE}( $elif OP == "PRELU": vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_blendv_ps(va, vy, va), _MM_FROUND_TO_NEAREST_INT)); - $if ACTIVATION == "MINMAX": - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -113,10 +93,6 @@ void xnn_f16_v${OP.lower()}${SUFFIX}_ukernel__f16c_u${BATCH_TILE}( $elif OP == "PRELU": vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_blendv_ps(va, vy, va), _MM_FROUND_TO_NEAREST_INT)); - $if ACTIVATION == "MINMAX": - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/vop-fp16arith.c.in b/src/f16-vbinary/vop-fp16arith.c.in index f8a52899a7e..92bfd993e43 100644 --- a/src/f16-vbinary/vop-fp16arith.c.in +++ b/src/f16-vbinary/vop-fp16arith.c.in @@ -6,10 +6,7 @@ $assert BATCH_TILE >= 1 $ABC = "01234567456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" $assert OP in ["ADD", "DIV", "MAX", "MIN", "MUL", "SUB", "SQRDIFF"] -$assert ACTIVATION in ["LINEAR", "MINMAX"] #include -$if ACTIVATION == "MINMAX": - #include #include @@ -27,14 +24,12 @@ $ "MUL": lambda x, y: "vmulh_f16(%s, %s)" % (x, y), $ "SUB": lambda x, y: "vsubh_f16(%s, %s)" % (x, y), $ "SQRDIFF": lambda x, y: "vsubh_f16(%s, %s)" % (x, y), $}[OP] -$SUFFIX = {"LINEAR": "", "MINMAX": "_minmax"}[ACTIVATION] -$PARAMS = {"LINEAR": "struct xnn_f16_default_params", "MINMAX": "union xnn_f16_minmax_params"}[ACTIVATION] -void xnn_f16_v${OP.lower()}${SUFFIX}_ukernel__fp16arith_u${BATCH_TILE}( +void xnn_f16_v${OP.lower()}_ukernel__fp16arith_u${BATCH_TILE}( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const ${PARAMS} params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float16_t) == 0); @@ -46,11 +41,6 @@ void xnn_f16_v${OP.lower()}${SUFFIX}_ukernel__fp16arith_u${BATCH_TILE}( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - $if ACTIVATION == "MINMAX": - float16_t vy_min, vy_max; - memcpy(&vy_min, ¶ms->scalar.min, sizeof(vy_min)); - memcpy(&vy_max, ¶ms->scalar.max, sizeof(vy_max)); - $if BATCH_TILE > 1: for (; batch >= ${BATCH_TILE} * sizeof(float16_t); batch -= ${BATCH_TILE} * sizeof(float16_t)) { $for N in range(BATCH_TILE): @@ -66,13 +56,6 @@ void xnn_f16_v${OP.lower()}${SUFFIX}_ukernel__fp16arith_u${BATCH_TILE}( $for N in range(BATCH_TILE): vacc${ABC[N]} = vmulh_f16(vacc${ABC[N]}, vacc${ABC[N]}); - $if ACTIVATION == "MINMAX": - $for N in range(BATCH_TILE): - vacc${ABC[N]} = vmaxnmh_f16(vacc${ABC[N]}, vy_min); - - $for N in range(BATCH_TILE): - vacc${ABC[N]} = vminnmh_f16(vacc${ABC[N]}, vy_max); - $for N in range(BATCH_TILE): *o++ = vacc${ABC[N]}; } @@ -84,9 +67,6 @@ void xnn_f16_v${OP.lower()}${SUFFIX}_ukernel__fp16arith_u${BATCH_TILE}( float16_t vacc = ${VOPH_F16("va", "vb")}; $if OP == "SQRDIFF": vacc = vmulh_f16(vacc, vacc); - $if ACTIVATION == "MINMAX": - vacc = vmaxnmh_f16(vacc, vy_min); - vacc = vminnmh_f16(vacc, vy_max); *o++ = vacc; batch -= sizeof(float16_t); } while (batch != 0); @@ -96,9 +76,6 @@ void xnn_f16_v${OP.lower()}${SUFFIX}_ukernel__fp16arith_u${BATCH_TILE}( float16_t vacc = ${VOPH_F16("va", "vb")}; $if OP == "SQRDIFF": vacc = vmulh_f16(vacc, vacc); - $if ACTIVATION == "MINMAX": - vacc = vmaxnmh_f16(vacc, vy_min); - vacc = vminnmh_f16(vacc, vy_max); *o = vacc; } $else: @@ -108,9 +85,6 @@ void xnn_f16_v${OP.lower()}${SUFFIX}_ukernel__fp16arith_u${BATCH_TILE}( float16_t vacc = ${VOPH_F16("va", "vb")}; $if OP == "SQRDIFF": vacc = vmulh_f16(vacc, vacc); - $if ACTIVATION == "MINMAX": - vacc = vmaxnmh_f16(vacc, vy_min); - vacc = vminnmh_f16(vacc, vy_max); *o++ = vacc; batch -= sizeof(float16_t); } while (batch != 0); diff --git a/src/f16-vbinary/vop-neonfp16arith.c.in b/src/f16-vbinary/vop-neonfp16arith.c.in index a284825d324..9114ff17cf6 100644 --- a/src/f16-vbinary/vop-neonfp16arith.c.in +++ b/src/f16-vbinary/vop-neonfp16arith.c.in @@ -7,7 +7,6 @@ $assert BATCH_TILE % 8 == 0 $assert BATCH_TILE >= 8 $ABC = "01234567456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" $assert OP in ["ADD", "DIV", "MAX", "MIN", "MUL", "SUB", "SQRDIFF", "PRELU"] -$assert ACTIVATION in ["LINEAR", "MINMAX"] #include #include @@ -26,15 +25,13 @@ $ "SUB": lambda x, y: "vsubq_f16(%s, %s)" % (x, y), $ "SQRDIFF": lambda x, y: "vsubq_f16(%s, %s)" % (x, y), $ "PRELU": lambda x, y: "vmulq_f16(%s, %s)" % (x, y), $}[OP] -$SUFFIX = {"LINEAR": "", "MINMAX": "_minmax"}[ACTIVATION] -$PARAMS = {"LINEAR": "struct xnn_f16_default_params", "MINMAX": "union xnn_f16_minmax_params"}[ACTIVATION] $ISA = "aarch64_neonfp16arith" if OP == "DIV" else "neonfp16arith" -void xnn_f16_v${OP.lower()}${SUFFIX}_ukernel__${ISA}_u${BATCH_TILE}( +void xnn_f16_v${OP.lower()}_ukernel__${ISA}_u${BATCH_TILE}( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const ${PARAMS} params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -46,10 +43,6 @@ void xnn_f16_v${OP.lower()}${SUFFIX}_ukernel__${ISA}_u${BATCH_TILE}( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - $if ACTIVATION == "MINMAX": - const float16x8_t vy_min = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.min)); - const float16x8_t vy_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.max)); - $if BATCH_TILE > 8: for (; batch >= ${BATCH_TILE} * sizeof(uint16_t); batch -= ${BATCH_TILE} * sizeof(uint16_t)) { $for N in range(0, BATCH_TILE, 8): @@ -69,13 +62,6 @@ void xnn_f16_v${OP.lower()}${SUFFIX}_ukernel__${ISA}_u${BATCH_TILE}( $for N in range(0, BATCH_TILE, 8): vy${ABC[N:N+8]} = vbslq_f16(vm${ABC[N:N+8]}, vy${ABC[N:N+8]}, va${ABC[N:N+8]}); - $if ACTIVATION == "MINMAX": - $for N in range(0, BATCH_TILE, 8): - vy${ABC[N:N+8]} = vmaxq_f16(vy${ABC[N:N+8]}, vy_min); - - $for N in range(0, BATCH_TILE, 8): - vy${ABC[N:N+8]} = vminq_f16(vy${ABC[N:N+8]}, vy_max); - $for N in range(0, BATCH_TILE, 8): vst1q_u16(o, vreinterpretq_u16_f16(vy${ABC[N:N+8]})); o += 8; } @@ -89,9 +75,6 @@ void xnn_f16_v${OP.lower()}${SUFFIX}_ukernel__${ISA}_u${BATCH_TILE}( $elif OP == "PRELU": const uint16x8_t vm01234567 = vcltq_s16(vreinterpretq_s16_f16(va01234567), vmovq_n_s16(0)); vy01234567 = vbslq_f16(vm01234567, vy01234567, va01234567); - $if ACTIVATION == "MINMAX": - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; } if XNN_UNLIKELY(batch != 0) { @@ -104,9 +87,6 @@ void xnn_f16_v${OP.lower()}${SUFFIX}_ukernel__${ISA}_u${BATCH_TILE}( $elif OP == "PRELU": const uint16x8_t vm01234567 = vcltq_s16(vreinterpretq_s16_f16(va01234567), vmovq_n_s16(0)); vy01234567 = vbslq_f16(vm01234567, vy01234567, va01234567); - $if ACTIVATION == "MINMAX": - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); float16x4_t vy0123 = vget_low_f16(vy01234567); if (batch & (4 * sizeof(uint16_t))) { diff --git a/src/f16-vbinary/vopc-avx512fp16.c.in b/src/f16-vbinary/vopc-avx512fp16.c.in index 9f23bab06cc..6fe805b952f 100644 --- a/src/f16-vbinary/vopc-avx512fp16.c.in +++ b/src/f16-vbinary/vopc-avx512fp16.c.in @@ -8,7 +8,6 @@ $assert BATCH_TILE >= 32 $SIMD_TILE = BATCH_TILE // 32 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" $assert OP in ["ADD", "DIV", "RDIV", "MAX", "MIN", "MUL", "SUB", "RSUB", "SQRDIFF", "PRELU", "RPRELU"] -$assert ACTIVATION in ["LINEAR", "MINMAX"] #include #include @@ -44,14 +43,12 @@ $ "SQRDIFF": lambda m, x: "_mm512_maskz_sub_ph(%s, %s, vb)" % (m, x), $ "PRELU": lambda m, x: "_mm512_maskz_mul_ph(%s, %s, vb)" % (m, x), $ "RPRELU": lambda m, x: "_mm512_maskz_mul_ph(%s, %s, vb)" % (m, x), $}[OP] -$SUFFIX = {"LINEAR": "", "MINMAX": "_minmax"}[ACTIVATION] -$PARAMS = {"LINEAR": "struct xnn_f16_default_params", "MINMAX": "union xnn_f16_minmax_params"}[ACTIVATION] -void xnn_f16_v${OP.lower()}c${SUFFIX}_ukernel__avx512fp16_u${BATCH_TILE}( +void xnn_f16_v${OP.lower()}c_ukernel__avx512fp16_u${BATCH_TILE}( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const ${PARAMS} params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -64,9 +61,6 @@ void xnn_f16_v${OP.lower()}c${SUFFIX}_ukernel__avx512fp16_u${BATCH_TILE}( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - $if ACTIVATION == "MINMAX": - const __m512h voutput_min = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m512h voutput_max = _mm512_castsi512_ph(_mm512_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); const __m512h vb = _mm512_castsi512_ph(_mm512_set1_epi16(*b)); $if OP == "PRELU": @@ -96,13 +90,6 @@ void xnn_f16_v${OP.lower()}c${SUFFIX}_ukernel__avx512fp16_u${BATCH_TILE}( $for N in range(SIMD_TILE): vacc${ABC[N]} = _mm512_mul_ph(vacc${ABC[N]}, vacc${ABC[N]}); - $if ACTIVATION == "MINMAX": - $for N in range(SIMD_TILE): - vacc${ABC[N]} = _mm512_max_ph(voutput_min, vacc${ABC[N]}); - - $for N in range(SIMD_TILE): - vacc${ABC[N]} = _mm512_min_ph(voutput_max, vacc${ABC[N]}); - _mm512_storeu_ph(o, vacc${ABC[0]}); $for N in range(1, SIMD_TILE): _mm512_storeu_ph(o + ${N * 32}, vacc${ABC[N]}); @@ -123,10 +110,6 @@ void xnn_f16_v${OP.lower()}c${SUFFIX}_ukernel__avx512fp16_u${BATCH_TILE}( $if OP == "SQRDIFF": vacc = _mm512_mul_ph(vacc, vacc); - $if ACTIVATION == "MINMAX": - vacc = _mm512_max_ph(voutput_min, vacc); - vacc = _mm512_min_ph(voutput_max, vacc); - _mm512_storeu_ph(o, vacc); o += 32; } @@ -149,9 +132,6 @@ void xnn_f16_v${OP.lower()}c${SUFFIX}_ukernel__avx512fp16_u${BATCH_TILE}( $if OP == "SQRDIFF": vacc = _mm512_maskz_mul_ph(vmask, vacc, vacc); - $if ACTIVATION == "MINMAX": - vacc = _mm512_maskz_max_ph(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ph(vmask, voutput_max, vacc); _mm512_mask_storeu_epi16(o, vmask, _mm512_castph_si512(vacc)); } #endif // defined(__AVX512FP16__) diff --git a/src/f16-vbinary/vopc-f16c.c.in b/src/f16-vbinary/vopc-f16c.c.in index 67de03d21ec..bc87bdeb18a 100644 --- a/src/f16-vbinary/vopc-f16c.c.in +++ b/src/f16-vbinary/vopc-f16c.c.in @@ -7,7 +7,6 @@ $assert BATCH_TILE % 8 == 0 $assert BATCH_TILE >= 8 $ABC = "01234567456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" $assert OP in ["ADD", "DIV", "RDIV", "MAX", "MIN", "MUL", "SUB", "RSUB", "SQRDIFF", "PRELU", "RPRELU"] -$assert ACTIVATION in ["LINEAR", "MINMAX"] #include #include @@ -30,14 +29,12 @@ $ "SQRDIFF": lambda x: "_mm256_sub_ps(%s, vb)" % x, $ "PRELU": lambda x: "_mm256_mul_ps(%s, vb)" % x, $ "RPRELU": lambda x: "_mm256_mul_ps(%s, vb)" % x, $}[OP] -$SUFFIX = {"LINEAR": "", "MINMAX": "_minmax"}[ACTIVATION] -$PARAMS = {"LINEAR": "struct xnn_f16_default_params", "MINMAX": "union xnn_f16_minmax_params"}[ACTIVATION] -void xnn_f16_v${OP.lower()}c${SUFFIX}_ukernel__f16c_u${BATCH_TILE}( +void xnn_f16_v${OP.lower()}c_ukernel__f16c_u${BATCH_TILE}( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const ${PARAMS} params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -49,12 +46,6 @@ void xnn_f16_v${OP.lower()}c${SUFFIX}_ukernel__f16c_u${BATCH_TILE}( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - $if ACTIVATION == "MINMAX": - const __m256 vy_min = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m256 vy_max = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - XNN_FORCE_REALIZATION(vy_min); - XNN_FORCE_REALIZATION(vy_max); - const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b)); $if BATCH_TILE > 8: for (; batch >= ${BATCH_TILE} * sizeof(uint16_t); batch -= ${BATCH_TILE} * sizeof(uint16_t)) { @@ -76,13 +67,6 @@ void xnn_f16_v${OP.lower()}c${SUFFIX}_ukernel__f16c_u${BATCH_TILE}( $for N in range(0, BATCH_TILE, 8): vy${ABC[N:N+8]} = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_blendv_ps(vb, vy${ABC[N:N+8]}, vb), _MM_FROUND_TO_NEAREST_INT)); - $if ACTIVATION == "MINMAX": - $for N in range(0, BATCH_TILE, 8): - vy${ABC[N:N+8]} = _mm256_max_ps(vy${ABC[N:N+8]}, vy_min); - - $for N in range(0, BATCH_TILE, 8): - vy${ABC[N:N+8]} = _mm256_min_ps(vy${ABC[N:N+8]}, vy_max); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy${ABC[0:8]}, _MM_FROUND_TO_NEAREST_INT)); $for N in range(8, BATCH_TILE, 8): _mm_storeu_si128((__m128i*) (o + ${N}), _mm256_cvtps_ph(vy${ABC[N:N+8]}, _MM_FROUND_TO_NEAREST_INT)); @@ -100,10 +84,6 @@ void xnn_f16_v${OP.lower()}c${SUFFIX}_ukernel__f16c_u${BATCH_TILE}( $elif OP == "RPRELU": vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_blendv_ps(vb, vy, vb), _MM_FROUND_TO_NEAREST_INT)); - $if ACTIVATION == "MINMAX": - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT)); o += 8; } @@ -118,10 +98,6 @@ void xnn_f16_v${OP.lower()}c${SUFFIX}_ukernel__f16c_u${BATCH_TILE}( $elif OP == "RPRELU": vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_blendv_ps(vb, vy, vb), _MM_FROUND_TO_NEAREST_INT)); - $if ACTIVATION == "MINMAX": - vy = _mm256_max_ps(vy, vy_min); - vy = _mm256_min_ps(vy, vy_max); - __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_TO_NEAREST_INT); if (batch & (4 * sizeof(uint16_t))) { _mm_storel_epi64((__m128i*) o, vh); diff --git a/src/f16-vbinary/vopc-fp16arith.c.in b/src/f16-vbinary/vopc-fp16arith.c.in index 540fece22ae..7ace2da9d23 100644 --- a/src/f16-vbinary/vopc-fp16arith.c.in +++ b/src/f16-vbinary/vopc-fp16arith.c.in @@ -6,10 +6,7 @@ $assert BATCH_TILE >= 1 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" $assert OP in ["ADD", "DIV", "RDIV", "MAX", "MIN", "MUL", "SUB", "RSUB", "SQRDIFF"] -$assert ACTIVATION in ["LINEAR", "MINMAX"] #include -$if ACTIVATION == "MINMAX": - #include #include @@ -30,14 +27,12 @@ $ "SUB": lambda x: "vsubh_f16(%s, vb)" % x, $ "RSUB": lambda x: "vsubh_f16(vb, %s)" % x, $ "SQRDIFF": lambda x: "vsubh_f16(%s, vb)" % x, $}[OP] -$SUFFIX = {"LINEAR": "", "MINMAX": "_minmax"}[ACTIVATION] -$PARAMS = {"LINEAR": "struct xnn_f16_default_params", "MINMAX": "union xnn_f16_minmax_params"}[ACTIVATION] -void xnn_f16_v${OP.lower()}c${SUFFIX}_ukernel__fp16arith_u${BATCH_TILE}( +void xnn_f16_v${OP.lower()}c_ukernel__fp16arith_u${BATCH_TILE}( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const ${PARAMS} params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float16_t) == 0); @@ -49,11 +44,6 @@ void xnn_f16_v${OP.lower()}c${SUFFIX}_ukernel__fp16arith_u${BATCH_TILE}( const float16_t* b = (const float16_t*) input_b; float16_t* o = (float16_t*) output; - $if ACTIVATION == "MINMAX": - float16_t vy_min, vy_max; - memcpy(&vy_min, ¶ms->scalar.min, sizeof(vy_min)); - memcpy(&vy_max, ¶ms->scalar.max, sizeof(vy_max)); - const float16_t vb = *b; $if BATCH_TILE > 1: for (; batch >= ${BATCH_TILE} * sizeof(float16_t); batch -= ${BATCH_TILE} * sizeof(float16_t)) { @@ -68,13 +58,6 @@ void xnn_f16_v${OP.lower()}c${SUFFIX}_ukernel__fp16arith_u${BATCH_TILE}( $for N in range(BATCH_TILE): vacc${ABC[N]} = vmulh_f16(vacc${ABC[N]}, vacc${ABC[N]}); - $if ACTIVATION == "MINMAX": - $for N in range(BATCH_TILE): - vacc${ABC[N]} = vmaxnmh_f16(vacc${ABC[N]}, vy_min); - - $for N in range(BATCH_TILE): - vacc${ABC[N]} = vminnmh_f16(vacc${ABC[N]}, vy_max); - $for N in range(BATCH_TILE): o[${N}] = vacc${ABC[N]}; o += ${BATCH_TILE}; @@ -86,9 +69,6 @@ void xnn_f16_v${OP.lower()}c${SUFFIX}_ukernel__fp16arith_u${BATCH_TILE}( vacc = ${VOPH_F16("vacc")}; $if OP == "SQRDIFF": vacc = vmulh_f16(vacc, vacc); - $if ACTIVATION == "MINMAX": - vacc = vmaxnmh_f16(vacc, vy_min); - vacc = vminnmh_f16(vacc, vy_max); *o++ = vacc; batch -= sizeof(float16_t); } while (batch != 0); @@ -97,9 +77,6 @@ void xnn_f16_v${OP.lower()}c${SUFFIX}_ukernel__fp16arith_u${BATCH_TILE}( vacc = ${VOPH_F16("vacc")}; $if OP == "SQRDIFF": vacc = vmulh_f16(vacc, vacc); - $if ACTIVATION == "MINMAX": - vacc = vmaxnmh_f16(vacc, vy_min); - vacc = vminnmh_f16(vacc, vy_max); *o = vacc; } $else: @@ -108,9 +85,6 @@ void xnn_f16_v${OP.lower()}c${SUFFIX}_ukernel__fp16arith_u${BATCH_TILE}( vacc = ${VOPH_F16("vacc")}; $if OP == "SQRDIFF": vacc = vmulh_f16(vacc, vacc); - $if ACTIVATION == "MINMAX": - vacc = vmaxnmh_f16(vacc, vy_min); - vacc = vminnmh_f16(vacc, vy_max); *o++ = vacc; batch -= sizeof(float16_t); } while (batch != 0); diff --git a/src/f16-vbinary/vopc-neonfp16arith.c.in b/src/f16-vbinary/vopc-neonfp16arith.c.in index 609e24a14f5..5f7a45c12a1 100644 --- a/src/f16-vbinary/vopc-neonfp16arith.c.in +++ b/src/f16-vbinary/vopc-neonfp16arith.c.in @@ -7,7 +7,6 @@ $assert BATCH_TILE % 8 == 0 $assert BATCH_TILE >= 8 $ABC = "01234567456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" $assert OP in ["ADD", "DIV", "RDIV", "MAX", "MIN", "MUL", "SUB", "RSUB", "SQRDIFF", "PRELU", "RPRELU"] -$assert ACTIVATION in ["LINEAR", "MINMAX"] #include #include @@ -29,15 +28,13 @@ $ "SQRDIFF": lambda x: "vsubq_f16(%s, vb)" % x, $ "PRELU": lambda x: "vmulq_f16(%s, vb)" % x, $ "RPRELU": lambda x: "vmulq_f16(%s, vb)" % x, $}[OP] -$SUFFIX = {"LINEAR": "", "MINMAX": "_minmax"}[ACTIVATION] -$PARAMS = {"LINEAR": "struct xnn_f16_default_params", "MINMAX": "union xnn_f16_minmax_params"}[ACTIVATION] $ISA = "aarch64_neonfp16arith" if OP in ["DIV", "RDIV"] else "neonfp16arith" -void xnn_f16_v${OP.lower()}c${SUFFIX}_ukernel__${ISA}_u${BATCH_TILE}( +void xnn_f16_v${OP.lower()}c_ukernel__${ISA}_u${BATCH_TILE}( size_t batch, const xnn_float16* restrict input_a, const xnn_float16* restrict input_b, xnn_float16* restrict output, - const ${PARAMS} params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -49,10 +46,6 @@ void xnn_f16_v${OP.lower()}c${SUFFIX}_ukernel__${ISA}_u${BATCH_TILE}( const uint16_t* b = (const uint16_t*) input_b; uint16_t* o = (uint16_t*) output; - $if ACTIVATION == "MINMAX": - const float16x8_t vy_min = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.min)); - const float16x8_t vy_max = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.max)); - const float16x8_t vb = vreinterpretq_f16_u16(vld1q_dup_u16(b)); $if OP == "RPRELU": const uint16x8_t vm = vcltq_s16(vreinterpretq_s16_f16(vb), vmovq_n_s16(0)); @@ -77,13 +70,6 @@ void xnn_f16_v${OP.lower()}c${SUFFIX}_ukernel__${ISA}_u${BATCH_TILE}( $for N in range(0, BATCH_TILE, 8): vy${ABC[N:N+8]} = vbslq_f16(vm, vy${ABC[N:N+8]}, vb); - $if ACTIVATION == "MINMAX": - $for N in range(0, BATCH_TILE, 8): - vy${ABC[N:N+8]} = vmaxq_f16(vy${ABC[N:N+8]}, vy_min); - - $for N in range(0, BATCH_TILE, 8): - vy${ABC[N:N+8]} = vminq_f16(vy${ABC[N:N+8]}, vy_max); - $for N in range(0, BATCH_TILE, 8): vst1q_u16(o, vreinterpretq_u16_f16(vy${ABC[N:N+8]})); o += 8; } @@ -98,9 +84,6 @@ void xnn_f16_v${OP.lower()}c${SUFFIX}_ukernel__${ISA}_u${BATCH_TILE}( vy01234567 = vbslq_f16(vm01234567, vy01234567, va01234567); $elif OP == "RPRELU": vy01234567 = vbslq_f16(vm, vy01234567, vb); - $if ACTIVATION == "MINMAX": - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); vst1q_u16(o, vreinterpretq_u16_f16(vy01234567)); o += 8; } if XNN_UNLIKELY(batch != 0) { @@ -114,9 +97,6 @@ void xnn_f16_v${OP.lower()}c${SUFFIX}_ukernel__${ISA}_u${BATCH_TILE}( vy01234567 = vbslq_f16(vm01234567, vy01234567, va01234567); $elif OP == "RPRELU": vy01234567 = vbslq_f16(vm, vy01234567, vb); - $if ACTIVATION == "MINMAX": - vy01234567 = vmaxq_f16(vy01234567, vy_min); - vy01234567 = vminq_f16(vy01234567, vy_max); float16x4_t vy0123 = vget_low_f16(vy01234567); if (batch & (4 * sizeof(uint16_t))) { diff --git a/src/f32-vbinary/f32-vadd-minmax.h b/src/f32-vbinary/f32-vadd-minmax.h deleted file mode 100644 index 65ea4073234..00000000000 --- a/src/f32-vbinary/f32-vadd-minmax.h +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#ifndef XNN_UKERNEL_WITH_PARAMS -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ - XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) -#define XNN_DEFINED_UKERNEL_WITH_PARAMS -#endif - -#ifndef XNN_UKERNEL -#define XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) \ - XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, void, /*init_params=*/nullptr) -#define XNN_DEFINED_UKERNEL -#endif - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vadd_minmax_ukernel__neon_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vadd_minmax_ukernel__neon_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - -#if XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vadd_minmax_ukernel__rvv_u4v, 4, true, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vadd_minmax_ukernel__rvv_u8v, 8, true, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vadd_minmax_ukernel__sse_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vadd_minmax_ukernel__sse_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vadd_minmax_ukernel__avx_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vadd_minmax_ukernel__avx_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vadd_minmax_ukernel__avx512f_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vadd_minmax_ukernel__avx512f_u32, 32, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vadd_minmax_ukernel__wasmsimd_arm_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vadd_minmax_ukernel__wasmsimd_arm_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vadd_minmax_ukernel__wasmsimd_arm_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vadd_minmax_ukernel__wasmsimd_x86_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vadd_minmax_ukernel__wasmsimd_x86_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vadd_minmax_ukernel__wasmsimd_x86_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vadd_minmax_ukernel__wasm_u1, 1, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vadd_minmax_ukernel__wasm_u2, 2, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vadd_minmax_ukernel__wasm_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vadd_minmax_ukernel__wasm_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - -#if XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vadd_minmax_ukernel__hvx_u32, 32, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vadd_minmax_ukernel__hvx_u64, 64, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vadd_minmax_ukernel__hvx_u128, 128, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) - -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vadd_minmax_ukernel__scalar_u1, 1, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vadd_minmax_ukernel__scalar_u2, 2, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vadd_minmax_ukernel__scalar_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vadd_minmax_ukernel__scalar_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) - -#ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS -#undef XNN_DEFINED_UKERNEL_WITH_PARAMS -#undef XNN_UKERNEL_WITH_PARAMS -#endif - -#ifdef XNN_DEFINED_UKERNEL -#undef XNN_DEFINED_UKERNEL -#undef XNN_UKERNEL -#endif diff --git a/src/f32-vbinary/f32-vadd.h b/src/f32-vbinary/f32-vadd.h index 8f75d7336a7..c0ab4527528 100644 --- a/src/f32-vbinary/f32-vadd.h +++ b/src/f32-vbinary/f32-vadd.h @@ -16,12 +16,44 @@ #endif +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vadd_ukernel__neon_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vadd_ukernel__neon_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vadd_ukernel__rvv_u4v, 4, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vadd_ukernel__rvv_u8v, 8, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vadd_ukernel__sse_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vadd_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vadd_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vadd_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vadd_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vadd_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vadd_ukernel__wasmsimd_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vadd_ukernel__wasmsimd_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vadd_ukernel__wasmsimd_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vadd_ukernel__wasm_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vadd_ukernel__wasm_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vadd_ukernel__wasm_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vadd_ukernel__wasm_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +#if XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vadd_ukernel__hvx_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vadd_ukernel__hvx_u64, 64, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vadd_ukernel__hvx_u128, 128, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) + XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vadd_ukernel__scalar_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vadd_ukernel__scalar_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vadd_ukernel__scalar_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) diff --git a/src/f32-vbinary/f32-vaddc-minmax.h b/src/f32-vbinary/f32-vaddc-minmax.h deleted file mode 100644 index bf2021b1c97..00000000000 --- a/src/f32-vbinary/f32-vaddc-minmax.h +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#ifndef XNN_UKERNEL_WITH_PARAMS -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ - XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) -#define XNN_DEFINED_UKERNEL_WITH_PARAMS -#endif - -#ifndef XNN_UKERNEL -#define XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) \ - XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, void, /*init_params=*/nullptr) -#define XNN_DEFINED_UKERNEL -#endif - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vaddc_minmax_ukernel__neon_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vaddc_minmax_ukernel__neon_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - -#if XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vaddc_minmax_ukernel__rvv_u4v, 4, true, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vaddc_minmax_ukernel__rvv_u8v, 8, true, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vaddc_minmax_ukernel__sse_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vaddc_minmax_ukernel__sse_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vaddc_minmax_ukernel__avx_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vaddc_minmax_ukernel__avx_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vaddc_minmax_ukernel__avx512f_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vaddc_minmax_ukernel__avx512f_u32, 32, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vaddc_minmax_ukernel__wasm_u1, 1, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vaddc_minmax_ukernel__wasm_u2, 2, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vaddc_minmax_ukernel__wasm_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vaddc_minmax_ukernel__wasm_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - -#if XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vaddc_minmax_ukernel__hvx_u32, 32, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vaddc_minmax_ukernel__hvx_u64, 64, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vaddc_minmax_ukernel__hvx_u128, 128, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) - -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vaddc_minmax_ukernel__scalar_u1, 1, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vaddc_minmax_ukernel__scalar_u2, 2, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vaddc_minmax_ukernel__scalar_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vaddc_minmax_ukernel__scalar_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) - -#ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS -#undef XNN_DEFINED_UKERNEL_WITH_PARAMS -#undef XNN_UKERNEL_WITH_PARAMS -#endif - -#ifdef XNN_DEFINED_UKERNEL -#undef XNN_DEFINED_UKERNEL -#undef XNN_UKERNEL -#endif diff --git a/src/f32-vbinary/f32-vaddc.h b/src/f32-vbinary/f32-vaddc.h index 0729ede7710..ca2080ac180 100644 --- a/src/f32-vbinary/f32-vaddc.h +++ b/src/f32-vbinary/f32-vaddc.h @@ -16,12 +16,44 @@ #endif +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vaddc_ukernel__neon_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vaddc_ukernel__neon_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vaddc_ukernel__rvv_u4v, 4, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vaddc_ukernel__rvv_u8v, 8, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vaddc_ukernel__sse_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vaddc_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vaddc_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vaddc_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vaddc_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vaddc_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vaddc_ukernel__wasmsimd_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vaddc_ukernel__wasmsimd_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vaddc_ukernel__wasmsimd_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vaddc_ukernel__wasm_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vaddc_ukernel__wasm_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vaddc_ukernel__wasm_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vaddc_ukernel__wasm_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +#if XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vaddc_ukernel__hvx_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vaddc_ukernel__hvx_u64, 64, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vaddc_ukernel__hvx_u128, 128, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) + XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vaddc_ukernel__scalar_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vaddc_ukernel__scalar_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vaddc_ukernel__scalar_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) diff --git a/src/f32-vbinary/f32-vdiv-minmax.h b/src/f32-vbinary/f32-vdiv-minmax.h deleted file mode 100644 index 1ecf23b49be..00000000000 --- a/src/f32-vbinary/f32-vdiv-minmax.h +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#ifndef XNN_UKERNEL_WITH_PARAMS -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ - XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) -#define XNN_DEFINED_UKERNEL_WITH_PARAMS -#endif - -#ifndef XNN_UKERNEL -#define XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) \ - XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, void, /*init_params=*/nullptr) -#define XNN_DEFINED_UKERNEL -#endif - - -#if XNN_ARCH_ARM64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vdiv_minmax_ukernel__aarch64_neon_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vdiv_minmax_ukernel__aarch64_neon_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_ARM64 - -#if XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vdiv_minmax_ukernel__rvv_u4v, 4, true, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vdiv_minmax_ukernel__rvv_u8v, 8, true, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdiv_minmax_ukernel__sse_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdiv_minmax_ukernel__sse_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vdiv_minmax_ukernel__avx_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vdiv_minmax_ukernel__avx_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vdiv_minmax_ukernel__avx512f_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vdiv_minmax_ukernel__avx512f_u32, 32, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdiv_minmax_ukernel__wasmsimd_arm_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdiv_minmax_ukernel__wasmsimd_arm_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdiv_minmax_ukernel__wasmsimd_arm_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdiv_minmax_ukernel__wasmsimd_x86_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdiv_minmax_ukernel__wasmsimd_x86_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdiv_minmax_ukernel__wasmsimd_x86_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdiv_minmax_ukernel__wasm_u1, 1, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdiv_minmax_ukernel__wasm_u2, 2, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdiv_minmax_ukernel__wasm_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdiv_minmax_ukernel__wasm_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdiv_minmax_ukernel__scalar_u1, 1, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdiv_minmax_ukernel__scalar_u2, 2, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdiv_minmax_ukernel__scalar_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdiv_minmax_ukernel__scalar_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) - -#ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS -#undef XNN_DEFINED_UKERNEL_WITH_PARAMS -#undef XNN_UKERNEL_WITH_PARAMS -#endif - -#ifdef XNN_DEFINED_UKERNEL -#undef XNN_DEFINED_UKERNEL -#undef XNN_UKERNEL -#endif diff --git a/src/f32-vbinary/f32-vdiv.h b/src/f32-vbinary/f32-vdiv.h index 654798b79aa..23937909a3f 100644 --- a/src/f32-vbinary/f32-vdiv.h +++ b/src/f32-vbinary/f32-vdiv.h @@ -16,12 +16,38 @@ #endif +#if XNN_ARCH_ARM64 +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vdiv_ukernel__aarch64_neon_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vdiv_ukernel__aarch64_neon_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_ARM64 + +#if XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vdiv_ukernel__rvv_u4v, 4, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vdiv_ukernel__rvv_u8v, 8, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdiv_ukernel__sse_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdiv_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vdiv_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vdiv_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vdiv_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vdiv_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdiv_ukernel__wasmsimd_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdiv_ukernel__wasmsimd_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdiv_ukernel__wasmsimd_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdiv_ukernel__wasm_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdiv_ukernel__wasm_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdiv_ukernel__wasm_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdiv_ukernel__wasm_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdiv_ukernel__scalar_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdiv_ukernel__scalar_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdiv_ukernel__scalar_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) diff --git a/src/f32-vbinary/f32-vdivc-minmax.h b/src/f32-vbinary/f32-vdivc-minmax.h deleted file mode 100644 index 94678bff6c7..00000000000 --- a/src/f32-vbinary/f32-vdivc-minmax.h +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#ifndef XNN_UKERNEL_WITH_PARAMS -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ - XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) -#define XNN_DEFINED_UKERNEL_WITH_PARAMS -#endif - -#ifndef XNN_UKERNEL -#define XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) \ - XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, void, /*init_params=*/nullptr) -#define XNN_DEFINED_UKERNEL -#endif - - -#if XNN_ARCH_ARM64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vdivc_minmax_ukernel__aarch64_neon_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vdivc_minmax_ukernel__aarch64_neon_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_ARM64 - -#if XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vdivc_minmax_ukernel__rvv_u4v, 4, true, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vdivc_minmax_ukernel__rvv_u8v, 8, true, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdivc_minmax_ukernel__sse_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdivc_minmax_ukernel__sse_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vdivc_minmax_ukernel__avx_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vdivc_minmax_ukernel__avx_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vdivc_minmax_ukernel__avx512f_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vdivc_minmax_ukernel__avx512f_u32, 32, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdivc_minmax_ukernel__wasmsimd_arm_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdivc_minmax_ukernel__wasmsimd_arm_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdivc_minmax_ukernel__wasmsimd_arm_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdivc_minmax_ukernel__wasmsimd_x86_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdivc_minmax_ukernel__wasmsimd_x86_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdivc_minmax_ukernel__wasmsimd_x86_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdivc_minmax_ukernel__wasm_u1, 1, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdivc_minmax_ukernel__wasm_u2, 2, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdivc_minmax_ukernel__wasm_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdivc_minmax_ukernel__wasm_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdivc_minmax_ukernel__scalar_u1, 1, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdivc_minmax_ukernel__scalar_u2, 2, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdivc_minmax_ukernel__scalar_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdivc_minmax_ukernel__scalar_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) - -#ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS -#undef XNN_DEFINED_UKERNEL_WITH_PARAMS -#undef XNN_UKERNEL_WITH_PARAMS -#endif - -#ifdef XNN_DEFINED_UKERNEL -#undef XNN_DEFINED_UKERNEL -#undef XNN_UKERNEL -#endif diff --git a/src/f32-vbinary/f32-vdivc.h b/src/f32-vbinary/f32-vdivc.h index 5a4383ad7bf..e776858bab1 100644 --- a/src/f32-vbinary/f32-vdivc.h +++ b/src/f32-vbinary/f32-vdivc.h @@ -16,12 +16,38 @@ #endif +#if XNN_ARCH_ARM64 +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vdivc_ukernel__aarch64_neon_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vdivc_ukernel__aarch64_neon_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_ARM64 + +#if XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vdivc_ukernel__rvv_u4v, 4, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vdivc_ukernel__rvv_u8v, 8, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdivc_ukernel__sse_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdivc_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vdivc_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vdivc_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vdivc_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vdivc_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdivc_ukernel__wasmsimd_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdivc_ukernel__wasmsimd_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdivc_ukernel__wasmsimd_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdivc_ukernel__wasm_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdivc_ukernel__wasm_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdivc_ukernel__wasm_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdivc_ukernel__wasm_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdivc_ukernel__scalar_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdivc_ukernel__scalar_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdivc_ukernel__scalar_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) diff --git a/src/f32-vbinary/f32-vmul-minmax.h b/src/f32-vbinary/f32-vmul-minmax.h deleted file mode 100644 index 1df9282cc77..00000000000 --- a/src/f32-vbinary/f32-vmul-minmax.h +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#ifndef XNN_UKERNEL_WITH_PARAMS -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ - XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) -#define XNN_DEFINED_UKERNEL_WITH_PARAMS -#endif - -#ifndef XNN_UKERNEL -#define XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) \ - XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, void, /*init_params=*/nullptr) -#define XNN_DEFINED_UKERNEL -#endif - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vmul_minmax_ukernel__neon_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vmul_minmax_ukernel__neon_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - -#if XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vmul_minmax_ukernel__rvv_u4v, 4, true, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vmul_minmax_ukernel__rvv_u8v, 8, true, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmul_minmax_ukernel__sse_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmul_minmax_ukernel__sse_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vmul_minmax_ukernel__avx_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vmul_minmax_ukernel__avx_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vmul_minmax_ukernel__avx512f_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vmul_minmax_ukernel__avx512f_u32, 32, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmul_minmax_ukernel__wasmsimd_arm_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmul_minmax_ukernel__wasmsimd_arm_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmul_minmax_ukernel__wasmsimd_arm_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmul_minmax_ukernel__wasmsimd_x86_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmul_minmax_ukernel__wasmsimd_x86_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmul_minmax_ukernel__wasmsimd_x86_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmul_minmax_ukernel__wasm_u1, 1, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmul_minmax_ukernel__wasm_u2, 2, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmul_minmax_ukernel__wasm_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmul_minmax_ukernel__wasm_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - -#if XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vmul_minmax_ukernel__hvx_u32, 32, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vmul_minmax_ukernel__hvx_u64, 64, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vmul_minmax_ukernel__hvx_u128, 128, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) - -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmul_minmax_ukernel__scalar_u1, 1, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmul_minmax_ukernel__scalar_u2, 2, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmul_minmax_ukernel__scalar_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmul_minmax_ukernel__scalar_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) - -#ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS -#undef XNN_DEFINED_UKERNEL_WITH_PARAMS -#undef XNN_UKERNEL_WITH_PARAMS -#endif - -#ifdef XNN_DEFINED_UKERNEL -#undef XNN_DEFINED_UKERNEL -#undef XNN_UKERNEL -#endif diff --git a/src/f32-vbinary/f32-vmul.h b/src/f32-vbinary/f32-vmul.h index f7e6ad56df2..437721a8d80 100644 --- a/src/f32-vbinary/f32-vmul.h +++ b/src/f32-vbinary/f32-vmul.h @@ -16,12 +16,44 @@ #endif +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vmul_ukernel__neon_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vmul_ukernel__neon_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vmul_ukernel__rvv_u4v, 4, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vmul_ukernel__rvv_u8v, 8, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmul_ukernel__sse_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmul_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vmul_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vmul_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vmul_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vmul_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmul_ukernel__wasmsimd_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmul_ukernel__wasmsimd_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmul_ukernel__wasmsimd_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmul_ukernel__wasm_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmul_ukernel__wasm_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmul_ukernel__wasm_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmul_ukernel__wasm_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +#if XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vmul_ukernel__hvx_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vmul_ukernel__hvx_u64, 64, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vmul_ukernel__hvx_u128, 128, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) + XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmul_ukernel__scalar_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmul_ukernel__scalar_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmul_ukernel__scalar_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) diff --git a/src/f32-vbinary/f32-vmulc-minmax.h b/src/f32-vbinary/f32-vmulc-minmax.h deleted file mode 100644 index aae986f594a..00000000000 --- a/src/f32-vbinary/f32-vmulc-minmax.h +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#ifndef XNN_UKERNEL_WITH_PARAMS -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ - XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) -#define XNN_DEFINED_UKERNEL_WITH_PARAMS -#endif - -#ifndef XNN_UKERNEL -#define XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) \ - XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, void, /*init_params=*/nullptr) -#define XNN_DEFINED_UKERNEL -#endif - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vmulc_minmax_ukernel__neon_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vmulc_minmax_ukernel__neon_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - -#if XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vmulc_minmax_ukernel__rvv_u4v, 4, true, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vmulc_minmax_ukernel__rvv_u8v, 8, true, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmulc_minmax_ukernel__sse_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmulc_minmax_ukernel__sse_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vmulc_minmax_ukernel__avx_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vmulc_minmax_ukernel__avx_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vmulc_minmax_ukernel__avx512f_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vmulc_minmax_ukernel__avx512f_u32, 32, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmulc_minmax_ukernel__wasm_u1, 1, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmulc_minmax_ukernel__wasm_u2, 2, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmulc_minmax_ukernel__wasm_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmulc_minmax_ukernel__wasm_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - -#if XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vmulc_minmax_ukernel__hvx_u32, 32, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vmulc_minmax_ukernel__hvx_u64, 64, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vmulc_minmax_ukernel__hvx_u128, 128, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) - -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmulc_minmax_ukernel__scalar_u1, 1, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmulc_minmax_ukernel__scalar_u2, 2, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmulc_minmax_ukernel__scalar_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmulc_minmax_ukernel__scalar_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) - -#ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS -#undef XNN_DEFINED_UKERNEL_WITH_PARAMS -#undef XNN_UKERNEL_WITH_PARAMS -#endif - -#ifdef XNN_DEFINED_UKERNEL -#undef XNN_DEFINED_UKERNEL -#undef XNN_UKERNEL -#endif diff --git a/src/f32-vbinary/f32-vmulc.h b/src/f32-vbinary/f32-vmulc.h index 67d023faeba..ad45df4c31c 100644 --- a/src/f32-vbinary/f32-vmulc.h +++ b/src/f32-vbinary/f32-vmulc.h @@ -16,23 +16,49 @@ #endif +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vmulc_ukernel__neon_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vmulc_ukernel__neon_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vmulc_ukernel__rvv_u4v, 4, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vmulc_ukernel__rvv_u8v, 8, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmulc_ukernel__sse_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmulc_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vmulc_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vmulc_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vmulc_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vmulc_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmulc_ukernel__wasmsimd_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmulc_ukernel__wasmsimd_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmulc_ukernel__wasmsimd_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmulc_ukernel__wasm_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmulc_ukernel__wasm_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmulc_ukernel__wasm_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmulc_ukernel__wasm_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +#if XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vmulc_ukernel__hvx_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vmulc_ukernel__hvx_u64, 64, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vmulc_ukernel__hvx_u128, 128, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) + XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmulc_ukernel__scalar_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmulc_ukernel__scalar_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmulc_ukernel__scalar_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmulc_ukernel__scalar_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -#if XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vmulc_ukernel__rvv_u4v, 4, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vmulc_ukernel__rvv_u8v, 8, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -#endif // XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) - - #ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS #undef XNN_DEFINED_UKERNEL_WITH_PARAMS #undef XNN_UKERNEL_WITH_PARAMS diff --git a/src/f32-vbinary/f32-vrdivc-minmax.h b/src/f32-vbinary/f32-vrdivc-minmax.h deleted file mode 100644 index df2890e7b30..00000000000 --- a/src/f32-vbinary/f32-vrdivc-minmax.h +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#ifndef XNN_UKERNEL_WITH_PARAMS -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ - XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) -#define XNN_DEFINED_UKERNEL_WITH_PARAMS -#endif - -#ifndef XNN_UKERNEL -#define XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) \ - XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, void, /*init_params=*/nullptr) -#define XNN_DEFINED_UKERNEL -#endif - - -#if XNN_ARCH_ARM64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vrdivc_minmax_ukernel__aarch64_neon_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vrdivc_minmax_ukernel__aarch64_neon_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_ARM64 - -#if XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrdivc_minmax_ukernel__rvv_u4v, 4, true, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrdivc_minmax_ukernel__rvv_u8v, 8, true, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrdivc_minmax_ukernel__sse_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrdivc_minmax_ukernel__sse_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrdivc_minmax_ukernel__avx_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrdivc_minmax_ukernel__avx_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrdivc_minmax_ukernel__avx512f_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrdivc_minmax_ukernel__avx512f_u32, 32, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrdivc_minmax_ukernel__wasmsimd_arm_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrdivc_minmax_ukernel__wasmsimd_arm_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrdivc_minmax_ukernel__wasmsimd_arm_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrdivc_minmax_ukernel__wasmsimd_x86_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrdivc_minmax_ukernel__wasmsimd_x86_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrdivc_minmax_ukernel__wasmsimd_x86_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrdivc_minmax_ukernel__wasm_u1, 1, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrdivc_minmax_ukernel__wasm_u2, 2, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrdivc_minmax_ukernel__wasm_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrdivc_minmax_ukernel__wasm_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrdivc_minmax_ukernel__scalar_u1, 1, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrdivc_minmax_ukernel__scalar_u2, 2, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrdivc_minmax_ukernel__scalar_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrdivc_minmax_ukernel__scalar_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) - -#ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS -#undef XNN_DEFINED_UKERNEL_WITH_PARAMS -#undef XNN_UKERNEL_WITH_PARAMS -#endif - -#ifdef XNN_DEFINED_UKERNEL -#undef XNN_DEFINED_UKERNEL -#undef XNN_UKERNEL -#endif diff --git a/src/f32-vbinary/f32-vrdivc.h b/src/f32-vbinary/f32-vrdivc.h index 20ab7ac8198..586ea49ede9 100644 --- a/src/f32-vbinary/f32-vrdivc.h +++ b/src/f32-vbinary/f32-vrdivc.h @@ -16,12 +16,38 @@ #endif +#if XNN_ARCH_ARM64 +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vrdivc_ukernel__aarch64_neon_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vrdivc_ukernel__aarch64_neon_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_ARM64 + +#if XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrdivc_ukernel__rvv_u4v, 4, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrdivc_ukernel__rvv_u8v, 8, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrdivc_ukernel__sse_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrdivc_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrdivc_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrdivc_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrdivc_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrdivc_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrdivc_ukernel__wasmsimd_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrdivc_ukernel__wasmsimd_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrdivc_ukernel__wasmsimd_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrdivc_ukernel__wasm_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrdivc_ukernel__wasm_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrdivc_ukernel__wasm_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrdivc_ukernel__wasm_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrdivc_ukernel__scalar_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrdivc_ukernel__scalar_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrdivc_ukernel__scalar_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) diff --git a/src/f32-vbinary/f32-vrsubc-minmax.h b/src/f32-vbinary/f32-vrsubc-minmax.h deleted file mode 100644 index d411997c5d7..00000000000 --- a/src/f32-vbinary/f32-vrsubc-minmax.h +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#ifndef XNN_UKERNEL_WITH_PARAMS -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ - XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) -#define XNN_DEFINED_UKERNEL_WITH_PARAMS -#endif - -#ifndef XNN_UKERNEL -#define XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) \ - XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, void, /*init_params=*/nullptr) -#define XNN_DEFINED_UKERNEL -#endif - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vrsubc_minmax_ukernel__neon_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vrsubc_minmax_ukernel__neon_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - -#if XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrsubc_minmax_ukernel__rvv_u4v, 4, true, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrsubc_minmax_ukernel__rvv_u8v, 8, true, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsubc_minmax_ukernel__sse_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsubc_minmax_ukernel__sse_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrsubc_minmax_ukernel__avx_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrsubc_minmax_ukernel__avx_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrsubc_minmax_ukernel__avx512f_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrsubc_minmax_ukernel__avx512f_u32, 32, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsubc_minmax_ukernel__wasmsimd_arm_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsubc_minmax_ukernel__wasmsimd_arm_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsubc_minmax_ukernel__wasmsimd_arm_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsubc_minmax_ukernel__wasmsimd_x86_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsubc_minmax_ukernel__wasmsimd_x86_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsubc_minmax_ukernel__wasmsimd_x86_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsubc_minmax_ukernel__wasm_u1, 1, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsubc_minmax_ukernel__wasm_u2, 2, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsubc_minmax_ukernel__wasm_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsubc_minmax_ukernel__wasm_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - -#if XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vrsubc_minmax_ukernel__hvx_u32, 32, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vrsubc_minmax_ukernel__hvx_u64, 64, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vrsubc_minmax_ukernel__hvx_u128, 128, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) - -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsubc_minmax_ukernel__scalar_u1, 1, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsubc_minmax_ukernel__scalar_u2, 2, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsubc_minmax_ukernel__scalar_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsubc_minmax_ukernel__scalar_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) - -#ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS -#undef XNN_DEFINED_UKERNEL_WITH_PARAMS -#undef XNN_UKERNEL_WITH_PARAMS -#endif - -#ifdef XNN_DEFINED_UKERNEL -#undef XNN_DEFINED_UKERNEL -#undef XNN_UKERNEL -#endif diff --git a/src/f32-vbinary/f32-vrsubc.h b/src/f32-vbinary/f32-vrsubc.h index 25a2321cc9f..2fcda551f07 100644 --- a/src/f32-vbinary/f32-vrsubc.h +++ b/src/f32-vbinary/f32-vrsubc.h @@ -16,12 +16,44 @@ #endif +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vrsubc_ukernel__neon_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vrsubc_ukernel__neon_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrsubc_ukernel__rvv_u4v, 4, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrsubc_ukernel__rvv_u8v, 8, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsubc_ukernel__sse_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsubc_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrsubc_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrsubc_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrsubc_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrsubc_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsubc_ukernel__wasmsimd_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsubc_ukernel__wasmsimd_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsubc_ukernel__wasmsimd_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsubc_ukernel__wasm_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsubc_ukernel__wasm_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsubc_ukernel__wasm_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsubc_ukernel__wasm_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +#if XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vrsubc_ukernel__hvx_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vrsubc_ukernel__hvx_u64, 64, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vrsubc_ukernel__hvx_u128, 128, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) + XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsubc_ukernel__scalar_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsubc_ukernel__scalar_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsubc_ukernel__scalar_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) diff --git a/src/f32-vbinary/f32-vsub-minmax.h b/src/f32-vbinary/f32-vsub-minmax.h deleted file mode 100644 index 2a08725a3b9..00000000000 --- a/src/f32-vbinary/f32-vsub-minmax.h +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#ifndef XNN_UKERNEL_WITH_PARAMS -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ - XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) -#define XNN_DEFINED_UKERNEL_WITH_PARAMS -#endif - -#ifndef XNN_UKERNEL -#define XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) \ - XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, void, /*init_params=*/nullptr) -#define XNN_DEFINED_UKERNEL -#endif - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsub_minmax_ukernel__neon_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsub_minmax_ukernel__neon_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - -#if XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vsub_minmax_ukernel__rvv_u4v, 4, true, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vsub_minmax_ukernel__rvv_u8v, 8, true, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsub_minmax_ukernel__sse_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsub_minmax_ukernel__sse_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsub_minmax_ukernel__avx_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsub_minmax_ukernel__avx_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsub_minmax_ukernel__avx512f_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsub_minmax_ukernel__avx512f_u32, 32, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsub_minmax_ukernel__wasmsimd_arm_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsub_minmax_ukernel__wasmsimd_arm_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsub_minmax_ukernel__wasmsimd_arm_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsub_minmax_ukernel__wasmsimd_x86_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsub_minmax_ukernel__wasmsimd_x86_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsub_minmax_ukernel__wasmsimd_x86_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsub_minmax_ukernel__wasm_u1, 1, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsub_minmax_ukernel__wasm_u2, 2, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsub_minmax_ukernel__wasm_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsub_minmax_ukernel__wasm_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - -#if XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vsub_minmax_ukernel__hvx_u32, 32, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vsub_minmax_ukernel__hvx_u64, 64, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vsub_minmax_ukernel__hvx_u128, 128, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) - -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsub_minmax_ukernel__scalar_u1, 1, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsub_minmax_ukernel__scalar_u2, 2, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsub_minmax_ukernel__scalar_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsub_minmax_ukernel__scalar_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) - -#ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS -#undef XNN_DEFINED_UKERNEL_WITH_PARAMS -#undef XNN_UKERNEL_WITH_PARAMS -#endif - -#ifdef XNN_DEFINED_UKERNEL -#undef XNN_DEFINED_UKERNEL -#undef XNN_UKERNEL -#endif diff --git a/src/f32-vbinary/f32-vsub.h b/src/f32-vbinary/f32-vsub.h index d964c873800..5208f2d904d 100644 --- a/src/f32-vbinary/f32-vsub.h +++ b/src/f32-vbinary/f32-vsub.h @@ -16,12 +16,44 @@ #endif +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsub_ukernel__neon_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsub_ukernel__neon_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vsub_ukernel__rvv_u4v, 4, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vsub_ukernel__rvv_u8v, 8, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsub_ukernel__sse_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsub_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsub_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsub_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsub_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsub_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsub_ukernel__wasmsimd_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsub_ukernel__wasmsimd_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsub_ukernel__wasmsimd_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsub_ukernel__wasm_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsub_ukernel__wasm_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsub_ukernel__wasm_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsub_ukernel__wasm_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +#if XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vsub_ukernel__hvx_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vsub_ukernel__hvx_u64, 64, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vsub_ukernel__hvx_u128, 128, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) + XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsub_ukernel__scalar_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsub_ukernel__scalar_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsub_ukernel__scalar_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) diff --git a/src/f32-vbinary/f32-vsubc-minmax.h b/src/f32-vbinary/f32-vsubc-minmax.h deleted file mode 100644 index bf26ac265cc..00000000000 --- a/src/f32-vbinary/f32-vsubc-minmax.h +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#ifndef XNN_UKERNEL_WITH_PARAMS -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ - XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) -#define XNN_DEFINED_UKERNEL_WITH_PARAMS -#endif - -#ifndef XNN_UKERNEL -#define XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) \ - XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, void, /*init_params=*/nullptr) -#define XNN_DEFINED_UKERNEL -#endif - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsubc_minmax_ukernel__neon_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsubc_minmax_ukernel__neon_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - -#if XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vsubc_minmax_ukernel__rvv_u4v, 4, true, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vsubc_minmax_ukernel__rvv_u8v, 8, true, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsubc_minmax_ukernel__sse_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsubc_minmax_ukernel__sse_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsubc_minmax_ukernel__avx_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsubc_minmax_ukernel__avx_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsubc_minmax_ukernel__avx512f_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsubc_minmax_ukernel__avx512f_u32, 32, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsubc_minmax_ukernel__wasmsimd_arm_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsubc_minmax_ukernel__wasmsimd_arm_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsubc_minmax_ukernel__wasmsimd_arm_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsubc_minmax_ukernel__wasmsimd_x86_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsubc_minmax_ukernel__wasmsimd_x86_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsubc_minmax_ukernel__wasmsimd_x86_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsubc_minmax_ukernel__wasm_u1, 1, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsubc_minmax_ukernel__wasm_u2, 2, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsubc_minmax_ukernel__wasm_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsubc_minmax_ukernel__wasm_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - -#if XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vsubc_minmax_ukernel__hvx_u32, 32, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vsubc_minmax_ukernel__hvx_u64, 64, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vsubc_minmax_ukernel__hvx_u128, 128, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -#endif // XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) - -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsubc_minmax_ukernel__scalar_u1, 1, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsubc_minmax_ukernel__scalar_u2, 2, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsubc_minmax_ukernel__scalar_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsubc_minmax_ukernel__scalar_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) - -#ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS -#undef XNN_DEFINED_UKERNEL_WITH_PARAMS -#undef XNN_UKERNEL_WITH_PARAMS -#endif - -#ifdef XNN_DEFINED_UKERNEL -#undef XNN_DEFINED_UKERNEL -#undef XNN_UKERNEL -#endif diff --git a/src/f32-vbinary/f32-vsubc.h b/src/f32-vbinary/f32-vsubc.h index 84f286601b8..477f39a5edb 100644 --- a/src/f32-vbinary/f32-vsubc.h +++ b/src/f32-vbinary/f32-vsubc.h @@ -16,12 +16,44 @@ #endif +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsubc_ukernel__neon_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsubc_ukernel__neon_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vsubc_ukernel__rvv_u4v, 4, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vsubc_ukernel__rvv_u8v, 8, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsubc_ukernel__sse_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsubc_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsubc_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsubc_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsubc_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsubc_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsubc_ukernel__wasmsimd_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsubc_ukernel__wasmsimd_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsubc_ukernel__wasmsimd_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsubc_ukernel__wasm_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsubc_ukernel__wasm_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsubc_ukernel__wasm_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsubc_ukernel__wasm_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +#if XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vsubc_ukernel__hvx_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vsubc_ukernel__hvx_u64, 64, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vsubc_ukernel__hvx_u128, 128, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) + XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsubc_ukernel__scalar_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsubc_ukernel__scalar_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsubc_ukernel__scalar_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) diff --git a/src/f32-vbinary/gen/f32-vadd-minmax-avx-u16.c b/src/f32-vbinary/gen/f32-vadd-avx-u16.c similarity index 76% rename from src/f32-vbinary/gen/f32-vadd-minmax-avx-u16.c rename to src/f32-vbinary/gen/f32-vadd-avx-u16.c index f70b84a5a24..31b64b0cda1 100644 --- a/src/f32-vbinary/gen/f32-vadd-minmax-avx-u16.c +++ b/src/f32-vbinary/gen/f32-vadd-avx-u16.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vadd_minmax_ukernel__avx_u16( +void xnn_f32_vadd_ukernel__avx_u16( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -30,11 +30,6 @@ void xnn_f32_vadd_minmax_ukernel__avx_u16( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; - const __m256 voutput_min = _mm256_set1_ps(params->scalar.min); - const __m256 voutput_max = _mm256_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const __m256 va0 = _mm256_loadu_ps(input_a); const __m256 va1 = _mm256_loadu_ps(input_a + 8); @@ -45,12 +40,6 @@ void xnn_f32_vadd_minmax_ukernel__avx_u16( input_b += 16; - vacc0 = _mm256_max_ps(voutput_min, vacc0); - vacc1 = _mm256_max_ps(voutput_min, vacc1); - - vacc0 = _mm256_min_ps(voutput_max, vacc0); - vacc1 = _mm256_min_ps(voutput_max, vacc1); - _mm256_storeu_ps(output, vacc0); _mm256_storeu_ps(output + 8, vacc1); output += 16; @@ -61,8 +50,6 @@ void xnn_f32_vadd_minmax_ukernel__avx_u16( __m256 vacc = _mm256_add_ps(va, _mm256_loadu_ps(input_b)); input_b += 8; - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); _mm256_storeu_ps(output, vacc); output += 8; } @@ -75,8 +62,6 @@ void xnn_f32_vadd_minmax_ukernel__avx_u16( const __m256 vb = _mm256_maskload_ps(input_b, vmask); __m256 vacc = _mm256_add_ps(va, vb); - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); __m128 vacc_lo = _mm256_castps256_ps128(vacc); if (batch & (4 * sizeof(float))) { diff --git a/src/f32-vbinary/gen/f32-vadd-minmax-avx-u8.c b/src/f32-vbinary/gen/f32-vadd-avx-u8.c similarity index 78% rename from src/f32-vbinary/gen/f32-vadd-minmax-avx-u8.c rename to src/f32-vbinary/gen/f32-vadd-avx-u8.c index c8bb186b4f5..61a9c1a784c 100644 --- a/src/f32-vbinary/gen/f32-vadd-minmax-avx-u8.c +++ b/src/f32-vbinary/gen/f32-vadd-avx-u8.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vadd_minmax_ukernel__avx_u8( +void xnn_f32_vadd_ukernel__avx_u8( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -30,19 +30,12 @@ void xnn_f32_vadd_minmax_ukernel__avx_u8( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; - const __m256 voutput_min = _mm256_set1_ps(params->scalar.min); - const __m256 voutput_max = _mm256_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const __m256 va = _mm256_loadu_ps(input_a); input_a += 8; __m256 vacc = _mm256_add_ps(va, _mm256_loadu_ps(input_b)); input_b += 8; - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); _mm256_storeu_ps(output, vacc); output += 8; } @@ -55,8 +48,6 @@ void xnn_f32_vadd_minmax_ukernel__avx_u8( const __m256 vb = _mm256_maskload_ps(input_b, vmask); __m256 vacc = _mm256_add_ps(va, vb); - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); __m128 vacc_lo = _mm256_castps256_ps128(vacc); if (batch & (4 * sizeof(float))) { diff --git a/src/f32-vbinary/gen/f32-vadd-minmax-avx512f-u16.c b/src/f32-vbinary/gen/f32-vadd-avx512f-u16.c similarity index 75% rename from src/f32-vbinary/gen/f32-vadd-minmax-avx512f-u16.c rename to src/f32-vbinary/gen/f32-vadd-avx512f-u16.c index 4b0777828f9..34c316b2b7e 100644 --- a/src/f32-vbinary/gen/f32-vadd-minmax-avx512f-u16.c +++ b/src/f32-vbinary/gen/f32-vadd-avx512f-u16.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vadd_minmax_ukernel__avx512f_u16( +void xnn_f32_vadd_ukernel__avx512f_u16( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,9 +29,6 @@ void xnn_f32_vadd_minmax_ukernel__avx512f_u16( assert(input_b != NULL); assert(output != NULL); - const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); - const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const __m512 va = _mm512_loadu_ps(input_a); @@ -41,9 +38,6 @@ void xnn_f32_vadd_minmax_ukernel__avx512f_u16( input_b += 16; - vacc = _mm512_max_ps(voutput_min, vacc); - vacc = _mm512_min_ps(voutput_max, vacc); - _mm512_storeu_ps(output, vacc); output += 16; } @@ -57,8 +51,6 @@ void xnn_f32_vadd_minmax_ukernel__avx512f_u16( const __m512 va = _mm512_maskz_loadu_ps(vmask, input_a); __m512 vacc = _mm512_maskz_add_ps(vmask, va, _mm512_maskz_loadu_ps(vmask, input_b)); - vacc = _mm512_maskz_max_ps(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ps(vmask, voutput_max, vacc); _mm512_mask_storeu_ps(output, vmask, vacc); } } diff --git a/src/f32-vbinary/gen/f32-vadd-minmax-avx512f-u32.c b/src/f32-vbinary/gen/f32-vadd-avx512f-u32.c similarity index 74% rename from src/f32-vbinary/gen/f32-vadd-minmax-avx512f-u32.c rename to src/f32-vbinary/gen/f32-vadd-avx512f-u32.c index 675b782ee52..6084a04c150 100644 --- a/src/f32-vbinary/gen/f32-vadd-minmax-avx512f-u32.c +++ b/src/f32-vbinary/gen/f32-vadd-avx512f-u32.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vadd_minmax_ukernel__avx512f_u32( +void xnn_f32_vadd_ukernel__avx512f_u32( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,9 +29,6 @@ void xnn_f32_vadd_minmax_ukernel__avx512f_u32( assert(input_b != NULL); assert(output != NULL); - const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); - const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); - for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { const __m512 va0 = _mm512_loadu_ps(input_a); @@ -43,12 +40,6 @@ void xnn_f32_vadd_minmax_ukernel__avx512f_u32( input_b += 32; - vacc0 = _mm512_max_ps(voutput_min, vacc0); - vacc1 = _mm512_max_ps(voutput_min, vacc1); - - vacc0 = _mm512_min_ps(voutput_max, vacc0); - vacc1 = _mm512_min_ps(voutput_max, vacc1); - _mm512_storeu_ps(output, vacc0); _mm512_storeu_ps(output + 16, vacc1); output += 32; @@ -61,9 +52,6 @@ void xnn_f32_vadd_minmax_ukernel__avx512f_u32( input_b += 16; - vacc = _mm512_max_ps(voutput_min, vacc); - vacc = _mm512_min_ps(voutput_max, vacc); - _mm512_storeu_ps(output, vacc); output += 16; } @@ -77,8 +65,6 @@ void xnn_f32_vadd_minmax_ukernel__avx512f_u32( const __m512 va = _mm512_maskz_loadu_ps(vmask, input_a); __m512 vacc = _mm512_maskz_add_ps(vmask, va, _mm512_maskz_loadu_ps(vmask, input_b)); - vacc = _mm512_maskz_max_ps(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ps(vmask, voutput_max, vacc); _mm512_mask_storeu_ps(output, vmask, vacc); } } diff --git a/src/f32-vbinary/gen/f32-vadd-minmax-hvx-u128.c b/src/f32-vbinary/gen/f32-vadd-hvx-u128.c similarity index 69% rename from src/f32-vbinary/gen/f32-vadd-minmax-hvx-u128.c rename to src/f32-vbinary/gen/f32-vadd-hvx-u128.c index d74aa547b94..97f7677338b 100644 --- a/src/f32-vbinary/gen/f32-vadd-minmax-hvx-u128.c +++ b/src/f32-vbinary/gen/f32-vadd-hvx-u128.c @@ -10,12 +10,12 @@ #include "xnnpack/math.h" #include "xnnpack/vbinary.h" -void xnn_f32_vadd_minmax_ukernel__hvx_u128( +void xnn_f32_vadd_ukernel__hvx_u128( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -23,9 +23,6 @@ void xnn_f32_vadd_minmax_ukernel__hvx_u128( assert(input_b != NULL); assert(output != NULL); - const HVX_Vector voutput_min = xnn_set1_f32(params->scalar.min); - const HVX_Vector voutput_max = xnn_set1_f32(params->scalar.max); - for (; batch >= 128 * sizeof(float); batch -= 128 * sizeof(float)) { HVX_Vector va0 = xnn_loadu_f32(input_a); HVX_Vector va1 = xnn_loadu_f32(input_a + 32); @@ -44,16 +41,6 @@ void xnn_f32_vadd_minmax_ukernel__hvx_u128( HVX_Vector vacc3 = xnn_add_f32(va3, vb3); - vacc0 = xnn_max_f32(vacc0, voutput_min); - vacc1 = xnn_max_f32(vacc1, voutput_min); - vacc2 = xnn_max_f32(vacc2, voutput_min); - vacc3 = xnn_max_f32(vacc3, voutput_min); - - vacc0 = xnn_min_f32(vacc0, voutput_max); - vacc1 = xnn_min_f32(vacc1, voutput_max); - vacc2 = xnn_min_f32(vacc2, voutput_max); - vacc3 = xnn_min_f32(vacc3, voutput_max); - xnn_storeu_f32(output, vacc0); xnn_storeu_f32(output + 32, vacc1); xnn_storeu_f32(output + 64, vacc2); @@ -67,8 +54,6 @@ void xnn_f32_vadd_minmax_ukernel__hvx_u128( input_b += 32; HVX_Vector vacc = xnn_add_f32(va, vb); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); xnn_storeu_f32(output, vacc); output += 32; @@ -78,9 +63,7 @@ void xnn_f32_vadd_minmax_ukernel__hvx_u128( HVX_Vector vb = xnn_loadu_f32(input_b); HVX_Vector vacc = xnn_add_f32(va, vb); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); - + Q6_V_vstu_variable(output, batch, vacc); } } diff --git a/src/f32-vbinary/gen/f32-vadd-minmax-hvx-u32.c b/src/f32-vbinary/gen/f32-vadd-hvx-u32.c similarity index 68% rename from src/f32-vbinary/gen/f32-vadd-minmax-hvx-u32.c rename to src/f32-vbinary/gen/f32-vadd-hvx-u32.c index d693a434b2a..be9b262cc1b 100644 --- a/src/f32-vbinary/gen/f32-vadd-minmax-hvx-u32.c +++ b/src/f32-vbinary/gen/f32-vadd-hvx-u32.c @@ -10,12 +10,12 @@ #include "xnnpack/math.h" #include "xnnpack/vbinary.h" -void xnn_f32_vadd_minmax_ukernel__hvx_u32( +void xnn_f32_vadd_ukernel__hvx_u32( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -23,9 +23,6 @@ void xnn_f32_vadd_minmax_ukernel__hvx_u32( assert(input_b != NULL); assert(output != NULL); - const HVX_Vector voutput_min = xnn_set1_f32(params->scalar.min); - const HVX_Vector voutput_max = xnn_set1_f32(params->scalar.max); - for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { HVX_Vector va = xnn_loadu_f32(input_a); HVX_Vector vb = xnn_loadu_f32(input_b); @@ -33,8 +30,6 @@ void xnn_f32_vadd_minmax_ukernel__hvx_u32( input_b += 32; HVX_Vector vacc = xnn_add_f32(va, vb); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); xnn_storeu_f32(output, vacc); output += 32; @@ -44,9 +39,7 @@ void xnn_f32_vadd_minmax_ukernel__hvx_u32( HVX_Vector vb = xnn_loadu_f32(input_b); HVX_Vector vacc = xnn_add_f32(va, vb); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); - + Q6_V_vstu_variable(output, batch, vacc); } } diff --git a/src/f32-vbinary/gen/f32-vadd-minmax-hvx-u64.c b/src/f32-vbinary/gen/f32-vadd-hvx-u64.c similarity index 69% rename from src/f32-vbinary/gen/f32-vadd-minmax-hvx-u64.c rename to src/f32-vbinary/gen/f32-vadd-hvx-u64.c index ee19bb1ebb6..b0ea11c2800 100644 --- a/src/f32-vbinary/gen/f32-vadd-minmax-hvx-u64.c +++ b/src/f32-vbinary/gen/f32-vadd-hvx-u64.c @@ -10,12 +10,12 @@ #include "xnnpack/math.h" #include "xnnpack/vbinary.h" -void xnn_f32_vadd_minmax_ukernel__hvx_u64( +void xnn_f32_vadd_ukernel__hvx_u64( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -23,9 +23,6 @@ void xnn_f32_vadd_minmax_ukernel__hvx_u64( assert(input_b != NULL); assert(output != NULL); - const HVX_Vector voutput_min = xnn_set1_f32(params->scalar.min); - const HVX_Vector voutput_max = xnn_set1_f32(params->scalar.max); - for (; batch >= 64 * sizeof(float); batch -= 64 * sizeof(float)) { HVX_Vector va0 = xnn_loadu_f32(input_a); HVX_Vector va1 = xnn_loadu_f32(input_a + 32); @@ -38,12 +35,6 @@ void xnn_f32_vadd_minmax_ukernel__hvx_u64( HVX_Vector vacc1 = xnn_add_f32(va1, vb1); - vacc0 = xnn_max_f32(vacc0, voutput_min); - vacc1 = xnn_max_f32(vacc1, voutput_min); - - vacc0 = xnn_min_f32(vacc0, voutput_max); - vacc1 = xnn_min_f32(vacc1, voutput_max); - xnn_storeu_f32(output, vacc0); xnn_storeu_f32(output + 32, vacc1); output += 64; @@ -55,8 +46,6 @@ void xnn_f32_vadd_minmax_ukernel__hvx_u64( input_b += 32; HVX_Vector vacc = xnn_add_f32(va, vb); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); xnn_storeu_f32(output, vacc); output += 32; @@ -66,9 +55,7 @@ void xnn_f32_vadd_minmax_ukernel__hvx_u64( HVX_Vector vb = xnn_loadu_f32(input_b); HVX_Vector vacc = xnn_add_f32(va, vb); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); - + Q6_V_vstu_variable(output, batch, vacc); } } diff --git a/src/f32-vbinary/gen/f32-vadd-minmax-wasm-u1.c b/src/f32-vbinary/gen/f32-vadd-minmax-wasm-u1.c deleted file mode 100644 index 1d8e0223f72..00000000000 --- a/src/f32-vbinary/gen/f32-vadd-minmax-wasm-u1.c +++ /dev/null @@ -1,41 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vadd_minmax_ukernel__wasm_u1( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - - for (; batch >= sizeof(float); batch -= sizeof(float)) { - const float va = *input_a++; - const float vb = *input_b++; - float vacc = va + vb; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output++ = vacc; - } -} diff --git a/src/f32-vbinary/gen/f32-vadd-minmax-wasm-u2.c b/src/f32-vbinary/gen/f32-vadd-minmax-wasm-u2.c deleted file mode 100644 index 3bf1336ca61..00000000000 --- a/src/f32-vbinary/gen/f32-vadd-minmax-wasm-u2.c +++ /dev/null @@ -1,65 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vadd_minmax_ukernel__wasm_u2( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - - for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) { - const float va0 = input_a[0]; - const float va1 = input_a[1]; - input_a += 2; - - const float vb0 = input_b[0]; - const float vb1 = input_b[1]; - input_b += 2; - - float vacc0 = va0 + vb0; - float vacc1 = va1 + vb1; - - - vacc0 = __builtin_wasm_max_f32(vacc0, voutput_min); - vacc1 = __builtin_wasm_max_f32(vacc1, voutput_min); - - vacc0 = __builtin_wasm_min_f32(vacc0, voutput_max); - vacc1 = __builtin_wasm_min_f32(vacc1, voutput_max); - - output[0] = vacc0; - output[1] = vacc1; - output += 2; - } - if XNN_UNLIKELY(batch != 0) { - assert(batch == sizeof(float)); - const float va = *input_a; - const float vb = *input_b; - float vacc = va + vb; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output = vacc; - } -} diff --git a/src/f32-vbinary/gen/f32-vadd-minmax-wasm-u4.c b/src/f32-vbinary/gen/f32-vadd-minmax-wasm-u4.c deleted file mode 100644 index 0c7c6468294..00000000000 --- a/src/f32-vbinary/gen/f32-vadd-minmax-wasm-u4.c +++ /dev/null @@ -1,79 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vadd_minmax_ukernel__wasm_u4( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float va0 = input_a[0]; - const float va1 = input_a[1]; - const float va2 = input_a[2]; - const float va3 = input_a[3]; - input_a += 4; - - const float vb0 = input_b[0]; - const float vb1 = input_b[1]; - const float vb2 = input_b[2]; - const float vb3 = input_b[3]; - input_b += 4; - - float vacc0 = va0 + vb0; - float vacc1 = va1 + vb1; - float vacc2 = va2 + vb2; - float vacc3 = va3 + vb3; - - - vacc0 = __builtin_wasm_max_f32(vacc0, voutput_min); - vacc1 = __builtin_wasm_max_f32(vacc1, voutput_min); - vacc2 = __builtin_wasm_max_f32(vacc2, voutput_min); - vacc3 = __builtin_wasm_max_f32(vacc3, voutput_min); - - vacc0 = __builtin_wasm_min_f32(vacc0, voutput_max); - vacc1 = __builtin_wasm_min_f32(vacc1, voutput_max); - vacc2 = __builtin_wasm_min_f32(vacc2, voutput_max); - vacc3 = __builtin_wasm_min_f32(vacc3, voutput_max); - - output[0] = vacc0; - output[1] = vacc1; - output[2] = vacc2; - output[3] = vacc3; - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - do { - const float va = *input_a++; - const float vb = *input_b++; - float vacc = va + vb; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output++ = vacc; - batch -= sizeof(float); - } while (batch != 0); - } -} diff --git a/src/f32-vbinary/gen/f32-vadd-minmax-wasm-u8.c b/src/f32-vbinary/gen/f32-vadd-minmax-wasm-u8.c deleted file mode 100644 index 8dfb9869088..00000000000 --- a/src/f32-vbinary/gen/f32-vadd-minmax-wasm-u8.c +++ /dev/null @@ -1,103 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vadd_minmax_ukernel__wasm_u8( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const float va0 = input_a[0]; - const float va1 = input_a[1]; - const float va2 = input_a[2]; - const float va3 = input_a[3]; - const float va4 = input_a[4]; - const float va5 = input_a[5]; - const float va6 = input_a[6]; - const float va7 = input_a[7]; - input_a += 8; - - const float vb0 = input_b[0]; - const float vb1 = input_b[1]; - const float vb2 = input_b[2]; - const float vb3 = input_b[3]; - const float vb4 = input_b[4]; - const float vb5 = input_b[5]; - const float vb6 = input_b[6]; - const float vb7 = input_b[7]; - input_b += 8; - - float vacc0 = va0 + vb0; - float vacc1 = va1 + vb1; - float vacc2 = va2 + vb2; - float vacc3 = va3 + vb3; - float vacc4 = va4 + vb4; - float vacc5 = va5 + vb5; - float vacc6 = va6 + vb6; - float vacc7 = va7 + vb7; - - - vacc0 = __builtin_wasm_max_f32(vacc0, voutput_min); - vacc1 = __builtin_wasm_max_f32(vacc1, voutput_min); - vacc2 = __builtin_wasm_max_f32(vacc2, voutput_min); - vacc3 = __builtin_wasm_max_f32(vacc3, voutput_min); - vacc4 = __builtin_wasm_max_f32(vacc4, voutput_min); - vacc5 = __builtin_wasm_max_f32(vacc5, voutput_min); - vacc6 = __builtin_wasm_max_f32(vacc6, voutput_min); - vacc7 = __builtin_wasm_max_f32(vacc7, voutput_min); - - vacc0 = __builtin_wasm_min_f32(vacc0, voutput_max); - vacc1 = __builtin_wasm_min_f32(vacc1, voutput_max); - vacc2 = __builtin_wasm_min_f32(vacc2, voutput_max); - vacc3 = __builtin_wasm_min_f32(vacc3, voutput_max); - vacc4 = __builtin_wasm_min_f32(vacc4, voutput_max); - vacc5 = __builtin_wasm_min_f32(vacc5, voutput_max); - vacc6 = __builtin_wasm_min_f32(vacc6, voutput_max); - vacc7 = __builtin_wasm_min_f32(vacc7, voutput_max); - - output[0] = vacc0; - output[1] = vacc1; - output[2] = vacc2; - output[3] = vacc3; - output[4] = vacc4; - output[5] = vacc5; - output[6] = vacc6; - output[7] = vacc7; - output += 8; - } - if XNN_UNLIKELY(batch != 0) { - do { - const float va = *input_a++; - const float vb = *input_b++; - float vacc = va + vb; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output++ = vacc; - batch -= sizeof(float); - } while (batch != 0); - } -} diff --git a/src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-arm-u16.c b/src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-arm-u16.c deleted file mode 100644 index 9bc0e965141..00000000000 --- a/src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-arm-u16.c +++ /dev/null @@ -1,103 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vadd_minmax_ukernel__wasmsimd_arm_u16( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - const v128_t va2 = wasm_v128_load(input_a + 8); - const v128_t va3 = wasm_v128_load(input_a + 12); - input_a += 16; - - const v128_t vb0 = wasm_v128_load(input_b); - const v128_t vb1 = wasm_v128_load(input_b + 4); - const v128_t vb2 = wasm_v128_load(input_b + 8); - const v128_t vb3 = wasm_v128_load(input_b + 12); - input_b += 16; - - v128_t vacc0 = wasm_f32x4_add(va0, vb0); - v128_t vacc1 = wasm_f32x4_add(va1, vb1); - v128_t vacc2 = wasm_f32x4_add(va2, vb2); - v128_t vacc3 = wasm_f32x4_add(va3, vb3); - - vacc0 = wasm_f32x4_max(vacc0, voutput_min); - vacc1 = wasm_f32x4_max(vacc1, voutput_min); - vacc2 = wasm_f32x4_max(vacc2, voutput_min); - vacc3 = wasm_f32x4_max(vacc3, voutput_min); - - vacc0 = wasm_f32x4_min(vacc0, voutput_max); - vacc1 = wasm_f32x4_min(vacc1, voutput_max); - vacc2 = wasm_f32x4_min(vacc2, voutput_max); - vacc3 = wasm_f32x4_min(vacc3, voutput_max); - - wasm_v128_store(output, vacc0); - wasm_v128_store(output + 4, vacc1); - wasm_v128_store(output + 8, vacc2); - wasm_v128_store(output + 12, vacc3); - output += 16; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - const v128_t vb = wasm_v128_load(input_b); - input_b += 4; - - v128_t vacc = wasm_f32x4_add(va, vb); - - vacc = wasm_f32x4_max(vacc, voutput_min); - vacc = wasm_f32x4_min(vacc, voutput_max); - - wasm_v128_store(output, vacc); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - const v128_t vb = wasm_v128_load(input_b); - - v128_t vacc = wasm_f32x4_add(va, vb); - - vacc = wasm_f32x4_max(vacc, voutput_min); - vacc = wasm_f32x4_min(vacc, voutput_max); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vacc, 0); - vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vacc, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-arm-u4.c b/src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-arm-u4.c deleted file mode 100644 index 168e2bf70c6..00000000000 --- a/src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-arm-u4.c +++ /dev/null @@ -1,69 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vadd_minmax_ukernel__wasmsimd_arm_u4( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - const v128_t vb = wasm_v128_load(input_b); - input_b += 4; - - v128_t vacc = wasm_f32x4_add(va, vb); - - vacc = wasm_f32x4_max(vacc, voutput_min); - vacc = wasm_f32x4_min(vacc, voutput_max); - - wasm_v128_store(output, vacc); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - const v128_t vb = wasm_v128_load(input_b); - - v128_t vacc = wasm_f32x4_add(va, vb); - - vacc = wasm_f32x4_max(vacc, voutput_min); - vacc = wasm_f32x4_min(vacc, voutput_max); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vacc, 0); - vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vacc, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-arm-u8.c b/src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-arm-u8.c deleted file mode 100644 index 55d703b0084..00000000000 --- a/src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-arm-u8.c +++ /dev/null @@ -1,91 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vadd_minmax_ukernel__wasmsimd_arm_u8( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - input_a += 8; - - const v128_t vb0 = wasm_v128_load(input_b); - const v128_t vb1 = wasm_v128_load(input_b + 4); - input_b += 8; - - v128_t vacc0 = wasm_f32x4_add(va0, vb0); - v128_t vacc1 = wasm_f32x4_add(va1, vb1); - - vacc0 = wasm_f32x4_max(vacc0, voutput_min); - vacc1 = wasm_f32x4_max(vacc1, voutput_min); - - vacc0 = wasm_f32x4_min(vacc0, voutput_max); - vacc1 = wasm_f32x4_min(vacc1, voutput_max); - - wasm_v128_store(output, vacc0); - wasm_v128_store(output + 4, vacc1); - output += 8; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - const v128_t vb = wasm_v128_load(input_b); - input_b += 4; - - v128_t vacc = wasm_f32x4_add(va, vb); - - vacc = wasm_f32x4_max(vacc, voutput_min); - vacc = wasm_f32x4_min(vacc, voutput_max); - - wasm_v128_store(output, vacc); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - const v128_t vb = wasm_v128_load(input_b); - - v128_t vacc = wasm_f32x4_add(va, vb); - - vacc = wasm_f32x4_max(vacc, voutput_min); - vacc = wasm_f32x4_min(vacc, voutput_max); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vacc, 0); - vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vacc, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-x86-u16.c b/src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-x86-u16.c deleted file mode 100644 index 94fb702d0ca..00000000000 --- a/src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-x86-u16.c +++ /dev/null @@ -1,103 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vadd_minmax_ukernel__wasmsimd_x86_u16( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - const v128_t va2 = wasm_v128_load(input_a + 8); - const v128_t va3 = wasm_v128_load(input_a + 12); - input_a += 16; - - const v128_t vb0 = wasm_v128_load(input_b); - const v128_t vb1 = wasm_v128_load(input_b + 4); - const v128_t vb2 = wasm_v128_load(input_b + 8); - const v128_t vb3 = wasm_v128_load(input_b + 12); - input_b += 16; - - v128_t vacc0 = wasm_f32x4_add(va0, vb0); - v128_t vacc1 = wasm_f32x4_add(va1, vb1); - v128_t vacc2 = wasm_f32x4_add(va2, vb2); - v128_t vacc3 = wasm_f32x4_add(va3, vb3); - - vacc0 = wasm_f32x4_pmax(voutput_min, vacc0); - vacc1 = wasm_f32x4_pmax(voutput_min, vacc1); - vacc2 = wasm_f32x4_pmax(voutput_min, vacc2); - vacc3 = wasm_f32x4_pmax(voutput_min, vacc3); - - vacc0 = wasm_f32x4_pmin(voutput_max, vacc0); - vacc1 = wasm_f32x4_pmin(voutput_max, vacc1); - vacc2 = wasm_f32x4_pmin(voutput_max, vacc2); - vacc3 = wasm_f32x4_pmin(voutput_max, vacc3); - - wasm_v128_store(output, vacc0); - wasm_v128_store(output + 4, vacc1); - wasm_v128_store(output + 8, vacc2); - wasm_v128_store(output + 12, vacc3); - output += 16; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - const v128_t vb = wasm_v128_load(input_b); - input_b += 4; - - v128_t vacc = wasm_f32x4_add(va, vb); - - vacc = wasm_f32x4_pmax(voutput_min, vacc); - vacc = wasm_f32x4_pmin(voutput_max, vacc); - - wasm_v128_store(output, vacc); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - const v128_t vb = wasm_v128_load(input_b); - - v128_t vacc = wasm_f32x4_add(va, vb); - - vacc = wasm_f32x4_pmax(voutput_min, vacc); - vacc = wasm_f32x4_pmin(voutput_max, vacc); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vacc, 0); - vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vacc, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-x86-u4.c b/src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-x86-u4.c deleted file mode 100644 index 0bdcb4e2b2b..00000000000 --- a/src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-x86-u4.c +++ /dev/null @@ -1,69 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vadd_minmax_ukernel__wasmsimd_x86_u4( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - const v128_t vb = wasm_v128_load(input_b); - input_b += 4; - - v128_t vacc = wasm_f32x4_add(va, vb); - - vacc = wasm_f32x4_pmax(voutput_min, vacc); - vacc = wasm_f32x4_pmin(voutput_max, vacc); - - wasm_v128_store(output, vacc); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - const v128_t vb = wasm_v128_load(input_b); - - v128_t vacc = wasm_f32x4_add(va, vb); - - vacc = wasm_f32x4_pmax(voutput_min, vacc); - vacc = wasm_f32x4_pmin(voutput_max, vacc); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vacc, 0); - vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vacc, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-x86-u8.c b/src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-x86-u8.c deleted file mode 100644 index 82be241a56f..00000000000 --- a/src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-x86-u8.c +++ /dev/null @@ -1,91 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vadd_minmax_ukernel__wasmsimd_x86_u8( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - input_a += 8; - - const v128_t vb0 = wasm_v128_load(input_b); - const v128_t vb1 = wasm_v128_load(input_b + 4); - input_b += 8; - - v128_t vacc0 = wasm_f32x4_add(va0, vb0); - v128_t vacc1 = wasm_f32x4_add(va1, vb1); - - vacc0 = wasm_f32x4_pmax(voutput_min, vacc0); - vacc1 = wasm_f32x4_pmax(voutput_min, vacc1); - - vacc0 = wasm_f32x4_pmin(voutput_max, vacc0); - vacc1 = wasm_f32x4_pmin(voutput_max, vacc1); - - wasm_v128_store(output, vacc0); - wasm_v128_store(output + 4, vacc1); - output += 8; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - const v128_t vb = wasm_v128_load(input_b); - input_b += 4; - - v128_t vacc = wasm_f32x4_add(va, vb); - - vacc = wasm_f32x4_pmax(voutput_min, vacc); - vacc = wasm_f32x4_pmin(voutput_max, vacc); - - wasm_v128_store(output, vacc); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - const v128_t vb = wasm_v128_load(input_b); - - v128_t vacc = wasm_f32x4_add(va, vb); - - vacc = wasm_f32x4_pmax(voutput_min, vacc); - vacc = wasm_f32x4_pmin(voutput_max, vacc); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vacc, 0); - vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vacc, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vadd-minmax-neon-u4.c b/src/f32-vbinary/gen/f32-vadd-neon-u4.c similarity index 75% rename from src/f32-vbinary/gen/f32-vadd-minmax-neon-u4.c rename to src/f32-vbinary/gen/f32-vadd-neon-u4.c index 5b16eb468e7..b96d5e75e78 100644 --- a/src/f32-vbinary/gen/f32-vadd-minmax-neon-u4.c +++ b/src/f32-vbinary/gen/f32-vadd-neon-u4.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vadd_minmax_ukernel__neon_u4( +void xnn_f32_vadd_ukernel__neon_u4( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -28,18 +28,12 @@ void xnn_f32_vadd_minmax_ukernel__neon_u4( assert(input_b != NULL); assert(output != NULL); - const float32x4_t voutput_min = vld1q_dup_f32(¶ms->scalar.min); - const float32x4_t voutput_max = vld1q_dup_f32(¶ms->scalar.max); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float32x4_t va = vld1q_f32(input_a); input_a += 4; const float32x4_t vb = vld1q_f32(input_b); input_b += 4; float32x4_t vacc = vaddq_f32(va, vb); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -48,9 +42,6 @@ void xnn_f32_vadd_minmax_ukernel__neon_u4( float32x4_t vacc = vaddq_f32(va, vb); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/gen/f32-vadd-minmax-neon-u8.c b/src/f32-vbinary/gen/f32-vadd-neon-u8.c similarity index 74% rename from src/f32-vbinary/gen/f32-vadd-minmax-neon-u8.c rename to src/f32-vbinary/gen/f32-vadd-neon-u8.c index 4f8eff92a99..e053b6514af 100644 --- a/src/f32-vbinary/gen/f32-vadd-minmax-neon-u8.c +++ b/src/f32-vbinary/gen/f32-vadd-neon-u8.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vadd_minmax_ukernel__neon_u8( +void xnn_f32_vadd_ukernel__neon_u8( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -28,9 +28,6 @@ void xnn_f32_vadd_minmax_ukernel__neon_u8( assert(input_b != NULL); assert(output != NULL); - const float32x4_t voutput_min = vld1q_dup_f32(¶ms->scalar.min); - const float32x4_t voutput_max = vld1q_dup_f32(¶ms->scalar.max); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float32x4_t va0 = vld1q_f32(input_a); input_a += 4; const float32x4_t vb0 = vld1q_f32(input_b); input_b += 4; @@ -41,12 +38,6 @@ void xnn_f32_vadd_minmax_ukernel__neon_u8( float32x4_t vacc1 = vaddq_f32(va1, vb1); - vacc0 = vmaxq_f32(vacc0, voutput_min); - vacc1 = vmaxq_f32(vacc1, voutput_min); - - vacc0 = vminq_f32(vacc0, voutput_max); - vacc1 = vminq_f32(vacc1, voutput_max); - vst1q_f32(output, vacc0); output += 4; vst1q_f32(output, vacc1); output += 4; } @@ -56,9 +47,6 @@ void xnn_f32_vadd_minmax_ukernel__neon_u8( float32x4_t vacc = vaddq_f32(va, vb); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -67,9 +55,6 @@ void xnn_f32_vadd_minmax_ukernel__neon_u8( float32x4_t vacc = vaddq_f32(va, vb); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/gen/f32-vadd-minmax-rvv-u4v.c b/src/f32-vbinary/gen/f32-vadd-rvv-u4v.c similarity index 75% rename from src/f32-vbinary/gen/f32-vadd-minmax-rvv-u4v.c rename to src/f32-vbinary/gen/f32-vadd-rvv-u4v.c index 6c4e0bb8091..1305a128a45 100644 --- a/src/f32-vbinary/gen/f32-vadd-minmax-rvv-u4v.c +++ b/src/f32-vbinary/gen/f32-vadd-rvv-u4v.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vadd_minmax_ukernel__rvv_u4v( +void xnn_f32_vadd_ukernel__rvv_u4v( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -28,8 +28,6 @@ void xnn_f32_vadd_minmax_ukernel__rvv_u4v( assert(input_b != NULL); assert(output != NULL); - const float output_min = params->scalar.min; - const float output_max = params->scalar.max; size_t n = batch >> 2; do { @@ -40,8 +38,6 @@ void xnn_f32_vadd_minmax_ukernel__rvv_u4v( vfloat32m4_t vb = __riscv_vle32_v_f32m4(input_b, vl); input_b += vl; vfloat32m4_t vacc = __riscv_vfadd_vv_f32m4(va, vb, vl); - vacc = __riscv_vfmax_vf_f32m4(vacc, output_min, vl); - vacc = __riscv_vfmin_vf_f32m4(vacc, output_max, vl); __riscv_vse32_v_f32m4(output, vacc, vl); output += vl; } while (n > 0); diff --git a/src/f32-vbinary/gen/f32-vadd-minmax-rvv-u8v.c b/src/f32-vbinary/gen/f32-vadd-rvv-u8v.c similarity index 75% rename from src/f32-vbinary/gen/f32-vadd-minmax-rvv-u8v.c rename to src/f32-vbinary/gen/f32-vadd-rvv-u8v.c index 62a2065b431..56b55805702 100644 --- a/src/f32-vbinary/gen/f32-vadd-minmax-rvv-u8v.c +++ b/src/f32-vbinary/gen/f32-vadd-rvv-u8v.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vadd_minmax_ukernel__rvv_u8v( +void xnn_f32_vadd_ukernel__rvv_u8v( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -28,8 +28,6 @@ void xnn_f32_vadd_minmax_ukernel__rvv_u8v( assert(input_b != NULL); assert(output != NULL); - const float output_min = params->scalar.min; - const float output_max = params->scalar.max; size_t n = batch >> 2; do { @@ -40,8 +38,6 @@ void xnn_f32_vadd_minmax_ukernel__rvv_u8v( vfloat32m8_t vb = __riscv_vle32_v_f32m8(input_b, vl); input_b += vl; vfloat32m8_t vacc = __riscv_vfadd_vv_f32m8(va, vb, vl); - vacc = __riscv_vfmax_vf_f32m8(vacc, output_min, vl); - vacc = __riscv_vfmin_vf_f32m8(vacc, output_max, vl); __riscv_vse32_v_f32m8(output, vacc, vl); output += vl; } while (n > 0); diff --git a/src/f32-vbinary/gen/f32-vadd-scalar-u1.c b/src/f32-vbinary/gen/f32-vadd-scalar-u1.c index 86fa44dab45..0d583cfaf12 100644 --- a/src/f32-vbinary/gen/f32-vadd-scalar-u1.c +++ b/src/f32-vbinary/gen/f32-vadd-scalar-u1.c @@ -27,7 +27,6 @@ void xnn_f32_vadd_ukernel__scalar_u1( assert(input_b != NULL); assert(output != NULL); - for (; batch >= sizeof(float); batch -= sizeof(float)) { const float va = *input_a++; const float vb = *input_b++; diff --git a/src/f32-vbinary/gen/f32-vadd-scalar-u2.c b/src/f32-vbinary/gen/f32-vadd-scalar-u2.c index a36f35a93fb..bac689cd702 100644 --- a/src/f32-vbinary/gen/f32-vadd-scalar-u2.c +++ b/src/f32-vbinary/gen/f32-vadd-scalar-u2.c @@ -27,7 +27,6 @@ void xnn_f32_vadd_ukernel__scalar_u2( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -41,7 +40,6 @@ void xnn_f32_vadd_ukernel__scalar_u2( float vacc1 = va1 + vb1; - output[0] = vacc0; output[1] = vacc1; output += 2; diff --git a/src/f32-vbinary/gen/f32-vadd-scalar-u4.c b/src/f32-vbinary/gen/f32-vadd-scalar-u4.c index 44bfc664f0a..571c1c33280 100644 --- a/src/f32-vbinary/gen/f32-vadd-scalar-u4.c +++ b/src/f32-vbinary/gen/f32-vadd-scalar-u4.c @@ -27,7 +27,6 @@ void xnn_f32_vadd_ukernel__scalar_u4( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -47,7 +46,6 @@ void xnn_f32_vadd_ukernel__scalar_u4( float vacc3 = va3 + vb3; - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vadd-scalar-u8.c b/src/f32-vbinary/gen/f32-vadd-scalar-u8.c index 2972b0cdf32..389a802be5e 100644 --- a/src/f32-vbinary/gen/f32-vadd-scalar-u8.c +++ b/src/f32-vbinary/gen/f32-vadd-scalar-u8.c @@ -27,7 +27,6 @@ void xnn_f32_vadd_ukernel__scalar_u8( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -59,7 +58,6 @@ void xnn_f32_vadd_ukernel__scalar_u8( float vacc7 = va7 + vb7; - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vadd-minmax-sse-u4.c b/src/f32-vbinary/gen/f32-vadd-sse-u4.c similarity index 72% rename from src/f32-vbinary/gen/f32-vadd-minmax-sse-u4.c rename to src/f32-vbinary/gen/f32-vadd-sse-u4.c index ba7961056d9..4fbcbb373a5 100644 --- a/src/f32-vbinary/gen/f32-vadd-minmax-sse-u4.c +++ b/src/f32-vbinary/gen/f32-vadd-sse-u4.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vadd_minmax_ukernel__sse_u4( +void xnn_f32_vadd_ukernel__sse_u4( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,11 +29,6 @@ void xnn_f32_vadd_minmax_ukernel__sse_u4( assert(input_b != NULL); assert(output != NULL); - const __m128 voutput_min = _mm_set1_ps(params->scalar.min); - const __m128 voutput_max = _mm_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const __m128 va = _mm_loadu_ps(input_a); input_a += 4; @@ -42,8 +37,6 @@ void xnn_f32_vadd_minmax_ukernel__sse_u4( input_b += 4; __m128 vacc = _mm_add_ps(va, vb); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); _mm_storeu_ps(output, vacc); output += 4; @@ -53,8 +46,6 @@ void xnn_f32_vadd_minmax_ukernel__sse_u4( const __m128 vb = _mm_loadu_ps(input_b); __m128 vacc = _mm_add_ps(va, vb); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); if (batch & (2 * sizeof(float))) { _mm_storel_pi((__m64*) output, vacc); diff --git a/src/f32-vbinary/gen/f32-vadd-minmax-sse-u8.c b/src/f32-vbinary/gen/f32-vadd-sse-u8.c similarity index 72% rename from src/f32-vbinary/gen/f32-vadd-minmax-sse-u8.c rename to src/f32-vbinary/gen/f32-vadd-sse-u8.c index cce4d1cc997..35d9e48e390 100644 --- a/src/f32-vbinary/gen/f32-vadd-minmax-sse-u8.c +++ b/src/f32-vbinary/gen/f32-vadd-sse-u8.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vadd_minmax_ukernel__sse_u8( +void xnn_f32_vadd_ukernel__sse_u8( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,11 +29,6 @@ void xnn_f32_vadd_minmax_ukernel__sse_u8( assert(input_b != NULL); assert(output != NULL); - const __m128 voutput_min = _mm_set1_ps(params->scalar.min); - const __m128 voutput_max = _mm_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const __m128 va0 = _mm_loadu_ps(input_a); const __m128 va1 = _mm_loadu_ps(input_a + 4); @@ -47,12 +42,6 @@ void xnn_f32_vadd_minmax_ukernel__sse_u8( __m128 vacc1 = _mm_add_ps(va1, vb1); - vacc0 = _mm_max_ps(vacc0, voutput_min); - vacc1 = _mm_max_ps(vacc1, voutput_min); - - vacc0 = _mm_min_ps(vacc0, voutput_max); - vacc1 = _mm_min_ps(vacc1, voutput_max); - _mm_storeu_ps(output, vacc0); _mm_storeu_ps(output + 4, vacc1); output += 8; @@ -65,8 +54,6 @@ void xnn_f32_vadd_minmax_ukernel__sse_u8( input_b += 4; __m128 vacc = _mm_add_ps(va, vb); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); _mm_storeu_ps(output, vacc); output += 4; @@ -76,8 +63,6 @@ void xnn_f32_vadd_minmax_ukernel__sse_u8( const __m128 vb = _mm_loadu_ps(input_b); __m128 vacc = _mm_add_ps(va, vb); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); if (batch & (2 * sizeof(float))) { _mm_storel_pi((__m64*) output, vacc); diff --git a/src/f32-vbinary/gen/f32-vadd-minmax-scalar-u1.c b/src/f32-vbinary/gen/f32-vadd-wasm-u1.c similarity index 72% rename from src/f32-vbinary/gen/f32-vadd-minmax-scalar-u1.c rename to src/f32-vbinary/gen/f32-vadd-wasm-u1.c index 22130ec49af..2f75f193aba 100644 --- a/src/f32-vbinary/gen/f32-vadd-minmax-scalar-u1.c +++ b/src/f32-vbinary/gen/f32-vadd-wasm-u1.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vadd_minmax_ukernel__scalar_u1( +void xnn_f32_vadd_ukernel__wasm_u1( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,15 +27,10 @@ void xnn_f32_vadd_minmax_ukernel__scalar_u1( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - for (; batch >= sizeof(float); batch -= sizeof(float)) { const float va = *input_a++; const float vb = *input_b++; float vacc = va + vb; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output++ = vacc; } } diff --git a/src/f32-vbinary/gen/f32-vadd-minmax-scalar-u2.c b/src/f32-vbinary/gen/f32-vadd-wasm-u2.c similarity index 70% rename from src/f32-vbinary/gen/f32-vadd-minmax-scalar-u2.c rename to src/f32-vbinary/gen/f32-vadd-wasm-u2.c index 76f740cdd7b..5415ccda7b0 100644 --- a/src/f32-vbinary/gen/f32-vadd-minmax-scalar-u2.c +++ b/src/f32-vbinary/gen/f32-vadd-wasm-u2.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vadd_minmax_ukernel__scalar_u2( +void xnn_f32_vadd_ukernel__wasm_u2( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,9 +27,6 @@ void xnn_f32_vadd_minmax_ukernel__scalar_u2( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -43,12 +40,6 @@ void xnn_f32_vadd_minmax_ukernel__scalar_u2( float vacc1 = va1 + vb1; - vacc0 = math_max_f32(vacc0, voutput_min); - vacc1 = math_max_f32(vacc1, voutput_min); - - vacc0 = math_min_f32(vacc0, voutput_max); - vacc1 = math_min_f32(vacc1, voutput_max); - output[0] = vacc0; output[1] = vacc1; output += 2; @@ -58,8 +49,6 @@ void xnn_f32_vadd_minmax_ukernel__scalar_u2( const float va = *input_a; const float vb = *input_b; float vacc = va + vb; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output = vacc; } } diff --git a/src/f32-vbinary/gen/f32-vadd-minmax-scalar-u4.c b/src/f32-vbinary/gen/f32-vadd-wasm-u4.c similarity index 68% rename from src/f32-vbinary/gen/f32-vadd-minmax-scalar-u4.c rename to src/f32-vbinary/gen/f32-vadd-wasm-u4.c index bb24549d3d0..f121a993b1f 100644 --- a/src/f32-vbinary/gen/f32-vadd-minmax-scalar-u4.c +++ b/src/f32-vbinary/gen/f32-vadd-wasm-u4.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vadd_minmax_ukernel__scalar_u4( +void xnn_f32_vadd_ukernel__wasm_u4( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,9 +27,6 @@ void xnn_f32_vadd_minmax_ukernel__scalar_u4( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -49,16 +46,6 @@ void xnn_f32_vadd_minmax_ukernel__scalar_u4( float vacc3 = va3 + vb3; - vacc0 = math_max_f32(vacc0, voutput_min); - vacc1 = math_max_f32(vacc1, voutput_min); - vacc2 = math_max_f32(vacc2, voutput_min); - vacc3 = math_max_f32(vacc3, voutput_min); - - vacc0 = math_min_f32(vacc0, voutput_max); - vacc1 = math_min_f32(vacc1, voutput_max); - vacc2 = math_min_f32(vacc2, voutput_max); - vacc3 = math_min_f32(vacc3, voutput_max); - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; @@ -70,8 +57,6 @@ void xnn_f32_vadd_minmax_ukernel__scalar_u4( const float va = *input_a++; const float vb = *input_b++; float vacc = va + vb; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output++ = vacc; batch -= sizeof(float); } while (batch != 0); diff --git a/src/f32-vbinary/gen/f32-vadd-minmax-scalar-u8.c b/src/f32-vbinary/gen/f32-vadd-wasm-u8.c similarity index 64% rename from src/f32-vbinary/gen/f32-vadd-minmax-scalar-u8.c rename to src/f32-vbinary/gen/f32-vadd-wasm-u8.c index b9322bbb6c1..27cecda27a3 100644 --- a/src/f32-vbinary/gen/f32-vadd-minmax-scalar-u8.c +++ b/src/f32-vbinary/gen/f32-vadd-wasm-u8.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vadd_minmax_ukernel__scalar_u8( +void xnn_f32_vadd_ukernel__wasm_u8( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,9 +27,6 @@ void xnn_f32_vadd_minmax_ukernel__scalar_u8( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -61,24 +58,6 @@ void xnn_f32_vadd_minmax_ukernel__scalar_u8( float vacc7 = va7 + vb7; - vacc0 = math_max_f32(vacc0, voutput_min); - vacc1 = math_max_f32(vacc1, voutput_min); - vacc2 = math_max_f32(vacc2, voutput_min); - vacc3 = math_max_f32(vacc3, voutput_min); - vacc4 = math_max_f32(vacc4, voutput_min); - vacc5 = math_max_f32(vacc5, voutput_min); - vacc6 = math_max_f32(vacc6, voutput_min); - vacc7 = math_max_f32(vacc7, voutput_min); - - vacc0 = math_min_f32(vacc0, voutput_max); - vacc1 = math_min_f32(vacc1, voutput_max); - vacc2 = math_min_f32(vacc2, voutput_max); - vacc3 = math_min_f32(vacc3, voutput_max); - vacc4 = math_min_f32(vacc4, voutput_max); - vacc5 = math_min_f32(vacc5, voutput_max); - vacc6 = math_min_f32(vacc6, voutput_max); - vacc7 = math_min_f32(vacc7, voutput_max); - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; @@ -94,8 +73,6 @@ void xnn_f32_vadd_minmax_ukernel__scalar_u8( const float va = *input_a++; const float vb = *input_b++; float vacc = va + vb; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output++ = vacc; batch -= sizeof(float); } while (batch != 0); diff --git a/src/f32-vbinary/gen/f32-vadd-wasmsimd-u16.c b/src/f32-vbinary/gen/f32-vadd-wasmsimd-u16.c index c8ae5ad6fd9..be691158716 100644 --- a/src/f32-vbinary/gen/f32-vadd-wasmsimd-u16.c +++ b/src/f32-vbinary/gen/f32-vadd-wasmsimd-u16.c @@ -28,7 +28,6 @@ void xnn_f32_vadd_ukernel__wasmsimd_u16( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); @@ -63,7 +62,6 @@ void xnn_f32_vadd_ukernel__wasmsimd_u16( v128_t vacc = wasm_f32x4_add(va, vb); - wasm_v128_store(output, vacc); output += 4; } @@ -73,7 +71,6 @@ void xnn_f32_vadd_ukernel__wasmsimd_u16( v128_t vacc = wasm_f32x4_add(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vadd-wasmsimd-u4.c b/src/f32-vbinary/gen/f32-vadd-wasmsimd-u4.c index 77dc51b3d1f..3b27ab1d53b 100644 --- a/src/f32-vbinary/gen/f32-vadd-wasmsimd-u4.c +++ b/src/f32-vbinary/gen/f32-vadd-wasmsimd-u4.c @@ -28,7 +28,6 @@ void xnn_f32_vadd_ukernel__wasmsimd_u4( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; @@ -38,7 +37,6 @@ void xnn_f32_vadd_ukernel__wasmsimd_u4( v128_t vacc = wasm_f32x4_add(va, vb); - wasm_v128_store(output, vacc); output += 4; } @@ -48,7 +46,6 @@ void xnn_f32_vadd_ukernel__wasmsimd_u4( v128_t vacc = wasm_f32x4_add(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vadd-wasmsimd-u8.c b/src/f32-vbinary/gen/f32-vadd-wasmsimd-u8.c index 3a633190dbb..cf3e75122d3 100644 --- a/src/f32-vbinary/gen/f32-vadd-wasmsimd-u8.c +++ b/src/f32-vbinary/gen/f32-vadd-wasmsimd-u8.c @@ -28,7 +28,6 @@ void xnn_f32_vadd_ukernel__wasmsimd_u8( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); @@ -55,7 +54,6 @@ void xnn_f32_vadd_ukernel__wasmsimd_u8( v128_t vacc = wasm_f32x4_add(va, vb); - wasm_v128_store(output, vacc); output += 4; } @@ -65,7 +63,6 @@ void xnn_f32_vadd_ukernel__wasmsimd_u8( v128_t vacc = wasm_f32x4_add(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vaddc-minmax-avx-u8.c b/src/f32-vbinary/gen/f32-vaddc-avx-u16.c similarity index 77% rename from src/f32-vbinary/gen/f32-vaddc-minmax-avx-u8.c rename to src/f32-vbinary/gen/f32-vaddc-avx-u16.c index 1db5cae8075..8b8f4dcebd4 100644 --- a/src/f32-vbinary/gen/f32-vaddc-minmax-avx-u8.c +++ b/src/f32-vbinary/gen/f32-vaddc-avx-u16.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vaddc_minmax_ukernel__avx_u8( +void xnn_f32_vaddc_ukernel__avx_u16( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -30,19 +30,26 @@ void xnn_f32_vaddc_minmax_ukernel__avx_u8( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; - const __m256 voutput_min = _mm256_set1_ps(params->scalar.min); - const __m256 voutput_max = _mm256_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); const __m256 vb = _mm256_broadcast_ss(input_b); + for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { + const __m256 va0 = _mm256_loadu_ps(input_a); + const __m256 va1 = _mm256_loadu_ps(input_a + 8); + input_a += 16; + + __m256 vacc0 = _mm256_add_ps(va0, vb); + __m256 vacc1 = _mm256_add_ps(va1, vb); + + + _mm256_storeu_ps(output, vacc0); + _mm256_storeu_ps(output + 8, vacc1); + output += 16; + } for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const __m256 va = _mm256_loadu_ps(input_a); input_a += 8; __m256 vacc = _mm256_add_ps(va, vb); - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); _mm256_storeu_ps(output, vacc); output += 8; } @@ -54,8 +61,6 @@ void xnn_f32_vaddc_minmax_ukernel__avx_u8( __m256 va = _mm256_maskload_ps(input_a, vmask); __m256 vacc = _mm256_add_ps(va, vb); - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); __m128 vacc_lo = _mm256_castps256_ps128(vacc); if (batch & (4 * sizeof(float))) { diff --git a/src/f32-vbinary/gen/f32-vaddc-avx-u8.c b/src/f32-vbinary/gen/f32-vaddc-avx-u8.c new file mode 100644 index 00000000000..8bfb253ba57 --- /dev/null +++ b/src/f32-vbinary/gen/f32-vaddc-avx-u8.c @@ -0,0 +1,67 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-vbinary/vopc-avx.c.in +// Generator: tools/xngen +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/vbinary.h" + + +void xnn_f32_vaddc_ukernel__avx_u8( + size_t batch, + const float* input_a, + const float* input_b, + float* output, + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + + const __m256 vb = _mm256_broadcast_ss(input_b); + + for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { + const __m256 va = _mm256_loadu_ps(input_a); + input_a += 8; + + __m256 vacc = _mm256_add_ps(va, vb); + _mm256_storeu_ps(output, vacc); + output += 8; + } + if XNN_UNLIKELY(batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 7 * sizeof(float)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch)); + + __m256 va = _mm256_maskload_ps(input_a, vmask); + + __m256 vacc = _mm256_add_ps(va, vb); + + __m128 vacc_lo = _mm256_castps256_ps128(vacc); + if (batch & (4 * sizeof(float))) { + _mm_storeu_ps(output, vacc_lo); + vacc_lo = _mm256_extractf128_ps(vacc, 1); + output += 4; + } + if (batch & (2 * sizeof(float))) { + _mm_storel_pi((__m64*) output, vacc_lo); + vacc_lo = _mm_movehl_ps(vacc_lo, vacc_lo); + output += 2; + } + if (batch & (1 * sizeof(float))) { + _mm_store_ss(output, vacc_lo); + } + } +} diff --git a/src/f32-vbinary/gen/f32-vaddc-minmax-avx512f-u16.c b/src/f32-vbinary/gen/f32-vaddc-avx512f-u16.c similarity index 75% rename from src/f32-vbinary/gen/f32-vaddc-minmax-avx512f-u16.c rename to src/f32-vbinary/gen/f32-vaddc-avx512f-u16.c index 830e689182f..284dd676c9a 100644 --- a/src/f32-vbinary/gen/f32-vaddc-minmax-avx512f-u16.c +++ b/src/f32-vbinary/gen/f32-vaddc-avx512f-u16.c @@ -7,6 +7,7 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. + #include #include @@ -16,12 +17,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vaddc_minmax_ukernel__avx512f_u16( +void xnn_f32_vaddc_ukernel__avx512f_u16( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,8 +30,6 @@ void xnn_f32_vaddc_minmax_ukernel__avx512f_u16( assert(input_b != NULL); assert(output != NULL); - const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); - const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); const __m512 vb = _mm512_set1_ps(*input_b); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { @@ -40,10 +39,6 @@ void xnn_f32_vaddc_minmax_ukernel__avx512f_u16( __m512 vacc0 = _mm512_add_ps(va0, vb); - vacc0 = _mm512_max_ps(voutput_min, vacc0); - - vacc0 = _mm512_min_ps(voutput_max, vacc0); - _mm512_storeu_ps(output, vacc0); output += 16; } @@ -57,8 +52,6 @@ void xnn_f32_vaddc_minmax_ukernel__avx512f_u16( __m512 va = _mm512_maskz_loadu_ps(vmask, input_a); __m512 vacc = _mm512_add_ps(va, vb); - vacc = _mm512_maskz_max_ps(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ps(vmask, voutput_max, vacc); _mm512_mask_storeu_ps(output, vmask, vacc); } } diff --git a/src/f32-vbinary/gen/f32-vaddc-minmax-avx512f-u32.c b/src/f32-vbinary/gen/f32-vaddc-avx512f-u32.c similarity index 73% rename from src/f32-vbinary/gen/f32-vaddc-minmax-avx512f-u32.c rename to src/f32-vbinary/gen/f32-vaddc-avx512f-u32.c index 3f950dab165..f5f629e5626 100644 --- a/src/f32-vbinary/gen/f32-vaddc-minmax-avx512f-u32.c +++ b/src/f32-vbinary/gen/f32-vaddc-avx512f-u32.c @@ -7,6 +7,7 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. + #include #include @@ -16,12 +17,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vaddc_minmax_ukernel__avx512f_u32( +void xnn_f32_vaddc_ukernel__avx512f_u32( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,8 +30,6 @@ void xnn_f32_vaddc_minmax_ukernel__avx512f_u32( assert(input_b != NULL); assert(output != NULL); - const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); - const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); const __m512 vb = _mm512_set1_ps(*input_b); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { @@ -42,12 +41,6 @@ void xnn_f32_vaddc_minmax_ukernel__avx512f_u32( __m512 vacc1 = _mm512_add_ps(va1, vb); - vacc0 = _mm512_max_ps(voutput_min, vacc0); - vacc1 = _mm512_max_ps(voutput_min, vacc1); - - vacc0 = _mm512_min_ps(voutput_max, vacc0); - vacc1 = _mm512_min_ps(voutput_max, vacc1); - _mm512_storeu_ps(output, vacc0); _mm512_storeu_ps(output + 16, vacc1); output += 32; @@ -58,9 +51,6 @@ void xnn_f32_vaddc_minmax_ukernel__avx512f_u32( __m512 vacc = _mm512_add_ps(va, vb); - vacc = _mm512_max_ps(voutput_min, vacc); - vacc = _mm512_min_ps(voutput_max, vacc); - _mm512_storeu_ps(output, vacc); output += 16; } @@ -74,8 +64,6 @@ void xnn_f32_vaddc_minmax_ukernel__avx512f_u32( __m512 va = _mm512_maskz_loadu_ps(vmask, input_a); __m512 vacc = _mm512_add_ps(va, vb); - vacc = _mm512_maskz_max_ps(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ps(vmask, voutput_max, vacc); _mm512_mask_storeu_ps(output, vmask, vacc); } } diff --git a/src/f32-vbinary/gen/f32-vaddc-minmax-hvx-u128.c b/src/f32-vbinary/gen/f32-vaddc-hvx-u128.c similarity index 65% rename from src/f32-vbinary/gen/f32-vaddc-minmax-hvx-u128.c rename to src/f32-vbinary/gen/f32-vaddc-hvx-u128.c index 46adc0f8adf..10269222c49 100644 --- a/src/f32-vbinary/gen/f32-vaddc-minmax-hvx-u128.c +++ b/src/f32-vbinary/gen/f32-vaddc-hvx-u128.c @@ -10,12 +10,12 @@ #include "xnnpack/math.h" #include "xnnpack/vbinary.h" -void xnn_f32_vaddc_minmax_ukernel__hvx_u128( +void xnn_f32_vaddc_ukernel__hvx_u128( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -23,8 +23,6 @@ void xnn_f32_vaddc_minmax_ukernel__hvx_u128( assert(input_b != NULL); assert(output != NULL); - const HVX_Vector voutput_min = xnn_set1_f32(params->scalar.min); - const HVX_Vector voutput_max = xnn_set1_f32(params->scalar.max); HVX_Vector vb = xnn_set1_f32(*input_b); for (; batch >= 128 * sizeof(float); batch -= 128 * sizeof(float)) { @@ -40,16 +38,6 @@ void xnn_f32_vaddc_minmax_ukernel__hvx_u128( HVX_Vector vacc3 = xnn_add_f32(va3, vb); - vacc0 = xnn_max_f32(vacc0, voutput_min); - vacc1 = xnn_max_f32(vacc1, voutput_min); - vacc2 = xnn_max_f32(vacc2, voutput_min); - vacc3 = xnn_max_f32(vacc3, voutput_min); - - vacc0 = xnn_min_f32(vacc0, voutput_max); - vacc1 = xnn_min_f32(vacc1, voutput_max); - vacc2 = xnn_min_f32(vacc2, voutput_max); - vacc3 = xnn_min_f32(vacc3, voutput_max); - xnn_storeu_f32(output, vacc0); xnn_storeu_f32(output + 32, vacc1); xnn_storeu_f32(output + 64, vacc2); @@ -61,8 +49,6 @@ void xnn_f32_vaddc_minmax_ukernel__hvx_u128( input_a += 32; HVX_Vector vacc = xnn_add_f32(va, vb); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); xnn_storeu_f32(output, vacc); output+= 32; @@ -71,8 +57,6 @@ void xnn_f32_vaddc_minmax_ukernel__hvx_u128( HVX_Vector va = xnn_loadu_f32(input_a); HVX_Vector vacc = xnn_add_f32(va, vb); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); Q6_V_vstu_variable(output, batch, vacc); } diff --git a/src/f32-vbinary/gen/f32-vaddc-minmax-hvx-u32.c b/src/f32-vbinary/gen/f32-vaddc-hvx-u32.c similarity index 67% rename from src/f32-vbinary/gen/f32-vaddc-minmax-hvx-u32.c rename to src/f32-vbinary/gen/f32-vaddc-hvx-u32.c index b09b9cd03fa..4191e7f25fb 100644 --- a/src/f32-vbinary/gen/f32-vaddc-minmax-hvx-u32.c +++ b/src/f32-vbinary/gen/f32-vaddc-hvx-u32.c @@ -10,12 +10,12 @@ #include "xnnpack/math.h" #include "xnnpack/vbinary.h" -void xnn_f32_vaddc_minmax_ukernel__hvx_u32( +void xnn_f32_vaddc_ukernel__hvx_u32( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -23,8 +23,6 @@ void xnn_f32_vaddc_minmax_ukernel__hvx_u32( assert(input_b != NULL); assert(output != NULL); - const HVX_Vector voutput_min = xnn_set1_f32(params->scalar.min); - const HVX_Vector voutput_max = xnn_set1_f32(params->scalar.max); HVX_Vector vb = xnn_set1_f32(*input_b); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { @@ -32,8 +30,6 @@ void xnn_f32_vaddc_minmax_ukernel__hvx_u32( input_a += 32; HVX_Vector vacc = xnn_add_f32(va, vb); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); xnn_storeu_f32(output, vacc); output+= 32; @@ -42,8 +38,6 @@ void xnn_f32_vaddc_minmax_ukernel__hvx_u32( HVX_Vector va = xnn_loadu_f32(input_a); HVX_Vector vacc = xnn_add_f32(va, vb); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); Q6_V_vstu_variable(output, batch, vacc); } diff --git a/src/f32-vbinary/gen/f32-vaddc-minmax-hvx-u64.c b/src/f32-vbinary/gen/f32-vaddc-hvx-u64.c similarity index 67% rename from src/f32-vbinary/gen/f32-vaddc-minmax-hvx-u64.c rename to src/f32-vbinary/gen/f32-vaddc-hvx-u64.c index 73ed3aee476..08470187827 100644 --- a/src/f32-vbinary/gen/f32-vaddc-minmax-hvx-u64.c +++ b/src/f32-vbinary/gen/f32-vaddc-hvx-u64.c @@ -10,12 +10,12 @@ #include "xnnpack/math.h" #include "xnnpack/vbinary.h" -void xnn_f32_vaddc_minmax_ukernel__hvx_u64( +void xnn_f32_vaddc_ukernel__hvx_u64( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -23,8 +23,6 @@ void xnn_f32_vaddc_minmax_ukernel__hvx_u64( assert(input_b != NULL); assert(output != NULL); - const HVX_Vector voutput_min = xnn_set1_f32(params->scalar.min); - const HVX_Vector voutput_max = xnn_set1_f32(params->scalar.max); HVX_Vector vb = xnn_set1_f32(*input_b); for (; batch >= 64 * sizeof(float); batch -= 64 * sizeof(float)) { @@ -36,12 +34,6 @@ void xnn_f32_vaddc_minmax_ukernel__hvx_u64( HVX_Vector vacc1 = xnn_add_f32(va1, vb); - vacc0 = xnn_max_f32(vacc0, voutput_min); - vacc1 = xnn_max_f32(vacc1, voutput_min); - - vacc0 = xnn_min_f32(vacc0, voutput_max); - vacc1 = xnn_min_f32(vacc1, voutput_max); - xnn_storeu_f32(output, vacc0); xnn_storeu_f32(output + 32, vacc1); output += 64; @@ -51,8 +43,6 @@ void xnn_f32_vaddc_minmax_ukernel__hvx_u64( input_a += 32; HVX_Vector vacc = xnn_add_f32(va, vb); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); xnn_storeu_f32(output, vacc); output+= 32; @@ -61,8 +51,6 @@ void xnn_f32_vaddc_minmax_ukernel__hvx_u64( HVX_Vector va = xnn_loadu_f32(input_a); HVX_Vector vacc = xnn_add_f32(va, vb); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); Q6_V_vstu_variable(output, batch, vacc); } diff --git a/src/f32-vbinary/gen/f32-vaddc-minmax-avx-u16.c b/src/f32-vbinary/gen/f32-vaddc-minmax-avx-u16.c deleted file mode 100644 index 52789fd29d8..00000000000 --- a/src/f32-vbinary/gen/f32-vaddc-minmax-avx-u16.c +++ /dev/null @@ -1,94 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-avx.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vaddc_minmax_ukernel__avx_u16( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; - - const __m256 voutput_min = _mm256_set1_ps(params->scalar.min); - const __m256 voutput_max = _mm256_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const __m256 vb = _mm256_broadcast_ss(input_b); - - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const __m256 va0 = _mm256_loadu_ps(input_a); - const __m256 va1 = _mm256_loadu_ps(input_a + 8); - input_a += 16; - - __m256 vacc0 = _mm256_add_ps(va0, vb); - __m256 vacc1 = _mm256_add_ps(va1, vb); - - - vacc0 = _mm256_max_ps(voutput_min, vacc0); - vacc1 = _mm256_max_ps(voutput_min, vacc1); - - vacc0 = _mm256_min_ps(voutput_max, vacc0); - vacc1 = _mm256_min_ps(voutput_max, vacc1); - - _mm256_storeu_ps(output, vacc0); - _mm256_storeu_ps(output + 8, vacc1); - output += 16; - } - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const __m256 va = _mm256_loadu_ps(input_a); - input_a += 8; - - __m256 vacc = _mm256_add_ps(va, vb); - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); - _mm256_storeu_ps(output, vacc); - output += 8; - } - if XNN_UNLIKELY(batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch)); - - __m256 va = _mm256_maskload_ps(input_a, vmask); - - __m256 vacc = _mm256_add_ps(va, vb); - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); - - __m128 vacc_lo = _mm256_castps256_ps128(vacc); - if (batch & (4 * sizeof(float))) { - _mm_storeu_ps(output, vacc_lo); - vacc_lo = _mm256_extractf128_ps(vacc, 1); - output += 4; - } - if (batch & (2 * sizeof(float))) { - _mm_storel_pi((__m64*) output, vacc_lo); - vacc_lo = _mm_movehl_ps(vacc_lo, vacc_lo); - output += 2; - } - if (batch & (1 * sizeof(float))) { - _mm_store_ss(output, vacc_lo); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vaddc-minmax-wasm-u1.c b/src/f32-vbinary/gen/f32-vaddc-minmax-wasm-u1.c deleted file mode 100644 index 0838dfaf4b3..00000000000 --- a/src/f32-vbinary/gen/f32-vaddc-minmax-wasm-u1.c +++ /dev/null @@ -1,41 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vaddc_minmax_ukernel__wasm_u1( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - const float vb = *input_b; - - for (; batch >= sizeof(float); batch -= sizeof(float)) { - const float va = *input_a++; - float vacc = va + vb; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output++ = vacc; - } -} diff --git a/src/f32-vbinary/gen/f32-vaddc-minmax-wasm-u2.c b/src/f32-vbinary/gen/f32-vaddc-minmax-wasm-u2.c deleted file mode 100644 index 73e6f79d3ea..00000000000 --- a/src/f32-vbinary/gen/f32-vaddc-minmax-wasm-u2.c +++ /dev/null @@ -1,61 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vaddc_minmax_ukernel__wasm_u2( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - const float vb = *input_b; - - for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) { - const float va0 = input_a[0]; - const float va1 = input_a[1]; - input_a += 2; - - float vacc0 = va0 + vb; - float vacc1 = va1 + vb; - - - vacc0 = __builtin_wasm_max_f32(vacc0, voutput_min); - vacc1 = __builtin_wasm_max_f32(vacc1, voutput_min); - - vacc0 = __builtin_wasm_min_f32(vacc0, voutput_max); - vacc1 = __builtin_wasm_min_f32(vacc1, voutput_max); - - output[0] = vacc0; - output[1] = vacc1; - output += 2; - } - if XNN_UNLIKELY(batch != 0) { - assert(batch == sizeof(float)); - const float va = *input_a; - float vacc = va + vb; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output = vacc; - } -} diff --git a/src/f32-vbinary/gen/f32-vaddc-minmax-wasm-u4.c b/src/f32-vbinary/gen/f32-vaddc-minmax-wasm-u4.c deleted file mode 100644 index 6bcf0f844dd..00000000000 --- a/src/f32-vbinary/gen/f32-vaddc-minmax-wasm-u4.c +++ /dev/null @@ -1,73 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vaddc_minmax_ukernel__wasm_u4( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - const float vb = *input_b; - - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float va0 = input_a[0]; - const float va1 = input_a[1]; - const float va2 = input_a[2]; - const float va3 = input_a[3]; - input_a += 4; - - float vacc0 = va0 + vb; - float vacc1 = va1 + vb; - float vacc2 = va2 + vb; - float vacc3 = va3 + vb; - - - vacc0 = __builtin_wasm_max_f32(vacc0, voutput_min); - vacc1 = __builtin_wasm_max_f32(vacc1, voutput_min); - vacc2 = __builtin_wasm_max_f32(vacc2, voutput_min); - vacc3 = __builtin_wasm_max_f32(vacc3, voutput_min); - - vacc0 = __builtin_wasm_min_f32(vacc0, voutput_max); - vacc1 = __builtin_wasm_min_f32(vacc1, voutput_max); - vacc2 = __builtin_wasm_min_f32(vacc2, voutput_max); - vacc3 = __builtin_wasm_min_f32(vacc3, voutput_max); - - output[0] = vacc0; - output[1] = vacc1; - output[2] = vacc2; - output[3] = vacc3; - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - do { - const float va = *input_a++; - float vacc = va + vb; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output++ = vacc; - batch -= sizeof(float); - } while (batch != 0); - } -} diff --git a/src/f32-vbinary/gen/f32-vaddc-minmax-wasm-u8.c b/src/f32-vbinary/gen/f32-vaddc-minmax-wasm-u8.c deleted file mode 100644 index e13ac2a5f9e..00000000000 --- a/src/f32-vbinary/gen/f32-vaddc-minmax-wasm-u8.c +++ /dev/null @@ -1,93 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vaddc_minmax_ukernel__wasm_u8( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - const float vb = *input_b; - - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const float va0 = input_a[0]; - const float va1 = input_a[1]; - const float va2 = input_a[2]; - const float va3 = input_a[3]; - const float va4 = input_a[4]; - const float va5 = input_a[5]; - const float va6 = input_a[6]; - const float va7 = input_a[7]; - input_a += 8; - - float vacc0 = va0 + vb; - float vacc1 = va1 + vb; - float vacc2 = va2 + vb; - float vacc3 = va3 + vb; - float vacc4 = va4 + vb; - float vacc5 = va5 + vb; - float vacc6 = va6 + vb; - float vacc7 = va7 + vb; - - - vacc0 = __builtin_wasm_max_f32(vacc0, voutput_min); - vacc1 = __builtin_wasm_max_f32(vacc1, voutput_min); - vacc2 = __builtin_wasm_max_f32(vacc2, voutput_min); - vacc3 = __builtin_wasm_max_f32(vacc3, voutput_min); - vacc4 = __builtin_wasm_max_f32(vacc4, voutput_min); - vacc5 = __builtin_wasm_max_f32(vacc5, voutput_min); - vacc6 = __builtin_wasm_max_f32(vacc6, voutput_min); - vacc7 = __builtin_wasm_max_f32(vacc7, voutput_min); - - vacc0 = __builtin_wasm_min_f32(vacc0, voutput_max); - vacc1 = __builtin_wasm_min_f32(vacc1, voutput_max); - vacc2 = __builtin_wasm_min_f32(vacc2, voutput_max); - vacc3 = __builtin_wasm_min_f32(vacc3, voutput_max); - vacc4 = __builtin_wasm_min_f32(vacc4, voutput_max); - vacc5 = __builtin_wasm_min_f32(vacc5, voutput_max); - vacc6 = __builtin_wasm_min_f32(vacc6, voutput_max); - vacc7 = __builtin_wasm_min_f32(vacc7, voutput_max); - - output[0] = vacc0; - output[1] = vacc1; - output[2] = vacc2; - output[3] = vacc3; - output[4] = vacc4; - output[5] = vacc5; - output[6] = vacc6; - output[7] = vacc7; - output += 8; - } - if XNN_UNLIKELY(batch != 0) { - do { - const float va = *input_a++; - float vacc = va + vb; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output++ = vacc; - batch -= sizeof(float); - } while (batch != 0); - } -} diff --git a/src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-arm-u16.c b/src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-arm-u16.c deleted file mode 100644 index aea1773f1d3..00000000000 --- a/src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-arm-u16.c +++ /dev/null @@ -1,95 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_u16( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const v128_t vb = wasm_v128_load32_splat(input_b); - - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - const v128_t va2 = wasm_v128_load(input_a + 8); - const v128_t va3 = wasm_v128_load(input_a + 12); - input_a += 16; - - v128_t vy0 = wasm_f32x4_add(va0, vb); - v128_t vy1 = wasm_f32x4_add(va1, vb); - v128_t vy2 = wasm_f32x4_add(va2, vb); - v128_t vy3 = wasm_f32x4_add(va3, vb); - - - vy0 = wasm_f32x4_max(vy0, voutput_min); - vy1 = wasm_f32x4_max(vy1, voutput_min); - vy2 = wasm_f32x4_max(vy2, voutput_min); - vy3 = wasm_f32x4_max(vy3, voutput_min); - - vy0 = wasm_f32x4_min(vy0, voutput_max); - vy1 = wasm_f32x4_min(vy1, voutput_max); - vy2 = wasm_f32x4_min(vy2, voutput_max); - vy3 = wasm_f32x4_min(vy3, voutput_max); - - wasm_v128_store(output, vy0); - wasm_v128_store(output + 4, vy1); - wasm_v128_store(output + 8, vy2); - wasm_v128_store(output + 12, vy3); - output += 16; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - v128_t vy = wasm_f32x4_add(va, vb); - - vy = wasm_f32x4_max(vy, voutput_min); - vy = wasm_f32x4_min(vy, voutput_max); - - wasm_v128_store(output, vy); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - - v128_t vy = wasm_f32x4_add(va, vb); - - vy = wasm_f32x4_max(vy, voutput_min); - vy = wasm_f32x4_min(vy, voutput_max); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vy, 0); - vy = wasm_v64x2_shuffle(vy, vy, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vy, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-arm-u4.c b/src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-arm-u4.c deleted file mode 100644 index aa3e2acaa17..00000000000 --- a/src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-arm-u4.c +++ /dev/null @@ -1,66 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_u4( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const v128_t vb = wasm_v128_load32_splat(input_b); - - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - v128_t vy = wasm_f32x4_add(va, vb); - - vy = wasm_f32x4_max(vy, voutput_min); - vy = wasm_f32x4_min(vy, voutput_max); - - wasm_v128_store(output, vy); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - - v128_t vy = wasm_f32x4_add(va, vb); - - vy = wasm_f32x4_max(vy, voutput_min); - vy = wasm_f32x4_min(vy, voutput_max); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vy, 0); - vy = wasm_v64x2_shuffle(vy, vy, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vy, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-arm-u8.c b/src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-arm-u8.c deleted file mode 100644 index 3455c2a3572..00000000000 --- a/src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-arm-u8.c +++ /dev/null @@ -1,85 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_u8( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const v128_t vb = wasm_v128_load32_splat(input_b); - - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - input_a += 8; - - v128_t vy0 = wasm_f32x4_add(va0, vb); - v128_t vy1 = wasm_f32x4_add(va1, vb); - - - vy0 = wasm_f32x4_max(vy0, voutput_min); - vy1 = wasm_f32x4_max(vy1, voutput_min); - - vy0 = wasm_f32x4_min(vy0, voutput_max); - vy1 = wasm_f32x4_min(vy1, voutput_max); - - wasm_v128_store(output, vy0); - wasm_v128_store(output + 4, vy1); - output += 8; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - v128_t vy = wasm_f32x4_add(va, vb); - - vy = wasm_f32x4_max(vy, voutput_min); - vy = wasm_f32x4_min(vy, voutput_max); - - wasm_v128_store(output, vy); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - - v128_t vy = wasm_f32x4_add(va, vb); - - vy = wasm_f32x4_max(vy, voutput_min); - vy = wasm_f32x4_min(vy, voutput_max); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vy, 0); - vy = wasm_v64x2_shuffle(vy, vy, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vy, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-x86-u16.c b/src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-x86-u16.c deleted file mode 100644 index 2b6b2acf824..00000000000 --- a/src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-x86-u16.c +++ /dev/null @@ -1,95 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_u16( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const v128_t vb = wasm_v128_load32_splat(input_b); - - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - const v128_t va2 = wasm_v128_load(input_a + 8); - const v128_t va3 = wasm_v128_load(input_a + 12); - input_a += 16; - - v128_t vy0 = wasm_f32x4_add(va0, vb); - v128_t vy1 = wasm_f32x4_add(va1, vb); - v128_t vy2 = wasm_f32x4_add(va2, vb); - v128_t vy3 = wasm_f32x4_add(va3, vb); - - - vy0 = wasm_f32x4_pmax(voutput_min, vy0); - vy1 = wasm_f32x4_pmax(voutput_min, vy1); - vy2 = wasm_f32x4_pmax(voutput_min, vy2); - vy3 = wasm_f32x4_pmax(voutput_min, vy3); - - vy0 = wasm_f32x4_pmin(voutput_max, vy0); - vy1 = wasm_f32x4_pmin(voutput_max, vy1); - vy2 = wasm_f32x4_pmin(voutput_max, vy2); - vy3 = wasm_f32x4_pmin(voutput_max, vy3); - - wasm_v128_store(output, vy0); - wasm_v128_store(output + 4, vy1); - wasm_v128_store(output + 8, vy2); - wasm_v128_store(output + 12, vy3); - output += 16; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - v128_t vy = wasm_f32x4_add(va, vb); - - vy = wasm_f32x4_pmax(voutput_min, vy); - vy = wasm_f32x4_pmin(voutput_max, vy); - - wasm_v128_store(output, vy); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - - v128_t vy = wasm_f32x4_add(va, vb); - - vy = wasm_f32x4_pmax(voutput_min, vy); - vy = wasm_f32x4_pmin(voutput_max, vy); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vy, 0); - vy = wasm_v64x2_shuffle(vy, vy, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vy, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-x86-u4.c b/src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-x86-u4.c deleted file mode 100644 index 92895d77ccc..00000000000 --- a/src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-x86-u4.c +++ /dev/null @@ -1,66 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_u4( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const v128_t vb = wasm_v128_load32_splat(input_b); - - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - v128_t vy = wasm_f32x4_add(va, vb); - - vy = wasm_f32x4_pmax(voutput_min, vy); - vy = wasm_f32x4_pmin(voutput_max, vy); - - wasm_v128_store(output, vy); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - - v128_t vy = wasm_f32x4_add(va, vb); - - vy = wasm_f32x4_pmax(voutput_min, vy); - vy = wasm_f32x4_pmin(voutput_max, vy); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vy, 0); - vy = wasm_v64x2_shuffle(vy, vy, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vy, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-x86-u8.c b/src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-x86-u8.c deleted file mode 100644 index d851bdd3d60..00000000000 --- a/src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-x86-u8.c +++ /dev/null @@ -1,85 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_u8( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const v128_t vb = wasm_v128_load32_splat(input_b); - - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - input_a += 8; - - v128_t vy0 = wasm_f32x4_add(va0, vb); - v128_t vy1 = wasm_f32x4_add(va1, vb); - - - vy0 = wasm_f32x4_pmax(voutput_min, vy0); - vy1 = wasm_f32x4_pmax(voutput_min, vy1); - - vy0 = wasm_f32x4_pmin(voutput_max, vy0); - vy1 = wasm_f32x4_pmin(voutput_max, vy1); - - wasm_v128_store(output, vy0); - wasm_v128_store(output + 4, vy1); - output += 8; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - v128_t vy = wasm_f32x4_add(va, vb); - - vy = wasm_f32x4_pmax(voutput_min, vy); - vy = wasm_f32x4_pmin(voutput_max, vy); - - wasm_v128_store(output, vy); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - - v128_t vy = wasm_f32x4_add(va, vb); - - vy = wasm_f32x4_pmax(voutput_min, vy); - vy = wasm_f32x4_pmin(voutput_max, vy); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vy, 0); - vy = wasm_v64x2_shuffle(vy, vy, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vy, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vaddc-minmax-neon-u4.c b/src/f32-vbinary/gen/f32-vaddc-neon-u4.c similarity index 74% rename from src/f32-vbinary/gen/f32-vaddc-minmax-neon-u4.c rename to src/f32-vbinary/gen/f32-vaddc-neon-u4.c index 41ec67e5d10..193c28b6464 100644 --- a/src/f32-vbinary/gen/f32-vaddc-minmax-neon-u4.c +++ b/src/f32-vbinary/gen/f32-vaddc-neon-u4.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vaddc_minmax_ukernel__neon_u4( +void xnn_f32_vaddc_ukernel__neon_u4( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -28,8 +28,6 @@ void xnn_f32_vaddc_minmax_ukernel__neon_u4( assert(input_b != NULL); assert(output != NULL); - const float32x4_t voutput_min = vld1q_dup_f32(¶ms->scalar.min); - const float32x4_t voutput_max = vld1q_dup_f32(¶ms->scalar.max); const float32x4_t vb = vld1q_dup_f32(input_b); for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { @@ -37,9 +35,6 @@ void xnn_f32_vaddc_minmax_ukernel__neon_u4( float32x4_t vacc = vaddq_f32(va, vb); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -47,9 +42,6 @@ void xnn_f32_vaddc_minmax_ukernel__neon_u4( float32x4_t vacc = vaddq_f32(va, vb); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/gen/f32-vaddc-minmax-neon-u8.c b/src/f32-vbinary/gen/f32-vaddc-neon-u8.c similarity index 72% rename from src/f32-vbinary/gen/f32-vaddc-minmax-neon-u8.c rename to src/f32-vbinary/gen/f32-vaddc-neon-u8.c index a0431371c83..6396ba24232 100644 --- a/src/f32-vbinary/gen/f32-vaddc-minmax-neon-u8.c +++ b/src/f32-vbinary/gen/f32-vaddc-neon-u8.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vaddc_minmax_ukernel__neon_u8( +void xnn_f32_vaddc_ukernel__neon_u8( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -28,8 +28,6 @@ void xnn_f32_vaddc_minmax_ukernel__neon_u8( assert(input_b != NULL); assert(output != NULL); - const float32x4_t voutput_min = vld1q_dup_f32(¶ms->scalar.min); - const float32x4_t voutput_max = vld1q_dup_f32(¶ms->scalar.max); const float32x4_t vb = vld1q_dup_f32(input_b); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { @@ -40,12 +38,6 @@ void xnn_f32_vaddc_minmax_ukernel__neon_u8( float32x4_t vacc1 = vaddq_f32(va1, vb); - vacc0 = vmaxq_f32(vacc0, voutput_min); - vacc1 = vmaxq_f32(vacc1, voutput_min); - - vacc0 = vminq_f32(vacc0, voutput_max); - vacc1 = vminq_f32(vacc1, voutput_max); - vst1q_f32(output, vacc0); output += 4; vst1q_f32(output, vacc1); output += 4; } @@ -54,9 +46,6 @@ void xnn_f32_vaddc_minmax_ukernel__neon_u8( float32x4_t vacc = vaddq_f32(va, vb); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -64,9 +53,6 @@ void xnn_f32_vaddc_minmax_ukernel__neon_u8( float32x4_t vacc = vaddq_f32(va, vb); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/gen/f32-vaddc-minmax-rvv-u4v.c b/src/f32-vbinary/gen/f32-vaddc-rvv-u4v.c similarity index 74% rename from src/f32-vbinary/gen/f32-vaddc-minmax-rvv-u4v.c rename to src/f32-vbinary/gen/f32-vaddc-rvv-u4v.c index 54ce560915e..7433b7a3222 100644 --- a/src/f32-vbinary/gen/f32-vaddc-minmax-rvv-u4v.c +++ b/src/f32-vbinary/gen/f32-vaddc-rvv-u4v.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vaddc_minmax_ukernel__rvv_u4v( +void xnn_f32_vaddc_ukernel__rvv_u4v( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -28,8 +28,6 @@ void xnn_f32_vaddc_minmax_ukernel__rvv_u4v( assert(input_b != NULL); assert(output != NULL); - const float output_min = params->scalar.min; - const float output_max = params->scalar.max; const float b = *input_b; size_t n = batch >> 2; @@ -39,8 +37,6 @@ void xnn_f32_vaddc_minmax_ukernel__rvv_u4v( vfloat32m4_t va = __riscv_vle32_v_f32m4(input_a, vl); input_a += vl; vfloat32m4_t vacc = __riscv_vfadd_vf_f32m4(va, b, vl); - vacc = __riscv_vfmax_vf_f32m4(vacc, output_min, vl); - vacc = __riscv_vfmin_vf_f32m4(vacc, output_max, vl); __riscv_vse32_v_f32m4(output, vacc, vl); output += vl; } while (n > 0); diff --git a/src/f32-vbinary/gen/f32-vaddc-minmax-rvv-u8v.c b/src/f32-vbinary/gen/f32-vaddc-rvv-u8v.c similarity index 74% rename from src/f32-vbinary/gen/f32-vaddc-minmax-rvv-u8v.c rename to src/f32-vbinary/gen/f32-vaddc-rvv-u8v.c index 834ab61741b..7bbe9408e93 100644 --- a/src/f32-vbinary/gen/f32-vaddc-minmax-rvv-u8v.c +++ b/src/f32-vbinary/gen/f32-vaddc-rvv-u8v.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vaddc_minmax_ukernel__rvv_u8v( +void xnn_f32_vaddc_ukernel__rvv_u8v( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -28,8 +28,6 @@ void xnn_f32_vaddc_minmax_ukernel__rvv_u8v( assert(input_b != NULL); assert(output != NULL); - const float output_min = params->scalar.min; - const float output_max = params->scalar.max; const float b = *input_b; size_t n = batch >> 2; @@ -39,8 +37,6 @@ void xnn_f32_vaddc_minmax_ukernel__rvv_u8v( vfloat32m8_t va = __riscv_vle32_v_f32m8(input_a, vl); input_a += vl; vfloat32m8_t vacc = __riscv_vfadd_vf_f32m8(va, b, vl); - vacc = __riscv_vfmax_vf_f32m8(vacc, output_min, vl); - vacc = __riscv_vfmin_vf_f32m8(vacc, output_max, vl); __riscv_vse32_v_f32m8(output, vacc, vl); output += vl; } while (n > 0); diff --git a/src/f32-vbinary/gen/f32-vaddc-scalar-u2.c b/src/f32-vbinary/gen/f32-vaddc-scalar-u2.c index ff0e5fd8a10..c8a5de373c0 100644 --- a/src/f32-vbinary/gen/f32-vaddc-scalar-u2.c +++ b/src/f32-vbinary/gen/f32-vaddc-scalar-u2.c @@ -38,7 +38,6 @@ void xnn_f32_vaddc_ukernel__scalar_u2( float vacc1 = va1 + vb; - output[0] = vacc0; output[1] = vacc1; output += 2; diff --git a/src/f32-vbinary/gen/f32-vaddc-scalar-u4.c b/src/f32-vbinary/gen/f32-vaddc-scalar-u4.c index a313c855a54..6f80c8d3ff3 100644 --- a/src/f32-vbinary/gen/f32-vaddc-scalar-u4.c +++ b/src/f32-vbinary/gen/f32-vaddc-scalar-u4.c @@ -42,7 +42,6 @@ void xnn_f32_vaddc_ukernel__scalar_u4( float vacc3 = va3 + vb; - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vaddc-scalar-u8.c b/src/f32-vbinary/gen/f32-vaddc-scalar-u8.c index 110e16d19f3..b89c8921b24 100644 --- a/src/f32-vbinary/gen/f32-vaddc-scalar-u8.c +++ b/src/f32-vbinary/gen/f32-vaddc-scalar-u8.c @@ -50,7 +50,6 @@ void xnn_f32_vaddc_ukernel__scalar_u8( float vacc7 = va7 + vb; - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vaddc-minmax-sse-u4.c b/src/f32-vbinary/gen/f32-vaddc-sse-u4.c similarity index 71% rename from src/f32-vbinary/gen/f32-vaddc-minmax-sse-u4.c rename to src/f32-vbinary/gen/f32-vaddc-sse-u4.c index 32c92325ddb..5e724f40f2d 100644 --- a/src/f32-vbinary/gen/f32-vaddc-minmax-sse-u4.c +++ b/src/f32-vbinary/gen/f32-vaddc-sse-u4.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vaddc_minmax_ukernel__sse_u4( +void xnn_f32_vaddc_ukernel__sse_u4( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,10 +29,6 @@ void xnn_f32_vaddc_minmax_ukernel__sse_u4( assert(input_b != NULL); assert(output != NULL); - const __m128 voutput_min = _mm_set1_ps(params->scalar.min); - const __m128 voutput_max = _mm_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); const __m128 vb = _mm_load1_ps(input_b); for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { @@ -40,8 +36,6 @@ void xnn_f32_vaddc_minmax_ukernel__sse_u4( input_a += 4; __m128 vacc = _mm_add_ps(va, vb); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); _mm_storeu_ps(output, vacc); output += 4; @@ -50,8 +44,6 @@ void xnn_f32_vaddc_minmax_ukernel__sse_u4( const __m128 va = _mm_loadu_ps(input_a); __m128 vacc = _mm_add_ps(va, vb); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); if (batch & (2 * sizeof(float))) { _mm_storel_pi((__m64*) output, vacc); vacc = _mm_movehl_ps(vacc, vacc); diff --git a/src/f32-vbinary/gen/f32-vaddc-minmax-sse-u8.c b/src/f32-vbinary/gen/f32-vaddc-sse-u8.c similarity index 70% rename from src/f32-vbinary/gen/f32-vaddc-minmax-sse-u8.c rename to src/f32-vbinary/gen/f32-vaddc-sse-u8.c index ff5bf57a0b0..57db4ad6fc8 100644 --- a/src/f32-vbinary/gen/f32-vaddc-minmax-sse-u8.c +++ b/src/f32-vbinary/gen/f32-vaddc-sse-u8.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vaddc_minmax_ukernel__sse_u8( +void xnn_f32_vaddc_ukernel__sse_u8( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,10 +29,6 @@ void xnn_f32_vaddc_minmax_ukernel__sse_u8( assert(input_b != NULL); assert(output != NULL); - const __m128 voutput_min = _mm_set1_ps(params->scalar.min); - const __m128 voutput_max = _mm_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); const __m128 vb = _mm_load1_ps(input_b); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { @@ -44,12 +40,6 @@ void xnn_f32_vaddc_minmax_ukernel__sse_u8( __m128 vacc1 = _mm_add_ps(va1, vb); - vacc0 = _mm_max_ps(vacc0, voutput_min); - vacc1 = _mm_max_ps(vacc1, voutput_min); - - vacc0 = _mm_min_ps(vacc0, voutput_max); - vacc1 = _mm_min_ps(vacc1, voutput_max); - _mm_storeu_ps(output, vacc0); _mm_storeu_ps(output + 4, vacc1); output += 8; @@ -59,8 +49,6 @@ void xnn_f32_vaddc_minmax_ukernel__sse_u8( input_a += 4; __m128 vacc = _mm_add_ps(va, vb); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); _mm_storeu_ps(output, vacc); output += 4; @@ -69,8 +57,6 @@ void xnn_f32_vaddc_minmax_ukernel__sse_u8( const __m128 va = _mm_loadu_ps(input_a); __m128 vacc = _mm_add_ps(va, vb); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); if (batch & (2 * sizeof(float))) { _mm_storel_pi((__m64*) output, vacc); vacc = _mm_movehl_ps(vacc, vacc); diff --git a/src/f32-vbinary/gen/f32-vaddc-minmax-scalar-u1.c b/src/f32-vbinary/gen/f32-vaddc-wasm-u1.c similarity index 72% rename from src/f32-vbinary/gen/f32-vaddc-minmax-scalar-u1.c rename to src/f32-vbinary/gen/f32-vaddc-wasm-u1.c index 8d1207e0a15..eed65258275 100644 --- a/src/f32-vbinary/gen/f32-vaddc-minmax-scalar-u1.c +++ b/src/f32-vbinary/gen/f32-vaddc-wasm-u1.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vaddc_minmax_ukernel__scalar_u1( +void xnn_f32_vaddc_ukernel__wasm_u1( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,15 +27,11 @@ void xnn_f32_vaddc_minmax_ukernel__scalar_u1( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; const float vb = *input_b; for (; batch >= sizeof(float); batch -= sizeof(float)) { const float va = *input_a++; float vacc = va + vb; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output++ = vacc; } } diff --git a/src/f32-vbinary/gen/f32-vaddc-minmax-scalar-u2.c b/src/f32-vbinary/gen/f32-vaddc-wasm-u2.c similarity index 68% rename from src/f32-vbinary/gen/f32-vaddc-minmax-scalar-u2.c rename to src/f32-vbinary/gen/f32-vaddc-wasm-u2.c index 7d427f18b39..4f784e9525b 100644 --- a/src/f32-vbinary/gen/f32-vaddc-minmax-scalar-u2.c +++ b/src/f32-vbinary/gen/f32-vaddc-wasm-u2.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vaddc_minmax_ukernel__scalar_u2( +void xnn_f32_vaddc_ukernel__wasm_u2( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,8 +27,6 @@ void xnn_f32_vaddc_minmax_ukernel__scalar_u2( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; const float vb = *input_b; for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) { @@ -40,12 +38,6 @@ void xnn_f32_vaddc_minmax_ukernel__scalar_u2( float vacc1 = va1 + vb; - vacc0 = math_max_f32(vacc0, voutput_min); - vacc1 = math_max_f32(vacc1, voutput_min); - - vacc0 = math_min_f32(vacc0, voutput_max); - vacc1 = math_min_f32(vacc1, voutput_max); - output[0] = vacc0; output[1] = vacc1; output += 2; @@ -54,8 +46,6 @@ void xnn_f32_vaddc_minmax_ukernel__scalar_u2( assert(batch == sizeof(float)); const float va = *input_a; float vacc = va + vb; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output = vacc; } } diff --git a/src/f32-vbinary/gen/f32-vaddc-minmax-scalar-u4.c b/src/f32-vbinary/gen/f32-vaddc-wasm-u4.c similarity index 65% rename from src/f32-vbinary/gen/f32-vaddc-minmax-scalar-u4.c rename to src/f32-vbinary/gen/f32-vaddc-wasm-u4.c index 8887ad9472f..1ff8da7e26f 100644 --- a/src/f32-vbinary/gen/f32-vaddc-minmax-scalar-u4.c +++ b/src/f32-vbinary/gen/f32-vaddc-wasm-u4.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vaddc_minmax_ukernel__scalar_u4( +void xnn_f32_vaddc_ukernel__wasm_u4( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,8 +27,6 @@ void xnn_f32_vaddc_minmax_ukernel__scalar_u4( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; const float vb = *input_b; for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { @@ -44,16 +42,6 @@ void xnn_f32_vaddc_minmax_ukernel__scalar_u4( float vacc3 = va3 + vb; - vacc0 = math_max_f32(vacc0, voutput_min); - vacc1 = math_max_f32(vacc1, voutput_min); - vacc2 = math_max_f32(vacc2, voutput_min); - vacc3 = math_max_f32(vacc3, voutput_min); - - vacc0 = math_min_f32(vacc0, voutput_max); - vacc1 = math_min_f32(vacc1, voutput_max); - vacc2 = math_min_f32(vacc2, voutput_max); - vacc3 = math_min_f32(vacc3, voutput_max); - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; @@ -64,8 +52,6 @@ void xnn_f32_vaddc_minmax_ukernel__scalar_u4( do { const float va = *input_a++; float vacc = va + vb; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output++ = vacc; batch -= sizeof(float); } while (batch != 0); diff --git a/src/f32-vbinary/gen/f32-vaddc-minmax-scalar-u8.c b/src/f32-vbinary/gen/f32-vaddc-wasm-u8.c similarity index 60% rename from src/f32-vbinary/gen/f32-vaddc-minmax-scalar-u8.c rename to src/f32-vbinary/gen/f32-vaddc-wasm-u8.c index 24cc9169ca3..04b324a2f24 100644 --- a/src/f32-vbinary/gen/f32-vaddc-minmax-scalar-u8.c +++ b/src/f32-vbinary/gen/f32-vaddc-wasm-u8.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vaddc_minmax_ukernel__scalar_u8( +void xnn_f32_vaddc_ukernel__wasm_u8( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,8 +27,6 @@ void xnn_f32_vaddc_minmax_ukernel__scalar_u8( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; const float vb = *input_b; for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { @@ -52,24 +50,6 @@ void xnn_f32_vaddc_minmax_ukernel__scalar_u8( float vacc7 = va7 + vb; - vacc0 = math_max_f32(vacc0, voutput_min); - vacc1 = math_max_f32(vacc1, voutput_min); - vacc2 = math_max_f32(vacc2, voutput_min); - vacc3 = math_max_f32(vacc3, voutput_min); - vacc4 = math_max_f32(vacc4, voutput_min); - vacc5 = math_max_f32(vacc5, voutput_min); - vacc6 = math_max_f32(vacc6, voutput_min); - vacc7 = math_max_f32(vacc7, voutput_min); - - vacc0 = math_min_f32(vacc0, voutput_max); - vacc1 = math_min_f32(vacc1, voutput_max); - vacc2 = math_min_f32(vacc2, voutput_max); - vacc3 = math_min_f32(vacc3, voutput_max); - vacc4 = math_min_f32(vacc4, voutput_max); - vacc5 = math_min_f32(vacc5, voutput_max); - vacc6 = math_min_f32(vacc6, voutput_max); - vacc7 = math_min_f32(vacc7, voutput_max); - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; @@ -84,8 +64,6 @@ void xnn_f32_vaddc_minmax_ukernel__scalar_u8( do { const float va = *input_a++; float vacc = va + vb; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output++ = vacc; batch -= sizeof(float); } while (batch != 0); diff --git a/src/f32-vbinary/gen/f32-vaddc-wasmsimd-u16.c b/src/f32-vbinary/gen/f32-vaddc-wasmsimd-u16.c index 78b6b3e6c88..99794ed4aea 100644 --- a/src/f32-vbinary/gen/f32-vaddc-wasmsimd-u16.c +++ b/src/f32-vbinary/gen/f32-vaddc-wasmsimd-u16.c @@ -43,7 +43,6 @@ void xnn_f32_vaddc_ukernel__wasmsimd_u16( v128_t vy3 = wasm_f32x4_add(va3, vb); - wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); wasm_v128_store(output + 8, vy2); @@ -56,7 +55,6 @@ void xnn_f32_vaddc_ukernel__wasmsimd_u16( v128_t vy = wasm_f32x4_add(va, vb); - wasm_v128_store(output, vy); output += 4; } @@ -65,7 +63,6 @@ void xnn_f32_vaddc_ukernel__wasmsimd_u16( v128_t vy = wasm_f32x4_add(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vaddc-wasmsimd-u4.c b/src/f32-vbinary/gen/f32-vaddc-wasmsimd-u4.c index 54a20840a27..1d5fb904018 100644 --- a/src/f32-vbinary/gen/f32-vaddc-wasmsimd-u4.c +++ b/src/f32-vbinary/gen/f32-vaddc-wasmsimd-u4.c @@ -36,7 +36,6 @@ void xnn_f32_vaddc_ukernel__wasmsimd_u4( v128_t vy = wasm_f32x4_add(va, vb); - wasm_v128_store(output, vy); output += 4; } @@ -45,7 +44,6 @@ void xnn_f32_vaddc_ukernel__wasmsimd_u4( v128_t vy = wasm_f32x4_add(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vaddc-wasmsimd-u8.c b/src/f32-vbinary/gen/f32-vaddc-wasmsimd-u8.c index 375e61bee4e..0ce546ad3e3 100644 --- a/src/f32-vbinary/gen/f32-vaddc-wasmsimd-u8.c +++ b/src/f32-vbinary/gen/f32-vaddc-wasmsimd-u8.c @@ -39,7 +39,6 @@ void xnn_f32_vaddc_ukernel__wasmsimd_u8( v128_t vy1 = wasm_f32x4_add(va1, vb); - wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); output += 8; @@ -50,7 +49,6 @@ void xnn_f32_vaddc_ukernel__wasmsimd_u8( v128_t vy = wasm_f32x4_add(va, vb); - wasm_v128_store(output, vy); output += 4; } @@ -59,7 +57,6 @@ void xnn_f32_vaddc_ukernel__wasmsimd_u8( v128_t vy = wasm_f32x4_add(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vdiv-minmax-aarch64-neon-u4.c b/src/f32-vbinary/gen/f32-vdiv-aarch64-neon-u4.c similarity index 74% rename from src/f32-vbinary/gen/f32-vdiv-minmax-aarch64-neon-u4.c rename to src/f32-vbinary/gen/f32-vdiv-aarch64-neon-u4.c index 5b8472a5326..a6ba9a7c08e 100644 --- a/src/f32-vbinary/gen/f32-vdiv-minmax-aarch64-neon-u4.c +++ b/src/f32-vbinary/gen/f32-vdiv-aarch64-neon-u4.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vdiv_minmax_ukernel__aarch64_neon_u4( +void xnn_f32_vdiv_ukernel__aarch64_neon_u4( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -28,18 +28,12 @@ void xnn_f32_vdiv_minmax_ukernel__aarch64_neon_u4( assert(input_b != NULL); assert(output != NULL); - const float32x4_t voutput_min = vld1q_dup_f32(¶ms->scalar.min); - const float32x4_t voutput_max = vld1q_dup_f32(¶ms->scalar.max); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float32x4_t va = vld1q_f32(input_a); input_a += 4; const float32x4_t vb = vld1q_f32(input_b); input_b += 4; float32x4_t vacc = vdivq_f32(va, vb); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -48,9 +42,6 @@ void xnn_f32_vdiv_minmax_ukernel__aarch64_neon_u4( float32x4_t vacc = vdivq_f32(va, vb); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/gen/f32-vdiv-minmax-aarch64-neon-u8.c b/src/f32-vbinary/gen/f32-vdiv-aarch64-neon-u8.c similarity index 74% rename from src/f32-vbinary/gen/f32-vdiv-minmax-aarch64-neon-u8.c rename to src/f32-vbinary/gen/f32-vdiv-aarch64-neon-u8.c index ffe567c1972..b3fadc24318 100644 --- a/src/f32-vbinary/gen/f32-vdiv-minmax-aarch64-neon-u8.c +++ b/src/f32-vbinary/gen/f32-vdiv-aarch64-neon-u8.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vdiv_minmax_ukernel__aarch64_neon_u8( +void xnn_f32_vdiv_ukernel__aarch64_neon_u8( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -28,9 +28,6 @@ void xnn_f32_vdiv_minmax_ukernel__aarch64_neon_u8( assert(input_b != NULL); assert(output != NULL); - const float32x4_t voutput_min = vld1q_dup_f32(¶ms->scalar.min); - const float32x4_t voutput_max = vld1q_dup_f32(¶ms->scalar.max); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float32x4_t va0 = vld1q_f32(input_a); input_a += 4; const float32x4_t vb0 = vld1q_f32(input_b); input_b += 4; @@ -41,12 +38,6 @@ void xnn_f32_vdiv_minmax_ukernel__aarch64_neon_u8( float32x4_t vacc1 = vdivq_f32(va1, vb1); - vacc0 = vmaxq_f32(vacc0, voutput_min); - vacc1 = vmaxq_f32(vacc1, voutput_min); - - vacc0 = vminq_f32(vacc0, voutput_max); - vacc1 = vminq_f32(vacc1, voutput_max); - vst1q_f32(output, vacc0); output += 4; vst1q_f32(output, vacc1); output += 4; } @@ -56,9 +47,6 @@ void xnn_f32_vdiv_minmax_ukernel__aarch64_neon_u8( float32x4_t vacc = vdivq_f32(va, vb); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -67,9 +55,6 @@ void xnn_f32_vdiv_minmax_ukernel__aarch64_neon_u8( float32x4_t vacc = vdivq_f32(va, vb); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/gen/f32-vdiv-minmax-avx-u16.c b/src/f32-vbinary/gen/f32-vdiv-avx-u16.c similarity index 76% rename from src/f32-vbinary/gen/f32-vdiv-minmax-avx-u16.c rename to src/f32-vbinary/gen/f32-vdiv-avx-u16.c index 45a21f24928..37cdbba6849 100644 --- a/src/f32-vbinary/gen/f32-vdiv-minmax-avx-u16.c +++ b/src/f32-vbinary/gen/f32-vdiv-avx-u16.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vdiv_minmax_ukernel__avx_u16( +void xnn_f32_vdiv_ukernel__avx_u16( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -30,11 +30,6 @@ void xnn_f32_vdiv_minmax_ukernel__avx_u16( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; - const __m256 voutput_min = _mm256_set1_ps(params->scalar.min); - const __m256 voutput_max = _mm256_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const __m256 va0 = _mm256_loadu_ps(input_a); const __m256 va1 = _mm256_loadu_ps(input_a + 8); @@ -45,12 +40,6 @@ void xnn_f32_vdiv_minmax_ukernel__avx_u16( input_b += 16; - vacc0 = _mm256_max_ps(voutput_min, vacc0); - vacc1 = _mm256_max_ps(voutput_min, vacc1); - - vacc0 = _mm256_min_ps(voutput_max, vacc0); - vacc1 = _mm256_min_ps(voutput_max, vacc1); - _mm256_storeu_ps(output, vacc0); _mm256_storeu_ps(output + 8, vacc1); output += 16; @@ -61,8 +50,6 @@ void xnn_f32_vdiv_minmax_ukernel__avx_u16( __m256 vacc = _mm256_div_ps(va, _mm256_loadu_ps(input_b)); input_b += 8; - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); _mm256_storeu_ps(output, vacc); output += 8; } @@ -75,8 +62,6 @@ void xnn_f32_vdiv_minmax_ukernel__avx_u16( const __m256 vb = _mm256_maskload_ps(input_b, vmask); __m256 vacc = _mm256_div_ps(va, vb); - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); __m128 vacc_lo = _mm256_castps256_ps128(vacc); if (batch & (4 * sizeof(float))) { diff --git a/src/f32-vbinary/gen/f32-vdiv-minmax-avx-u8.c b/src/f32-vbinary/gen/f32-vdiv-avx-u8.c similarity index 78% rename from src/f32-vbinary/gen/f32-vdiv-minmax-avx-u8.c rename to src/f32-vbinary/gen/f32-vdiv-avx-u8.c index 0a796487b1a..403b07dd0fe 100644 --- a/src/f32-vbinary/gen/f32-vdiv-minmax-avx-u8.c +++ b/src/f32-vbinary/gen/f32-vdiv-avx-u8.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vdiv_minmax_ukernel__avx_u8( +void xnn_f32_vdiv_ukernel__avx_u8( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -30,19 +30,12 @@ void xnn_f32_vdiv_minmax_ukernel__avx_u8( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; - const __m256 voutput_min = _mm256_set1_ps(params->scalar.min); - const __m256 voutput_max = _mm256_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const __m256 va = _mm256_loadu_ps(input_a); input_a += 8; __m256 vacc = _mm256_div_ps(va, _mm256_loadu_ps(input_b)); input_b += 8; - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); _mm256_storeu_ps(output, vacc); output += 8; } @@ -55,8 +48,6 @@ void xnn_f32_vdiv_minmax_ukernel__avx_u8( const __m256 vb = _mm256_maskload_ps(input_b, vmask); __m256 vacc = _mm256_div_ps(va, vb); - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); __m128 vacc_lo = _mm256_castps256_ps128(vacc); if (batch & (4 * sizeof(float))) { diff --git a/src/f32-vbinary/gen/f32-vdiv-minmax-avx512f-u16.c b/src/f32-vbinary/gen/f32-vdiv-avx512f-u16.c similarity index 75% rename from src/f32-vbinary/gen/f32-vdiv-minmax-avx512f-u16.c rename to src/f32-vbinary/gen/f32-vdiv-avx512f-u16.c index 885f1d4b09a..cb166d48837 100644 --- a/src/f32-vbinary/gen/f32-vdiv-minmax-avx512f-u16.c +++ b/src/f32-vbinary/gen/f32-vdiv-avx512f-u16.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vdiv_minmax_ukernel__avx512f_u16( +void xnn_f32_vdiv_ukernel__avx512f_u16( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,9 +29,6 @@ void xnn_f32_vdiv_minmax_ukernel__avx512f_u16( assert(input_b != NULL); assert(output != NULL); - const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); - const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const __m512 va = _mm512_loadu_ps(input_a); @@ -41,9 +38,6 @@ void xnn_f32_vdiv_minmax_ukernel__avx512f_u16( input_b += 16; - vacc = _mm512_max_ps(voutput_min, vacc); - vacc = _mm512_min_ps(voutput_max, vacc); - _mm512_storeu_ps(output, vacc); output += 16; } @@ -57,8 +51,6 @@ void xnn_f32_vdiv_minmax_ukernel__avx512f_u16( const __m512 va = _mm512_maskz_loadu_ps(vmask, input_a); __m512 vacc = _mm512_maskz_div_ps(vmask, va, _mm512_maskz_loadu_ps(vmask, input_b)); - vacc = _mm512_maskz_max_ps(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ps(vmask, voutput_max, vacc); _mm512_mask_storeu_ps(output, vmask, vacc); } } diff --git a/src/f32-vbinary/gen/f32-vdiv-minmax-avx512f-u32.c b/src/f32-vbinary/gen/f32-vdiv-avx512f-u32.c similarity index 74% rename from src/f32-vbinary/gen/f32-vdiv-minmax-avx512f-u32.c rename to src/f32-vbinary/gen/f32-vdiv-avx512f-u32.c index d68a043721a..aa1fff31c8e 100644 --- a/src/f32-vbinary/gen/f32-vdiv-minmax-avx512f-u32.c +++ b/src/f32-vbinary/gen/f32-vdiv-avx512f-u32.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vdiv_minmax_ukernel__avx512f_u32( +void xnn_f32_vdiv_ukernel__avx512f_u32( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,9 +29,6 @@ void xnn_f32_vdiv_minmax_ukernel__avx512f_u32( assert(input_b != NULL); assert(output != NULL); - const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); - const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); - for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { const __m512 va0 = _mm512_loadu_ps(input_a); @@ -43,12 +40,6 @@ void xnn_f32_vdiv_minmax_ukernel__avx512f_u32( input_b += 32; - vacc0 = _mm512_max_ps(voutput_min, vacc0); - vacc1 = _mm512_max_ps(voutput_min, vacc1); - - vacc0 = _mm512_min_ps(voutput_max, vacc0); - vacc1 = _mm512_min_ps(voutput_max, vacc1); - _mm512_storeu_ps(output, vacc0); _mm512_storeu_ps(output + 16, vacc1); output += 32; @@ -61,9 +52,6 @@ void xnn_f32_vdiv_minmax_ukernel__avx512f_u32( input_b += 16; - vacc = _mm512_max_ps(voutput_min, vacc); - vacc = _mm512_min_ps(voutput_max, vacc); - _mm512_storeu_ps(output, vacc); output += 16; } @@ -77,8 +65,6 @@ void xnn_f32_vdiv_minmax_ukernel__avx512f_u32( const __m512 va = _mm512_maskz_loadu_ps(vmask, input_a); __m512 vacc = _mm512_maskz_div_ps(vmask, va, _mm512_maskz_loadu_ps(vmask, input_b)); - vacc = _mm512_maskz_max_ps(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ps(vmask, voutput_max, vacc); _mm512_mask_storeu_ps(output, vmask, vacc); } } diff --git a/src/f32-vbinary/gen/f32-vdiv-minmax-wasm-u1.c b/src/f32-vbinary/gen/f32-vdiv-minmax-wasm-u1.c deleted file mode 100644 index bd18d9c3205..00000000000 --- a/src/f32-vbinary/gen/f32-vdiv-minmax-wasm-u1.c +++ /dev/null @@ -1,41 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vdiv_minmax_ukernel__wasm_u1( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - - for (; batch >= sizeof(float); batch -= sizeof(float)) { - const float va = *input_a++; - const float vb = *input_b++; - float vacc = va / vb; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output++ = vacc; - } -} diff --git a/src/f32-vbinary/gen/f32-vdiv-minmax-wasm-u2.c b/src/f32-vbinary/gen/f32-vdiv-minmax-wasm-u2.c deleted file mode 100644 index 559b42c8c6d..00000000000 --- a/src/f32-vbinary/gen/f32-vdiv-minmax-wasm-u2.c +++ /dev/null @@ -1,65 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vdiv_minmax_ukernel__wasm_u2( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - - for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) { - const float va0 = input_a[0]; - const float va1 = input_a[1]; - input_a += 2; - - const float vb0 = input_b[0]; - const float vb1 = input_b[1]; - input_b += 2; - - float vacc0 = va0 / vb0; - float vacc1 = va1 / vb1; - - - vacc0 = __builtin_wasm_max_f32(vacc0, voutput_min); - vacc1 = __builtin_wasm_max_f32(vacc1, voutput_min); - - vacc0 = __builtin_wasm_min_f32(vacc0, voutput_max); - vacc1 = __builtin_wasm_min_f32(vacc1, voutput_max); - - output[0] = vacc0; - output[1] = vacc1; - output += 2; - } - if XNN_UNLIKELY(batch != 0) { - assert(batch == sizeof(float)); - const float va = *input_a; - const float vb = *input_b; - float vacc = va / vb; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output = vacc; - } -} diff --git a/src/f32-vbinary/gen/f32-vdiv-minmax-wasm-u4.c b/src/f32-vbinary/gen/f32-vdiv-minmax-wasm-u4.c deleted file mode 100644 index ee1bec230d0..00000000000 --- a/src/f32-vbinary/gen/f32-vdiv-minmax-wasm-u4.c +++ /dev/null @@ -1,79 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vdiv_minmax_ukernel__wasm_u4( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float va0 = input_a[0]; - const float va1 = input_a[1]; - const float va2 = input_a[2]; - const float va3 = input_a[3]; - input_a += 4; - - const float vb0 = input_b[0]; - const float vb1 = input_b[1]; - const float vb2 = input_b[2]; - const float vb3 = input_b[3]; - input_b += 4; - - float vacc0 = va0 / vb0; - float vacc1 = va1 / vb1; - float vacc2 = va2 / vb2; - float vacc3 = va3 / vb3; - - - vacc0 = __builtin_wasm_max_f32(vacc0, voutput_min); - vacc1 = __builtin_wasm_max_f32(vacc1, voutput_min); - vacc2 = __builtin_wasm_max_f32(vacc2, voutput_min); - vacc3 = __builtin_wasm_max_f32(vacc3, voutput_min); - - vacc0 = __builtin_wasm_min_f32(vacc0, voutput_max); - vacc1 = __builtin_wasm_min_f32(vacc1, voutput_max); - vacc2 = __builtin_wasm_min_f32(vacc2, voutput_max); - vacc3 = __builtin_wasm_min_f32(vacc3, voutput_max); - - output[0] = vacc0; - output[1] = vacc1; - output[2] = vacc2; - output[3] = vacc3; - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - do { - const float va = *input_a++; - const float vb = *input_b++; - float vacc = va / vb; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output++ = vacc; - batch -= sizeof(float); - } while (batch != 0); - } -} diff --git a/src/f32-vbinary/gen/f32-vdiv-minmax-wasm-u8.c b/src/f32-vbinary/gen/f32-vdiv-minmax-wasm-u8.c deleted file mode 100644 index e29d793c5df..00000000000 --- a/src/f32-vbinary/gen/f32-vdiv-minmax-wasm-u8.c +++ /dev/null @@ -1,103 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vdiv_minmax_ukernel__wasm_u8( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const float va0 = input_a[0]; - const float va1 = input_a[1]; - const float va2 = input_a[2]; - const float va3 = input_a[3]; - const float va4 = input_a[4]; - const float va5 = input_a[5]; - const float va6 = input_a[6]; - const float va7 = input_a[7]; - input_a += 8; - - const float vb0 = input_b[0]; - const float vb1 = input_b[1]; - const float vb2 = input_b[2]; - const float vb3 = input_b[3]; - const float vb4 = input_b[4]; - const float vb5 = input_b[5]; - const float vb6 = input_b[6]; - const float vb7 = input_b[7]; - input_b += 8; - - float vacc0 = va0 / vb0; - float vacc1 = va1 / vb1; - float vacc2 = va2 / vb2; - float vacc3 = va3 / vb3; - float vacc4 = va4 / vb4; - float vacc5 = va5 / vb5; - float vacc6 = va6 / vb6; - float vacc7 = va7 / vb7; - - - vacc0 = __builtin_wasm_max_f32(vacc0, voutput_min); - vacc1 = __builtin_wasm_max_f32(vacc1, voutput_min); - vacc2 = __builtin_wasm_max_f32(vacc2, voutput_min); - vacc3 = __builtin_wasm_max_f32(vacc3, voutput_min); - vacc4 = __builtin_wasm_max_f32(vacc4, voutput_min); - vacc5 = __builtin_wasm_max_f32(vacc5, voutput_min); - vacc6 = __builtin_wasm_max_f32(vacc6, voutput_min); - vacc7 = __builtin_wasm_max_f32(vacc7, voutput_min); - - vacc0 = __builtin_wasm_min_f32(vacc0, voutput_max); - vacc1 = __builtin_wasm_min_f32(vacc1, voutput_max); - vacc2 = __builtin_wasm_min_f32(vacc2, voutput_max); - vacc3 = __builtin_wasm_min_f32(vacc3, voutput_max); - vacc4 = __builtin_wasm_min_f32(vacc4, voutput_max); - vacc5 = __builtin_wasm_min_f32(vacc5, voutput_max); - vacc6 = __builtin_wasm_min_f32(vacc6, voutput_max); - vacc7 = __builtin_wasm_min_f32(vacc7, voutput_max); - - output[0] = vacc0; - output[1] = vacc1; - output[2] = vacc2; - output[3] = vacc3; - output[4] = vacc4; - output[5] = vacc5; - output[6] = vacc6; - output[7] = vacc7; - output += 8; - } - if XNN_UNLIKELY(batch != 0) { - do { - const float va = *input_a++; - const float vb = *input_b++; - float vacc = va / vb; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output++ = vacc; - batch -= sizeof(float); - } while (batch != 0); - } -} diff --git a/src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-arm-u16.c b/src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-arm-u16.c deleted file mode 100644 index 17aa75efbb1..00000000000 --- a/src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-arm-u16.c +++ /dev/null @@ -1,103 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vdiv_minmax_ukernel__wasmsimd_arm_u16( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - const v128_t va2 = wasm_v128_load(input_a + 8); - const v128_t va3 = wasm_v128_load(input_a + 12); - input_a += 16; - - const v128_t vb0 = wasm_v128_load(input_b); - const v128_t vb1 = wasm_v128_load(input_b + 4); - const v128_t vb2 = wasm_v128_load(input_b + 8); - const v128_t vb3 = wasm_v128_load(input_b + 12); - input_b += 16; - - v128_t vacc0 = wasm_f32x4_div(va0, vb0); - v128_t vacc1 = wasm_f32x4_div(va1, vb1); - v128_t vacc2 = wasm_f32x4_div(va2, vb2); - v128_t vacc3 = wasm_f32x4_div(va3, vb3); - - vacc0 = wasm_f32x4_max(vacc0, voutput_min); - vacc1 = wasm_f32x4_max(vacc1, voutput_min); - vacc2 = wasm_f32x4_max(vacc2, voutput_min); - vacc3 = wasm_f32x4_max(vacc3, voutput_min); - - vacc0 = wasm_f32x4_min(vacc0, voutput_max); - vacc1 = wasm_f32x4_min(vacc1, voutput_max); - vacc2 = wasm_f32x4_min(vacc2, voutput_max); - vacc3 = wasm_f32x4_min(vacc3, voutput_max); - - wasm_v128_store(output, vacc0); - wasm_v128_store(output + 4, vacc1); - wasm_v128_store(output + 8, vacc2); - wasm_v128_store(output + 12, vacc3); - output += 16; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - const v128_t vb = wasm_v128_load(input_b); - input_b += 4; - - v128_t vacc = wasm_f32x4_div(va, vb); - - vacc = wasm_f32x4_max(vacc, voutput_min); - vacc = wasm_f32x4_min(vacc, voutput_max); - - wasm_v128_store(output, vacc); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - const v128_t vb = wasm_v128_load(input_b); - - v128_t vacc = wasm_f32x4_div(va, vb); - - vacc = wasm_f32x4_max(vacc, voutput_min); - vacc = wasm_f32x4_min(vacc, voutput_max); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vacc, 0); - vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vacc, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-arm-u4.c b/src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-arm-u4.c deleted file mode 100644 index 6e7eebbc661..00000000000 --- a/src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-arm-u4.c +++ /dev/null @@ -1,69 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vdiv_minmax_ukernel__wasmsimd_arm_u4( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - const v128_t vb = wasm_v128_load(input_b); - input_b += 4; - - v128_t vacc = wasm_f32x4_div(va, vb); - - vacc = wasm_f32x4_max(vacc, voutput_min); - vacc = wasm_f32x4_min(vacc, voutput_max); - - wasm_v128_store(output, vacc); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - const v128_t vb = wasm_v128_load(input_b); - - v128_t vacc = wasm_f32x4_div(va, vb); - - vacc = wasm_f32x4_max(vacc, voutput_min); - vacc = wasm_f32x4_min(vacc, voutput_max); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vacc, 0); - vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vacc, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-arm-u8.c b/src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-arm-u8.c deleted file mode 100644 index f3fc28a6602..00000000000 --- a/src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-arm-u8.c +++ /dev/null @@ -1,91 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vdiv_minmax_ukernel__wasmsimd_arm_u8( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - input_a += 8; - - const v128_t vb0 = wasm_v128_load(input_b); - const v128_t vb1 = wasm_v128_load(input_b + 4); - input_b += 8; - - v128_t vacc0 = wasm_f32x4_div(va0, vb0); - v128_t vacc1 = wasm_f32x4_div(va1, vb1); - - vacc0 = wasm_f32x4_max(vacc0, voutput_min); - vacc1 = wasm_f32x4_max(vacc1, voutput_min); - - vacc0 = wasm_f32x4_min(vacc0, voutput_max); - vacc1 = wasm_f32x4_min(vacc1, voutput_max); - - wasm_v128_store(output, vacc0); - wasm_v128_store(output + 4, vacc1); - output += 8; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - const v128_t vb = wasm_v128_load(input_b); - input_b += 4; - - v128_t vacc = wasm_f32x4_div(va, vb); - - vacc = wasm_f32x4_max(vacc, voutput_min); - vacc = wasm_f32x4_min(vacc, voutput_max); - - wasm_v128_store(output, vacc); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - const v128_t vb = wasm_v128_load(input_b); - - v128_t vacc = wasm_f32x4_div(va, vb); - - vacc = wasm_f32x4_max(vacc, voutput_min); - vacc = wasm_f32x4_min(vacc, voutput_max); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vacc, 0); - vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vacc, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-x86-u16.c b/src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-x86-u16.c deleted file mode 100644 index f8e4783b4a2..00000000000 --- a/src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-x86-u16.c +++ /dev/null @@ -1,103 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vdiv_minmax_ukernel__wasmsimd_x86_u16( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - const v128_t va2 = wasm_v128_load(input_a + 8); - const v128_t va3 = wasm_v128_load(input_a + 12); - input_a += 16; - - const v128_t vb0 = wasm_v128_load(input_b); - const v128_t vb1 = wasm_v128_load(input_b + 4); - const v128_t vb2 = wasm_v128_load(input_b + 8); - const v128_t vb3 = wasm_v128_load(input_b + 12); - input_b += 16; - - v128_t vacc0 = wasm_f32x4_div(va0, vb0); - v128_t vacc1 = wasm_f32x4_div(va1, vb1); - v128_t vacc2 = wasm_f32x4_div(va2, vb2); - v128_t vacc3 = wasm_f32x4_div(va3, vb3); - - vacc0 = wasm_f32x4_pmax(voutput_min, vacc0); - vacc1 = wasm_f32x4_pmax(voutput_min, vacc1); - vacc2 = wasm_f32x4_pmax(voutput_min, vacc2); - vacc3 = wasm_f32x4_pmax(voutput_min, vacc3); - - vacc0 = wasm_f32x4_pmin(voutput_max, vacc0); - vacc1 = wasm_f32x4_pmin(voutput_max, vacc1); - vacc2 = wasm_f32x4_pmin(voutput_max, vacc2); - vacc3 = wasm_f32x4_pmin(voutput_max, vacc3); - - wasm_v128_store(output, vacc0); - wasm_v128_store(output + 4, vacc1); - wasm_v128_store(output + 8, vacc2); - wasm_v128_store(output + 12, vacc3); - output += 16; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - const v128_t vb = wasm_v128_load(input_b); - input_b += 4; - - v128_t vacc = wasm_f32x4_div(va, vb); - - vacc = wasm_f32x4_pmax(voutput_min, vacc); - vacc = wasm_f32x4_pmin(voutput_max, vacc); - - wasm_v128_store(output, vacc); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - const v128_t vb = wasm_v128_load(input_b); - - v128_t vacc = wasm_f32x4_div(va, vb); - - vacc = wasm_f32x4_pmax(voutput_min, vacc); - vacc = wasm_f32x4_pmin(voutput_max, vacc); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vacc, 0); - vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vacc, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-x86-u4.c b/src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-x86-u4.c deleted file mode 100644 index a04a67ff689..00000000000 --- a/src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-x86-u4.c +++ /dev/null @@ -1,69 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vdiv_minmax_ukernel__wasmsimd_x86_u4( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - const v128_t vb = wasm_v128_load(input_b); - input_b += 4; - - v128_t vacc = wasm_f32x4_div(va, vb); - - vacc = wasm_f32x4_pmax(voutput_min, vacc); - vacc = wasm_f32x4_pmin(voutput_max, vacc); - - wasm_v128_store(output, vacc); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - const v128_t vb = wasm_v128_load(input_b); - - v128_t vacc = wasm_f32x4_div(va, vb); - - vacc = wasm_f32x4_pmax(voutput_min, vacc); - vacc = wasm_f32x4_pmin(voutput_max, vacc); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vacc, 0); - vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vacc, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-x86-u8.c b/src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-x86-u8.c deleted file mode 100644 index 45037193e4a..00000000000 --- a/src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-x86-u8.c +++ /dev/null @@ -1,91 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vdiv_minmax_ukernel__wasmsimd_x86_u8( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - input_a += 8; - - const v128_t vb0 = wasm_v128_load(input_b); - const v128_t vb1 = wasm_v128_load(input_b + 4); - input_b += 8; - - v128_t vacc0 = wasm_f32x4_div(va0, vb0); - v128_t vacc1 = wasm_f32x4_div(va1, vb1); - - vacc0 = wasm_f32x4_pmax(voutput_min, vacc0); - vacc1 = wasm_f32x4_pmax(voutput_min, vacc1); - - vacc0 = wasm_f32x4_pmin(voutput_max, vacc0); - vacc1 = wasm_f32x4_pmin(voutput_max, vacc1); - - wasm_v128_store(output, vacc0); - wasm_v128_store(output + 4, vacc1); - output += 8; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - const v128_t vb = wasm_v128_load(input_b); - input_b += 4; - - v128_t vacc = wasm_f32x4_div(va, vb); - - vacc = wasm_f32x4_pmax(voutput_min, vacc); - vacc = wasm_f32x4_pmin(voutput_max, vacc); - - wasm_v128_store(output, vacc); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - const v128_t vb = wasm_v128_load(input_b); - - v128_t vacc = wasm_f32x4_div(va, vb); - - vacc = wasm_f32x4_pmax(voutput_min, vacc); - vacc = wasm_f32x4_pmin(voutput_max, vacc); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vacc, 0); - vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vacc, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vdiv-minmax-rvv-u4v.c b/src/f32-vbinary/gen/f32-vdiv-rvv-u4v.c similarity index 75% rename from src/f32-vbinary/gen/f32-vdiv-minmax-rvv-u4v.c rename to src/f32-vbinary/gen/f32-vdiv-rvv-u4v.c index d089759f4ea..eb36a499c90 100644 --- a/src/f32-vbinary/gen/f32-vdiv-minmax-rvv-u4v.c +++ b/src/f32-vbinary/gen/f32-vdiv-rvv-u4v.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vdiv_minmax_ukernel__rvv_u4v( +void xnn_f32_vdiv_ukernel__rvv_u4v( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -28,8 +28,6 @@ void xnn_f32_vdiv_minmax_ukernel__rvv_u4v( assert(input_b != NULL); assert(output != NULL); - const float output_min = params->scalar.min; - const float output_max = params->scalar.max; size_t n = batch >> 2; do { @@ -40,8 +38,6 @@ void xnn_f32_vdiv_minmax_ukernel__rvv_u4v( vfloat32m4_t vb = __riscv_vle32_v_f32m4(input_b, vl); input_b += vl; vfloat32m4_t vacc = __riscv_vfdiv_vv_f32m4(va, vb, vl); - vacc = __riscv_vfmax_vf_f32m4(vacc, output_min, vl); - vacc = __riscv_vfmin_vf_f32m4(vacc, output_max, vl); __riscv_vse32_v_f32m4(output, vacc, vl); output += vl; } while (n > 0); diff --git a/src/f32-vbinary/gen/f32-vdiv-minmax-rvv-u8v.c b/src/f32-vbinary/gen/f32-vdiv-rvv-u8v.c similarity index 75% rename from src/f32-vbinary/gen/f32-vdiv-minmax-rvv-u8v.c rename to src/f32-vbinary/gen/f32-vdiv-rvv-u8v.c index ae68d1b7b71..3cf6edceef4 100644 --- a/src/f32-vbinary/gen/f32-vdiv-minmax-rvv-u8v.c +++ b/src/f32-vbinary/gen/f32-vdiv-rvv-u8v.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vdiv_minmax_ukernel__rvv_u8v( +void xnn_f32_vdiv_ukernel__rvv_u8v( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -28,8 +28,6 @@ void xnn_f32_vdiv_minmax_ukernel__rvv_u8v( assert(input_b != NULL); assert(output != NULL); - const float output_min = params->scalar.min; - const float output_max = params->scalar.max; size_t n = batch >> 2; do { @@ -40,8 +38,6 @@ void xnn_f32_vdiv_minmax_ukernel__rvv_u8v( vfloat32m8_t vb = __riscv_vle32_v_f32m8(input_b, vl); input_b += vl; vfloat32m8_t vacc = __riscv_vfdiv_vv_f32m8(va, vb, vl); - vacc = __riscv_vfmax_vf_f32m8(vacc, output_min, vl); - vacc = __riscv_vfmin_vf_f32m8(vacc, output_max, vl); __riscv_vse32_v_f32m8(output, vacc, vl); output += vl; } while (n > 0); diff --git a/src/f32-vbinary/gen/f32-vdiv-scalar-u1.c b/src/f32-vbinary/gen/f32-vdiv-scalar-u1.c index 4713672b842..7a2f1e45be4 100644 --- a/src/f32-vbinary/gen/f32-vdiv-scalar-u1.c +++ b/src/f32-vbinary/gen/f32-vdiv-scalar-u1.c @@ -27,7 +27,6 @@ void xnn_f32_vdiv_ukernel__scalar_u1( assert(input_b != NULL); assert(output != NULL); - for (; batch >= sizeof(float); batch -= sizeof(float)) { const float va = *input_a++; const float vb = *input_b++; diff --git a/src/f32-vbinary/gen/f32-vdiv-scalar-u2.c b/src/f32-vbinary/gen/f32-vdiv-scalar-u2.c index 1b17662b5c5..0a56950c96f 100644 --- a/src/f32-vbinary/gen/f32-vdiv-scalar-u2.c +++ b/src/f32-vbinary/gen/f32-vdiv-scalar-u2.c @@ -27,7 +27,6 @@ void xnn_f32_vdiv_ukernel__scalar_u2( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -41,7 +40,6 @@ void xnn_f32_vdiv_ukernel__scalar_u2( float vacc1 = va1 / vb1; - output[0] = vacc0; output[1] = vacc1; output += 2; diff --git a/src/f32-vbinary/gen/f32-vdiv-scalar-u4.c b/src/f32-vbinary/gen/f32-vdiv-scalar-u4.c index ca54c95c6e8..d5490aa1ebe 100644 --- a/src/f32-vbinary/gen/f32-vdiv-scalar-u4.c +++ b/src/f32-vbinary/gen/f32-vdiv-scalar-u4.c @@ -27,7 +27,6 @@ void xnn_f32_vdiv_ukernel__scalar_u4( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -47,7 +46,6 @@ void xnn_f32_vdiv_ukernel__scalar_u4( float vacc3 = va3 / vb3; - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vdiv-scalar-u8.c b/src/f32-vbinary/gen/f32-vdiv-scalar-u8.c index 24607cb5801..80efe4bcd20 100644 --- a/src/f32-vbinary/gen/f32-vdiv-scalar-u8.c +++ b/src/f32-vbinary/gen/f32-vdiv-scalar-u8.c @@ -27,7 +27,6 @@ void xnn_f32_vdiv_ukernel__scalar_u8( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -59,7 +58,6 @@ void xnn_f32_vdiv_ukernel__scalar_u8( float vacc7 = va7 / vb7; - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vdiv-minmax-sse-u4.c b/src/f32-vbinary/gen/f32-vdiv-sse-u4.c similarity index 72% rename from src/f32-vbinary/gen/f32-vdiv-minmax-sse-u4.c rename to src/f32-vbinary/gen/f32-vdiv-sse-u4.c index 33c77d60454..187571ab8d5 100644 --- a/src/f32-vbinary/gen/f32-vdiv-minmax-sse-u4.c +++ b/src/f32-vbinary/gen/f32-vdiv-sse-u4.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vdiv_minmax_ukernel__sse_u4( +void xnn_f32_vdiv_ukernel__sse_u4( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,11 +29,6 @@ void xnn_f32_vdiv_minmax_ukernel__sse_u4( assert(input_b != NULL); assert(output != NULL); - const __m128 voutput_min = _mm_set1_ps(params->scalar.min); - const __m128 voutput_max = _mm_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const __m128 va = _mm_loadu_ps(input_a); input_a += 4; @@ -42,8 +37,6 @@ void xnn_f32_vdiv_minmax_ukernel__sse_u4( input_b += 4; __m128 vacc = _mm_div_ps(va, vb); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); _mm_storeu_ps(output, vacc); output += 4; @@ -53,8 +46,6 @@ void xnn_f32_vdiv_minmax_ukernel__sse_u4( const __m128 vb = _mm_loadu_ps(input_b); __m128 vacc = _mm_div_ps(va, vb); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); if (batch & (2 * sizeof(float))) { _mm_storel_pi((__m64*) output, vacc); diff --git a/src/f32-vbinary/gen/f32-vdiv-minmax-sse-u8.c b/src/f32-vbinary/gen/f32-vdiv-sse-u8.c similarity index 72% rename from src/f32-vbinary/gen/f32-vdiv-minmax-sse-u8.c rename to src/f32-vbinary/gen/f32-vdiv-sse-u8.c index efe4486d90f..b519bd2e80c 100644 --- a/src/f32-vbinary/gen/f32-vdiv-minmax-sse-u8.c +++ b/src/f32-vbinary/gen/f32-vdiv-sse-u8.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vdiv_minmax_ukernel__sse_u8( +void xnn_f32_vdiv_ukernel__sse_u8( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,11 +29,6 @@ void xnn_f32_vdiv_minmax_ukernel__sse_u8( assert(input_b != NULL); assert(output != NULL); - const __m128 voutput_min = _mm_set1_ps(params->scalar.min); - const __m128 voutput_max = _mm_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const __m128 va0 = _mm_loadu_ps(input_a); const __m128 va1 = _mm_loadu_ps(input_a + 4); @@ -47,12 +42,6 @@ void xnn_f32_vdiv_minmax_ukernel__sse_u8( __m128 vacc1 = _mm_div_ps(va1, vb1); - vacc0 = _mm_max_ps(vacc0, voutput_min); - vacc1 = _mm_max_ps(vacc1, voutput_min); - - vacc0 = _mm_min_ps(vacc0, voutput_max); - vacc1 = _mm_min_ps(vacc1, voutput_max); - _mm_storeu_ps(output, vacc0); _mm_storeu_ps(output + 4, vacc1); output += 8; @@ -65,8 +54,6 @@ void xnn_f32_vdiv_minmax_ukernel__sse_u8( input_b += 4; __m128 vacc = _mm_div_ps(va, vb); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); _mm_storeu_ps(output, vacc); output += 4; @@ -76,8 +63,6 @@ void xnn_f32_vdiv_minmax_ukernel__sse_u8( const __m128 vb = _mm_loadu_ps(input_b); __m128 vacc = _mm_div_ps(va, vb); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); if (batch & (2 * sizeof(float))) { _mm_storel_pi((__m64*) output, vacc); diff --git a/src/f32-vbinary/gen/f32-vdiv-minmax-scalar-u1.c b/src/f32-vbinary/gen/f32-vdiv-wasm-u1.c similarity index 72% rename from src/f32-vbinary/gen/f32-vdiv-minmax-scalar-u1.c rename to src/f32-vbinary/gen/f32-vdiv-wasm-u1.c index 2c2b72feeb8..a9aeef7accf 100644 --- a/src/f32-vbinary/gen/f32-vdiv-minmax-scalar-u1.c +++ b/src/f32-vbinary/gen/f32-vdiv-wasm-u1.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vdiv_minmax_ukernel__scalar_u1( +void xnn_f32_vdiv_ukernel__wasm_u1( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,15 +27,10 @@ void xnn_f32_vdiv_minmax_ukernel__scalar_u1( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - for (; batch >= sizeof(float); batch -= sizeof(float)) { const float va = *input_a++; const float vb = *input_b++; float vacc = va / vb; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output++ = vacc; } } diff --git a/src/f32-vbinary/gen/f32-vdiv-minmax-scalar-u2.c b/src/f32-vbinary/gen/f32-vdiv-wasm-u2.c similarity index 70% rename from src/f32-vbinary/gen/f32-vdiv-minmax-scalar-u2.c rename to src/f32-vbinary/gen/f32-vdiv-wasm-u2.c index b28ccc33907..75458dbe437 100644 --- a/src/f32-vbinary/gen/f32-vdiv-minmax-scalar-u2.c +++ b/src/f32-vbinary/gen/f32-vdiv-wasm-u2.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vdiv_minmax_ukernel__scalar_u2( +void xnn_f32_vdiv_ukernel__wasm_u2( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,9 +27,6 @@ void xnn_f32_vdiv_minmax_ukernel__scalar_u2( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -43,12 +40,6 @@ void xnn_f32_vdiv_minmax_ukernel__scalar_u2( float vacc1 = va1 / vb1; - vacc0 = math_max_f32(vacc0, voutput_min); - vacc1 = math_max_f32(vacc1, voutput_min); - - vacc0 = math_min_f32(vacc0, voutput_max); - vacc1 = math_min_f32(vacc1, voutput_max); - output[0] = vacc0; output[1] = vacc1; output += 2; @@ -58,8 +49,6 @@ void xnn_f32_vdiv_minmax_ukernel__scalar_u2( const float va = *input_a; const float vb = *input_b; float vacc = va / vb; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output = vacc; } } diff --git a/src/f32-vbinary/gen/f32-vdiv-minmax-scalar-u4.c b/src/f32-vbinary/gen/f32-vdiv-wasm-u4.c similarity index 68% rename from src/f32-vbinary/gen/f32-vdiv-minmax-scalar-u4.c rename to src/f32-vbinary/gen/f32-vdiv-wasm-u4.c index 2aebd44c129..53d33165205 100644 --- a/src/f32-vbinary/gen/f32-vdiv-minmax-scalar-u4.c +++ b/src/f32-vbinary/gen/f32-vdiv-wasm-u4.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vdiv_minmax_ukernel__scalar_u4( +void xnn_f32_vdiv_ukernel__wasm_u4( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,9 +27,6 @@ void xnn_f32_vdiv_minmax_ukernel__scalar_u4( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -49,16 +46,6 @@ void xnn_f32_vdiv_minmax_ukernel__scalar_u4( float vacc3 = va3 / vb3; - vacc0 = math_max_f32(vacc0, voutput_min); - vacc1 = math_max_f32(vacc1, voutput_min); - vacc2 = math_max_f32(vacc2, voutput_min); - vacc3 = math_max_f32(vacc3, voutput_min); - - vacc0 = math_min_f32(vacc0, voutput_max); - vacc1 = math_min_f32(vacc1, voutput_max); - vacc2 = math_min_f32(vacc2, voutput_max); - vacc3 = math_min_f32(vacc3, voutput_max); - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; @@ -70,8 +57,6 @@ void xnn_f32_vdiv_minmax_ukernel__scalar_u4( const float va = *input_a++; const float vb = *input_b++; float vacc = va / vb; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output++ = vacc; batch -= sizeof(float); } while (batch != 0); diff --git a/src/f32-vbinary/gen/f32-vdiv-minmax-scalar-u8.c b/src/f32-vbinary/gen/f32-vdiv-wasm-u8.c similarity index 64% rename from src/f32-vbinary/gen/f32-vdiv-minmax-scalar-u8.c rename to src/f32-vbinary/gen/f32-vdiv-wasm-u8.c index 3b24d815a88..54fcc95733f 100644 --- a/src/f32-vbinary/gen/f32-vdiv-minmax-scalar-u8.c +++ b/src/f32-vbinary/gen/f32-vdiv-wasm-u8.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vdiv_minmax_ukernel__scalar_u8( +void xnn_f32_vdiv_ukernel__wasm_u8( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,9 +27,6 @@ void xnn_f32_vdiv_minmax_ukernel__scalar_u8( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -61,24 +58,6 @@ void xnn_f32_vdiv_minmax_ukernel__scalar_u8( float vacc7 = va7 / vb7; - vacc0 = math_max_f32(vacc0, voutput_min); - vacc1 = math_max_f32(vacc1, voutput_min); - vacc2 = math_max_f32(vacc2, voutput_min); - vacc3 = math_max_f32(vacc3, voutput_min); - vacc4 = math_max_f32(vacc4, voutput_min); - vacc5 = math_max_f32(vacc5, voutput_min); - vacc6 = math_max_f32(vacc6, voutput_min); - vacc7 = math_max_f32(vacc7, voutput_min); - - vacc0 = math_min_f32(vacc0, voutput_max); - vacc1 = math_min_f32(vacc1, voutput_max); - vacc2 = math_min_f32(vacc2, voutput_max); - vacc3 = math_min_f32(vacc3, voutput_max); - vacc4 = math_min_f32(vacc4, voutput_max); - vacc5 = math_min_f32(vacc5, voutput_max); - vacc6 = math_min_f32(vacc6, voutput_max); - vacc7 = math_min_f32(vacc7, voutput_max); - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; @@ -94,8 +73,6 @@ void xnn_f32_vdiv_minmax_ukernel__scalar_u8( const float va = *input_a++; const float vb = *input_b++; float vacc = va / vb; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output++ = vacc; batch -= sizeof(float); } while (batch != 0); diff --git a/src/f32-vbinary/gen/f32-vdiv-wasmsimd-u16.c b/src/f32-vbinary/gen/f32-vdiv-wasmsimd-u16.c index f46f146c387..030cadec775 100644 --- a/src/f32-vbinary/gen/f32-vdiv-wasmsimd-u16.c +++ b/src/f32-vbinary/gen/f32-vdiv-wasmsimd-u16.c @@ -28,7 +28,6 @@ void xnn_f32_vdiv_ukernel__wasmsimd_u16( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); @@ -63,7 +62,6 @@ void xnn_f32_vdiv_ukernel__wasmsimd_u16( v128_t vacc = wasm_f32x4_div(va, vb); - wasm_v128_store(output, vacc); output += 4; } @@ -73,7 +71,6 @@ void xnn_f32_vdiv_ukernel__wasmsimd_u16( v128_t vacc = wasm_f32x4_div(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vdiv-wasmsimd-u4.c b/src/f32-vbinary/gen/f32-vdiv-wasmsimd-u4.c index 014f7cce91d..ca66ce95179 100644 --- a/src/f32-vbinary/gen/f32-vdiv-wasmsimd-u4.c +++ b/src/f32-vbinary/gen/f32-vdiv-wasmsimd-u4.c @@ -28,7 +28,6 @@ void xnn_f32_vdiv_ukernel__wasmsimd_u4( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; @@ -38,7 +37,6 @@ void xnn_f32_vdiv_ukernel__wasmsimd_u4( v128_t vacc = wasm_f32x4_div(va, vb); - wasm_v128_store(output, vacc); output += 4; } @@ -48,7 +46,6 @@ void xnn_f32_vdiv_ukernel__wasmsimd_u4( v128_t vacc = wasm_f32x4_div(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vdiv-wasmsimd-u8.c b/src/f32-vbinary/gen/f32-vdiv-wasmsimd-u8.c index 69183d65c0a..73c807426d3 100644 --- a/src/f32-vbinary/gen/f32-vdiv-wasmsimd-u8.c +++ b/src/f32-vbinary/gen/f32-vdiv-wasmsimd-u8.c @@ -28,7 +28,6 @@ void xnn_f32_vdiv_ukernel__wasmsimd_u8( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); @@ -55,7 +54,6 @@ void xnn_f32_vdiv_ukernel__wasmsimd_u8( v128_t vacc = wasm_f32x4_div(va, vb); - wasm_v128_store(output, vacc); output += 4; } @@ -65,7 +63,6 @@ void xnn_f32_vdiv_ukernel__wasmsimd_u8( v128_t vacc = wasm_f32x4_div(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vdivc-minmax-aarch64-neon-u4.c b/src/f32-vbinary/gen/f32-vdivc-aarch64-neon-u4.c similarity index 73% rename from src/f32-vbinary/gen/f32-vdivc-minmax-aarch64-neon-u4.c rename to src/f32-vbinary/gen/f32-vdivc-aarch64-neon-u4.c index 643eb2179f4..f276e822bf8 100644 --- a/src/f32-vbinary/gen/f32-vdivc-minmax-aarch64-neon-u4.c +++ b/src/f32-vbinary/gen/f32-vdivc-aarch64-neon-u4.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vdivc_minmax_ukernel__aarch64_neon_u4( +void xnn_f32_vdivc_ukernel__aarch64_neon_u4( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -28,8 +28,6 @@ void xnn_f32_vdivc_minmax_ukernel__aarch64_neon_u4( assert(input_b != NULL); assert(output != NULL); - const float32x4_t voutput_min = vld1q_dup_f32(¶ms->scalar.min); - const float32x4_t voutput_max = vld1q_dup_f32(¶ms->scalar.max); const float32x4_t vb = vld1q_dup_f32(input_b); for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { @@ -37,9 +35,6 @@ void xnn_f32_vdivc_minmax_ukernel__aarch64_neon_u4( float32x4_t vacc = vdivq_f32(va, vb); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -47,9 +42,6 @@ void xnn_f32_vdivc_minmax_ukernel__aarch64_neon_u4( float32x4_t vacc = vdivq_f32(va, vb); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/gen/f32-vdivc-minmax-aarch64-neon-u8.c b/src/f32-vbinary/gen/f32-vdivc-aarch64-neon-u8.c similarity index 72% rename from src/f32-vbinary/gen/f32-vdivc-minmax-aarch64-neon-u8.c rename to src/f32-vbinary/gen/f32-vdivc-aarch64-neon-u8.c index 0151aba4984..2290e9f77ce 100644 --- a/src/f32-vbinary/gen/f32-vdivc-minmax-aarch64-neon-u8.c +++ b/src/f32-vbinary/gen/f32-vdivc-aarch64-neon-u8.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vdivc_minmax_ukernel__aarch64_neon_u8( +void xnn_f32_vdivc_ukernel__aarch64_neon_u8( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -28,8 +28,6 @@ void xnn_f32_vdivc_minmax_ukernel__aarch64_neon_u8( assert(input_b != NULL); assert(output != NULL); - const float32x4_t voutput_min = vld1q_dup_f32(¶ms->scalar.min); - const float32x4_t voutput_max = vld1q_dup_f32(¶ms->scalar.max); const float32x4_t vb = vld1q_dup_f32(input_b); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { @@ -40,12 +38,6 @@ void xnn_f32_vdivc_minmax_ukernel__aarch64_neon_u8( float32x4_t vacc1 = vdivq_f32(va1, vb); - vacc0 = vmaxq_f32(vacc0, voutput_min); - vacc1 = vmaxq_f32(vacc1, voutput_min); - - vacc0 = vminq_f32(vacc0, voutput_max); - vacc1 = vminq_f32(vacc1, voutput_max); - vst1q_f32(output, vacc0); output += 4; vst1q_f32(output, vacc1); output += 4; } @@ -54,9 +46,6 @@ void xnn_f32_vdivc_minmax_ukernel__aarch64_neon_u8( float32x4_t vacc = vdivq_f32(va, vb); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -64,9 +53,6 @@ void xnn_f32_vdivc_minmax_ukernel__aarch64_neon_u8( float32x4_t vacc = vdivq_f32(va, vb); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/gen/f32-vdivc-minmax-avx-u8.c b/src/f32-vbinary/gen/f32-vdivc-avx-u16.c similarity index 77% rename from src/f32-vbinary/gen/f32-vdivc-minmax-avx-u8.c rename to src/f32-vbinary/gen/f32-vdivc-avx-u16.c index 11d43934828..1bc85147adb 100644 --- a/src/f32-vbinary/gen/f32-vdivc-minmax-avx-u8.c +++ b/src/f32-vbinary/gen/f32-vdivc-avx-u16.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vdivc_minmax_ukernel__avx_u8( +void xnn_f32_vdivc_ukernel__avx_u16( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -30,19 +30,26 @@ void xnn_f32_vdivc_minmax_ukernel__avx_u8( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; - const __m256 voutput_min = _mm256_set1_ps(params->scalar.min); - const __m256 voutput_max = _mm256_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); const __m256 vb = _mm256_broadcast_ss(input_b); + for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { + const __m256 va0 = _mm256_loadu_ps(input_a); + const __m256 va1 = _mm256_loadu_ps(input_a + 8); + input_a += 16; + + __m256 vacc0 = _mm256_div_ps(va0, vb); + __m256 vacc1 = _mm256_div_ps(va1, vb); + + + _mm256_storeu_ps(output, vacc0); + _mm256_storeu_ps(output + 8, vacc1); + output += 16; + } for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const __m256 va = _mm256_loadu_ps(input_a); input_a += 8; __m256 vacc = _mm256_div_ps(va, vb); - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); _mm256_storeu_ps(output, vacc); output += 8; } @@ -54,8 +61,6 @@ void xnn_f32_vdivc_minmax_ukernel__avx_u8( __m256 va = _mm256_maskload_ps(input_a, vmask); __m256 vacc = _mm256_div_ps(va, vb); - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); __m128 vacc_lo = _mm256_castps256_ps128(vacc); if (batch & (4 * sizeof(float))) { diff --git a/src/f32-vbinary/gen/f32-vdivc-avx-u8.c b/src/f32-vbinary/gen/f32-vdivc-avx-u8.c new file mode 100644 index 00000000000..ece5adfd53e --- /dev/null +++ b/src/f32-vbinary/gen/f32-vdivc-avx-u8.c @@ -0,0 +1,67 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-vbinary/vopc-avx.c.in +// Generator: tools/xngen +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/vbinary.h" + + +void xnn_f32_vdivc_ukernel__avx_u8( + size_t batch, + const float* input_a, + const float* input_b, + float* output, + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + + const __m256 vb = _mm256_broadcast_ss(input_b); + + for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { + const __m256 va = _mm256_loadu_ps(input_a); + input_a += 8; + + __m256 vacc = _mm256_div_ps(va, vb); + _mm256_storeu_ps(output, vacc); + output += 8; + } + if XNN_UNLIKELY(batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 7 * sizeof(float)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch)); + + __m256 va = _mm256_maskload_ps(input_a, vmask); + + __m256 vacc = _mm256_div_ps(va, vb); + + __m128 vacc_lo = _mm256_castps256_ps128(vacc); + if (batch & (4 * sizeof(float))) { + _mm_storeu_ps(output, vacc_lo); + vacc_lo = _mm256_extractf128_ps(vacc, 1); + output += 4; + } + if (batch & (2 * sizeof(float))) { + _mm_storel_pi((__m64*) output, vacc_lo); + vacc_lo = _mm_movehl_ps(vacc_lo, vacc_lo); + output += 2; + } + if (batch & (1 * sizeof(float))) { + _mm_store_ss(output, vacc_lo); + } + } +} diff --git a/src/f32-vbinary/gen/f32-vdivc-minmax-avx512f-u16.c b/src/f32-vbinary/gen/f32-vdivc-avx512f-u16.c similarity index 75% rename from src/f32-vbinary/gen/f32-vdivc-minmax-avx512f-u16.c rename to src/f32-vbinary/gen/f32-vdivc-avx512f-u16.c index 5020e7f5382..be6ca5aa213 100644 --- a/src/f32-vbinary/gen/f32-vdivc-minmax-avx512f-u16.c +++ b/src/f32-vbinary/gen/f32-vdivc-avx512f-u16.c @@ -7,6 +7,7 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. + #include #include @@ -16,12 +17,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vdivc_minmax_ukernel__avx512f_u16( +void xnn_f32_vdivc_ukernel__avx512f_u16( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,8 +30,6 @@ void xnn_f32_vdivc_minmax_ukernel__avx512f_u16( assert(input_b != NULL); assert(output != NULL); - const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); - const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); const __m512 vb = _mm512_set1_ps(*input_b); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { @@ -40,10 +39,6 @@ void xnn_f32_vdivc_minmax_ukernel__avx512f_u16( __m512 vacc0 = _mm512_div_ps(va0, vb); - vacc0 = _mm512_max_ps(voutput_min, vacc0); - - vacc0 = _mm512_min_ps(voutput_max, vacc0); - _mm512_storeu_ps(output, vacc0); output += 16; } @@ -57,8 +52,6 @@ void xnn_f32_vdivc_minmax_ukernel__avx512f_u16( __m512 va = _mm512_maskz_loadu_ps(vmask, input_a); __m512 vacc = _mm512_div_ps(va, vb); - vacc = _mm512_maskz_max_ps(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ps(vmask, voutput_max, vacc); _mm512_mask_storeu_ps(output, vmask, vacc); } } diff --git a/src/f32-vbinary/gen/f32-vdivc-minmax-avx512f-u32.c b/src/f32-vbinary/gen/f32-vdivc-avx512f-u32.c similarity index 73% rename from src/f32-vbinary/gen/f32-vdivc-minmax-avx512f-u32.c rename to src/f32-vbinary/gen/f32-vdivc-avx512f-u32.c index 98fa2dc4a7e..b140f6f5d58 100644 --- a/src/f32-vbinary/gen/f32-vdivc-minmax-avx512f-u32.c +++ b/src/f32-vbinary/gen/f32-vdivc-avx512f-u32.c @@ -7,6 +7,7 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. + #include #include @@ -16,12 +17,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vdivc_minmax_ukernel__avx512f_u32( +void xnn_f32_vdivc_ukernel__avx512f_u32( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,8 +30,6 @@ void xnn_f32_vdivc_minmax_ukernel__avx512f_u32( assert(input_b != NULL); assert(output != NULL); - const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); - const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); const __m512 vb = _mm512_set1_ps(*input_b); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { @@ -42,12 +41,6 @@ void xnn_f32_vdivc_minmax_ukernel__avx512f_u32( __m512 vacc1 = _mm512_div_ps(va1, vb); - vacc0 = _mm512_max_ps(voutput_min, vacc0); - vacc1 = _mm512_max_ps(voutput_min, vacc1); - - vacc0 = _mm512_min_ps(voutput_max, vacc0); - vacc1 = _mm512_min_ps(voutput_max, vacc1); - _mm512_storeu_ps(output, vacc0); _mm512_storeu_ps(output + 16, vacc1); output += 32; @@ -58,9 +51,6 @@ void xnn_f32_vdivc_minmax_ukernel__avx512f_u32( __m512 vacc = _mm512_div_ps(va, vb); - vacc = _mm512_max_ps(voutput_min, vacc); - vacc = _mm512_min_ps(voutput_max, vacc); - _mm512_storeu_ps(output, vacc); output += 16; } @@ -74,8 +64,6 @@ void xnn_f32_vdivc_minmax_ukernel__avx512f_u32( __m512 va = _mm512_maskz_loadu_ps(vmask, input_a); __m512 vacc = _mm512_div_ps(va, vb); - vacc = _mm512_maskz_max_ps(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ps(vmask, voutput_max, vacc); _mm512_mask_storeu_ps(output, vmask, vacc); } } diff --git a/src/f32-vbinary/gen/f32-vdivc-minmax-avx-u16.c b/src/f32-vbinary/gen/f32-vdivc-minmax-avx-u16.c deleted file mode 100644 index d53a000643f..00000000000 --- a/src/f32-vbinary/gen/f32-vdivc-minmax-avx-u16.c +++ /dev/null @@ -1,94 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-avx.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vdivc_minmax_ukernel__avx_u16( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; - - const __m256 voutput_min = _mm256_set1_ps(params->scalar.min); - const __m256 voutput_max = _mm256_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const __m256 vb = _mm256_broadcast_ss(input_b); - - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const __m256 va0 = _mm256_loadu_ps(input_a); - const __m256 va1 = _mm256_loadu_ps(input_a + 8); - input_a += 16; - - __m256 vacc0 = _mm256_div_ps(va0, vb); - __m256 vacc1 = _mm256_div_ps(va1, vb); - - - vacc0 = _mm256_max_ps(voutput_min, vacc0); - vacc1 = _mm256_max_ps(voutput_min, vacc1); - - vacc0 = _mm256_min_ps(voutput_max, vacc0); - vacc1 = _mm256_min_ps(voutput_max, vacc1); - - _mm256_storeu_ps(output, vacc0); - _mm256_storeu_ps(output + 8, vacc1); - output += 16; - } - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const __m256 va = _mm256_loadu_ps(input_a); - input_a += 8; - - __m256 vacc = _mm256_div_ps(va, vb); - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); - _mm256_storeu_ps(output, vacc); - output += 8; - } - if XNN_UNLIKELY(batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch)); - - __m256 va = _mm256_maskload_ps(input_a, vmask); - - __m256 vacc = _mm256_div_ps(va, vb); - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); - - __m128 vacc_lo = _mm256_castps256_ps128(vacc); - if (batch & (4 * sizeof(float))) { - _mm_storeu_ps(output, vacc_lo); - vacc_lo = _mm256_extractf128_ps(vacc, 1); - output += 4; - } - if (batch & (2 * sizeof(float))) { - _mm_storel_pi((__m64*) output, vacc_lo); - vacc_lo = _mm_movehl_ps(vacc_lo, vacc_lo); - output += 2; - } - if (batch & (1 * sizeof(float))) { - _mm_store_ss(output, vacc_lo); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vdivc-minmax-wasm-u1.c b/src/f32-vbinary/gen/f32-vdivc-minmax-wasm-u1.c deleted file mode 100644 index f277b5790b4..00000000000 --- a/src/f32-vbinary/gen/f32-vdivc-minmax-wasm-u1.c +++ /dev/null @@ -1,41 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vdivc_minmax_ukernel__wasm_u1( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - const float vb = *input_b; - - for (; batch >= sizeof(float); batch -= sizeof(float)) { - const float va = *input_a++; - float vacc = va / vb; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output++ = vacc; - } -} diff --git a/src/f32-vbinary/gen/f32-vdivc-minmax-wasm-u2.c b/src/f32-vbinary/gen/f32-vdivc-minmax-wasm-u2.c deleted file mode 100644 index 041ae9ff3a8..00000000000 --- a/src/f32-vbinary/gen/f32-vdivc-minmax-wasm-u2.c +++ /dev/null @@ -1,61 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vdivc_minmax_ukernel__wasm_u2( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - const float vb = *input_b; - - for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) { - const float va0 = input_a[0]; - const float va1 = input_a[1]; - input_a += 2; - - float vacc0 = va0 / vb; - float vacc1 = va1 / vb; - - - vacc0 = __builtin_wasm_max_f32(vacc0, voutput_min); - vacc1 = __builtin_wasm_max_f32(vacc1, voutput_min); - - vacc0 = __builtin_wasm_min_f32(vacc0, voutput_max); - vacc1 = __builtin_wasm_min_f32(vacc1, voutput_max); - - output[0] = vacc0; - output[1] = vacc1; - output += 2; - } - if XNN_UNLIKELY(batch != 0) { - assert(batch == sizeof(float)); - const float va = *input_a; - float vacc = va / vb; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output = vacc; - } -} diff --git a/src/f32-vbinary/gen/f32-vdivc-minmax-wasm-u4.c b/src/f32-vbinary/gen/f32-vdivc-minmax-wasm-u4.c deleted file mode 100644 index c748d220736..00000000000 --- a/src/f32-vbinary/gen/f32-vdivc-minmax-wasm-u4.c +++ /dev/null @@ -1,73 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vdivc_minmax_ukernel__wasm_u4( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - const float vb = *input_b; - - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float va0 = input_a[0]; - const float va1 = input_a[1]; - const float va2 = input_a[2]; - const float va3 = input_a[3]; - input_a += 4; - - float vacc0 = va0 / vb; - float vacc1 = va1 / vb; - float vacc2 = va2 / vb; - float vacc3 = va3 / vb; - - - vacc0 = __builtin_wasm_max_f32(vacc0, voutput_min); - vacc1 = __builtin_wasm_max_f32(vacc1, voutput_min); - vacc2 = __builtin_wasm_max_f32(vacc2, voutput_min); - vacc3 = __builtin_wasm_max_f32(vacc3, voutput_min); - - vacc0 = __builtin_wasm_min_f32(vacc0, voutput_max); - vacc1 = __builtin_wasm_min_f32(vacc1, voutput_max); - vacc2 = __builtin_wasm_min_f32(vacc2, voutput_max); - vacc3 = __builtin_wasm_min_f32(vacc3, voutput_max); - - output[0] = vacc0; - output[1] = vacc1; - output[2] = vacc2; - output[3] = vacc3; - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - do { - const float va = *input_a++; - float vacc = va / vb; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output++ = vacc; - batch -= sizeof(float); - } while (batch != 0); - } -} diff --git a/src/f32-vbinary/gen/f32-vdivc-minmax-wasm-u8.c b/src/f32-vbinary/gen/f32-vdivc-minmax-wasm-u8.c deleted file mode 100644 index 3dfaad5c097..00000000000 --- a/src/f32-vbinary/gen/f32-vdivc-minmax-wasm-u8.c +++ /dev/null @@ -1,93 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vdivc_minmax_ukernel__wasm_u8( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - const float vb = *input_b; - - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const float va0 = input_a[0]; - const float va1 = input_a[1]; - const float va2 = input_a[2]; - const float va3 = input_a[3]; - const float va4 = input_a[4]; - const float va5 = input_a[5]; - const float va6 = input_a[6]; - const float va7 = input_a[7]; - input_a += 8; - - float vacc0 = va0 / vb; - float vacc1 = va1 / vb; - float vacc2 = va2 / vb; - float vacc3 = va3 / vb; - float vacc4 = va4 / vb; - float vacc5 = va5 / vb; - float vacc6 = va6 / vb; - float vacc7 = va7 / vb; - - - vacc0 = __builtin_wasm_max_f32(vacc0, voutput_min); - vacc1 = __builtin_wasm_max_f32(vacc1, voutput_min); - vacc2 = __builtin_wasm_max_f32(vacc2, voutput_min); - vacc3 = __builtin_wasm_max_f32(vacc3, voutput_min); - vacc4 = __builtin_wasm_max_f32(vacc4, voutput_min); - vacc5 = __builtin_wasm_max_f32(vacc5, voutput_min); - vacc6 = __builtin_wasm_max_f32(vacc6, voutput_min); - vacc7 = __builtin_wasm_max_f32(vacc7, voutput_min); - - vacc0 = __builtin_wasm_min_f32(vacc0, voutput_max); - vacc1 = __builtin_wasm_min_f32(vacc1, voutput_max); - vacc2 = __builtin_wasm_min_f32(vacc2, voutput_max); - vacc3 = __builtin_wasm_min_f32(vacc3, voutput_max); - vacc4 = __builtin_wasm_min_f32(vacc4, voutput_max); - vacc5 = __builtin_wasm_min_f32(vacc5, voutput_max); - vacc6 = __builtin_wasm_min_f32(vacc6, voutput_max); - vacc7 = __builtin_wasm_min_f32(vacc7, voutput_max); - - output[0] = vacc0; - output[1] = vacc1; - output[2] = vacc2; - output[3] = vacc3; - output[4] = vacc4; - output[5] = vacc5; - output[6] = vacc6; - output[7] = vacc7; - output += 8; - } - if XNN_UNLIKELY(batch != 0) { - do { - const float va = *input_a++; - float vacc = va / vb; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output++ = vacc; - batch -= sizeof(float); - } while (batch != 0); - } -} diff --git a/src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-arm-u16.c b/src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-arm-u16.c deleted file mode 100644 index 7398748f4e1..00000000000 --- a/src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-arm-u16.c +++ /dev/null @@ -1,95 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vdivc_minmax_ukernel__wasmsimd_arm_u16( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const v128_t vb = wasm_v128_load32_splat(input_b); - - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - const v128_t va2 = wasm_v128_load(input_a + 8); - const v128_t va3 = wasm_v128_load(input_a + 12); - input_a += 16; - - v128_t vy0 = wasm_f32x4_div(va0, vb); - v128_t vy1 = wasm_f32x4_div(va1, vb); - v128_t vy2 = wasm_f32x4_div(va2, vb); - v128_t vy3 = wasm_f32x4_div(va3, vb); - - - vy0 = wasm_f32x4_max(vy0, voutput_min); - vy1 = wasm_f32x4_max(vy1, voutput_min); - vy2 = wasm_f32x4_max(vy2, voutput_min); - vy3 = wasm_f32x4_max(vy3, voutput_min); - - vy0 = wasm_f32x4_min(vy0, voutput_max); - vy1 = wasm_f32x4_min(vy1, voutput_max); - vy2 = wasm_f32x4_min(vy2, voutput_max); - vy3 = wasm_f32x4_min(vy3, voutput_max); - - wasm_v128_store(output, vy0); - wasm_v128_store(output + 4, vy1); - wasm_v128_store(output + 8, vy2); - wasm_v128_store(output + 12, vy3); - output += 16; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - v128_t vy = wasm_f32x4_div(va, vb); - - vy = wasm_f32x4_max(vy, voutput_min); - vy = wasm_f32x4_min(vy, voutput_max); - - wasm_v128_store(output, vy); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - - v128_t vy = wasm_f32x4_div(va, vb); - - vy = wasm_f32x4_max(vy, voutput_min); - vy = wasm_f32x4_min(vy, voutput_max); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vy, 0); - vy = wasm_v64x2_shuffle(vy, vy, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vy, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-arm-u4.c b/src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-arm-u4.c deleted file mode 100644 index f6e48466372..00000000000 --- a/src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-arm-u4.c +++ /dev/null @@ -1,66 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vdivc_minmax_ukernel__wasmsimd_arm_u4( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const v128_t vb = wasm_v128_load32_splat(input_b); - - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - v128_t vy = wasm_f32x4_div(va, vb); - - vy = wasm_f32x4_max(vy, voutput_min); - vy = wasm_f32x4_min(vy, voutput_max); - - wasm_v128_store(output, vy); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - - v128_t vy = wasm_f32x4_div(va, vb); - - vy = wasm_f32x4_max(vy, voutput_min); - vy = wasm_f32x4_min(vy, voutput_max); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vy, 0); - vy = wasm_v64x2_shuffle(vy, vy, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vy, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-arm-u8.c b/src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-arm-u8.c deleted file mode 100644 index b2138292a51..00000000000 --- a/src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-arm-u8.c +++ /dev/null @@ -1,85 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vdivc_minmax_ukernel__wasmsimd_arm_u8( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const v128_t vb = wasm_v128_load32_splat(input_b); - - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - input_a += 8; - - v128_t vy0 = wasm_f32x4_div(va0, vb); - v128_t vy1 = wasm_f32x4_div(va1, vb); - - - vy0 = wasm_f32x4_max(vy0, voutput_min); - vy1 = wasm_f32x4_max(vy1, voutput_min); - - vy0 = wasm_f32x4_min(vy0, voutput_max); - vy1 = wasm_f32x4_min(vy1, voutput_max); - - wasm_v128_store(output, vy0); - wasm_v128_store(output + 4, vy1); - output += 8; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - v128_t vy = wasm_f32x4_div(va, vb); - - vy = wasm_f32x4_max(vy, voutput_min); - vy = wasm_f32x4_min(vy, voutput_max); - - wasm_v128_store(output, vy); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - - v128_t vy = wasm_f32x4_div(va, vb); - - vy = wasm_f32x4_max(vy, voutput_min); - vy = wasm_f32x4_min(vy, voutput_max); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vy, 0); - vy = wasm_v64x2_shuffle(vy, vy, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vy, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-x86-u16.c b/src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-x86-u16.c deleted file mode 100644 index 38e4ced3ec0..00000000000 --- a/src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-x86-u16.c +++ /dev/null @@ -1,95 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vdivc_minmax_ukernel__wasmsimd_x86_u16( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const v128_t vb = wasm_v128_load32_splat(input_b); - - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - const v128_t va2 = wasm_v128_load(input_a + 8); - const v128_t va3 = wasm_v128_load(input_a + 12); - input_a += 16; - - v128_t vy0 = wasm_f32x4_div(va0, vb); - v128_t vy1 = wasm_f32x4_div(va1, vb); - v128_t vy2 = wasm_f32x4_div(va2, vb); - v128_t vy3 = wasm_f32x4_div(va3, vb); - - - vy0 = wasm_f32x4_pmax(voutput_min, vy0); - vy1 = wasm_f32x4_pmax(voutput_min, vy1); - vy2 = wasm_f32x4_pmax(voutput_min, vy2); - vy3 = wasm_f32x4_pmax(voutput_min, vy3); - - vy0 = wasm_f32x4_pmin(voutput_max, vy0); - vy1 = wasm_f32x4_pmin(voutput_max, vy1); - vy2 = wasm_f32x4_pmin(voutput_max, vy2); - vy3 = wasm_f32x4_pmin(voutput_max, vy3); - - wasm_v128_store(output, vy0); - wasm_v128_store(output + 4, vy1); - wasm_v128_store(output + 8, vy2); - wasm_v128_store(output + 12, vy3); - output += 16; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - v128_t vy = wasm_f32x4_div(va, vb); - - vy = wasm_f32x4_pmax(voutput_min, vy); - vy = wasm_f32x4_pmin(voutput_max, vy); - - wasm_v128_store(output, vy); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - - v128_t vy = wasm_f32x4_div(va, vb); - - vy = wasm_f32x4_pmax(voutput_min, vy); - vy = wasm_f32x4_pmin(voutput_max, vy); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vy, 0); - vy = wasm_v64x2_shuffle(vy, vy, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vy, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-x86-u4.c b/src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-x86-u4.c deleted file mode 100644 index e1a472d4c95..00000000000 --- a/src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-x86-u4.c +++ /dev/null @@ -1,66 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vdivc_minmax_ukernel__wasmsimd_x86_u4( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const v128_t vb = wasm_v128_load32_splat(input_b); - - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - v128_t vy = wasm_f32x4_div(va, vb); - - vy = wasm_f32x4_pmax(voutput_min, vy); - vy = wasm_f32x4_pmin(voutput_max, vy); - - wasm_v128_store(output, vy); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - - v128_t vy = wasm_f32x4_div(va, vb); - - vy = wasm_f32x4_pmax(voutput_min, vy); - vy = wasm_f32x4_pmin(voutput_max, vy); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vy, 0); - vy = wasm_v64x2_shuffle(vy, vy, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vy, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-x86-u8.c b/src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-x86-u8.c deleted file mode 100644 index 03e1bf1a6aa..00000000000 --- a/src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-x86-u8.c +++ /dev/null @@ -1,85 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vdivc_minmax_ukernel__wasmsimd_x86_u8( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const v128_t vb = wasm_v128_load32_splat(input_b); - - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - input_a += 8; - - v128_t vy0 = wasm_f32x4_div(va0, vb); - v128_t vy1 = wasm_f32x4_div(va1, vb); - - - vy0 = wasm_f32x4_pmax(voutput_min, vy0); - vy1 = wasm_f32x4_pmax(voutput_min, vy1); - - vy0 = wasm_f32x4_pmin(voutput_max, vy0); - vy1 = wasm_f32x4_pmin(voutput_max, vy1); - - wasm_v128_store(output, vy0); - wasm_v128_store(output + 4, vy1); - output += 8; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - v128_t vy = wasm_f32x4_div(va, vb); - - vy = wasm_f32x4_pmax(voutput_min, vy); - vy = wasm_f32x4_pmin(voutput_max, vy); - - wasm_v128_store(output, vy); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - - v128_t vy = wasm_f32x4_div(va, vb); - - vy = wasm_f32x4_pmax(voutput_min, vy); - vy = wasm_f32x4_pmin(voutput_max, vy); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vy, 0); - vy = wasm_v64x2_shuffle(vy, vy, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vy, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vdivc-minmax-rvv-u4v.c b/src/f32-vbinary/gen/f32-vdivc-rvv-u4v.c similarity index 74% rename from src/f32-vbinary/gen/f32-vdivc-minmax-rvv-u4v.c rename to src/f32-vbinary/gen/f32-vdivc-rvv-u4v.c index db76b5dddf6..6c7325c8d20 100644 --- a/src/f32-vbinary/gen/f32-vdivc-minmax-rvv-u4v.c +++ b/src/f32-vbinary/gen/f32-vdivc-rvv-u4v.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vdivc_minmax_ukernel__rvv_u4v( +void xnn_f32_vdivc_ukernel__rvv_u4v( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -28,8 +28,6 @@ void xnn_f32_vdivc_minmax_ukernel__rvv_u4v( assert(input_b != NULL); assert(output != NULL); - const float output_min = params->scalar.min; - const float output_max = params->scalar.max; const float b = *input_b; size_t n = batch >> 2; @@ -39,8 +37,6 @@ void xnn_f32_vdivc_minmax_ukernel__rvv_u4v( vfloat32m4_t va = __riscv_vle32_v_f32m4(input_a, vl); input_a += vl; vfloat32m4_t vacc = __riscv_vfdiv_vf_f32m4(va, b, vl); - vacc = __riscv_vfmax_vf_f32m4(vacc, output_min, vl); - vacc = __riscv_vfmin_vf_f32m4(vacc, output_max, vl); __riscv_vse32_v_f32m4(output, vacc, vl); output += vl; } while (n > 0); diff --git a/src/f32-vbinary/gen/f32-vdivc-minmax-rvv-u8v.c b/src/f32-vbinary/gen/f32-vdivc-rvv-u8v.c similarity index 74% rename from src/f32-vbinary/gen/f32-vdivc-minmax-rvv-u8v.c rename to src/f32-vbinary/gen/f32-vdivc-rvv-u8v.c index 925bdb06f8e..3195dea1fc4 100644 --- a/src/f32-vbinary/gen/f32-vdivc-minmax-rvv-u8v.c +++ b/src/f32-vbinary/gen/f32-vdivc-rvv-u8v.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vdivc_minmax_ukernel__rvv_u8v( +void xnn_f32_vdivc_ukernel__rvv_u8v( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -28,8 +28,6 @@ void xnn_f32_vdivc_minmax_ukernel__rvv_u8v( assert(input_b != NULL); assert(output != NULL); - const float output_min = params->scalar.min; - const float output_max = params->scalar.max; const float b = *input_b; size_t n = batch >> 2; @@ -39,8 +37,6 @@ void xnn_f32_vdivc_minmax_ukernel__rvv_u8v( vfloat32m8_t va = __riscv_vle32_v_f32m8(input_a, vl); input_a += vl; vfloat32m8_t vacc = __riscv_vfdiv_vf_f32m8(va, b, vl); - vacc = __riscv_vfmax_vf_f32m8(vacc, output_min, vl); - vacc = __riscv_vfmin_vf_f32m8(vacc, output_max, vl); __riscv_vse32_v_f32m8(output, vacc, vl); output += vl; } while (n > 0); diff --git a/src/f32-vbinary/gen/f32-vdivc-scalar-u2.c b/src/f32-vbinary/gen/f32-vdivc-scalar-u2.c index d5ee27d241b..71ee42bcf1f 100644 --- a/src/f32-vbinary/gen/f32-vdivc-scalar-u2.c +++ b/src/f32-vbinary/gen/f32-vdivc-scalar-u2.c @@ -38,7 +38,6 @@ void xnn_f32_vdivc_ukernel__scalar_u2( float vacc1 = va1 / vb; - output[0] = vacc0; output[1] = vacc1; output += 2; diff --git a/src/f32-vbinary/gen/f32-vdivc-scalar-u4.c b/src/f32-vbinary/gen/f32-vdivc-scalar-u4.c index 3cb43b9cf1f..fbde2de09a1 100644 --- a/src/f32-vbinary/gen/f32-vdivc-scalar-u4.c +++ b/src/f32-vbinary/gen/f32-vdivc-scalar-u4.c @@ -42,7 +42,6 @@ void xnn_f32_vdivc_ukernel__scalar_u4( float vacc3 = va3 / vb; - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vdivc-scalar-u8.c b/src/f32-vbinary/gen/f32-vdivc-scalar-u8.c index 9216e2e32a1..8aedd122941 100644 --- a/src/f32-vbinary/gen/f32-vdivc-scalar-u8.c +++ b/src/f32-vbinary/gen/f32-vdivc-scalar-u8.c @@ -50,7 +50,6 @@ void xnn_f32_vdivc_ukernel__scalar_u8( float vacc7 = va7 / vb; - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vdivc-minmax-sse-u4.c b/src/f32-vbinary/gen/f32-vdivc-sse-u4.c similarity index 71% rename from src/f32-vbinary/gen/f32-vdivc-minmax-sse-u4.c rename to src/f32-vbinary/gen/f32-vdivc-sse-u4.c index cd210f0264a..9c1acc2fb73 100644 --- a/src/f32-vbinary/gen/f32-vdivc-minmax-sse-u4.c +++ b/src/f32-vbinary/gen/f32-vdivc-sse-u4.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vdivc_minmax_ukernel__sse_u4( +void xnn_f32_vdivc_ukernel__sse_u4( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,10 +29,6 @@ void xnn_f32_vdivc_minmax_ukernel__sse_u4( assert(input_b != NULL); assert(output != NULL); - const __m128 voutput_min = _mm_set1_ps(params->scalar.min); - const __m128 voutput_max = _mm_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); const __m128 vb = _mm_load1_ps(input_b); for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { @@ -40,8 +36,6 @@ void xnn_f32_vdivc_minmax_ukernel__sse_u4( input_a += 4; __m128 vacc = _mm_div_ps(va, vb); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); _mm_storeu_ps(output, vacc); output += 4; @@ -50,8 +44,6 @@ void xnn_f32_vdivc_minmax_ukernel__sse_u4( const __m128 va = _mm_loadu_ps(input_a); __m128 vacc = _mm_div_ps(va, vb); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); if (batch & (2 * sizeof(float))) { _mm_storel_pi((__m64*) output, vacc); vacc = _mm_movehl_ps(vacc, vacc); diff --git a/src/f32-vbinary/gen/f32-vdivc-minmax-sse-u8.c b/src/f32-vbinary/gen/f32-vdivc-sse-u8.c similarity index 70% rename from src/f32-vbinary/gen/f32-vdivc-minmax-sse-u8.c rename to src/f32-vbinary/gen/f32-vdivc-sse-u8.c index 4180ba2be4b..eae1428611e 100644 --- a/src/f32-vbinary/gen/f32-vdivc-minmax-sse-u8.c +++ b/src/f32-vbinary/gen/f32-vdivc-sse-u8.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vdivc_minmax_ukernel__sse_u8( +void xnn_f32_vdivc_ukernel__sse_u8( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,10 +29,6 @@ void xnn_f32_vdivc_minmax_ukernel__sse_u8( assert(input_b != NULL); assert(output != NULL); - const __m128 voutput_min = _mm_set1_ps(params->scalar.min); - const __m128 voutput_max = _mm_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); const __m128 vb = _mm_load1_ps(input_b); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { @@ -44,12 +40,6 @@ void xnn_f32_vdivc_minmax_ukernel__sse_u8( __m128 vacc1 = _mm_div_ps(va1, vb); - vacc0 = _mm_max_ps(vacc0, voutput_min); - vacc1 = _mm_max_ps(vacc1, voutput_min); - - vacc0 = _mm_min_ps(vacc0, voutput_max); - vacc1 = _mm_min_ps(vacc1, voutput_max); - _mm_storeu_ps(output, vacc0); _mm_storeu_ps(output + 4, vacc1); output += 8; @@ -59,8 +49,6 @@ void xnn_f32_vdivc_minmax_ukernel__sse_u8( input_a += 4; __m128 vacc = _mm_div_ps(va, vb); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); _mm_storeu_ps(output, vacc); output += 4; @@ -69,8 +57,6 @@ void xnn_f32_vdivc_minmax_ukernel__sse_u8( const __m128 va = _mm_loadu_ps(input_a); __m128 vacc = _mm_div_ps(va, vb); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); if (batch & (2 * sizeof(float))) { _mm_storel_pi((__m64*) output, vacc); vacc = _mm_movehl_ps(vacc, vacc); diff --git a/src/f32-vbinary/gen/f32-vdivc-minmax-scalar-u1.c b/src/f32-vbinary/gen/f32-vdivc-wasm-u1.c similarity index 72% rename from src/f32-vbinary/gen/f32-vdivc-minmax-scalar-u1.c rename to src/f32-vbinary/gen/f32-vdivc-wasm-u1.c index 53bf741b047..f94baa04459 100644 --- a/src/f32-vbinary/gen/f32-vdivc-minmax-scalar-u1.c +++ b/src/f32-vbinary/gen/f32-vdivc-wasm-u1.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vdivc_minmax_ukernel__scalar_u1( +void xnn_f32_vdivc_ukernel__wasm_u1( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,15 +27,11 @@ void xnn_f32_vdivc_minmax_ukernel__scalar_u1( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; const float vb = *input_b; for (; batch >= sizeof(float); batch -= sizeof(float)) { const float va = *input_a++; float vacc = va / vb; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output++ = vacc; } } diff --git a/src/f32-vbinary/gen/f32-vdivc-minmax-scalar-u2.c b/src/f32-vbinary/gen/f32-vdivc-wasm-u2.c similarity index 68% rename from src/f32-vbinary/gen/f32-vdivc-minmax-scalar-u2.c rename to src/f32-vbinary/gen/f32-vdivc-wasm-u2.c index e0d2fe4bf97..f618fb07c85 100644 --- a/src/f32-vbinary/gen/f32-vdivc-minmax-scalar-u2.c +++ b/src/f32-vbinary/gen/f32-vdivc-wasm-u2.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vdivc_minmax_ukernel__scalar_u2( +void xnn_f32_vdivc_ukernel__wasm_u2( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,8 +27,6 @@ void xnn_f32_vdivc_minmax_ukernel__scalar_u2( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; const float vb = *input_b; for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) { @@ -40,12 +38,6 @@ void xnn_f32_vdivc_minmax_ukernel__scalar_u2( float vacc1 = va1 / vb; - vacc0 = math_max_f32(vacc0, voutput_min); - vacc1 = math_max_f32(vacc1, voutput_min); - - vacc0 = math_min_f32(vacc0, voutput_max); - vacc1 = math_min_f32(vacc1, voutput_max); - output[0] = vacc0; output[1] = vacc1; output += 2; @@ -54,8 +46,6 @@ void xnn_f32_vdivc_minmax_ukernel__scalar_u2( assert(batch == sizeof(float)); const float va = *input_a; float vacc = va / vb; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output = vacc; } } diff --git a/src/f32-vbinary/gen/f32-vdivc-minmax-scalar-u4.c b/src/f32-vbinary/gen/f32-vdivc-wasm-u4.c similarity index 65% rename from src/f32-vbinary/gen/f32-vdivc-minmax-scalar-u4.c rename to src/f32-vbinary/gen/f32-vdivc-wasm-u4.c index c4dd77edea1..73ec183e795 100644 --- a/src/f32-vbinary/gen/f32-vdivc-minmax-scalar-u4.c +++ b/src/f32-vbinary/gen/f32-vdivc-wasm-u4.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vdivc_minmax_ukernel__scalar_u4( +void xnn_f32_vdivc_ukernel__wasm_u4( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,8 +27,6 @@ void xnn_f32_vdivc_minmax_ukernel__scalar_u4( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; const float vb = *input_b; for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { @@ -44,16 +42,6 @@ void xnn_f32_vdivc_minmax_ukernel__scalar_u4( float vacc3 = va3 / vb; - vacc0 = math_max_f32(vacc0, voutput_min); - vacc1 = math_max_f32(vacc1, voutput_min); - vacc2 = math_max_f32(vacc2, voutput_min); - vacc3 = math_max_f32(vacc3, voutput_min); - - vacc0 = math_min_f32(vacc0, voutput_max); - vacc1 = math_min_f32(vacc1, voutput_max); - vacc2 = math_min_f32(vacc2, voutput_max); - vacc3 = math_min_f32(vacc3, voutput_max); - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; @@ -64,8 +52,6 @@ void xnn_f32_vdivc_minmax_ukernel__scalar_u4( do { const float va = *input_a++; float vacc = va / vb; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output++ = vacc; batch -= sizeof(float); } while (batch != 0); diff --git a/src/f32-vbinary/gen/f32-vdivc-minmax-scalar-u8.c b/src/f32-vbinary/gen/f32-vdivc-wasm-u8.c similarity index 60% rename from src/f32-vbinary/gen/f32-vdivc-minmax-scalar-u8.c rename to src/f32-vbinary/gen/f32-vdivc-wasm-u8.c index be66ac40aa8..b9e5b6ddea7 100644 --- a/src/f32-vbinary/gen/f32-vdivc-minmax-scalar-u8.c +++ b/src/f32-vbinary/gen/f32-vdivc-wasm-u8.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vdivc_minmax_ukernel__scalar_u8( +void xnn_f32_vdivc_ukernel__wasm_u8( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,8 +27,6 @@ void xnn_f32_vdivc_minmax_ukernel__scalar_u8( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; const float vb = *input_b; for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { @@ -52,24 +50,6 @@ void xnn_f32_vdivc_minmax_ukernel__scalar_u8( float vacc7 = va7 / vb; - vacc0 = math_max_f32(vacc0, voutput_min); - vacc1 = math_max_f32(vacc1, voutput_min); - vacc2 = math_max_f32(vacc2, voutput_min); - vacc3 = math_max_f32(vacc3, voutput_min); - vacc4 = math_max_f32(vacc4, voutput_min); - vacc5 = math_max_f32(vacc5, voutput_min); - vacc6 = math_max_f32(vacc6, voutput_min); - vacc7 = math_max_f32(vacc7, voutput_min); - - vacc0 = math_min_f32(vacc0, voutput_max); - vacc1 = math_min_f32(vacc1, voutput_max); - vacc2 = math_min_f32(vacc2, voutput_max); - vacc3 = math_min_f32(vacc3, voutput_max); - vacc4 = math_min_f32(vacc4, voutput_max); - vacc5 = math_min_f32(vacc5, voutput_max); - vacc6 = math_min_f32(vacc6, voutput_max); - vacc7 = math_min_f32(vacc7, voutput_max); - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; @@ -84,8 +64,6 @@ void xnn_f32_vdivc_minmax_ukernel__scalar_u8( do { const float va = *input_a++; float vacc = va / vb; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output++ = vacc; batch -= sizeof(float); } while (batch != 0); diff --git a/src/f32-vbinary/gen/f32-vdivc-wasmsimd-u16.c b/src/f32-vbinary/gen/f32-vdivc-wasmsimd-u16.c index 1c780c589d8..cfd9aab458e 100644 --- a/src/f32-vbinary/gen/f32-vdivc-wasmsimd-u16.c +++ b/src/f32-vbinary/gen/f32-vdivc-wasmsimd-u16.c @@ -43,7 +43,6 @@ void xnn_f32_vdivc_ukernel__wasmsimd_u16( v128_t vy3 = wasm_f32x4_div(va3, vb); - wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); wasm_v128_store(output + 8, vy2); @@ -56,7 +55,6 @@ void xnn_f32_vdivc_ukernel__wasmsimd_u16( v128_t vy = wasm_f32x4_div(va, vb); - wasm_v128_store(output, vy); output += 4; } @@ -65,7 +63,6 @@ void xnn_f32_vdivc_ukernel__wasmsimd_u16( v128_t vy = wasm_f32x4_div(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vdivc-wasmsimd-u4.c b/src/f32-vbinary/gen/f32-vdivc-wasmsimd-u4.c index 2ec2398126f..03c95a3a2bb 100644 --- a/src/f32-vbinary/gen/f32-vdivc-wasmsimd-u4.c +++ b/src/f32-vbinary/gen/f32-vdivc-wasmsimd-u4.c @@ -36,7 +36,6 @@ void xnn_f32_vdivc_ukernel__wasmsimd_u4( v128_t vy = wasm_f32x4_div(va, vb); - wasm_v128_store(output, vy); output += 4; } @@ -45,7 +44,6 @@ void xnn_f32_vdivc_ukernel__wasmsimd_u4( v128_t vy = wasm_f32x4_div(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vdivc-wasmsimd-u8.c b/src/f32-vbinary/gen/f32-vdivc-wasmsimd-u8.c index b8ff162d705..be134bc7f1b 100644 --- a/src/f32-vbinary/gen/f32-vdivc-wasmsimd-u8.c +++ b/src/f32-vbinary/gen/f32-vdivc-wasmsimd-u8.c @@ -39,7 +39,6 @@ void xnn_f32_vdivc_ukernel__wasmsimd_u8( v128_t vy1 = wasm_f32x4_div(va1, vb); - wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); output += 8; @@ -50,7 +49,6 @@ void xnn_f32_vdivc_ukernel__wasmsimd_u8( v128_t vy = wasm_f32x4_div(va, vb); - wasm_v128_store(output, vy); output += 4; } @@ -59,7 +57,6 @@ void xnn_f32_vdivc_ukernel__wasmsimd_u8( v128_t vy = wasm_f32x4_div(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vmax-avx-u16.c b/src/f32-vbinary/gen/f32-vmax-avx-u16.c index 9a39b6f258c..7c3b3f349a5 100644 --- a/src/f32-vbinary/gen/f32-vmax-avx-u16.c +++ b/src/f32-vbinary/gen/f32-vmax-avx-u16.c @@ -30,7 +30,6 @@ void xnn_f32_vmax_ukernel__avx_u16( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const __m256 va0 = _mm256_loadu_ps(input_a); const __m256 va1 = _mm256_loadu_ps(input_a + 8); @@ -41,7 +40,6 @@ void xnn_f32_vmax_ukernel__avx_u16( input_b += 16; - _mm256_storeu_ps(output, vacc0); _mm256_storeu_ps(output + 8, vacc1); output += 16; diff --git a/src/f32-vbinary/gen/f32-vmax-avx-u8.c b/src/f32-vbinary/gen/f32-vmax-avx-u8.c index 888b5b31b87..b8fc090b431 100644 --- a/src/f32-vbinary/gen/f32-vmax-avx-u8.c +++ b/src/f32-vbinary/gen/f32-vmax-avx-u8.c @@ -30,7 +30,6 @@ void xnn_f32_vmax_ukernel__avx_u8( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const __m256 va = _mm256_loadu_ps(input_a); input_a += 8; diff --git a/src/f32-vbinary/gen/f32-vmax-avx512f-u16.c b/src/f32-vbinary/gen/f32-vmax-avx512f-u16.c index 5f216f8e856..8219275b8f5 100644 --- a/src/f32-vbinary/gen/f32-vmax-avx512f-u16.c +++ b/src/f32-vbinary/gen/f32-vmax-avx512f-u16.c @@ -30,7 +30,6 @@ void xnn_f32_vmax_ukernel__avx512f_u16( assert(output != NULL); - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const __m512 va = _mm512_loadu_ps(input_a); input_a += 16; @@ -39,7 +38,6 @@ void xnn_f32_vmax_ukernel__avx512f_u16( input_b += 16; - _mm512_storeu_ps(output, vacc); output += 16; } diff --git a/src/f32-vbinary/gen/f32-vmax-avx512f-u32.c b/src/f32-vbinary/gen/f32-vmax-avx512f-u32.c index f45a3ce6225..e92fe97a596 100644 --- a/src/f32-vbinary/gen/f32-vmax-avx512f-u32.c +++ b/src/f32-vbinary/gen/f32-vmax-avx512f-u32.c @@ -30,7 +30,6 @@ void xnn_f32_vmax_ukernel__avx512f_u32( assert(output != NULL); - for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { const __m512 va0 = _mm512_loadu_ps(input_a); const __m512 va1 = _mm512_loadu_ps(input_a + 16); @@ -41,7 +40,6 @@ void xnn_f32_vmax_ukernel__avx512f_u32( input_b += 32; - _mm512_storeu_ps(output, vacc0); _mm512_storeu_ps(output + 16, vacc1); output += 32; @@ -54,7 +52,6 @@ void xnn_f32_vmax_ukernel__avx512f_u32( input_b += 16; - _mm512_storeu_ps(output, vacc); output += 16; } diff --git a/src/f32-vbinary/gen/f32-vmax-hvx-u128.c b/src/f32-vbinary/gen/f32-vmax-hvx-u128.c index c00a93c0323..ad413987fe0 100644 --- a/src/f32-vbinary/gen/f32-vmax-hvx-u128.c +++ b/src/f32-vbinary/gen/f32-vmax-hvx-u128.c @@ -23,7 +23,6 @@ void xnn_f32_vmax_ukernel__hvx_u128( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 128 * sizeof(float); batch -= 128 * sizeof(float)) { HVX_Vector va0 = xnn_loadu_f32(input_a); HVX_Vector va1 = xnn_loadu_f32(input_a + 32); @@ -42,7 +41,6 @@ void xnn_f32_vmax_ukernel__hvx_u128( HVX_Vector vacc3 = xnn_max_f32(va3, vb3); - xnn_storeu_f32(output, vacc0); xnn_storeu_f32(output + 32, vacc1); xnn_storeu_f32(output + 64, vacc2); @@ -65,7 +63,7 @@ void xnn_f32_vmax_ukernel__hvx_u128( HVX_Vector vb = xnn_loadu_f32(input_b); HVX_Vector vacc = xnn_max_f32(va, vb); - + Q6_V_vstu_variable(output, batch, vacc); } } diff --git a/src/f32-vbinary/gen/f32-vmax-hvx-u32.c b/src/f32-vbinary/gen/f32-vmax-hvx-u32.c index 8a85342ad3b..b00ef334de8 100644 --- a/src/f32-vbinary/gen/f32-vmax-hvx-u32.c +++ b/src/f32-vbinary/gen/f32-vmax-hvx-u32.c @@ -23,7 +23,6 @@ void xnn_f32_vmax_ukernel__hvx_u32( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { HVX_Vector va = xnn_loadu_f32(input_a); HVX_Vector vb = xnn_loadu_f32(input_b); @@ -40,7 +39,7 @@ void xnn_f32_vmax_ukernel__hvx_u32( HVX_Vector vb = xnn_loadu_f32(input_b); HVX_Vector vacc = xnn_max_f32(va, vb); - + Q6_V_vstu_variable(output, batch, vacc); } } diff --git a/src/f32-vbinary/gen/f32-vmax-hvx-u64.c b/src/f32-vbinary/gen/f32-vmax-hvx-u64.c index 488257f6437..1aaf5cf952d 100644 --- a/src/f32-vbinary/gen/f32-vmax-hvx-u64.c +++ b/src/f32-vbinary/gen/f32-vmax-hvx-u64.c @@ -23,7 +23,6 @@ void xnn_f32_vmax_ukernel__hvx_u64( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 64 * sizeof(float); batch -= 64 * sizeof(float)) { HVX_Vector va0 = xnn_loadu_f32(input_a); HVX_Vector va1 = xnn_loadu_f32(input_a + 32); @@ -36,7 +35,6 @@ void xnn_f32_vmax_ukernel__hvx_u64( HVX_Vector vacc1 = xnn_max_f32(va1, vb1); - xnn_storeu_f32(output, vacc0); xnn_storeu_f32(output + 32, vacc1); output += 64; @@ -57,7 +55,7 @@ void xnn_f32_vmax_ukernel__hvx_u64( HVX_Vector vb = xnn_loadu_f32(input_b); HVX_Vector vacc = xnn_max_f32(va, vb); - + Q6_V_vstu_variable(output, batch, vacc); } } diff --git a/src/f32-vbinary/gen/f32-vmax-neon-u4.c b/src/f32-vbinary/gen/f32-vmax-neon-u4.c index eda5fc551f1..260a38f7476 100644 --- a/src/f32-vbinary/gen/f32-vmax-neon-u4.c +++ b/src/f32-vbinary/gen/f32-vmax-neon-u4.c @@ -28,14 +28,12 @@ void xnn_f32_vmax_ukernel__neon_u4( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float32x4_t va = vld1q_f32(input_a); input_a += 4; const float32x4_t vb = vld1q_f32(input_b); input_b += 4; float32x4_t vacc = vmaxq_f32(va, vb); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -44,7 +42,6 @@ void xnn_f32_vmax_ukernel__neon_u4( float32x4_t vacc = vmaxq_f32(va, vb); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/gen/f32-vmax-neon-u8.c b/src/f32-vbinary/gen/f32-vmax-neon-u8.c index feb44141e77..4006e784ef0 100644 --- a/src/f32-vbinary/gen/f32-vmax-neon-u8.c +++ b/src/f32-vbinary/gen/f32-vmax-neon-u8.c @@ -28,7 +28,6 @@ void xnn_f32_vmax_ukernel__neon_u8( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float32x4_t va0 = vld1q_f32(input_a); input_a += 4; const float32x4_t vb0 = vld1q_f32(input_b); input_b += 4; @@ -39,7 +38,6 @@ void xnn_f32_vmax_ukernel__neon_u8( float32x4_t vacc1 = vmaxq_f32(va1, vb1); - vst1q_f32(output, vacc0); output += 4; vst1q_f32(output, vacc1); output += 4; } @@ -49,7 +47,6 @@ void xnn_f32_vmax_ukernel__neon_u8( float32x4_t vacc = vmaxq_f32(va, vb); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -58,7 +55,6 @@ void xnn_f32_vmax_ukernel__neon_u8( float32x4_t vacc = vmaxq_f32(va, vb); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/gen/f32-vmax-scalar-u1.c b/src/f32-vbinary/gen/f32-vmax-scalar-u1.c index 5f4c14ab5b9..4cf505947e6 100644 --- a/src/f32-vbinary/gen/f32-vmax-scalar-u1.c +++ b/src/f32-vbinary/gen/f32-vmax-scalar-u1.c @@ -27,7 +27,6 @@ void xnn_f32_vmax_ukernel__scalar_u1( assert(input_b != NULL); assert(output != NULL); - for (; batch >= sizeof(float); batch -= sizeof(float)) { const float va = *input_a++; const float vb = *input_b++; diff --git a/src/f32-vbinary/gen/f32-vmax-scalar-u2.c b/src/f32-vbinary/gen/f32-vmax-scalar-u2.c index b663f88cff7..b2fd2ce1c12 100644 --- a/src/f32-vbinary/gen/f32-vmax-scalar-u2.c +++ b/src/f32-vbinary/gen/f32-vmax-scalar-u2.c @@ -27,7 +27,6 @@ void xnn_f32_vmax_ukernel__scalar_u2( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -41,7 +40,6 @@ void xnn_f32_vmax_ukernel__scalar_u2( float vacc1 = math_max_f32(va1, vb1); - output[0] = vacc0; output[1] = vacc1; output += 2; diff --git a/src/f32-vbinary/gen/f32-vmax-scalar-u4.c b/src/f32-vbinary/gen/f32-vmax-scalar-u4.c index c8e7af8df50..5603d4c5232 100644 --- a/src/f32-vbinary/gen/f32-vmax-scalar-u4.c +++ b/src/f32-vbinary/gen/f32-vmax-scalar-u4.c @@ -27,7 +27,6 @@ void xnn_f32_vmax_ukernel__scalar_u4( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -47,7 +46,6 @@ void xnn_f32_vmax_ukernel__scalar_u4( float vacc3 = math_max_f32(va3, vb3); - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vmax-scalar-u8.c b/src/f32-vbinary/gen/f32-vmax-scalar-u8.c index 5a25a0eb015..5a4116dfd8d 100644 --- a/src/f32-vbinary/gen/f32-vmax-scalar-u8.c +++ b/src/f32-vbinary/gen/f32-vmax-scalar-u8.c @@ -27,7 +27,6 @@ void xnn_f32_vmax_ukernel__scalar_u8( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -59,7 +58,6 @@ void xnn_f32_vmax_ukernel__scalar_u8( float vacc7 = math_max_f32(va7, vb7); - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vmax-sse-u4.c b/src/f32-vbinary/gen/f32-vmax-sse-u4.c index 51fe68092e4..8a4d6867da1 100644 --- a/src/f32-vbinary/gen/f32-vmax-sse-u4.c +++ b/src/f32-vbinary/gen/f32-vmax-sse-u4.c @@ -29,7 +29,6 @@ void xnn_f32_vmax_ukernel__sse_u4( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const __m128 va = _mm_loadu_ps(input_a); input_a += 4; diff --git a/src/f32-vbinary/gen/f32-vmax-sse-u8.c b/src/f32-vbinary/gen/f32-vmax-sse-u8.c index 82c7904483d..00b86f8d08d 100644 --- a/src/f32-vbinary/gen/f32-vmax-sse-u8.c +++ b/src/f32-vbinary/gen/f32-vmax-sse-u8.c @@ -29,7 +29,6 @@ void xnn_f32_vmax_ukernel__sse_u8( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const __m128 va0 = _mm_loadu_ps(input_a); const __m128 va1 = _mm_loadu_ps(input_a + 4); @@ -43,7 +42,6 @@ void xnn_f32_vmax_ukernel__sse_u8( __m128 vacc1 = _mm_max_ps(va1, vb1); - _mm_storeu_ps(output, vacc0); _mm_storeu_ps(output + 4, vacc1); output += 8; diff --git a/src/f32-vbinary/gen/f32-vmax-wasm-u1.c b/src/f32-vbinary/gen/f32-vmax-wasm-u1.c index 288df328693..49d6113b352 100644 --- a/src/f32-vbinary/gen/f32-vmax-wasm-u1.c +++ b/src/f32-vbinary/gen/f32-vmax-wasm-u1.c @@ -27,7 +27,6 @@ void xnn_f32_vmax_ukernel__wasm_u1( assert(input_b != NULL); assert(output != NULL); - for (; batch >= sizeof(float); batch -= sizeof(float)) { const float va = *input_a++; const float vb = *input_b++; diff --git a/src/f32-vbinary/gen/f32-vmax-wasm-u2.c b/src/f32-vbinary/gen/f32-vmax-wasm-u2.c index 8674129917b..0cbc2377bf3 100644 --- a/src/f32-vbinary/gen/f32-vmax-wasm-u2.c +++ b/src/f32-vbinary/gen/f32-vmax-wasm-u2.c @@ -27,7 +27,6 @@ void xnn_f32_vmax_ukernel__wasm_u2( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -41,7 +40,6 @@ void xnn_f32_vmax_ukernel__wasm_u2( float vacc1 = __builtin_wasm_max_f32(va1, vb1); - output[0] = vacc0; output[1] = vacc1; output += 2; diff --git a/src/f32-vbinary/gen/f32-vmax-wasm-u4.c b/src/f32-vbinary/gen/f32-vmax-wasm-u4.c index aaae1354621..92060ae230a 100644 --- a/src/f32-vbinary/gen/f32-vmax-wasm-u4.c +++ b/src/f32-vbinary/gen/f32-vmax-wasm-u4.c @@ -27,7 +27,6 @@ void xnn_f32_vmax_ukernel__wasm_u4( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -47,7 +46,6 @@ void xnn_f32_vmax_ukernel__wasm_u4( float vacc3 = __builtin_wasm_max_f32(va3, vb3); - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vmax-wasm-u8.c b/src/f32-vbinary/gen/f32-vmax-wasm-u8.c index 957163b99d7..af3fbc71d50 100644 --- a/src/f32-vbinary/gen/f32-vmax-wasm-u8.c +++ b/src/f32-vbinary/gen/f32-vmax-wasm-u8.c @@ -27,7 +27,6 @@ void xnn_f32_vmax_ukernel__wasm_u8( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -59,7 +58,6 @@ void xnn_f32_vmax_ukernel__wasm_u8( float vacc7 = __builtin_wasm_max_f32(va7, vb7); - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vmax-wasmsimd-arm-u16.c b/src/f32-vbinary/gen/f32-vmax-wasmsimd-arm-u16.c index db869d0b76d..499e5e28835 100644 --- a/src/f32-vbinary/gen/f32-vmax-wasmsimd-arm-u16.c +++ b/src/f32-vbinary/gen/f32-vmax-wasmsimd-arm-u16.c @@ -28,7 +28,6 @@ void xnn_f32_vmax_ukernel__wasmsimd_arm_u16( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); @@ -63,7 +62,6 @@ void xnn_f32_vmax_ukernel__wasmsimd_arm_u16( v128_t vacc = wasm_f32x4_max(va, vb); - wasm_v128_store(output, vacc); output += 4; } @@ -73,7 +71,6 @@ void xnn_f32_vmax_ukernel__wasmsimd_arm_u16( v128_t vacc = wasm_f32x4_max(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vmax-wasmsimd-arm-u4.c b/src/f32-vbinary/gen/f32-vmax-wasmsimd-arm-u4.c index 692ef8c3425..7bcb990b669 100644 --- a/src/f32-vbinary/gen/f32-vmax-wasmsimd-arm-u4.c +++ b/src/f32-vbinary/gen/f32-vmax-wasmsimd-arm-u4.c @@ -28,7 +28,6 @@ void xnn_f32_vmax_ukernel__wasmsimd_arm_u4( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; @@ -38,7 +37,6 @@ void xnn_f32_vmax_ukernel__wasmsimd_arm_u4( v128_t vacc = wasm_f32x4_max(va, vb); - wasm_v128_store(output, vacc); output += 4; } @@ -48,7 +46,6 @@ void xnn_f32_vmax_ukernel__wasmsimd_arm_u4( v128_t vacc = wasm_f32x4_max(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vmax-wasmsimd-arm-u8.c b/src/f32-vbinary/gen/f32-vmax-wasmsimd-arm-u8.c index 5a08be49026..491591737da 100644 --- a/src/f32-vbinary/gen/f32-vmax-wasmsimd-arm-u8.c +++ b/src/f32-vbinary/gen/f32-vmax-wasmsimd-arm-u8.c @@ -28,7 +28,6 @@ void xnn_f32_vmax_ukernel__wasmsimd_arm_u8( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); @@ -55,7 +54,6 @@ void xnn_f32_vmax_ukernel__wasmsimd_arm_u8( v128_t vacc = wasm_f32x4_max(va, vb); - wasm_v128_store(output, vacc); output += 4; } @@ -65,7 +63,6 @@ void xnn_f32_vmax_ukernel__wasmsimd_arm_u8( v128_t vacc = wasm_f32x4_max(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vmax-wasmsimd-x86-u16.c b/src/f32-vbinary/gen/f32-vmax-wasmsimd-x86-u16.c index 692a16d7fb8..65d84798967 100644 --- a/src/f32-vbinary/gen/f32-vmax-wasmsimd-x86-u16.c +++ b/src/f32-vbinary/gen/f32-vmax-wasmsimd-x86-u16.c @@ -28,7 +28,6 @@ void xnn_f32_vmax_ukernel__wasmsimd_x86_u16( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); @@ -63,7 +62,6 @@ void xnn_f32_vmax_ukernel__wasmsimd_x86_u16( v128_t vacc = wasm_f32x4_pmax(va, vb); - wasm_v128_store(output, vacc); output += 4; } @@ -73,7 +71,6 @@ void xnn_f32_vmax_ukernel__wasmsimd_x86_u16( v128_t vacc = wasm_f32x4_pmax(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vmax-wasmsimd-x86-u4.c b/src/f32-vbinary/gen/f32-vmax-wasmsimd-x86-u4.c index 795dbf8f7b4..5bf9e3d16dc 100644 --- a/src/f32-vbinary/gen/f32-vmax-wasmsimd-x86-u4.c +++ b/src/f32-vbinary/gen/f32-vmax-wasmsimd-x86-u4.c @@ -28,7 +28,6 @@ void xnn_f32_vmax_ukernel__wasmsimd_x86_u4( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; @@ -38,7 +37,6 @@ void xnn_f32_vmax_ukernel__wasmsimd_x86_u4( v128_t vacc = wasm_f32x4_pmax(va, vb); - wasm_v128_store(output, vacc); output += 4; } @@ -48,7 +46,6 @@ void xnn_f32_vmax_ukernel__wasmsimd_x86_u4( v128_t vacc = wasm_f32x4_pmax(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vmax-wasmsimd-x86-u8.c b/src/f32-vbinary/gen/f32-vmax-wasmsimd-x86-u8.c index 49e8eedd325..f32a84e6d82 100644 --- a/src/f32-vbinary/gen/f32-vmax-wasmsimd-x86-u8.c +++ b/src/f32-vbinary/gen/f32-vmax-wasmsimd-x86-u8.c @@ -28,7 +28,6 @@ void xnn_f32_vmax_ukernel__wasmsimd_x86_u8( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); @@ -55,7 +54,6 @@ void xnn_f32_vmax_ukernel__wasmsimd_x86_u8( v128_t vacc = wasm_f32x4_pmax(va, vb); - wasm_v128_store(output, vacc); output += 4; } @@ -65,7 +63,6 @@ void xnn_f32_vmax_ukernel__wasmsimd_x86_u8( v128_t vacc = wasm_f32x4_pmax(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vmaxc-avx-u16.c b/src/f32-vbinary/gen/f32-vmaxc-avx-u16.c index cd747906196..c90c6eb76cc 100644 --- a/src/f32-vbinary/gen/f32-vmaxc-avx-u16.c +++ b/src/f32-vbinary/gen/f32-vmaxc-avx-u16.c @@ -41,7 +41,6 @@ void xnn_f32_vmaxc_ukernel__avx_u16( __m256 vacc1 = _mm256_max_ps(va1, vb); - _mm256_storeu_ps(output, vacc0); _mm256_storeu_ps(output + 8, vacc1); output += 16; diff --git a/src/f32-vbinary/gen/f32-vmaxc-avx512f-u16.c b/src/f32-vbinary/gen/f32-vmaxc-avx512f-u16.c index 73709d392b0..39b5d63cde2 100644 --- a/src/f32-vbinary/gen/f32-vmaxc-avx512f-u16.c +++ b/src/f32-vbinary/gen/f32-vmaxc-avx512f-u16.c @@ -7,6 +7,7 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. + #include #include @@ -38,7 +39,6 @@ void xnn_f32_vmaxc_ukernel__avx512f_u16( __m512 vacc0 = _mm512_max_ps(va0, vb); - _mm512_storeu_ps(output, vacc0); output += 16; } diff --git a/src/f32-vbinary/gen/f32-vmaxc-avx512f-u32.c b/src/f32-vbinary/gen/f32-vmaxc-avx512f-u32.c index 2651ac0a077..93869514394 100644 --- a/src/f32-vbinary/gen/f32-vmaxc-avx512f-u32.c +++ b/src/f32-vbinary/gen/f32-vmaxc-avx512f-u32.c @@ -7,6 +7,7 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. + #include #include @@ -40,7 +41,6 @@ void xnn_f32_vmaxc_ukernel__avx512f_u32( __m512 vacc1 = _mm512_max_ps(va1, vb); - _mm512_storeu_ps(output, vacc0); _mm512_storeu_ps(output + 16, vacc1); output += 32; @@ -51,7 +51,6 @@ void xnn_f32_vmaxc_ukernel__avx512f_u32( __m512 vacc = _mm512_max_ps(va, vb); - _mm512_storeu_ps(output, vacc); output += 16; } diff --git a/src/f32-vbinary/gen/f32-vmaxc-hvx-u128.c b/src/f32-vbinary/gen/f32-vmaxc-hvx-u128.c index 8d46994e575..ddac8ce884c 100644 --- a/src/f32-vbinary/gen/f32-vmaxc-hvx-u128.c +++ b/src/f32-vbinary/gen/f32-vmaxc-hvx-u128.c @@ -38,7 +38,6 @@ void xnn_f32_vmaxc_ukernel__hvx_u128( HVX_Vector vacc3 = xnn_max_f32(va3, vb); - xnn_storeu_f32(output, vacc0); xnn_storeu_f32(output + 32, vacc1); xnn_storeu_f32(output + 64, vacc2); diff --git a/src/f32-vbinary/gen/f32-vmaxc-hvx-u64.c b/src/f32-vbinary/gen/f32-vmaxc-hvx-u64.c index be9e37605f0..547fbfb69e6 100644 --- a/src/f32-vbinary/gen/f32-vmaxc-hvx-u64.c +++ b/src/f32-vbinary/gen/f32-vmaxc-hvx-u64.c @@ -34,7 +34,6 @@ void xnn_f32_vmaxc_ukernel__hvx_u64( HVX_Vector vacc1 = xnn_max_f32(va1, vb); - xnn_storeu_f32(output, vacc0); xnn_storeu_f32(output + 32, vacc1); output += 64; diff --git a/src/f32-vbinary/gen/f32-vmaxc-neon-u4.c b/src/f32-vbinary/gen/f32-vmaxc-neon-u4.c index 33f0d0e38cb..8114d453323 100644 --- a/src/f32-vbinary/gen/f32-vmaxc-neon-u4.c +++ b/src/f32-vbinary/gen/f32-vmaxc-neon-u4.c @@ -35,7 +35,6 @@ void xnn_f32_vmaxc_ukernel__neon_u4( float32x4_t vacc = vmaxq_f32(va, vb); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -43,7 +42,6 @@ void xnn_f32_vmaxc_ukernel__neon_u4( float32x4_t vacc = vmaxq_f32(va, vb); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/gen/f32-vmaxc-neon-u8.c b/src/f32-vbinary/gen/f32-vmaxc-neon-u8.c index 4a2ea28ef9d..1cbe8059d97 100644 --- a/src/f32-vbinary/gen/f32-vmaxc-neon-u8.c +++ b/src/f32-vbinary/gen/f32-vmaxc-neon-u8.c @@ -38,7 +38,6 @@ void xnn_f32_vmaxc_ukernel__neon_u8( float32x4_t vacc1 = vmaxq_f32(va1, vb); - vst1q_f32(output, vacc0); output += 4; vst1q_f32(output, vacc1); output += 4; } @@ -47,7 +46,6 @@ void xnn_f32_vmaxc_ukernel__neon_u8( float32x4_t vacc = vmaxq_f32(va, vb); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -55,7 +53,6 @@ void xnn_f32_vmaxc_ukernel__neon_u8( float32x4_t vacc = vmaxq_f32(va, vb); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/gen/f32-vmaxc-scalar-u2.c b/src/f32-vbinary/gen/f32-vmaxc-scalar-u2.c index e8c047bfe84..38e0b7f871b 100644 --- a/src/f32-vbinary/gen/f32-vmaxc-scalar-u2.c +++ b/src/f32-vbinary/gen/f32-vmaxc-scalar-u2.c @@ -38,7 +38,6 @@ void xnn_f32_vmaxc_ukernel__scalar_u2( float vacc1 = math_max_f32(va1, vb); - output[0] = vacc0; output[1] = vacc1; output += 2; diff --git a/src/f32-vbinary/gen/f32-vmaxc-scalar-u4.c b/src/f32-vbinary/gen/f32-vmaxc-scalar-u4.c index d8870f061a9..c1be97466ab 100644 --- a/src/f32-vbinary/gen/f32-vmaxc-scalar-u4.c +++ b/src/f32-vbinary/gen/f32-vmaxc-scalar-u4.c @@ -42,7 +42,6 @@ void xnn_f32_vmaxc_ukernel__scalar_u4( float vacc3 = math_max_f32(va3, vb); - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vmaxc-scalar-u8.c b/src/f32-vbinary/gen/f32-vmaxc-scalar-u8.c index 838b7a00693..3fb5e8cf6ff 100644 --- a/src/f32-vbinary/gen/f32-vmaxc-scalar-u8.c +++ b/src/f32-vbinary/gen/f32-vmaxc-scalar-u8.c @@ -50,7 +50,6 @@ void xnn_f32_vmaxc_ukernel__scalar_u8( float vacc7 = math_max_f32(va7, vb); - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vmaxc-sse-u8.c b/src/f32-vbinary/gen/f32-vmaxc-sse-u8.c index abed53c6161..93087df665c 100644 --- a/src/f32-vbinary/gen/f32-vmaxc-sse-u8.c +++ b/src/f32-vbinary/gen/f32-vmaxc-sse-u8.c @@ -40,7 +40,6 @@ void xnn_f32_vmaxc_ukernel__sse_u8( __m128 vacc1 = _mm_max_ps(va1, vb); - _mm_storeu_ps(output, vacc0); _mm_storeu_ps(output + 4, vacc1); output += 8; diff --git a/src/f32-vbinary/gen/f32-vmaxc-wasm-u2.c b/src/f32-vbinary/gen/f32-vmaxc-wasm-u2.c index f83a8f8351c..03119a9eb81 100644 --- a/src/f32-vbinary/gen/f32-vmaxc-wasm-u2.c +++ b/src/f32-vbinary/gen/f32-vmaxc-wasm-u2.c @@ -38,7 +38,6 @@ void xnn_f32_vmaxc_ukernel__wasm_u2( float vacc1 = __builtin_wasm_max_f32(va1, vb); - output[0] = vacc0; output[1] = vacc1; output += 2; diff --git a/src/f32-vbinary/gen/f32-vmaxc-wasm-u4.c b/src/f32-vbinary/gen/f32-vmaxc-wasm-u4.c index 9bc1a8d1cbc..73de48ace6d 100644 --- a/src/f32-vbinary/gen/f32-vmaxc-wasm-u4.c +++ b/src/f32-vbinary/gen/f32-vmaxc-wasm-u4.c @@ -42,7 +42,6 @@ void xnn_f32_vmaxc_ukernel__wasm_u4( float vacc3 = __builtin_wasm_max_f32(va3, vb); - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vmaxc-wasm-u8.c b/src/f32-vbinary/gen/f32-vmaxc-wasm-u8.c index eb1d9409f94..94bc6e69404 100644 --- a/src/f32-vbinary/gen/f32-vmaxc-wasm-u8.c +++ b/src/f32-vbinary/gen/f32-vmaxc-wasm-u8.c @@ -50,7 +50,6 @@ void xnn_f32_vmaxc_ukernel__wasm_u8( float vacc7 = __builtin_wasm_max_f32(va7, vb); - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vmaxc-wasmsimd-arm-u16.c b/src/f32-vbinary/gen/f32-vmaxc-wasmsimd-arm-u16.c index e98ff7c52b2..f0a37f5a8f7 100644 --- a/src/f32-vbinary/gen/f32-vmaxc-wasmsimd-arm-u16.c +++ b/src/f32-vbinary/gen/f32-vmaxc-wasmsimd-arm-u16.c @@ -43,7 +43,6 @@ void xnn_f32_vmaxc_ukernel__wasmsimd_arm_u16( v128_t vy3 = wasm_f32x4_max(va3, vb); - wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); wasm_v128_store(output + 8, vy2); @@ -56,7 +55,6 @@ void xnn_f32_vmaxc_ukernel__wasmsimd_arm_u16( v128_t vy = wasm_f32x4_max(va, vb); - wasm_v128_store(output, vy); output += 4; } @@ -65,7 +63,6 @@ void xnn_f32_vmaxc_ukernel__wasmsimd_arm_u16( v128_t vy = wasm_f32x4_max(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vmaxc-wasmsimd-arm-u4.c b/src/f32-vbinary/gen/f32-vmaxc-wasmsimd-arm-u4.c index ef724d11e1b..3231abedaa1 100644 --- a/src/f32-vbinary/gen/f32-vmaxc-wasmsimd-arm-u4.c +++ b/src/f32-vbinary/gen/f32-vmaxc-wasmsimd-arm-u4.c @@ -36,7 +36,6 @@ void xnn_f32_vmaxc_ukernel__wasmsimd_arm_u4( v128_t vy = wasm_f32x4_max(va, vb); - wasm_v128_store(output, vy); output += 4; } @@ -45,7 +44,6 @@ void xnn_f32_vmaxc_ukernel__wasmsimd_arm_u4( v128_t vy = wasm_f32x4_max(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vmaxc-wasmsimd-arm-u8.c b/src/f32-vbinary/gen/f32-vmaxc-wasmsimd-arm-u8.c index f8334347dbd..1a8a099f9e4 100644 --- a/src/f32-vbinary/gen/f32-vmaxc-wasmsimd-arm-u8.c +++ b/src/f32-vbinary/gen/f32-vmaxc-wasmsimd-arm-u8.c @@ -39,7 +39,6 @@ void xnn_f32_vmaxc_ukernel__wasmsimd_arm_u8( v128_t vy1 = wasm_f32x4_max(va1, vb); - wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); output += 8; @@ -50,7 +49,6 @@ void xnn_f32_vmaxc_ukernel__wasmsimd_arm_u8( v128_t vy = wasm_f32x4_max(va, vb); - wasm_v128_store(output, vy); output += 4; } @@ -59,7 +57,6 @@ void xnn_f32_vmaxc_ukernel__wasmsimd_arm_u8( v128_t vy = wasm_f32x4_max(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vmaxc-wasmsimd-x86-u16.c b/src/f32-vbinary/gen/f32-vmaxc-wasmsimd-x86-u16.c index d44a7fd88db..37154b6163f 100644 --- a/src/f32-vbinary/gen/f32-vmaxc-wasmsimd-x86-u16.c +++ b/src/f32-vbinary/gen/f32-vmaxc-wasmsimd-x86-u16.c @@ -43,7 +43,6 @@ void xnn_f32_vmaxc_ukernel__wasmsimd_x86_u16( v128_t vy3 = wasm_f32x4_pmax(vb, va3); - wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); wasm_v128_store(output + 8, vy2); @@ -56,7 +55,6 @@ void xnn_f32_vmaxc_ukernel__wasmsimd_x86_u16( v128_t vy = wasm_f32x4_pmax(vb, va); - wasm_v128_store(output, vy); output += 4; } @@ -65,7 +63,6 @@ void xnn_f32_vmaxc_ukernel__wasmsimd_x86_u16( v128_t vy = wasm_f32x4_pmax(vb, va); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vmaxc-wasmsimd-x86-u4.c b/src/f32-vbinary/gen/f32-vmaxc-wasmsimd-x86-u4.c index e6f2d0b888a..2763c8682cc 100644 --- a/src/f32-vbinary/gen/f32-vmaxc-wasmsimd-x86-u4.c +++ b/src/f32-vbinary/gen/f32-vmaxc-wasmsimd-x86-u4.c @@ -36,7 +36,6 @@ void xnn_f32_vmaxc_ukernel__wasmsimd_x86_u4( v128_t vy = wasm_f32x4_pmax(vb, va); - wasm_v128_store(output, vy); output += 4; } @@ -45,7 +44,6 @@ void xnn_f32_vmaxc_ukernel__wasmsimd_x86_u4( v128_t vy = wasm_f32x4_pmax(vb, va); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vmaxc-wasmsimd-x86-u8.c b/src/f32-vbinary/gen/f32-vmaxc-wasmsimd-x86-u8.c index b7f0aaadb1e..4e65650b1a3 100644 --- a/src/f32-vbinary/gen/f32-vmaxc-wasmsimd-x86-u8.c +++ b/src/f32-vbinary/gen/f32-vmaxc-wasmsimd-x86-u8.c @@ -39,7 +39,6 @@ void xnn_f32_vmaxc_ukernel__wasmsimd_x86_u8( v128_t vy1 = wasm_f32x4_pmax(vb, va1); - wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); output += 8; @@ -50,7 +49,6 @@ void xnn_f32_vmaxc_ukernel__wasmsimd_x86_u8( v128_t vy = wasm_f32x4_pmax(vb, va); - wasm_v128_store(output, vy); output += 4; } @@ -59,7 +57,6 @@ void xnn_f32_vmaxc_ukernel__wasmsimd_x86_u8( v128_t vy = wasm_f32x4_pmax(vb, va); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vmin-avx-u16.c b/src/f32-vbinary/gen/f32-vmin-avx-u16.c index a9da325c7b4..5a08eb9c9e0 100644 --- a/src/f32-vbinary/gen/f32-vmin-avx-u16.c +++ b/src/f32-vbinary/gen/f32-vmin-avx-u16.c @@ -30,7 +30,6 @@ void xnn_f32_vmin_ukernel__avx_u16( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const __m256 va0 = _mm256_loadu_ps(input_a); const __m256 va1 = _mm256_loadu_ps(input_a + 8); @@ -41,7 +40,6 @@ void xnn_f32_vmin_ukernel__avx_u16( input_b += 16; - _mm256_storeu_ps(output, vacc0); _mm256_storeu_ps(output + 8, vacc1); output += 16; diff --git a/src/f32-vbinary/gen/f32-vmin-avx-u8.c b/src/f32-vbinary/gen/f32-vmin-avx-u8.c index 3cd83889929..70b66fe7d41 100644 --- a/src/f32-vbinary/gen/f32-vmin-avx-u8.c +++ b/src/f32-vbinary/gen/f32-vmin-avx-u8.c @@ -30,7 +30,6 @@ void xnn_f32_vmin_ukernel__avx_u8( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const __m256 va = _mm256_loadu_ps(input_a); input_a += 8; diff --git a/src/f32-vbinary/gen/f32-vmin-avx512f-u16.c b/src/f32-vbinary/gen/f32-vmin-avx512f-u16.c index 464c2cdefb0..fc42ee037a5 100644 --- a/src/f32-vbinary/gen/f32-vmin-avx512f-u16.c +++ b/src/f32-vbinary/gen/f32-vmin-avx512f-u16.c @@ -30,7 +30,6 @@ void xnn_f32_vmin_ukernel__avx512f_u16( assert(output != NULL); - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const __m512 va = _mm512_loadu_ps(input_a); input_a += 16; @@ -39,7 +38,6 @@ void xnn_f32_vmin_ukernel__avx512f_u16( input_b += 16; - _mm512_storeu_ps(output, vacc); output += 16; } diff --git a/src/f32-vbinary/gen/f32-vmin-avx512f-u32.c b/src/f32-vbinary/gen/f32-vmin-avx512f-u32.c index df1f0516340..a669b8117e4 100644 --- a/src/f32-vbinary/gen/f32-vmin-avx512f-u32.c +++ b/src/f32-vbinary/gen/f32-vmin-avx512f-u32.c @@ -30,7 +30,6 @@ void xnn_f32_vmin_ukernel__avx512f_u32( assert(output != NULL); - for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { const __m512 va0 = _mm512_loadu_ps(input_a); const __m512 va1 = _mm512_loadu_ps(input_a + 16); @@ -41,7 +40,6 @@ void xnn_f32_vmin_ukernel__avx512f_u32( input_b += 32; - _mm512_storeu_ps(output, vacc0); _mm512_storeu_ps(output + 16, vacc1); output += 32; @@ -54,7 +52,6 @@ void xnn_f32_vmin_ukernel__avx512f_u32( input_b += 16; - _mm512_storeu_ps(output, vacc); output += 16; } diff --git a/src/f32-vbinary/gen/f32-vmin-hvx-u128.c b/src/f32-vbinary/gen/f32-vmin-hvx-u128.c index 26e4f3030c0..bc2e64a9acd 100644 --- a/src/f32-vbinary/gen/f32-vmin-hvx-u128.c +++ b/src/f32-vbinary/gen/f32-vmin-hvx-u128.c @@ -23,7 +23,6 @@ void xnn_f32_vmin_ukernel__hvx_u128( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 128 * sizeof(float); batch -= 128 * sizeof(float)) { HVX_Vector va0 = xnn_loadu_f32(input_a); HVX_Vector va1 = xnn_loadu_f32(input_a + 32); @@ -42,7 +41,6 @@ void xnn_f32_vmin_ukernel__hvx_u128( HVX_Vector vacc3 = xnn_min_f32(va3, vb3); - xnn_storeu_f32(output, vacc0); xnn_storeu_f32(output + 32, vacc1); xnn_storeu_f32(output + 64, vacc2); @@ -65,7 +63,7 @@ void xnn_f32_vmin_ukernel__hvx_u128( HVX_Vector vb = xnn_loadu_f32(input_b); HVX_Vector vacc = xnn_min_f32(va, vb); - + Q6_V_vstu_variable(output, batch, vacc); } } diff --git a/src/f32-vbinary/gen/f32-vmin-hvx-u32.c b/src/f32-vbinary/gen/f32-vmin-hvx-u32.c index 2ab7d50b970..b2bfbb13e4a 100644 --- a/src/f32-vbinary/gen/f32-vmin-hvx-u32.c +++ b/src/f32-vbinary/gen/f32-vmin-hvx-u32.c @@ -23,7 +23,6 @@ void xnn_f32_vmin_ukernel__hvx_u32( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { HVX_Vector va = xnn_loadu_f32(input_a); HVX_Vector vb = xnn_loadu_f32(input_b); @@ -40,7 +39,7 @@ void xnn_f32_vmin_ukernel__hvx_u32( HVX_Vector vb = xnn_loadu_f32(input_b); HVX_Vector vacc = xnn_min_f32(va, vb); - + Q6_V_vstu_variable(output, batch, vacc); } } diff --git a/src/f32-vbinary/gen/f32-vmin-hvx-u64.c b/src/f32-vbinary/gen/f32-vmin-hvx-u64.c index e792d847ab2..498b408e21b 100644 --- a/src/f32-vbinary/gen/f32-vmin-hvx-u64.c +++ b/src/f32-vbinary/gen/f32-vmin-hvx-u64.c @@ -23,7 +23,6 @@ void xnn_f32_vmin_ukernel__hvx_u64( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 64 * sizeof(float); batch -= 64 * sizeof(float)) { HVX_Vector va0 = xnn_loadu_f32(input_a); HVX_Vector va1 = xnn_loadu_f32(input_a + 32); @@ -36,7 +35,6 @@ void xnn_f32_vmin_ukernel__hvx_u64( HVX_Vector vacc1 = xnn_min_f32(va1, vb1); - xnn_storeu_f32(output, vacc0); xnn_storeu_f32(output + 32, vacc1); output += 64; @@ -57,7 +55,7 @@ void xnn_f32_vmin_ukernel__hvx_u64( HVX_Vector vb = xnn_loadu_f32(input_b); HVX_Vector vacc = xnn_min_f32(va, vb); - + Q6_V_vstu_variable(output, batch, vacc); } } diff --git a/src/f32-vbinary/gen/f32-vmin-neon-u4.c b/src/f32-vbinary/gen/f32-vmin-neon-u4.c index ac8cd4c97ee..3a2aa3640b4 100644 --- a/src/f32-vbinary/gen/f32-vmin-neon-u4.c +++ b/src/f32-vbinary/gen/f32-vmin-neon-u4.c @@ -28,14 +28,12 @@ void xnn_f32_vmin_ukernel__neon_u4( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float32x4_t va = vld1q_f32(input_a); input_a += 4; const float32x4_t vb = vld1q_f32(input_b); input_b += 4; float32x4_t vacc = vminq_f32(va, vb); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -44,7 +42,6 @@ void xnn_f32_vmin_ukernel__neon_u4( float32x4_t vacc = vminq_f32(va, vb); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/gen/f32-vmin-neon-u8.c b/src/f32-vbinary/gen/f32-vmin-neon-u8.c index 30514f34fda..3cdc649ef06 100644 --- a/src/f32-vbinary/gen/f32-vmin-neon-u8.c +++ b/src/f32-vbinary/gen/f32-vmin-neon-u8.c @@ -28,7 +28,6 @@ void xnn_f32_vmin_ukernel__neon_u8( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float32x4_t va0 = vld1q_f32(input_a); input_a += 4; const float32x4_t vb0 = vld1q_f32(input_b); input_b += 4; @@ -39,7 +38,6 @@ void xnn_f32_vmin_ukernel__neon_u8( float32x4_t vacc1 = vminq_f32(va1, vb1); - vst1q_f32(output, vacc0); output += 4; vst1q_f32(output, vacc1); output += 4; } @@ -49,7 +47,6 @@ void xnn_f32_vmin_ukernel__neon_u8( float32x4_t vacc = vminq_f32(va, vb); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -58,7 +55,6 @@ void xnn_f32_vmin_ukernel__neon_u8( float32x4_t vacc = vminq_f32(va, vb); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/gen/f32-vmin-scalar-u1.c b/src/f32-vbinary/gen/f32-vmin-scalar-u1.c index 94b1b282061..57e6531681f 100644 --- a/src/f32-vbinary/gen/f32-vmin-scalar-u1.c +++ b/src/f32-vbinary/gen/f32-vmin-scalar-u1.c @@ -27,7 +27,6 @@ void xnn_f32_vmin_ukernel__scalar_u1( assert(input_b != NULL); assert(output != NULL); - for (; batch >= sizeof(float); batch -= sizeof(float)) { const float va = *input_a++; const float vb = *input_b++; diff --git a/src/f32-vbinary/gen/f32-vmin-scalar-u2.c b/src/f32-vbinary/gen/f32-vmin-scalar-u2.c index 0ee9c89ca9f..12edef1b7c3 100644 --- a/src/f32-vbinary/gen/f32-vmin-scalar-u2.c +++ b/src/f32-vbinary/gen/f32-vmin-scalar-u2.c @@ -27,7 +27,6 @@ void xnn_f32_vmin_ukernel__scalar_u2( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -41,7 +40,6 @@ void xnn_f32_vmin_ukernel__scalar_u2( float vacc1 = math_min_f32(va1, vb1); - output[0] = vacc0; output[1] = vacc1; output += 2; diff --git a/src/f32-vbinary/gen/f32-vmin-scalar-u4.c b/src/f32-vbinary/gen/f32-vmin-scalar-u4.c index 4304cc8bc36..b7b93ec7963 100644 --- a/src/f32-vbinary/gen/f32-vmin-scalar-u4.c +++ b/src/f32-vbinary/gen/f32-vmin-scalar-u4.c @@ -27,7 +27,6 @@ void xnn_f32_vmin_ukernel__scalar_u4( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -47,7 +46,6 @@ void xnn_f32_vmin_ukernel__scalar_u4( float vacc3 = math_min_f32(va3, vb3); - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vmin-scalar-u8.c b/src/f32-vbinary/gen/f32-vmin-scalar-u8.c index 58227479d17..e06c67d3779 100644 --- a/src/f32-vbinary/gen/f32-vmin-scalar-u8.c +++ b/src/f32-vbinary/gen/f32-vmin-scalar-u8.c @@ -27,7 +27,6 @@ void xnn_f32_vmin_ukernel__scalar_u8( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -59,7 +58,6 @@ void xnn_f32_vmin_ukernel__scalar_u8( float vacc7 = math_min_f32(va7, vb7); - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vmin-sse-u4.c b/src/f32-vbinary/gen/f32-vmin-sse-u4.c index 570a046c7d8..eaa9155b6e4 100644 --- a/src/f32-vbinary/gen/f32-vmin-sse-u4.c +++ b/src/f32-vbinary/gen/f32-vmin-sse-u4.c @@ -29,7 +29,6 @@ void xnn_f32_vmin_ukernel__sse_u4( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const __m128 va = _mm_loadu_ps(input_a); input_a += 4; diff --git a/src/f32-vbinary/gen/f32-vmin-sse-u8.c b/src/f32-vbinary/gen/f32-vmin-sse-u8.c index c2fc9cc0807..222caf895f0 100644 --- a/src/f32-vbinary/gen/f32-vmin-sse-u8.c +++ b/src/f32-vbinary/gen/f32-vmin-sse-u8.c @@ -29,7 +29,6 @@ void xnn_f32_vmin_ukernel__sse_u8( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const __m128 va0 = _mm_loadu_ps(input_a); const __m128 va1 = _mm_loadu_ps(input_a + 4); @@ -43,7 +42,6 @@ void xnn_f32_vmin_ukernel__sse_u8( __m128 vacc1 = _mm_min_ps(va1, vb1); - _mm_storeu_ps(output, vacc0); _mm_storeu_ps(output + 4, vacc1); output += 8; diff --git a/src/f32-vbinary/gen/f32-vmin-wasm-u1.c b/src/f32-vbinary/gen/f32-vmin-wasm-u1.c index 93c3dab6808..028dca7f1be 100644 --- a/src/f32-vbinary/gen/f32-vmin-wasm-u1.c +++ b/src/f32-vbinary/gen/f32-vmin-wasm-u1.c @@ -27,7 +27,6 @@ void xnn_f32_vmin_ukernel__wasm_u1( assert(input_b != NULL); assert(output != NULL); - for (; batch >= sizeof(float); batch -= sizeof(float)) { const float va = *input_a++; const float vb = *input_b++; diff --git a/src/f32-vbinary/gen/f32-vmin-wasm-u2.c b/src/f32-vbinary/gen/f32-vmin-wasm-u2.c index f53bff5d0d4..936a071ee02 100644 --- a/src/f32-vbinary/gen/f32-vmin-wasm-u2.c +++ b/src/f32-vbinary/gen/f32-vmin-wasm-u2.c @@ -27,7 +27,6 @@ void xnn_f32_vmin_ukernel__wasm_u2( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -41,7 +40,6 @@ void xnn_f32_vmin_ukernel__wasm_u2( float vacc1 = __builtin_wasm_min_f32(va1, vb1); - output[0] = vacc0; output[1] = vacc1; output += 2; diff --git a/src/f32-vbinary/gen/f32-vmin-wasm-u4.c b/src/f32-vbinary/gen/f32-vmin-wasm-u4.c index 86250a412a0..06d7684ad13 100644 --- a/src/f32-vbinary/gen/f32-vmin-wasm-u4.c +++ b/src/f32-vbinary/gen/f32-vmin-wasm-u4.c @@ -27,7 +27,6 @@ void xnn_f32_vmin_ukernel__wasm_u4( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -47,7 +46,6 @@ void xnn_f32_vmin_ukernel__wasm_u4( float vacc3 = __builtin_wasm_min_f32(va3, vb3); - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vmin-wasm-u8.c b/src/f32-vbinary/gen/f32-vmin-wasm-u8.c index 2c70ba8ff35..555eec3ff37 100644 --- a/src/f32-vbinary/gen/f32-vmin-wasm-u8.c +++ b/src/f32-vbinary/gen/f32-vmin-wasm-u8.c @@ -27,7 +27,6 @@ void xnn_f32_vmin_ukernel__wasm_u8( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -59,7 +58,6 @@ void xnn_f32_vmin_ukernel__wasm_u8( float vacc7 = __builtin_wasm_min_f32(va7, vb7); - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vmin-wasmsimd-arm-u16.c b/src/f32-vbinary/gen/f32-vmin-wasmsimd-arm-u16.c index 8e821286e4b..e2c6435b8f6 100644 --- a/src/f32-vbinary/gen/f32-vmin-wasmsimd-arm-u16.c +++ b/src/f32-vbinary/gen/f32-vmin-wasmsimd-arm-u16.c @@ -28,7 +28,6 @@ void xnn_f32_vmin_ukernel__wasmsimd_arm_u16( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); @@ -63,7 +62,6 @@ void xnn_f32_vmin_ukernel__wasmsimd_arm_u16( v128_t vacc = wasm_f32x4_min(va, vb); - wasm_v128_store(output, vacc); output += 4; } @@ -73,7 +71,6 @@ void xnn_f32_vmin_ukernel__wasmsimd_arm_u16( v128_t vacc = wasm_f32x4_min(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vmin-wasmsimd-arm-u4.c b/src/f32-vbinary/gen/f32-vmin-wasmsimd-arm-u4.c index 3104d0bd797..2a19bf2507b 100644 --- a/src/f32-vbinary/gen/f32-vmin-wasmsimd-arm-u4.c +++ b/src/f32-vbinary/gen/f32-vmin-wasmsimd-arm-u4.c @@ -28,7 +28,6 @@ void xnn_f32_vmin_ukernel__wasmsimd_arm_u4( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; @@ -38,7 +37,6 @@ void xnn_f32_vmin_ukernel__wasmsimd_arm_u4( v128_t vacc = wasm_f32x4_min(va, vb); - wasm_v128_store(output, vacc); output += 4; } @@ -48,7 +46,6 @@ void xnn_f32_vmin_ukernel__wasmsimd_arm_u4( v128_t vacc = wasm_f32x4_min(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vmin-wasmsimd-arm-u8.c b/src/f32-vbinary/gen/f32-vmin-wasmsimd-arm-u8.c index 155f48e2ec9..2999cc88f13 100644 --- a/src/f32-vbinary/gen/f32-vmin-wasmsimd-arm-u8.c +++ b/src/f32-vbinary/gen/f32-vmin-wasmsimd-arm-u8.c @@ -28,7 +28,6 @@ void xnn_f32_vmin_ukernel__wasmsimd_arm_u8( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); @@ -55,7 +54,6 @@ void xnn_f32_vmin_ukernel__wasmsimd_arm_u8( v128_t vacc = wasm_f32x4_min(va, vb); - wasm_v128_store(output, vacc); output += 4; } @@ -65,7 +63,6 @@ void xnn_f32_vmin_ukernel__wasmsimd_arm_u8( v128_t vacc = wasm_f32x4_min(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vmin-wasmsimd-x86-u16.c b/src/f32-vbinary/gen/f32-vmin-wasmsimd-x86-u16.c index df432d88504..6260ae5b3e9 100644 --- a/src/f32-vbinary/gen/f32-vmin-wasmsimd-x86-u16.c +++ b/src/f32-vbinary/gen/f32-vmin-wasmsimd-x86-u16.c @@ -28,7 +28,6 @@ void xnn_f32_vmin_ukernel__wasmsimd_x86_u16( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); @@ -63,7 +62,6 @@ void xnn_f32_vmin_ukernel__wasmsimd_x86_u16( v128_t vacc = wasm_f32x4_pmin(va, vb); - wasm_v128_store(output, vacc); output += 4; } @@ -73,7 +71,6 @@ void xnn_f32_vmin_ukernel__wasmsimd_x86_u16( v128_t vacc = wasm_f32x4_pmin(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vmin-wasmsimd-x86-u4.c b/src/f32-vbinary/gen/f32-vmin-wasmsimd-x86-u4.c index bb971aed7ee..607ca4fea40 100644 --- a/src/f32-vbinary/gen/f32-vmin-wasmsimd-x86-u4.c +++ b/src/f32-vbinary/gen/f32-vmin-wasmsimd-x86-u4.c @@ -28,7 +28,6 @@ void xnn_f32_vmin_ukernel__wasmsimd_x86_u4( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; @@ -38,7 +37,6 @@ void xnn_f32_vmin_ukernel__wasmsimd_x86_u4( v128_t vacc = wasm_f32x4_pmin(va, vb); - wasm_v128_store(output, vacc); output += 4; } @@ -48,7 +46,6 @@ void xnn_f32_vmin_ukernel__wasmsimd_x86_u4( v128_t vacc = wasm_f32x4_pmin(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vmin-wasmsimd-x86-u8.c b/src/f32-vbinary/gen/f32-vmin-wasmsimd-x86-u8.c index 8555959a3c4..108387a0a18 100644 --- a/src/f32-vbinary/gen/f32-vmin-wasmsimd-x86-u8.c +++ b/src/f32-vbinary/gen/f32-vmin-wasmsimd-x86-u8.c @@ -28,7 +28,6 @@ void xnn_f32_vmin_ukernel__wasmsimd_x86_u8( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); @@ -55,7 +54,6 @@ void xnn_f32_vmin_ukernel__wasmsimd_x86_u8( v128_t vacc = wasm_f32x4_pmin(va, vb); - wasm_v128_store(output, vacc); output += 4; } @@ -65,7 +63,6 @@ void xnn_f32_vmin_ukernel__wasmsimd_x86_u8( v128_t vacc = wasm_f32x4_pmin(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vminc-avx-u16.c b/src/f32-vbinary/gen/f32-vminc-avx-u16.c index 1cb8620ba6c..e8eb85dc9fb 100644 --- a/src/f32-vbinary/gen/f32-vminc-avx-u16.c +++ b/src/f32-vbinary/gen/f32-vminc-avx-u16.c @@ -41,7 +41,6 @@ void xnn_f32_vminc_ukernel__avx_u16( __m256 vacc1 = _mm256_min_ps(va1, vb); - _mm256_storeu_ps(output, vacc0); _mm256_storeu_ps(output + 8, vacc1); output += 16; diff --git a/src/f32-vbinary/gen/f32-vminc-avx512f-u16.c b/src/f32-vbinary/gen/f32-vminc-avx512f-u16.c index 5ad822f355a..0ec59884de0 100644 --- a/src/f32-vbinary/gen/f32-vminc-avx512f-u16.c +++ b/src/f32-vbinary/gen/f32-vminc-avx512f-u16.c @@ -7,6 +7,7 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. + #include #include @@ -38,7 +39,6 @@ void xnn_f32_vminc_ukernel__avx512f_u16( __m512 vacc0 = _mm512_min_ps(va0, vb); - _mm512_storeu_ps(output, vacc0); output += 16; } diff --git a/src/f32-vbinary/gen/f32-vminc-avx512f-u32.c b/src/f32-vbinary/gen/f32-vminc-avx512f-u32.c index b84fb8a03ba..0d460564670 100644 --- a/src/f32-vbinary/gen/f32-vminc-avx512f-u32.c +++ b/src/f32-vbinary/gen/f32-vminc-avx512f-u32.c @@ -7,6 +7,7 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. + #include #include @@ -40,7 +41,6 @@ void xnn_f32_vminc_ukernel__avx512f_u32( __m512 vacc1 = _mm512_min_ps(va1, vb); - _mm512_storeu_ps(output, vacc0); _mm512_storeu_ps(output + 16, vacc1); output += 32; @@ -51,7 +51,6 @@ void xnn_f32_vminc_ukernel__avx512f_u32( __m512 vacc = _mm512_min_ps(va, vb); - _mm512_storeu_ps(output, vacc); output += 16; } diff --git a/src/f32-vbinary/gen/f32-vminc-hvx-u128.c b/src/f32-vbinary/gen/f32-vminc-hvx-u128.c index c716021d0f0..9382c149aa0 100644 --- a/src/f32-vbinary/gen/f32-vminc-hvx-u128.c +++ b/src/f32-vbinary/gen/f32-vminc-hvx-u128.c @@ -38,7 +38,6 @@ void xnn_f32_vminc_ukernel__hvx_u128( HVX_Vector vacc3 = xnn_min_f32(va3, vb); - xnn_storeu_f32(output, vacc0); xnn_storeu_f32(output + 32, vacc1); xnn_storeu_f32(output + 64, vacc2); diff --git a/src/f32-vbinary/gen/f32-vminc-hvx-u64.c b/src/f32-vbinary/gen/f32-vminc-hvx-u64.c index e94295a901a..98f3f94080d 100644 --- a/src/f32-vbinary/gen/f32-vminc-hvx-u64.c +++ b/src/f32-vbinary/gen/f32-vminc-hvx-u64.c @@ -34,7 +34,6 @@ void xnn_f32_vminc_ukernel__hvx_u64( HVX_Vector vacc1 = xnn_min_f32(va1, vb); - xnn_storeu_f32(output, vacc0); xnn_storeu_f32(output + 32, vacc1); output += 64; diff --git a/src/f32-vbinary/gen/f32-vminc-neon-u4.c b/src/f32-vbinary/gen/f32-vminc-neon-u4.c index 2b36b04cb8f..ca93c2d99a7 100644 --- a/src/f32-vbinary/gen/f32-vminc-neon-u4.c +++ b/src/f32-vbinary/gen/f32-vminc-neon-u4.c @@ -35,7 +35,6 @@ void xnn_f32_vminc_ukernel__neon_u4( float32x4_t vacc = vminq_f32(va, vb); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -43,7 +42,6 @@ void xnn_f32_vminc_ukernel__neon_u4( float32x4_t vacc = vminq_f32(va, vb); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/gen/f32-vminc-neon-u8.c b/src/f32-vbinary/gen/f32-vminc-neon-u8.c index 81cd89018d2..60a3e966464 100644 --- a/src/f32-vbinary/gen/f32-vminc-neon-u8.c +++ b/src/f32-vbinary/gen/f32-vminc-neon-u8.c @@ -38,7 +38,6 @@ void xnn_f32_vminc_ukernel__neon_u8( float32x4_t vacc1 = vminq_f32(va1, vb); - vst1q_f32(output, vacc0); output += 4; vst1q_f32(output, vacc1); output += 4; } @@ -47,7 +46,6 @@ void xnn_f32_vminc_ukernel__neon_u8( float32x4_t vacc = vminq_f32(va, vb); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -55,7 +53,6 @@ void xnn_f32_vminc_ukernel__neon_u8( float32x4_t vacc = vminq_f32(va, vb); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/gen/f32-vminc-scalar-u2.c b/src/f32-vbinary/gen/f32-vminc-scalar-u2.c index 5e6d59f5cdd..c752cd082d0 100644 --- a/src/f32-vbinary/gen/f32-vminc-scalar-u2.c +++ b/src/f32-vbinary/gen/f32-vminc-scalar-u2.c @@ -38,7 +38,6 @@ void xnn_f32_vminc_ukernel__scalar_u2( float vacc1 = math_min_f32(va1, vb); - output[0] = vacc0; output[1] = vacc1; output += 2; diff --git a/src/f32-vbinary/gen/f32-vminc-scalar-u4.c b/src/f32-vbinary/gen/f32-vminc-scalar-u4.c index 6fa073c469f..2b7ce138e78 100644 --- a/src/f32-vbinary/gen/f32-vminc-scalar-u4.c +++ b/src/f32-vbinary/gen/f32-vminc-scalar-u4.c @@ -42,7 +42,6 @@ void xnn_f32_vminc_ukernel__scalar_u4( float vacc3 = math_min_f32(va3, vb); - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vminc-scalar-u8.c b/src/f32-vbinary/gen/f32-vminc-scalar-u8.c index e80d23ae16c..6db2b81c50c 100644 --- a/src/f32-vbinary/gen/f32-vminc-scalar-u8.c +++ b/src/f32-vbinary/gen/f32-vminc-scalar-u8.c @@ -50,7 +50,6 @@ void xnn_f32_vminc_ukernel__scalar_u8( float vacc7 = math_min_f32(va7, vb); - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vminc-sse-u8.c b/src/f32-vbinary/gen/f32-vminc-sse-u8.c index 8004b0a6a7e..e1bcad5a19d 100644 --- a/src/f32-vbinary/gen/f32-vminc-sse-u8.c +++ b/src/f32-vbinary/gen/f32-vminc-sse-u8.c @@ -40,7 +40,6 @@ void xnn_f32_vminc_ukernel__sse_u8( __m128 vacc1 = _mm_min_ps(va1, vb); - _mm_storeu_ps(output, vacc0); _mm_storeu_ps(output + 4, vacc1); output += 8; diff --git a/src/f32-vbinary/gen/f32-vminc-wasm-u2.c b/src/f32-vbinary/gen/f32-vminc-wasm-u2.c index bb1a2768e61..b4d26b5b4db 100644 --- a/src/f32-vbinary/gen/f32-vminc-wasm-u2.c +++ b/src/f32-vbinary/gen/f32-vminc-wasm-u2.c @@ -38,7 +38,6 @@ void xnn_f32_vminc_ukernel__wasm_u2( float vacc1 = __builtin_wasm_min_f32(va1, vb); - output[0] = vacc0; output[1] = vacc1; output += 2; diff --git a/src/f32-vbinary/gen/f32-vminc-wasm-u4.c b/src/f32-vbinary/gen/f32-vminc-wasm-u4.c index 50172bc3686..81e8f9e055a 100644 --- a/src/f32-vbinary/gen/f32-vminc-wasm-u4.c +++ b/src/f32-vbinary/gen/f32-vminc-wasm-u4.c @@ -42,7 +42,6 @@ void xnn_f32_vminc_ukernel__wasm_u4( float vacc3 = __builtin_wasm_min_f32(va3, vb); - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vminc-wasm-u8.c b/src/f32-vbinary/gen/f32-vminc-wasm-u8.c index 134971db67b..3ce013c0363 100644 --- a/src/f32-vbinary/gen/f32-vminc-wasm-u8.c +++ b/src/f32-vbinary/gen/f32-vminc-wasm-u8.c @@ -50,7 +50,6 @@ void xnn_f32_vminc_ukernel__wasm_u8( float vacc7 = __builtin_wasm_min_f32(va7, vb); - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vminc-wasmsimd-arm-u16.c b/src/f32-vbinary/gen/f32-vminc-wasmsimd-arm-u16.c index 9c75ab45d94..875dd7c228a 100644 --- a/src/f32-vbinary/gen/f32-vminc-wasmsimd-arm-u16.c +++ b/src/f32-vbinary/gen/f32-vminc-wasmsimd-arm-u16.c @@ -43,7 +43,6 @@ void xnn_f32_vminc_ukernel__wasmsimd_arm_u16( v128_t vy3 = wasm_f32x4_min(va3, vb); - wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); wasm_v128_store(output + 8, vy2); @@ -56,7 +55,6 @@ void xnn_f32_vminc_ukernel__wasmsimd_arm_u16( v128_t vy = wasm_f32x4_min(va, vb); - wasm_v128_store(output, vy); output += 4; } @@ -65,7 +63,6 @@ void xnn_f32_vminc_ukernel__wasmsimd_arm_u16( v128_t vy = wasm_f32x4_min(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vminc-wasmsimd-arm-u4.c b/src/f32-vbinary/gen/f32-vminc-wasmsimd-arm-u4.c index 07c15d33ebd..b17b2eb2331 100644 --- a/src/f32-vbinary/gen/f32-vminc-wasmsimd-arm-u4.c +++ b/src/f32-vbinary/gen/f32-vminc-wasmsimd-arm-u4.c @@ -36,7 +36,6 @@ void xnn_f32_vminc_ukernel__wasmsimd_arm_u4( v128_t vy = wasm_f32x4_min(va, vb); - wasm_v128_store(output, vy); output += 4; } @@ -45,7 +44,6 @@ void xnn_f32_vminc_ukernel__wasmsimd_arm_u4( v128_t vy = wasm_f32x4_min(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vminc-wasmsimd-arm-u8.c b/src/f32-vbinary/gen/f32-vminc-wasmsimd-arm-u8.c index 508bb523f90..98e7ae0cc58 100644 --- a/src/f32-vbinary/gen/f32-vminc-wasmsimd-arm-u8.c +++ b/src/f32-vbinary/gen/f32-vminc-wasmsimd-arm-u8.c @@ -39,7 +39,6 @@ void xnn_f32_vminc_ukernel__wasmsimd_arm_u8( v128_t vy1 = wasm_f32x4_min(va1, vb); - wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); output += 8; @@ -50,7 +49,6 @@ void xnn_f32_vminc_ukernel__wasmsimd_arm_u8( v128_t vy = wasm_f32x4_min(va, vb); - wasm_v128_store(output, vy); output += 4; } @@ -59,7 +57,6 @@ void xnn_f32_vminc_ukernel__wasmsimd_arm_u8( v128_t vy = wasm_f32x4_min(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vminc-wasmsimd-x86-u16.c b/src/f32-vbinary/gen/f32-vminc-wasmsimd-x86-u16.c index 52a2760804a..872d6e12ffc 100644 --- a/src/f32-vbinary/gen/f32-vminc-wasmsimd-x86-u16.c +++ b/src/f32-vbinary/gen/f32-vminc-wasmsimd-x86-u16.c @@ -43,7 +43,6 @@ void xnn_f32_vminc_ukernel__wasmsimd_x86_u16( v128_t vy3 = wasm_f32x4_pmin(vb, va3); - wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); wasm_v128_store(output + 8, vy2); @@ -56,7 +55,6 @@ void xnn_f32_vminc_ukernel__wasmsimd_x86_u16( v128_t vy = wasm_f32x4_pmin(vb, va); - wasm_v128_store(output, vy); output += 4; } @@ -65,7 +63,6 @@ void xnn_f32_vminc_ukernel__wasmsimd_x86_u16( v128_t vy = wasm_f32x4_pmin(vb, va); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vminc-wasmsimd-x86-u4.c b/src/f32-vbinary/gen/f32-vminc-wasmsimd-x86-u4.c index 0485052be75..00b4dea8273 100644 --- a/src/f32-vbinary/gen/f32-vminc-wasmsimd-x86-u4.c +++ b/src/f32-vbinary/gen/f32-vminc-wasmsimd-x86-u4.c @@ -36,7 +36,6 @@ void xnn_f32_vminc_ukernel__wasmsimd_x86_u4( v128_t vy = wasm_f32x4_pmin(vb, va); - wasm_v128_store(output, vy); output += 4; } @@ -45,7 +44,6 @@ void xnn_f32_vminc_ukernel__wasmsimd_x86_u4( v128_t vy = wasm_f32x4_pmin(vb, va); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vminc-wasmsimd-x86-u8.c b/src/f32-vbinary/gen/f32-vminc-wasmsimd-x86-u8.c index d4584b71768..f5a58d50382 100644 --- a/src/f32-vbinary/gen/f32-vminc-wasmsimd-x86-u8.c +++ b/src/f32-vbinary/gen/f32-vminc-wasmsimd-x86-u8.c @@ -39,7 +39,6 @@ void xnn_f32_vminc_ukernel__wasmsimd_x86_u8( v128_t vy1 = wasm_f32x4_pmin(vb, va1); - wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); output += 8; @@ -50,7 +49,6 @@ void xnn_f32_vminc_ukernel__wasmsimd_x86_u8( v128_t vy = wasm_f32x4_pmin(vb, va); - wasm_v128_store(output, vy); output += 4; } @@ -59,7 +57,6 @@ void xnn_f32_vminc_ukernel__wasmsimd_x86_u8( v128_t vy = wasm_f32x4_pmin(vb, va); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vmul-minmax-avx-u16.c b/src/f32-vbinary/gen/f32-vmul-avx-u16.c similarity index 76% rename from src/f32-vbinary/gen/f32-vmul-minmax-avx-u16.c rename to src/f32-vbinary/gen/f32-vmul-avx-u16.c index 9556fe0db1d..1b0a52106de 100644 --- a/src/f32-vbinary/gen/f32-vmul-minmax-avx-u16.c +++ b/src/f32-vbinary/gen/f32-vmul-avx-u16.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vmul_minmax_ukernel__avx_u16( +void xnn_f32_vmul_ukernel__avx_u16( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -30,11 +30,6 @@ void xnn_f32_vmul_minmax_ukernel__avx_u16( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; - const __m256 voutput_min = _mm256_set1_ps(params->scalar.min); - const __m256 voutput_max = _mm256_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const __m256 va0 = _mm256_loadu_ps(input_a); const __m256 va1 = _mm256_loadu_ps(input_a + 8); @@ -45,12 +40,6 @@ void xnn_f32_vmul_minmax_ukernel__avx_u16( input_b += 16; - vacc0 = _mm256_max_ps(voutput_min, vacc0); - vacc1 = _mm256_max_ps(voutput_min, vacc1); - - vacc0 = _mm256_min_ps(voutput_max, vacc0); - vacc1 = _mm256_min_ps(voutput_max, vacc1); - _mm256_storeu_ps(output, vacc0); _mm256_storeu_ps(output + 8, vacc1); output += 16; @@ -61,8 +50,6 @@ void xnn_f32_vmul_minmax_ukernel__avx_u16( __m256 vacc = _mm256_mul_ps(va, _mm256_loadu_ps(input_b)); input_b += 8; - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); _mm256_storeu_ps(output, vacc); output += 8; } @@ -75,8 +62,6 @@ void xnn_f32_vmul_minmax_ukernel__avx_u16( const __m256 vb = _mm256_maskload_ps(input_b, vmask); __m256 vacc = _mm256_mul_ps(va, vb); - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); __m128 vacc_lo = _mm256_castps256_ps128(vacc); if (batch & (4 * sizeof(float))) { diff --git a/src/f32-vbinary/gen/f32-vmul-minmax-avx-u8.c b/src/f32-vbinary/gen/f32-vmul-avx-u8.c similarity index 78% rename from src/f32-vbinary/gen/f32-vmul-minmax-avx-u8.c rename to src/f32-vbinary/gen/f32-vmul-avx-u8.c index d95ea3f2b23..5fbb7985ae8 100644 --- a/src/f32-vbinary/gen/f32-vmul-minmax-avx-u8.c +++ b/src/f32-vbinary/gen/f32-vmul-avx-u8.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vmul_minmax_ukernel__avx_u8( +void xnn_f32_vmul_ukernel__avx_u8( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -30,19 +30,12 @@ void xnn_f32_vmul_minmax_ukernel__avx_u8( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; - const __m256 voutput_min = _mm256_set1_ps(params->scalar.min); - const __m256 voutput_max = _mm256_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const __m256 va = _mm256_loadu_ps(input_a); input_a += 8; __m256 vacc = _mm256_mul_ps(va, _mm256_loadu_ps(input_b)); input_b += 8; - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); _mm256_storeu_ps(output, vacc); output += 8; } @@ -55,8 +48,6 @@ void xnn_f32_vmul_minmax_ukernel__avx_u8( const __m256 vb = _mm256_maskload_ps(input_b, vmask); __m256 vacc = _mm256_mul_ps(va, vb); - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); __m128 vacc_lo = _mm256_castps256_ps128(vacc); if (batch & (4 * sizeof(float))) { diff --git a/src/f32-vbinary/gen/f32-vmul-minmax-avx512f-u16.c b/src/f32-vbinary/gen/f32-vmul-avx512f-u16.c similarity index 75% rename from src/f32-vbinary/gen/f32-vmul-minmax-avx512f-u16.c rename to src/f32-vbinary/gen/f32-vmul-avx512f-u16.c index 8198be9cab4..53a38980332 100644 --- a/src/f32-vbinary/gen/f32-vmul-minmax-avx512f-u16.c +++ b/src/f32-vbinary/gen/f32-vmul-avx512f-u16.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vmul_minmax_ukernel__avx512f_u16( +void xnn_f32_vmul_ukernel__avx512f_u16( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,9 +29,6 @@ void xnn_f32_vmul_minmax_ukernel__avx512f_u16( assert(input_b != NULL); assert(output != NULL); - const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); - const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const __m512 va = _mm512_loadu_ps(input_a); @@ -41,9 +38,6 @@ void xnn_f32_vmul_minmax_ukernel__avx512f_u16( input_b += 16; - vacc = _mm512_max_ps(voutput_min, vacc); - vacc = _mm512_min_ps(voutput_max, vacc); - _mm512_storeu_ps(output, vacc); output += 16; } @@ -57,8 +51,6 @@ void xnn_f32_vmul_minmax_ukernel__avx512f_u16( const __m512 va = _mm512_maskz_loadu_ps(vmask, input_a); __m512 vacc = _mm512_maskz_mul_ps(vmask, va, _mm512_maskz_loadu_ps(vmask, input_b)); - vacc = _mm512_maskz_max_ps(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ps(vmask, voutput_max, vacc); _mm512_mask_storeu_ps(output, vmask, vacc); } } diff --git a/src/f32-vbinary/gen/f32-vmul-minmax-avx512f-u32.c b/src/f32-vbinary/gen/f32-vmul-avx512f-u32.c similarity index 74% rename from src/f32-vbinary/gen/f32-vmul-minmax-avx512f-u32.c rename to src/f32-vbinary/gen/f32-vmul-avx512f-u32.c index 1981bc43111..3b7f3a1b2c5 100644 --- a/src/f32-vbinary/gen/f32-vmul-minmax-avx512f-u32.c +++ b/src/f32-vbinary/gen/f32-vmul-avx512f-u32.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vmul_minmax_ukernel__avx512f_u32( +void xnn_f32_vmul_ukernel__avx512f_u32( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,9 +29,6 @@ void xnn_f32_vmul_minmax_ukernel__avx512f_u32( assert(input_b != NULL); assert(output != NULL); - const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); - const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); - for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { const __m512 va0 = _mm512_loadu_ps(input_a); @@ -43,12 +40,6 @@ void xnn_f32_vmul_minmax_ukernel__avx512f_u32( input_b += 32; - vacc0 = _mm512_max_ps(voutput_min, vacc0); - vacc1 = _mm512_max_ps(voutput_min, vacc1); - - vacc0 = _mm512_min_ps(voutput_max, vacc0); - vacc1 = _mm512_min_ps(voutput_max, vacc1); - _mm512_storeu_ps(output, vacc0); _mm512_storeu_ps(output + 16, vacc1); output += 32; @@ -61,9 +52,6 @@ void xnn_f32_vmul_minmax_ukernel__avx512f_u32( input_b += 16; - vacc = _mm512_max_ps(voutput_min, vacc); - vacc = _mm512_min_ps(voutput_max, vacc); - _mm512_storeu_ps(output, vacc); output += 16; } @@ -77,8 +65,6 @@ void xnn_f32_vmul_minmax_ukernel__avx512f_u32( const __m512 va = _mm512_maskz_loadu_ps(vmask, input_a); __m512 vacc = _mm512_maskz_mul_ps(vmask, va, _mm512_maskz_loadu_ps(vmask, input_b)); - vacc = _mm512_maskz_max_ps(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ps(vmask, voutput_max, vacc); _mm512_mask_storeu_ps(output, vmask, vacc); } } diff --git a/src/f32-vbinary/gen/f32-vmul-minmax-hvx-u128.c b/src/f32-vbinary/gen/f32-vmul-hvx-u128.c similarity index 69% rename from src/f32-vbinary/gen/f32-vmul-minmax-hvx-u128.c rename to src/f32-vbinary/gen/f32-vmul-hvx-u128.c index 01b844b7295..16090d0588d 100644 --- a/src/f32-vbinary/gen/f32-vmul-minmax-hvx-u128.c +++ b/src/f32-vbinary/gen/f32-vmul-hvx-u128.c @@ -10,12 +10,12 @@ #include "xnnpack/math.h" #include "xnnpack/vbinary.h" -void xnn_f32_vmul_minmax_ukernel__hvx_u128( +void xnn_f32_vmul_ukernel__hvx_u128( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -23,9 +23,6 @@ void xnn_f32_vmul_minmax_ukernel__hvx_u128( assert(input_b != NULL); assert(output != NULL); - const HVX_Vector voutput_min = xnn_set1_f32(params->scalar.min); - const HVX_Vector voutput_max = xnn_set1_f32(params->scalar.max); - for (; batch >= 128 * sizeof(float); batch -= 128 * sizeof(float)) { HVX_Vector va0 = xnn_loadu_f32(input_a); HVX_Vector va1 = xnn_loadu_f32(input_a + 32); @@ -44,16 +41,6 @@ void xnn_f32_vmul_minmax_ukernel__hvx_u128( HVX_Vector vacc3 = xnn_mul_f32(va3, vb3); - vacc0 = xnn_max_f32(vacc0, voutput_min); - vacc1 = xnn_max_f32(vacc1, voutput_min); - vacc2 = xnn_max_f32(vacc2, voutput_min); - vacc3 = xnn_max_f32(vacc3, voutput_min); - - vacc0 = xnn_min_f32(vacc0, voutput_max); - vacc1 = xnn_min_f32(vacc1, voutput_max); - vacc2 = xnn_min_f32(vacc2, voutput_max); - vacc3 = xnn_min_f32(vacc3, voutput_max); - xnn_storeu_f32(output, vacc0); xnn_storeu_f32(output + 32, vacc1); xnn_storeu_f32(output + 64, vacc2); @@ -67,8 +54,6 @@ void xnn_f32_vmul_minmax_ukernel__hvx_u128( input_b += 32; HVX_Vector vacc = xnn_mul_f32(va, vb); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); xnn_storeu_f32(output, vacc); output += 32; @@ -78,9 +63,7 @@ void xnn_f32_vmul_minmax_ukernel__hvx_u128( HVX_Vector vb = xnn_loadu_f32(input_b); HVX_Vector vacc = xnn_mul_f32(va, vb); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); - + Q6_V_vstu_variable(output, batch, vacc); } } diff --git a/src/f32-vbinary/gen/f32-vmul-minmax-hvx-u32.c b/src/f32-vbinary/gen/f32-vmul-hvx-u32.c similarity index 68% rename from src/f32-vbinary/gen/f32-vmul-minmax-hvx-u32.c rename to src/f32-vbinary/gen/f32-vmul-hvx-u32.c index 22fbac8f6f5..975ebfbe585 100644 --- a/src/f32-vbinary/gen/f32-vmul-minmax-hvx-u32.c +++ b/src/f32-vbinary/gen/f32-vmul-hvx-u32.c @@ -10,12 +10,12 @@ #include "xnnpack/math.h" #include "xnnpack/vbinary.h" -void xnn_f32_vmul_minmax_ukernel__hvx_u32( +void xnn_f32_vmul_ukernel__hvx_u32( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -23,9 +23,6 @@ void xnn_f32_vmul_minmax_ukernel__hvx_u32( assert(input_b != NULL); assert(output != NULL); - const HVX_Vector voutput_min = xnn_set1_f32(params->scalar.min); - const HVX_Vector voutput_max = xnn_set1_f32(params->scalar.max); - for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { HVX_Vector va = xnn_loadu_f32(input_a); HVX_Vector vb = xnn_loadu_f32(input_b); @@ -33,8 +30,6 @@ void xnn_f32_vmul_minmax_ukernel__hvx_u32( input_b += 32; HVX_Vector vacc = xnn_mul_f32(va, vb); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); xnn_storeu_f32(output, vacc); output += 32; @@ -44,9 +39,7 @@ void xnn_f32_vmul_minmax_ukernel__hvx_u32( HVX_Vector vb = xnn_loadu_f32(input_b); HVX_Vector vacc = xnn_mul_f32(va, vb); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); - + Q6_V_vstu_variable(output, batch, vacc); } } diff --git a/src/f32-vbinary/gen/f32-vmul-minmax-hvx-u64.c b/src/f32-vbinary/gen/f32-vmul-hvx-u64.c similarity index 69% rename from src/f32-vbinary/gen/f32-vmul-minmax-hvx-u64.c rename to src/f32-vbinary/gen/f32-vmul-hvx-u64.c index 0a2d0e26828..bb50fb65395 100644 --- a/src/f32-vbinary/gen/f32-vmul-minmax-hvx-u64.c +++ b/src/f32-vbinary/gen/f32-vmul-hvx-u64.c @@ -10,12 +10,12 @@ #include "xnnpack/math.h" #include "xnnpack/vbinary.h" -void xnn_f32_vmul_minmax_ukernel__hvx_u64( +void xnn_f32_vmul_ukernel__hvx_u64( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -23,9 +23,6 @@ void xnn_f32_vmul_minmax_ukernel__hvx_u64( assert(input_b != NULL); assert(output != NULL); - const HVX_Vector voutput_min = xnn_set1_f32(params->scalar.min); - const HVX_Vector voutput_max = xnn_set1_f32(params->scalar.max); - for (; batch >= 64 * sizeof(float); batch -= 64 * sizeof(float)) { HVX_Vector va0 = xnn_loadu_f32(input_a); HVX_Vector va1 = xnn_loadu_f32(input_a + 32); @@ -38,12 +35,6 @@ void xnn_f32_vmul_minmax_ukernel__hvx_u64( HVX_Vector vacc1 = xnn_mul_f32(va1, vb1); - vacc0 = xnn_max_f32(vacc0, voutput_min); - vacc1 = xnn_max_f32(vacc1, voutput_min); - - vacc0 = xnn_min_f32(vacc0, voutput_max); - vacc1 = xnn_min_f32(vacc1, voutput_max); - xnn_storeu_f32(output, vacc0); xnn_storeu_f32(output + 32, vacc1); output += 64; @@ -55,8 +46,6 @@ void xnn_f32_vmul_minmax_ukernel__hvx_u64( input_b += 32; HVX_Vector vacc = xnn_mul_f32(va, vb); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); xnn_storeu_f32(output, vacc); output += 32; @@ -66,9 +55,7 @@ void xnn_f32_vmul_minmax_ukernel__hvx_u64( HVX_Vector vb = xnn_loadu_f32(input_b); HVX_Vector vacc = xnn_mul_f32(va, vb); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); - + Q6_V_vstu_variable(output, batch, vacc); } } diff --git a/src/f32-vbinary/gen/f32-vmul-minmax-wasm-u1.c b/src/f32-vbinary/gen/f32-vmul-minmax-wasm-u1.c deleted file mode 100644 index 3f3ccd38684..00000000000 --- a/src/f32-vbinary/gen/f32-vmul-minmax-wasm-u1.c +++ /dev/null @@ -1,41 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vmul_minmax_ukernel__wasm_u1( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - - for (; batch >= sizeof(float); batch -= sizeof(float)) { - const float va = *input_a++; - const float vb = *input_b++; - float vacc = va * vb; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output++ = vacc; - } -} diff --git a/src/f32-vbinary/gen/f32-vmul-minmax-wasm-u2.c b/src/f32-vbinary/gen/f32-vmul-minmax-wasm-u2.c deleted file mode 100644 index ba18c91b100..00000000000 --- a/src/f32-vbinary/gen/f32-vmul-minmax-wasm-u2.c +++ /dev/null @@ -1,65 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vmul_minmax_ukernel__wasm_u2( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - - for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) { - const float va0 = input_a[0]; - const float va1 = input_a[1]; - input_a += 2; - - const float vb0 = input_b[0]; - const float vb1 = input_b[1]; - input_b += 2; - - float vacc0 = va0 * vb0; - float vacc1 = va1 * vb1; - - - vacc0 = __builtin_wasm_max_f32(vacc0, voutput_min); - vacc1 = __builtin_wasm_max_f32(vacc1, voutput_min); - - vacc0 = __builtin_wasm_min_f32(vacc0, voutput_max); - vacc1 = __builtin_wasm_min_f32(vacc1, voutput_max); - - output[0] = vacc0; - output[1] = vacc1; - output += 2; - } - if XNN_UNLIKELY(batch != 0) { - assert(batch == sizeof(float)); - const float va = *input_a; - const float vb = *input_b; - float vacc = va * vb; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output = vacc; - } -} diff --git a/src/f32-vbinary/gen/f32-vmul-minmax-wasm-u4.c b/src/f32-vbinary/gen/f32-vmul-minmax-wasm-u4.c deleted file mode 100644 index 30a3a8786c3..00000000000 --- a/src/f32-vbinary/gen/f32-vmul-minmax-wasm-u4.c +++ /dev/null @@ -1,79 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vmul_minmax_ukernel__wasm_u4( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float va0 = input_a[0]; - const float va1 = input_a[1]; - const float va2 = input_a[2]; - const float va3 = input_a[3]; - input_a += 4; - - const float vb0 = input_b[0]; - const float vb1 = input_b[1]; - const float vb2 = input_b[2]; - const float vb3 = input_b[3]; - input_b += 4; - - float vacc0 = va0 * vb0; - float vacc1 = va1 * vb1; - float vacc2 = va2 * vb2; - float vacc3 = va3 * vb3; - - - vacc0 = __builtin_wasm_max_f32(vacc0, voutput_min); - vacc1 = __builtin_wasm_max_f32(vacc1, voutput_min); - vacc2 = __builtin_wasm_max_f32(vacc2, voutput_min); - vacc3 = __builtin_wasm_max_f32(vacc3, voutput_min); - - vacc0 = __builtin_wasm_min_f32(vacc0, voutput_max); - vacc1 = __builtin_wasm_min_f32(vacc1, voutput_max); - vacc2 = __builtin_wasm_min_f32(vacc2, voutput_max); - vacc3 = __builtin_wasm_min_f32(vacc3, voutput_max); - - output[0] = vacc0; - output[1] = vacc1; - output[2] = vacc2; - output[3] = vacc3; - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - do { - const float va = *input_a++; - const float vb = *input_b++; - float vacc = va * vb; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output++ = vacc; - batch -= sizeof(float); - } while (batch != 0); - } -} diff --git a/src/f32-vbinary/gen/f32-vmul-minmax-wasm-u8.c b/src/f32-vbinary/gen/f32-vmul-minmax-wasm-u8.c deleted file mode 100644 index 5555e65d5d8..00000000000 --- a/src/f32-vbinary/gen/f32-vmul-minmax-wasm-u8.c +++ /dev/null @@ -1,103 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vmul_minmax_ukernel__wasm_u8( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const float va0 = input_a[0]; - const float va1 = input_a[1]; - const float va2 = input_a[2]; - const float va3 = input_a[3]; - const float va4 = input_a[4]; - const float va5 = input_a[5]; - const float va6 = input_a[6]; - const float va7 = input_a[7]; - input_a += 8; - - const float vb0 = input_b[0]; - const float vb1 = input_b[1]; - const float vb2 = input_b[2]; - const float vb3 = input_b[3]; - const float vb4 = input_b[4]; - const float vb5 = input_b[5]; - const float vb6 = input_b[6]; - const float vb7 = input_b[7]; - input_b += 8; - - float vacc0 = va0 * vb0; - float vacc1 = va1 * vb1; - float vacc2 = va2 * vb2; - float vacc3 = va3 * vb3; - float vacc4 = va4 * vb4; - float vacc5 = va5 * vb5; - float vacc6 = va6 * vb6; - float vacc7 = va7 * vb7; - - - vacc0 = __builtin_wasm_max_f32(vacc0, voutput_min); - vacc1 = __builtin_wasm_max_f32(vacc1, voutput_min); - vacc2 = __builtin_wasm_max_f32(vacc2, voutput_min); - vacc3 = __builtin_wasm_max_f32(vacc3, voutput_min); - vacc4 = __builtin_wasm_max_f32(vacc4, voutput_min); - vacc5 = __builtin_wasm_max_f32(vacc5, voutput_min); - vacc6 = __builtin_wasm_max_f32(vacc6, voutput_min); - vacc7 = __builtin_wasm_max_f32(vacc7, voutput_min); - - vacc0 = __builtin_wasm_min_f32(vacc0, voutput_max); - vacc1 = __builtin_wasm_min_f32(vacc1, voutput_max); - vacc2 = __builtin_wasm_min_f32(vacc2, voutput_max); - vacc3 = __builtin_wasm_min_f32(vacc3, voutput_max); - vacc4 = __builtin_wasm_min_f32(vacc4, voutput_max); - vacc5 = __builtin_wasm_min_f32(vacc5, voutput_max); - vacc6 = __builtin_wasm_min_f32(vacc6, voutput_max); - vacc7 = __builtin_wasm_min_f32(vacc7, voutput_max); - - output[0] = vacc0; - output[1] = vacc1; - output[2] = vacc2; - output[3] = vacc3; - output[4] = vacc4; - output[5] = vacc5; - output[6] = vacc6; - output[7] = vacc7; - output += 8; - } - if XNN_UNLIKELY(batch != 0) { - do { - const float va = *input_a++; - const float vb = *input_b++; - float vacc = va * vb; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output++ = vacc; - batch -= sizeof(float); - } while (batch != 0); - } -} diff --git a/src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-arm-u16.c b/src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-arm-u16.c deleted file mode 100644 index 084e5e07761..00000000000 --- a/src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-arm-u16.c +++ /dev/null @@ -1,103 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vmul_minmax_ukernel__wasmsimd_arm_u16( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - const v128_t va2 = wasm_v128_load(input_a + 8); - const v128_t va3 = wasm_v128_load(input_a + 12); - input_a += 16; - - const v128_t vb0 = wasm_v128_load(input_b); - const v128_t vb1 = wasm_v128_load(input_b + 4); - const v128_t vb2 = wasm_v128_load(input_b + 8); - const v128_t vb3 = wasm_v128_load(input_b + 12); - input_b += 16; - - v128_t vacc0 = wasm_f32x4_mul(va0, vb0); - v128_t vacc1 = wasm_f32x4_mul(va1, vb1); - v128_t vacc2 = wasm_f32x4_mul(va2, vb2); - v128_t vacc3 = wasm_f32x4_mul(va3, vb3); - - vacc0 = wasm_f32x4_max(vacc0, voutput_min); - vacc1 = wasm_f32x4_max(vacc1, voutput_min); - vacc2 = wasm_f32x4_max(vacc2, voutput_min); - vacc3 = wasm_f32x4_max(vacc3, voutput_min); - - vacc0 = wasm_f32x4_min(vacc0, voutput_max); - vacc1 = wasm_f32x4_min(vacc1, voutput_max); - vacc2 = wasm_f32x4_min(vacc2, voutput_max); - vacc3 = wasm_f32x4_min(vacc3, voutput_max); - - wasm_v128_store(output, vacc0); - wasm_v128_store(output + 4, vacc1); - wasm_v128_store(output + 8, vacc2); - wasm_v128_store(output + 12, vacc3); - output += 16; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - const v128_t vb = wasm_v128_load(input_b); - input_b += 4; - - v128_t vacc = wasm_f32x4_mul(va, vb); - - vacc = wasm_f32x4_max(vacc, voutput_min); - vacc = wasm_f32x4_min(vacc, voutput_max); - - wasm_v128_store(output, vacc); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - const v128_t vb = wasm_v128_load(input_b); - - v128_t vacc = wasm_f32x4_mul(va, vb); - - vacc = wasm_f32x4_max(vacc, voutput_min); - vacc = wasm_f32x4_min(vacc, voutput_max); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vacc, 0); - vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vacc, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-arm-u4.c b/src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-arm-u4.c deleted file mode 100644 index 80942917428..00000000000 --- a/src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-arm-u4.c +++ /dev/null @@ -1,69 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vmul_minmax_ukernel__wasmsimd_arm_u4( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - const v128_t vb = wasm_v128_load(input_b); - input_b += 4; - - v128_t vacc = wasm_f32x4_mul(va, vb); - - vacc = wasm_f32x4_max(vacc, voutput_min); - vacc = wasm_f32x4_min(vacc, voutput_max); - - wasm_v128_store(output, vacc); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - const v128_t vb = wasm_v128_load(input_b); - - v128_t vacc = wasm_f32x4_mul(va, vb); - - vacc = wasm_f32x4_max(vacc, voutput_min); - vacc = wasm_f32x4_min(vacc, voutput_max); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vacc, 0); - vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vacc, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-arm-u8.c b/src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-arm-u8.c deleted file mode 100644 index 11ee2e76a78..00000000000 --- a/src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-arm-u8.c +++ /dev/null @@ -1,91 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vmul_minmax_ukernel__wasmsimd_arm_u8( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - input_a += 8; - - const v128_t vb0 = wasm_v128_load(input_b); - const v128_t vb1 = wasm_v128_load(input_b + 4); - input_b += 8; - - v128_t vacc0 = wasm_f32x4_mul(va0, vb0); - v128_t vacc1 = wasm_f32x4_mul(va1, vb1); - - vacc0 = wasm_f32x4_max(vacc0, voutput_min); - vacc1 = wasm_f32x4_max(vacc1, voutput_min); - - vacc0 = wasm_f32x4_min(vacc0, voutput_max); - vacc1 = wasm_f32x4_min(vacc1, voutput_max); - - wasm_v128_store(output, vacc0); - wasm_v128_store(output + 4, vacc1); - output += 8; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - const v128_t vb = wasm_v128_load(input_b); - input_b += 4; - - v128_t vacc = wasm_f32x4_mul(va, vb); - - vacc = wasm_f32x4_max(vacc, voutput_min); - vacc = wasm_f32x4_min(vacc, voutput_max); - - wasm_v128_store(output, vacc); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - const v128_t vb = wasm_v128_load(input_b); - - v128_t vacc = wasm_f32x4_mul(va, vb); - - vacc = wasm_f32x4_max(vacc, voutput_min); - vacc = wasm_f32x4_min(vacc, voutput_max); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vacc, 0); - vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vacc, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-x86-u16.c b/src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-x86-u16.c deleted file mode 100644 index acb17cf0ef1..00000000000 --- a/src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-x86-u16.c +++ /dev/null @@ -1,103 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vmul_minmax_ukernel__wasmsimd_x86_u16( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - const v128_t va2 = wasm_v128_load(input_a + 8); - const v128_t va3 = wasm_v128_load(input_a + 12); - input_a += 16; - - const v128_t vb0 = wasm_v128_load(input_b); - const v128_t vb1 = wasm_v128_load(input_b + 4); - const v128_t vb2 = wasm_v128_load(input_b + 8); - const v128_t vb3 = wasm_v128_load(input_b + 12); - input_b += 16; - - v128_t vacc0 = wasm_f32x4_mul(va0, vb0); - v128_t vacc1 = wasm_f32x4_mul(va1, vb1); - v128_t vacc2 = wasm_f32x4_mul(va2, vb2); - v128_t vacc3 = wasm_f32x4_mul(va3, vb3); - - vacc0 = wasm_f32x4_pmax(voutput_min, vacc0); - vacc1 = wasm_f32x4_pmax(voutput_min, vacc1); - vacc2 = wasm_f32x4_pmax(voutput_min, vacc2); - vacc3 = wasm_f32x4_pmax(voutput_min, vacc3); - - vacc0 = wasm_f32x4_pmin(voutput_max, vacc0); - vacc1 = wasm_f32x4_pmin(voutput_max, vacc1); - vacc2 = wasm_f32x4_pmin(voutput_max, vacc2); - vacc3 = wasm_f32x4_pmin(voutput_max, vacc3); - - wasm_v128_store(output, vacc0); - wasm_v128_store(output + 4, vacc1); - wasm_v128_store(output + 8, vacc2); - wasm_v128_store(output + 12, vacc3); - output += 16; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - const v128_t vb = wasm_v128_load(input_b); - input_b += 4; - - v128_t vacc = wasm_f32x4_mul(va, vb); - - vacc = wasm_f32x4_pmax(voutput_min, vacc); - vacc = wasm_f32x4_pmin(voutput_max, vacc); - - wasm_v128_store(output, vacc); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - const v128_t vb = wasm_v128_load(input_b); - - v128_t vacc = wasm_f32x4_mul(va, vb); - - vacc = wasm_f32x4_pmax(voutput_min, vacc); - vacc = wasm_f32x4_pmin(voutput_max, vacc); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vacc, 0); - vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vacc, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-x86-u4.c b/src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-x86-u4.c deleted file mode 100644 index d8be8132d19..00000000000 --- a/src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-x86-u4.c +++ /dev/null @@ -1,69 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vmul_minmax_ukernel__wasmsimd_x86_u4( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - const v128_t vb = wasm_v128_load(input_b); - input_b += 4; - - v128_t vacc = wasm_f32x4_mul(va, vb); - - vacc = wasm_f32x4_pmax(voutput_min, vacc); - vacc = wasm_f32x4_pmin(voutput_max, vacc); - - wasm_v128_store(output, vacc); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - const v128_t vb = wasm_v128_load(input_b); - - v128_t vacc = wasm_f32x4_mul(va, vb); - - vacc = wasm_f32x4_pmax(voutput_min, vacc); - vacc = wasm_f32x4_pmin(voutput_max, vacc); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vacc, 0); - vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vacc, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-x86-u8.c b/src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-x86-u8.c deleted file mode 100644 index c6b27986ea0..00000000000 --- a/src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-x86-u8.c +++ /dev/null @@ -1,91 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vmul_minmax_ukernel__wasmsimd_x86_u8( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - input_a += 8; - - const v128_t vb0 = wasm_v128_load(input_b); - const v128_t vb1 = wasm_v128_load(input_b + 4); - input_b += 8; - - v128_t vacc0 = wasm_f32x4_mul(va0, vb0); - v128_t vacc1 = wasm_f32x4_mul(va1, vb1); - - vacc0 = wasm_f32x4_pmax(voutput_min, vacc0); - vacc1 = wasm_f32x4_pmax(voutput_min, vacc1); - - vacc0 = wasm_f32x4_pmin(voutput_max, vacc0); - vacc1 = wasm_f32x4_pmin(voutput_max, vacc1); - - wasm_v128_store(output, vacc0); - wasm_v128_store(output + 4, vacc1); - output += 8; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - const v128_t vb = wasm_v128_load(input_b); - input_b += 4; - - v128_t vacc = wasm_f32x4_mul(va, vb); - - vacc = wasm_f32x4_pmax(voutput_min, vacc); - vacc = wasm_f32x4_pmin(voutput_max, vacc); - - wasm_v128_store(output, vacc); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - const v128_t vb = wasm_v128_load(input_b); - - v128_t vacc = wasm_f32x4_mul(va, vb); - - vacc = wasm_f32x4_pmax(voutput_min, vacc); - vacc = wasm_f32x4_pmin(voutput_max, vacc); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vacc, 0); - vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vacc, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vmul-minmax-neon-u4.c b/src/f32-vbinary/gen/f32-vmul-neon-u4.c similarity index 75% rename from src/f32-vbinary/gen/f32-vmul-minmax-neon-u4.c rename to src/f32-vbinary/gen/f32-vmul-neon-u4.c index dfb2674671e..a31c9b17969 100644 --- a/src/f32-vbinary/gen/f32-vmul-minmax-neon-u4.c +++ b/src/f32-vbinary/gen/f32-vmul-neon-u4.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vmul_minmax_ukernel__neon_u4( +void xnn_f32_vmul_ukernel__neon_u4( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -28,18 +28,12 @@ void xnn_f32_vmul_minmax_ukernel__neon_u4( assert(input_b != NULL); assert(output != NULL); - const float32x4_t voutput_min = vld1q_dup_f32(¶ms->scalar.min); - const float32x4_t voutput_max = vld1q_dup_f32(¶ms->scalar.max); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float32x4_t va = vld1q_f32(input_a); input_a += 4; const float32x4_t vb = vld1q_f32(input_b); input_b += 4; float32x4_t vacc = vmulq_f32(va, vb); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -48,9 +42,6 @@ void xnn_f32_vmul_minmax_ukernel__neon_u4( float32x4_t vacc = vmulq_f32(va, vb); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/gen/f32-vmul-minmax-neon-u8.c b/src/f32-vbinary/gen/f32-vmul-neon-u8.c similarity index 74% rename from src/f32-vbinary/gen/f32-vmul-minmax-neon-u8.c rename to src/f32-vbinary/gen/f32-vmul-neon-u8.c index 77e79612ed9..5341c7a5db7 100644 --- a/src/f32-vbinary/gen/f32-vmul-minmax-neon-u8.c +++ b/src/f32-vbinary/gen/f32-vmul-neon-u8.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vmul_minmax_ukernel__neon_u8( +void xnn_f32_vmul_ukernel__neon_u8( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -28,9 +28,6 @@ void xnn_f32_vmul_minmax_ukernel__neon_u8( assert(input_b != NULL); assert(output != NULL); - const float32x4_t voutput_min = vld1q_dup_f32(¶ms->scalar.min); - const float32x4_t voutput_max = vld1q_dup_f32(¶ms->scalar.max); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float32x4_t va0 = vld1q_f32(input_a); input_a += 4; const float32x4_t vb0 = vld1q_f32(input_b); input_b += 4; @@ -41,12 +38,6 @@ void xnn_f32_vmul_minmax_ukernel__neon_u8( float32x4_t vacc1 = vmulq_f32(va1, vb1); - vacc0 = vmaxq_f32(vacc0, voutput_min); - vacc1 = vmaxq_f32(vacc1, voutput_min); - - vacc0 = vminq_f32(vacc0, voutput_max); - vacc1 = vminq_f32(vacc1, voutput_max); - vst1q_f32(output, vacc0); output += 4; vst1q_f32(output, vacc1); output += 4; } @@ -56,9 +47,6 @@ void xnn_f32_vmul_minmax_ukernel__neon_u8( float32x4_t vacc = vmulq_f32(va, vb); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -67,9 +55,6 @@ void xnn_f32_vmul_minmax_ukernel__neon_u8( float32x4_t vacc = vmulq_f32(va, vb); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/gen/f32-vmul-minmax-rvv-u4v.c b/src/f32-vbinary/gen/f32-vmul-rvv-u4v.c similarity index 75% rename from src/f32-vbinary/gen/f32-vmul-minmax-rvv-u4v.c rename to src/f32-vbinary/gen/f32-vmul-rvv-u4v.c index d6f69da48a0..e61cb2b6a53 100644 --- a/src/f32-vbinary/gen/f32-vmul-minmax-rvv-u4v.c +++ b/src/f32-vbinary/gen/f32-vmul-rvv-u4v.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vmul_minmax_ukernel__rvv_u4v( +void xnn_f32_vmul_ukernel__rvv_u4v( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -28,8 +28,6 @@ void xnn_f32_vmul_minmax_ukernel__rvv_u4v( assert(input_b != NULL); assert(output != NULL); - const float output_min = params->scalar.min; - const float output_max = params->scalar.max; size_t n = batch >> 2; do { @@ -40,8 +38,6 @@ void xnn_f32_vmul_minmax_ukernel__rvv_u4v( vfloat32m4_t vb = __riscv_vle32_v_f32m4(input_b, vl); input_b += vl; vfloat32m4_t vacc = __riscv_vfmul_vv_f32m4(va, vb, vl); - vacc = __riscv_vfmax_vf_f32m4(vacc, output_min, vl); - vacc = __riscv_vfmin_vf_f32m4(vacc, output_max, vl); __riscv_vse32_v_f32m4(output, vacc, vl); output += vl; } while (n > 0); diff --git a/src/f32-vbinary/gen/f32-vmul-minmax-rvv-u8v.c b/src/f32-vbinary/gen/f32-vmul-rvv-u8v.c similarity index 75% rename from src/f32-vbinary/gen/f32-vmul-minmax-rvv-u8v.c rename to src/f32-vbinary/gen/f32-vmul-rvv-u8v.c index dc5da7b4d47..cdeb729e5f3 100644 --- a/src/f32-vbinary/gen/f32-vmul-minmax-rvv-u8v.c +++ b/src/f32-vbinary/gen/f32-vmul-rvv-u8v.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vmul_minmax_ukernel__rvv_u8v( +void xnn_f32_vmul_ukernel__rvv_u8v( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -28,8 +28,6 @@ void xnn_f32_vmul_minmax_ukernel__rvv_u8v( assert(input_b != NULL); assert(output != NULL); - const float output_min = params->scalar.min; - const float output_max = params->scalar.max; size_t n = batch >> 2; do { @@ -40,8 +38,6 @@ void xnn_f32_vmul_minmax_ukernel__rvv_u8v( vfloat32m8_t vb = __riscv_vle32_v_f32m8(input_b, vl); input_b += vl; vfloat32m8_t vacc = __riscv_vfmul_vv_f32m8(va, vb, vl); - vacc = __riscv_vfmax_vf_f32m8(vacc, output_min, vl); - vacc = __riscv_vfmin_vf_f32m8(vacc, output_max, vl); __riscv_vse32_v_f32m8(output, vacc, vl); output += vl; } while (n > 0); diff --git a/src/f32-vbinary/gen/f32-vmul-scalar-u1.c b/src/f32-vbinary/gen/f32-vmul-scalar-u1.c index 302ff2c528e..813d9513d0a 100644 --- a/src/f32-vbinary/gen/f32-vmul-scalar-u1.c +++ b/src/f32-vbinary/gen/f32-vmul-scalar-u1.c @@ -27,7 +27,6 @@ void xnn_f32_vmul_ukernel__scalar_u1( assert(input_b != NULL); assert(output != NULL); - for (; batch >= sizeof(float); batch -= sizeof(float)) { const float va = *input_a++; const float vb = *input_b++; diff --git a/src/f32-vbinary/gen/f32-vmul-scalar-u2.c b/src/f32-vbinary/gen/f32-vmul-scalar-u2.c index 82bce9446f3..995a1ffc290 100644 --- a/src/f32-vbinary/gen/f32-vmul-scalar-u2.c +++ b/src/f32-vbinary/gen/f32-vmul-scalar-u2.c @@ -27,7 +27,6 @@ void xnn_f32_vmul_ukernel__scalar_u2( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -41,7 +40,6 @@ void xnn_f32_vmul_ukernel__scalar_u2( float vacc1 = va1 * vb1; - output[0] = vacc0; output[1] = vacc1; output += 2; diff --git a/src/f32-vbinary/gen/f32-vmul-scalar-u4.c b/src/f32-vbinary/gen/f32-vmul-scalar-u4.c index dbd544a49d3..c6c28eac7dd 100644 --- a/src/f32-vbinary/gen/f32-vmul-scalar-u4.c +++ b/src/f32-vbinary/gen/f32-vmul-scalar-u4.c @@ -27,7 +27,6 @@ void xnn_f32_vmul_ukernel__scalar_u4( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -47,7 +46,6 @@ void xnn_f32_vmul_ukernel__scalar_u4( float vacc3 = va3 * vb3; - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vmul-scalar-u8.c b/src/f32-vbinary/gen/f32-vmul-scalar-u8.c index d02697342aa..d5639dc8330 100644 --- a/src/f32-vbinary/gen/f32-vmul-scalar-u8.c +++ b/src/f32-vbinary/gen/f32-vmul-scalar-u8.c @@ -27,7 +27,6 @@ void xnn_f32_vmul_ukernel__scalar_u8( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -59,7 +58,6 @@ void xnn_f32_vmul_ukernel__scalar_u8( float vacc7 = va7 * vb7; - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vmul-minmax-sse-u4.c b/src/f32-vbinary/gen/f32-vmul-sse-u4.c similarity index 72% rename from src/f32-vbinary/gen/f32-vmul-minmax-sse-u4.c rename to src/f32-vbinary/gen/f32-vmul-sse-u4.c index 31887945a3e..3369720276d 100644 --- a/src/f32-vbinary/gen/f32-vmul-minmax-sse-u4.c +++ b/src/f32-vbinary/gen/f32-vmul-sse-u4.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vmul_minmax_ukernel__sse_u4( +void xnn_f32_vmul_ukernel__sse_u4( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,11 +29,6 @@ void xnn_f32_vmul_minmax_ukernel__sse_u4( assert(input_b != NULL); assert(output != NULL); - const __m128 voutput_min = _mm_set1_ps(params->scalar.min); - const __m128 voutput_max = _mm_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const __m128 va = _mm_loadu_ps(input_a); input_a += 4; @@ -42,8 +37,6 @@ void xnn_f32_vmul_minmax_ukernel__sse_u4( input_b += 4; __m128 vacc = _mm_mul_ps(va, vb); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); _mm_storeu_ps(output, vacc); output += 4; @@ -53,8 +46,6 @@ void xnn_f32_vmul_minmax_ukernel__sse_u4( const __m128 vb = _mm_loadu_ps(input_b); __m128 vacc = _mm_mul_ps(va, vb); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); if (batch & (2 * sizeof(float))) { _mm_storel_pi((__m64*) output, vacc); diff --git a/src/f32-vbinary/gen/f32-vmul-minmax-sse-u8.c b/src/f32-vbinary/gen/f32-vmul-sse-u8.c similarity index 72% rename from src/f32-vbinary/gen/f32-vmul-minmax-sse-u8.c rename to src/f32-vbinary/gen/f32-vmul-sse-u8.c index cd115d0dd75..9120422ee7e 100644 --- a/src/f32-vbinary/gen/f32-vmul-minmax-sse-u8.c +++ b/src/f32-vbinary/gen/f32-vmul-sse-u8.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vmul_minmax_ukernel__sse_u8( +void xnn_f32_vmul_ukernel__sse_u8( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,11 +29,6 @@ void xnn_f32_vmul_minmax_ukernel__sse_u8( assert(input_b != NULL); assert(output != NULL); - const __m128 voutput_min = _mm_set1_ps(params->scalar.min); - const __m128 voutput_max = _mm_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const __m128 va0 = _mm_loadu_ps(input_a); const __m128 va1 = _mm_loadu_ps(input_a + 4); @@ -47,12 +42,6 @@ void xnn_f32_vmul_minmax_ukernel__sse_u8( __m128 vacc1 = _mm_mul_ps(va1, vb1); - vacc0 = _mm_max_ps(vacc0, voutput_min); - vacc1 = _mm_max_ps(vacc1, voutput_min); - - vacc0 = _mm_min_ps(vacc0, voutput_max); - vacc1 = _mm_min_ps(vacc1, voutput_max); - _mm_storeu_ps(output, vacc0); _mm_storeu_ps(output + 4, vacc1); output += 8; @@ -65,8 +54,6 @@ void xnn_f32_vmul_minmax_ukernel__sse_u8( input_b += 4; __m128 vacc = _mm_mul_ps(va, vb); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); _mm_storeu_ps(output, vacc); output += 4; @@ -76,8 +63,6 @@ void xnn_f32_vmul_minmax_ukernel__sse_u8( const __m128 vb = _mm_loadu_ps(input_b); __m128 vacc = _mm_mul_ps(va, vb); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); if (batch & (2 * sizeof(float))) { _mm_storel_pi((__m64*) output, vacc); diff --git a/src/f32-vbinary/gen/f32-vmul-minmax-scalar-u1.c b/src/f32-vbinary/gen/f32-vmul-wasm-u1.c similarity index 72% rename from src/f32-vbinary/gen/f32-vmul-minmax-scalar-u1.c rename to src/f32-vbinary/gen/f32-vmul-wasm-u1.c index 364b0a42a17..52431576d47 100644 --- a/src/f32-vbinary/gen/f32-vmul-minmax-scalar-u1.c +++ b/src/f32-vbinary/gen/f32-vmul-wasm-u1.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vmul_minmax_ukernel__scalar_u1( +void xnn_f32_vmul_ukernel__wasm_u1( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,15 +27,10 @@ void xnn_f32_vmul_minmax_ukernel__scalar_u1( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - for (; batch >= sizeof(float); batch -= sizeof(float)) { const float va = *input_a++; const float vb = *input_b++; float vacc = va * vb; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output++ = vacc; } } diff --git a/src/f32-vbinary/gen/f32-vmul-minmax-scalar-u2.c b/src/f32-vbinary/gen/f32-vmul-wasm-u2.c similarity index 70% rename from src/f32-vbinary/gen/f32-vmul-minmax-scalar-u2.c rename to src/f32-vbinary/gen/f32-vmul-wasm-u2.c index 770dbd52973..822d0844e7c 100644 --- a/src/f32-vbinary/gen/f32-vmul-minmax-scalar-u2.c +++ b/src/f32-vbinary/gen/f32-vmul-wasm-u2.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vmul_minmax_ukernel__scalar_u2( +void xnn_f32_vmul_ukernel__wasm_u2( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,9 +27,6 @@ void xnn_f32_vmul_minmax_ukernel__scalar_u2( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -43,12 +40,6 @@ void xnn_f32_vmul_minmax_ukernel__scalar_u2( float vacc1 = va1 * vb1; - vacc0 = math_max_f32(vacc0, voutput_min); - vacc1 = math_max_f32(vacc1, voutput_min); - - vacc0 = math_min_f32(vacc0, voutput_max); - vacc1 = math_min_f32(vacc1, voutput_max); - output[0] = vacc0; output[1] = vacc1; output += 2; @@ -58,8 +49,6 @@ void xnn_f32_vmul_minmax_ukernel__scalar_u2( const float va = *input_a; const float vb = *input_b; float vacc = va * vb; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output = vacc; } } diff --git a/src/f32-vbinary/gen/f32-vmul-minmax-scalar-u4.c b/src/f32-vbinary/gen/f32-vmul-wasm-u4.c similarity index 68% rename from src/f32-vbinary/gen/f32-vmul-minmax-scalar-u4.c rename to src/f32-vbinary/gen/f32-vmul-wasm-u4.c index eecdef4abc2..1ea3abda8ff 100644 --- a/src/f32-vbinary/gen/f32-vmul-minmax-scalar-u4.c +++ b/src/f32-vbinary/gen/f32-vmul-wasm-u4.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vmul_minmax_ukernel__scalar_u4( +void xnn_f32_vmul_ukernel__wasm_u4( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,9 +27,6 @@ void xnn_f32_vmul_minmax_ukernel__scalar_u4( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -49,16 +46,6 @@ void xnn_f32_vmul_minmax_ukernel__scalar_u4( float vacc3 = va3 * vb3; - vacc0 = math_max_f32(vacc0, voutput_min); - vacc1 = math_max_f32(vacc1, voutput_min); - vacc2 = math_max_f32(vacc2, voutput_min); - vacc3 = math_max_f32(vacc3, voutput_min); - - vacc0 = math_min_f32(vacc0, voutput_max); - vacc1 = math_min_f32(vacc1, voutput_max); - vacc2 = math_min_f32(vacc2, voutput_max); - vacc3 = math_min_f32(vacc3, voutput_max); - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; @@ -70,8 +57,6 @@ void xnn_f32_vmul_minmax_ukernel__scalar_u4( const float va = *input_a++; const float vb = *input_b++; float vacc = va * vb; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output++ = vacc; batch -= sizeof(float); } while (batch != 0); diff --git a/src/f32-vbinary/gen/f32-vmul-minmax-scalar-u8.c b/src/f32-vbinary/gen/f32-vmul-wasm-u8.c similarity index 64% rename from src/f32-vbinary/gen/f32-vmul-minmax-scalar-u8.c rename to src/f32-vbinary/gen/f32-vmul-wasm-u8.c index 5b0b72c2f41..51827093ab3 100644 --- a/src/f32-vbinary/gen/f32-vmul-minmax-scalar-u8.c +++ b/src/f32-vbinary/gen/f32-vmul-wasm-u8.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vmul_minmax_ukernel__scalar_u8( +void xnn_f32_vmul_ukernel__wasm_u8( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,9 +27,6 @@ void xnn_f32_vmul_minmax_ukernel__scalar_u8( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -61,24 +58,6 @@ void xnn_f32_vmul_minmax_ukernel__scalar_u8( float vacc7 = va7 * vb7; - vacc0 = math_max_f32(vacc0, voutput_min); - vacc1 = math_max_f32(vacc1, voutput_min); - vacc2 = math_max_f32(vacc2, voutput_min); - vacc3 = math_max_f32(vacc3, voutput_min); - vacc4 = math_max_f32(vacc4, voutput_min); - vacc5 = math_max_f32(vacc5, voutput_min); - vacc6 = math_max_f32(vacc6, voutput_min); - vacc7 = math_max_f32(vacc7, voutput_min); - - vacc0 = math_min_f32(vacc0, voutput_max); - vacc1 = math_min_f32(vacc1, voutput_max); - vacc2 = math_min_f32(vacc2, voutput_max); - vacc3 = math_min_f32(vacc3, voutput_max); - vacc4 = math_min_f32(vacc4, voutput_max); - vacc5 = math_min_f32(vacc5, voutput_max); - vacc6 = math_min_f32(vacc6, voutput_max); - vacc7 = math_min_f32(vacc7, voutput_max); - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; @@ -94,8 +73,6 @@ void xnn_f32_vmul_minmax_ukernel__scalar_u8( const float va = *input_a++; const float vb = *input_b++; float vacc = va * vb; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output++ = vacc; batch -= sizeof(float); } while (batch != 0); diff --git a/src/f32-vbinary/gen/f32-vmul-wasmsimd-u16.c b/src/f32-vbinary/gen/f32-vmul-wasmsimd-u16.c index 89ee12e7c31..a0dbb116e14 100644 --- a/src/f32-vbinary/gen/f32-vmul-wasmsimd-u16.c +++ b/src/f32-vbinary/gen/f32-vmul-wasmsimd-u16.c @@ -28,7 +28,6 @@ void xnn_f32_vmul_ukernel__wasmsimd_u16( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); @@ -63,7 +62,6 @@ void xnn_f32_vmul_ukernel__wasmsimd_u16( v128_t vacc = wasm_f32x4_mul(va, vb); - wasm_v128_store(output, vacc); output += 4; } @@ -73,7 +71,6 @@ void xnn_f32_vmul_ukernel__wasmsimd_u16( v128_t vacc = wasm_f32x4_mul(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vmul-wasmsimd-u4.c b/src/f32-vbinary/gen/f32-vmul-wasmsimd-u4.c index 1971e3dc2ea..08064c05f8e 100644 --- a/src/f32-vbinary/gen/f32-vmul-wasmsimd-u4.c +++ b/src/f32-vbinary/gen/f32-vmul-wasmsimd-u4.c @@ -28,7 +28,6 @@ void xnn_f32_vmul_ukernel__wasmsimd_u4( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; @@ -38,7 +37,6 @@ void xnn_f32_vmul_ukernel__wasmsimd_u4( v128_t vacc = wasm_f32x4_mul(va, vb); - wasm_v128_store(output, vacc); output += 4; } @@ -48,7 +46,6 @@ void xnn_f32_vmul_ukernel__wasmsimd_u4( v128_t vacc = wasm_f32x4_mul(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vmul-wasmsimd-u8.c b/src/f32-vbinary/gen/f32-vmul-wasmsimd-u8.c index 98d2c97f784..76a84065e11 100644 --- a/src/f32-vbinary/gen/f32-vmul-wasmsimd-u8.c +++ b/src/f32-vbinary/gen/f32-vmul-wasmsimd-u8.c @@ -28,7 +28,6 @@ void xnn_f32_vmul_ukernel__wasmsimd_u8( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); @@ -55,7 +54,6 @@ void xnn_f32_vmul_ukernel__wasmsimd_u8( v128_t vacc = wasm_f32x4_mul(va, vb); - wasm_v128_store(output, vacc); output += 4; } @@ -65,7 +63,6 @@ void xnn_f32_vmul_ukernel__wasmsimd_u8( v128_t vacc = wasm_f32x4_mul(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vmulc-minmax-avx-u8.c b/src/f32-vbinary/gen/f32-vmulc-avx-u16.c similarity index 77% rename from src/f32-vbinary/gen/f32-vmulc-minmax-avx-u8.c rename to src/f32-vbinary/gen/f32-vmulc-avx-u16.c index 925daf92e06..0ba59ca4aac 100644 --- a/src/f32-vbinary/gen/f32-vmulc-minmax-avx-u8.c +++ b/src/f32-vbinary/gen/f32-vmulc-avx-u16.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vmulc_minmax_ukernel__avx_u8( +void xnn_f32_vmulc_ukernel__avx_u16( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -30,19 +30,26 @@ void xnn_f32_vmulc_minmax_ukernel__avx_u8( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; - const __m256 voutput_min = _mm256_set1_ps(params->scalar.min); - const __m256 voutput_max = _mm256_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); const __m256 vb = _mm256_broadcast_ss(input_b); + for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { + const __m256 va0 = _mm256_loadu_ps(input_a); + const __m256 va1 = _mm256_loadu_ps(input_a + 8); + input_a += 16; + + __m256 vacc0 = _mm256_mul_ps(va0, vb); + __m256 vacc1 = _mm256_mul_ps(va1, vb); + + + _mm256_storeu_ps(output, vacc0); + _mm256_storeu_ps(output + 8, vacc1); + output += 16; + } for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const __m256 va = _mm256_loadu_ps(input_a); input_a += 8; __m256 vacc = _mm256_mul_ps(va, vb); - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); _mm256_storeu_ps(output, vacc); output += 8; } @@ -54,8 +61,6 @@ void xnn_f32_vmulc_minmax_ukernel__avx_u8( __m256 va = _mm256_maskload_ps(input_a, vmask); __m256 vacc = _mm256_mul_ps(va, vb); - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); __m128 vacc_lo = _mm256_castps256_ps128(vacc); if (batch & (4 * sizeof(float))) { diff --git a/src/f32-vbinary/gen/f32-vmulc-avx-u8.c b/src/f32-vbinary/gen/f32-vmulc-avx-u8.c new file mode 100644 index 00000000000..aea9940f13d --- /dev/null +++ b/src/f32-vbinary/gen/f32-vmulc-avx-u8.c @@ -0,0 +1,67 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-vbinary/vopc-avx.c.in +// Generator: tools/xngen +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/vbinary.h" + + +void xnn_f32_vmulc_ukernel__avx_u8( + size_t batch, + const float* input_a, + const float* input_b, + float* output, + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + + const __m256 vb = _mm256_broadcast_ss(input_b); + + for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { + const __m256 va = _mm256_loadu_ps(input_a); + input_a += 8; + + __m256 vacc = _mm256_mul_ps(va, vb); + _mm256_storeu_ps(output, vacc); + output += 8; + } + if XNN_UNLIKELY(batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 7 * sizeof(float)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch)); + + __m256 va = _mm256_maskload_ps(input_a, vmask); + + __m256 vacc = _mm256_mul_ps(va, vb); + + __m128 vacc_lo = _mm256_castps256_ps128(vacc); + if (batch & (4 * sizeof(float))) { + _mm_storeu_ps(output, vacc_lo); + vacc_lo = _mm256_extractf128_ps(vacc, 1); + output += 4; + } + if (batch & (2 * sizeof(float))) { + _mm_storel_pi((__m64*) output, vacc_lo); + vacc_lo = _mm_movehl_ps(vacc_lo, vacc_lo); + output += 2; + } + if (batch & (1 * sizeof(float))) { + _mm_store_ss(output, vacc_lo); + } + } +} diff --git a/src/f32-vbinary/gen/f32-vmulc-minmax-avx512f-u16.c b/src/f32-vbinary/gen/f32-vmulc-avx512f-u16.c similarity index 75% rename from src/f32-vbinary/gen/f32-vmulc-minmax-avx512f-u16.c rename to src/f32-vbinary/gen/f32-vmulc-avx512f-u16.c index fefe09bb0df..82445b4bae4 100644 --- a/src/f32-vbinary/gen/f32-vmulc-minmax-avx512f-u16.c +++ b/src/f32-vbinary/gen/f32-vmulc-avx512f-u16.c @@ -7,6 +7,7 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. + #include #include @@ -16,12 +17,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vmulc_minmax_ukernel__avx512f_u16( +void xnn_f32_vmulc_ukernel__avx512f_u16( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,8 +30,6 @@ void xnn_f32_vmulc_minmax_ukernel__avx512f_u16( assert(input_b != NULL); assert(output != NULL); - const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); - const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); const __m512 vb = _mm512_set1_ps(*input_b); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { @@ -40,10 +39,6 @@ void xnn_f32_vmulc_minmax_ukernel__avx512f_u16( __m512 vacc0 = _mm512_mul_ps(va0, vb); - vacc0 = _mm512_max_ps(voutput_min, vacc0); - - vacc0 = _mm512_min_ps(voutput_max, vacc0); - _mm512_storeu_ps(output, vacc0); output += 16; } @@ -57,8 +52,6 @@ void xnn_f32_vmulc_minmax_ukernel__avx512f_u16( __m512 va = _mm512_maskz_loadu_ps(vmask, input_a); __m512 vacc = _mm512_mul_ps(va, vb); - vacc = _mm512_maskz_max_ps(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ps(vmask, voutput_max, vacc); _mm512_mask_storeu_ps(output, vmask, vacc); } } diff --git a/src/f32-vbinary/gen/f32-vmulc-minmax-avx512f-u32.c b/src/f32-vbinary/gen/f32-vmulc-avx512f-u32.c similarity index 73% rename from src/f32-vbinary/gen/f32-vmulc-minmax-avx512f-u32.c rename to src/f32-vbinary/gen/f32-vmulc-avx512f-u32.c index 0cd9baacea6..d3852a80dfc 100644 --- a/src/f32-vbinary/gen/f32-vmulc-minmax-avx512f-u32.c +++ b/src/f32-vbinary/gen/f32-vmulc-avx512f-u32.c @@ -7,6 +7,7 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. + #include #include @@ -16,12 +17,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vmulc_minmax_ukernel__avx512f_u32( +void xnn_f32_vmulc_ukernel__avx512f_u32( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,8 +30,6 @@ void xnn_f32_vmulc_minmax_ukernel__avx512f_u32( assert(input_b != NULL); assert(output != NULL); - const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); - const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); const __m512 vb = _mm512_set1_ps(*input_b); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { @@ -42,12 +41,6 @@ void xnn_f32_vmulc_minmax_ukernel__avx512f_u32( __m512 vacc1 = _mm512_mul_ps(va1, vb); - vacc0 = _mm512_max_ps(voutput_min, vacc0); - vacc1 = _mm512_max_ps(voutput_min, vacc1); - - vacc0 = _mm512_min_ps(voutput_max, vacc0); - vacc1 = _mm512_min_ps(voutput_max, vacc1); - _mm512_storeu_ps(output, vacc0); _mm512_storeu_ps(output + 16, vacc1); output += 32; @@ -58,9 +51,6 @@ void xnn_f32_vmulc_minmax_ukernel__avx512f_u32( __m512 vacc = _mm512_mul_ps(va, vb); - vacc = _mm512_max_ps(voutput_min, vacc); - vacc = _mm512_min_ps(voutput_max, vacc); - _mm512_storeu_ps(output, vacc); output += 16; } @@ -74,8 +64,6 @@ void xnn_f32_vmulc_minmax_ukernel__avx512f_u32( __m512 va = _mm512_maskz_loadu_ps(vmask, input_a); __m512 vacc = _mm512_mul_ps(va, vb); - vacc = _mm512_maskz_max_ps(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ps(vmask, voutput_max, vacc); _mm512_mask_storeu_ps(output, vmask, vacc); } } diff --git a/src/f32-vbinary/gen/f32-vmulc-minmax-hvx-u128.c b/src/f32-vbinary/gen/f32-vmulc-hvx-u128.c similarity index 65% rename from src/f32-vbinary/gen/f32-vmulc-minmax-hvx-u128.c rename to src/f32-vbinary/gen/f32-vmulc-hvx-u128.c index 6aa897d26db..8603ffb2ecf 100644 --- a/src/f32-vbinary/gen/f32-vmulc-minmax-hvx-u128.c +++ b/src/f32-vbinary/gen/f32-vmulc-hvx-u128.c @@ -10,12 +10,12 @@ #include "xnnpack/math.h" #include "xnnpack/vbinary.h" -void xnn_f32_vmulc_minmax_ukernel__hvx_u128( +void xnn_f32_vmulc_ukernel__hvx_u128( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -23,8 +23,6 @@ void xnn_f32_vmulc_minmax_ukernel__hvx_u128( assert(input_b != NULL); assert(output != NULL); - const HVX_Vector voutput_min = xnn_set1_f32(params->scalar.min); - const HVX_Vector voutput_max = xnn_set1_f32(params->scalar.max); HVX_Vector vb = xnn_set1_f32(*input_b); for (; batch >= 128 * sizeof(float); batch -= 128 * sizeof(float)) { @@ -40,16 +38,6 @@ void xnn_f32_vmulc_minmax_ukernel__hvx_u128( HVX_Vector vacc3 = xnn_mul_f32(va3, vb); - vacc0 = xnn_max_f32(vacc0, voutput_min); - vacc1 = xnn_max_f32(vacc1, voutput_min); - vacc2 = xnn_max_f32(vacc2, voutput_min); - vacc3 = xnn_max_f32(vacc3, voutput_min); - - vacc0 = xnn_min_f32(vacc0, voutput_max); - vacc1 = xnn_min_f32(vacc1, voutput_max); - vacc2 = xnn_min_f32(vacc2, voutput_max); - vacc3 = xnn_min_f32(vacc3, voutput_max); - xnn_storeu_f32(output, vacc0); xnn_storeu_f32(output + 32, vacc1); xnn_storeu_f32(output + 64, vacc2); @@ -61,8 +49,6 @@ void xnn_f32_vmulc_minmax_ukernel__hvx_u128( input_a += 32; HVX_Vector vacc = xnn_mul_f32(va, vb); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); xnn_storeu_f32(output, vacc); output+= 32; @@ -71,8 +57,6 @@ void xnn_f32_vmulc_minmax_ukernel__hvx_u128( HVX_Vector va = xnn_loadu_f32(input_a); HVX_Vector vacc = xnn_mul_f32(va, vb); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); Q6_V_vstu_variable(output, batch, vacc); } diff --git a/src/f32-vbinary/gen/f32-vmulc-minmax-hvx-u32.c b/src/f32-vbinary/gen/f32-vmulc-hvx-u32.c similarity index 67% rename from src/f32-vbinary/gen/f32-vmulc-minmax-hvx-u32.c rename to src/f32-vbinary/gen/f32-vmulc-hvx-u32.c index 39fb5cca833..dbb1fa08291 100644 --- a/src/f32-vbinary/gen/f32-vmulc-minmax-hvx-u32.c +++ b/src/f32-vbinary/gen/f32-vmulc-hvx-u32.c @@ -10,12 +10,12 @@ #include "xnnpack/math.h" #include "xnnpack/vbinary.h" -void xnn_f32_vmulc_minmax_ukernel__hvx_u32( +void xnn_f32_vmulc_ukernel__hvx_u32( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -23,8 +23,6 @@ void xnn_f32_vmulc_minmax_ukernel__hvx_u32( assert(input_b != NULL); assert(output != NULL); - const HVX_Vector voutput_min = xnn_set1_f32(params->scalar.min); - const HVX_Vector voutput_max = xnn_set1_f32(params->scalar.max); HVX_Vector vb = xnn_set1_f32(*input_b); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { @@ -32,8 +30,6 @@ void xnn_f32_vmulc_minmax_ukernel__hvx_u32( input_a += 32; HVX_Vector vacc = xnn_mul_f32(va, vb); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); xnn_storeu_f32(output, vacc); output+= 32; @@ -42,8 +38,6 @@ void xnn_f32_vmulc_minmax_ukernel__hvx_u32( HVX_Vector va = xnn_loadu_f32(input_a); HVX_Vector vacc = xnn_mul_f32(va, vb); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); Q6_V_vstu_variable(output, batch, vacc); } diff --git a/src/f32-vbinary/gen/f32-vmulc-minmax-hvx-u64.c b/src/f32-vbinary/gen/f32-vmulc-hvx-u64.c similarity index 67% rename from src/f32-vbinary/gen/f32-vmulc-minmax-hvx-u64.c rename to src/f32-vbinary/gen/f32-vmulc-hvx-u64.c index 49445ede436..8dd36b5ba0b 100644 --- a/src/f32-vbinary/gen/f32-vmulc-minmax-hvx-u64.c +++ b/src/f32-vbinary/gen/f32-vmulc-hvx-u64.c @@ -10,12 +10,12 @@ #include "xnnpack/math.h" #include "xnnpack/vbinary.h" -void xnn_f32_vmulc_minmax_ukernel__hvx_u64( +void xnn_f32_vmulc_ukernel__hvx_u64( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -23,8 +23,6 @@ void xnn_f32_vmulc_minmax_ukernel__hvx_u64( assert(input_b != NULL); assert(output != NULL); - const HVX_Vector voutput_min = xnn_set1_f32(params->scalar.min); - const HVX_Vector voutput_max = xnn_set1_f32(params->scalar.max); HVX_Vector vb = xnn_set1_f32(*input_b); for (; batch >= 64 * sizeof(float); batch -= 64 * sizeof(float)) { @@ -36,12 +34,6 @@ void xnn_f32_vmulc_minmax_ukernel__hvx_u64( HVX_Vector vacc1 = xnn_mul_f32(va1, vb); - vacc0 = xnn_max_f32(vacc0, voutput_min); - vacc1 = xnn_max_f32(vacc1, voutput_min); - - vacc0 = xnn_min_f32(vacc0, voutput_max); - vacc1 = xnn_min_f32(vacc1, voutput_max); - xnn_storeu_f32(output, vacc0); xnn_storeu_f32(output + 32, vacc1); output += 64; @@ -51,8 +43,6 @@ void xnn_f32_vmulc_minmax_ukernel__hvx_u64( input_a += 32; HVX_Vector vacc = xnn_mul_f32(va, vb); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); xnn_storeu_f32(output, vacc); output+= 32; @@ -61,8 +51,6 @@ void xnn_f32_vmulc_minmax_ukernel__hvx_u64( HVX_Vector va = xnn_loadu_f32(input_a); HVX_Vector vacc = xnn_mul_f32(va, vb); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); Q6_V_vstu_variable(output, batch, vacc); } diff --git a/src/f32-vbinary/gen/f32-vmulc-minmax-avx-u16.c b/src/f32-vbinary/gen/f32-vmulc-minmax-avx-u16.c deleted file mode 100644 index 5e800f29bc9..00000000000 --- a/src/f32-vbinary/gen/f32-vmulc-minmax-avx-u16.c +++ /dev/null @@ -1,94 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-avx.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vmulc_minmax_ukernel__avx_u16( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; - - const __m256 voutput_min = _mm256_set1_ps(params->scalar.min); - const __m256 voutput_max = _mm256_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const __m256 vb = _mm256_broadcast_ss(input_b); - - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const __m256 va0 = _mm256_loadu_ps(input_a); - const __m256 va1 = _mm256_loadu_ps(input_a + 8); - input_a += 16; - - __m256 vacc0 = _mm256_mul_ps(va0, vb); - __m256 vacc1 = _mm256_mul_ps(va1, vb); - - - vacc0 = _mm256_max_ps(voutput_min, vacc0); - vacc1 = _mm256_max_ps(voutput_min, vacc1); - - vacc0 = _mm256_min_ps(voutput_max, vacc0); - vacc1 = _mm256_min_ps(voutput_max, vacc1); - - _mm256_storeu_ps(output, vacc0); - _mm256_storeu_ps(output + 8, vacc1); - output += 16; - } - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const __m256 va = _mm256_loadu_ps(input_a); - input_a += 8; - - __m256 vacc = _mm256_mul_ps(va, vb); - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); - _mm256_storeu_ps(output, vacc); - output += 8; - } - if XNN_UNLIKELY(batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch)); - - __m256 va = _mm256_maskload_ps(input_a, vmask); - - __m256 vacc = _mm256_mul_ps(va, vb); - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); - - __m128 vacc_lo = _mm256_castps256_ps128(vacc); - if (batch & (4 * sizeof(float))) { - _mm_storeu_ps(output, vacc_lo); - vacc_lo = _mm256_extractf128_ps(vacc, 1); - output += 4; - } - if (batch & (2 * sizeof(float))) { - _mm_storel_pi((__m64*) output, vacc_lo); - vacc_lo = _mm_movehl_ps(vacc_lo, vacc_lo); - output += 2; - } - if (batch & (1 * sizeof(float))) { - _mm_store_ss(output, vacc_lo); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vmulc-minmax-rvv-u4v.c b/src/f32-vbinary/gen/f32-vmulc-minmax-rvv-u4v.c deleted file mode 100644 index ffe04c1a1d7..00000000000 --- a/src/f32-vbinary/gen/f32-vmulc-minmax-rvv-u4v.c +++ /dev/null @@ -1,47 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-rvv.c.in -// Generator: tools/xngen -// -// Copyright 2023 SiFive, Inc. -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vmulc_minmax_ukernel__rvv_u4v( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float output_min = params->scalar.min; - const float output_max = params->scalar.max; - const float b = *input_b; - size_t n = batch >> 2; - - do { - size_t vl = __riscv_vsetvl_e32m4(n); - n -= vl; - vfloat32m4_t va = __riscv_vle32_v_f32m4(input_a, vl); - input_a += vl; - vfloat32m4_t vacc = __riscv_vfmul_vf_f32m4(va, b, vl); - vacc = __riscv_vfmax_vf_f32m4(vacc, output_min, vl); - vacc = __riscv_vfmin_vf_f32m4(vacc, output_max, vl); - __riscv_vse32_v_f32m4(output, vacc, vl); - output += vl; - } while (n > 0); -} diff --git a/src/f32-vbinary/gen/f32-vmulc-minmax-rvv-u8v.c b/src/f32-vbinary/gen/f32-vmulc-minmax-rvv-u8v.c deleted file mode 100644 index 0d3eb5fb964..00000000000 --- a/src/f32-vbinary/gen/f32-vmulc-minmax-rvv-u8v.c +++ /dev/null @@ -1,47 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-rvv.c.in -// Generator: tools/xngen -// -// Copyright 2023 SiFive, Inc. -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vmulc_minmax_ukernel__rvv_u8v( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float output_min = params->scalar.min; - const float output_max = params->scalar.max; - const float b = *input_b; - size_t n = batch >> 2; - - do { - size_t vl = __riscv_vsetvl_e32m8(n); - n -= vl; - vfloat32m8_t va = __riscv_vle32_v_f32m8(input_a, vl); - input_a += vl; - vfloat32m8_t vacc = __riscv_vfmul_vf_f32m8(va, b, vl); - vacc = __riscv_vfmax_vf_f32m8(vacc, output_min, vl); - vacc = __riscv_vfmin_vf_f32m8(vacc, output_max, vl); - __riscv_vse32_v_f32m8(output, vacc, vl); - output += vl; - } while (n > 0); -} diff --git a/src/f32-vbinary/gen/f32-vmulc-minmax-wasm-u1.c b/src/f32-vbinary/gen/f32-vmulc-minmax-wasm-u1.c deleted file mode 100644 index 64434851698..00000000000 --- a/src/f32-vbinary/gen/f32-vmulc-minmax-wasm-u1.c +++ /dev/null @@ -1,41 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vmulc_minmax_ukernel__wasm_u1( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - const float vb = *input_b; - - for (; batch >= sizeof(float); batch -= sizeof(float)) { - const float va = *input_a++; - float vacc = va * vb; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output++ = vacc; - } -} diff --git a/src/f32-vbinary/gen/f32-vmulc-minmax-wasm-u2.c b/src/f32-vbinary/gen/f32-vmulc-minmax-wasm-u2.c deleted file mode 100644 index 19480124cee..00000000000 --- a/src/f32-vbinary/gen/f32-vmulc-minmax-wasm-u2.c +++ /dev/null @@ -1,61 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vmulc_minmax_ukernel__wasm_u2( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - const float vb = *input_b; - - for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) { - const float va0 = input_a[0]; - const float va1 = input_a[1]; - input_a += 2; - - float vacc0 = va0 * vb; - float vacc1 = va1 * vb; - - - vacc0 = __builtin_wasm_max_f32(vacc0, voutput_min); - vacc1 = __builtin_wasm_max_f32(vacc1, voutput_min); - - vacc0 = __builtin_wasm_min_f32(vacc0, voutput_max); - vacc1 = __builtin_wasm_min_f32(vacc1, voutput_max); - - output[0] = vacc0; - output[1] = vacc1; - output += 2; - } - if XNN_UNLIKELY(batch != 0) { - assert(batch == sizeof(float)); - const float va = *input_a; - float vacc = va * vb; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output = vacc; - } -} diff --git a/src/f32-vbinary/gen/f32-vmulc-minmax-wasm-u4.c b/src/f32-vbinary/gen/f32-vmulc-minmax-wasm-u4.c deleted file mode 100644 index 73c4b4e431b..00000000000 --- a/src/f32-vbinary/gen/f32-vmulc-minmax-wasm-u4.c +++ /dev/null @@ -1,73 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vmulc_minmax_ukernel__wasm_u4( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - const float vb = *input_b; - - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float va0 = input_a[0]; - const float va1 = input_a[1]; - const float va2 = input_a[2]; - const float va3 = input_a[3]; - input_a += 4; - - float vacc0 = va0 * vb; - float vacc1 = va1 * vb; - float vacc2 = va2 * vb; - float vacc3 = va3 * vb; - - - vacc0 = __builtin_wasm_max_f32(vacc0, voutput_min); - vacc1 = __builtin_wasm_max_f32(vacc1, voutput_min); - vacc2 = __builtin_wasm_max_f32(vacc2, voutput_min); - vacc3 = __builtin_wasm_max_f32(vacc3, voutput_min); - - vacc0 = __builtin_wasm_min_f32(vacc0, voutput_max); - vacc1 = __builtin_wasm_min_f32(vacc1, voutput_max); - vacc2 = __builtin_wasm_min_f32(vacc2, voutput_max); - vacc3 = __builtin_wasm_min_f32(vacc3, voutput_max); - - output[0] = vacc0; - output[1] = vacc1; - output[2] = vacc2; - output[3] = vacc3; - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - do { - const float va = *input_a++; - float vacc = va * vb; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output++ = vacc; - batch -= sizeof(float); - } while (batch != 0); - } -} diff --git a/src/f32-vbinary/gen/f32-vmulc-minmax-wasm-u8.c b/src/f32-vbinary/gen/f32-vmulc-minmax-wasm-u8.c deleted file mode 100644 index c0bb6ba99e9..00000000000 --- a/src/f32-vbinary/gen/f32-vmulc-minmax-wasm-u8.c +++ /dev/null @@ -1,93 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vmulc_minmax_ukernel__wasm_u8( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - const float vb = *input_b; - - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const float va0 = input_a[0]; - const float va1 = input_a[1]; - const float va2 = input_a[2]; - const float va3 = input_a[3]; - const float va4 = input_a[4]; - const float va5 = input_a[5]; - const float va6 = input_a[6]; - const float va7 = input_a[7]; - input_a += 8; - - float vacc0 = va0 * vb; - float vacc1 = va1 * vb; - float vacc2 = va2 * vb; - float vacc3 = va3 * vb; - float vacc4 = va4 * vb; - float vacc5 = va5 * vb; - float vacc6 = va6 * vb; - float vacc7 = va7 * vb; - - - vacc0 = __builtin_wasm_max_f32(vacc0, voutput_min); - vacc1 = __builtin_wasm_max_f32(vacc1, voutput_min); - vacc2 = __builtin_wasm_max_f32(vacc2, voutput_min); - vacc3 = __builtin_wasm_max_f32(vacc3, voutput_min); - vacc4 = __builtin_wasm_max_f32(vacc4, voutput_min); - vacc5 = __builtin_wasm_max_f32(vacc5, voutput_min); - vacc6 = __builtin_wasm_max_f32(vacc6, voutput_min); - vacc7 = __builtin_wasm_max_f32(vacc7, voutput_min); - - vacc0 = __builtin_wasm_min_f32(vacc0, voutput_max); - vacc1 = __builtin_wasm_min_f32(vacc1, voutput_max); - vacc2 = __builtin_wasm_min_f32(vacc2, voutput_max); - vacc3 = __builtin_wasm_min_f32(vacc3, voutput_max); - vacc4 = __builtin_wasm_min_f32(vacc4, voutput_max); - vacc5 = __builtin_wasm_min_f32(vacc5, voutput_max); - vacc6 = __builtin_wasm_min_f32(vacc6, voutput_max); - vacc7 = __builtin_wasm_min_f32(vacc7, voutput_max); - - output[0] = vacc0; - output[1] = vacc1; - output[2] = vacc2; - output[3] = vacc3; - output[4] = vacc4; - output[5] = vacc5; - output[6] = vacc6; - output[7] = vacc7; - output += 8; - } - if XNN_UNLIKELY(batch != 0) { - do { - const float va = *input_a++; - float vacc = va * vb; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output++ = vacc; - batch -= sizeof(float); - } while (batch != 0); - } -} diff --git a/src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-arm-u16.c b/src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-arm-u16.c deleted file mode 100644 index 63937977a3d..00000000000 --- a/src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-arm-u16.c +++ /dev/null @@ -1,95 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_u16( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const v128_t vb = wasm_v128_load32_splat(input_b); - - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - const v128_t va2 = wasm_v128_load(input_a + 8); - const v128_t va3 = wasm_v128_load(input_a + 12); - input_a += 16; - - v128_t vy0 = wasm_f32x4_mul(va0, vb); - v128_t vy1 = wasm_f32x4_mul(va1, vb); - v128_t vy2 = wasm_f32x4_mul(va2, vb); - v128_t vy3 = wasm_f32x4_mul(va3, vb); - - - vy0 = wasm_f32x4_max(vy0, voutput_min); - vy1 = wasm_f32x4_max(vy1, voutput_min); - vy2 = wasm_f32x4_max(vy2, voutput_min); - vy3 = wasm_f32x4_max(vy3, voutput_min); - - vy0 = wasm_f32x4_min(vy0, voutput_max); - vy1 = wasm_f32x4_min(vy1, voutput_max); - vy2 = wasm_f32x4_min(vy2, voutput_max); - vy3 = wasm_f32x4_min(vy3, voutput_max); - - wasm_v128_store(output, vy0); - wasm_v128_store(output + 4, vy1); - wasm_v128_store(output + 8, vy2); - wasm_v128_store(output + 12, vy3); - output += 16; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - v128_t vy = wasm_f32x4_mul(va, vb); - - vy = wasm_f32x4_max(vy, voutput_min); - vy = wasm_f32x4_min(vy, voutput_max); - - wasm_v128_store(output, vy); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - - v128_t vy = wasm_f32x4_mul(va, vb); - - vy = wasm_f32x4_max(vy, voutput_min); - vy = wasm_f32x4_min(vy, voutput_max); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vy, 0); - vy = wasm_v64x2_shuffle(vy, vy, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vy, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-arm-u4.c b/src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-arm-u4.c deleted file mode 100644 index d388e8711f2..00000000000 --- a/src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-arm-u4.c +++ /dev/null @@ -1,66 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_u4( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const v128_t vb = wasm_v128_load32_splat(input_b); - - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - v128_t vy = wasm_f32x4_mul(va, vb); - - vy = wasm_f32x4_max(vy, voutput_min); - vy = wasm_f32x4_min(vy, voutput_max); - - wasm_v128_store(output, vy); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - - v128_t vy = wasm_f32x4_mul(va, vb); - - vy = wasm_f32x4_max(vy, voutput_min); - vy = wasm_f32x4_min(vy, voutput_max); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vy, 0); - vy = wasm_v64x2_shuffle(vy, vy, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vy, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-arm-u8.c b/src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-arm-u8.c deleted file mode 100644 index 64f7c9ab65e..00000000000 --- a/src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-arm-u8.c +++ /dev/null @@ -1,85 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_u8( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const v128_t vb = wasm_v128_load32_splat(input_b); - - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - input_a += 8; - - v128_t vy0 = wasm_f32x4_mul(va0, vb); - v128_t vy1 = wasm_f32x4_mul(va1, vb); - - - vy0 = wasm_f32x4_max(vy0, voutput_min); - vy1 = wasm_f32x4_max(vy1, voutput_min); - - vy0 = wasm_f32x4_min(vy0, voutput_max); - vy1 = wasm_f32x4_min(vy1, voutput_max); - - wasm_v128_store(output, vy0); - wasm_v128_store(output + 4, vy1); - output += 8; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - v128_t vy = wasm_f32x4_mul(va, vb); - - vy = wasm_f32x4_max(vy, voutput_min); - vy = wasm_f32x4_min(vy, voutput_max); - - wasm_v128_store(output, vy); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - - v128_t vy = wasm_f32x4_mul(va, vb); - - vy = wasm_f32x4_max(vy, voutput_min); - vy = wasm_f32x4_min(vy, voutput_max); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vy, 0); - vy = wasm_v64x2_shuffle(vy, vy, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vy, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-x86-u16.c b/src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-x86-u16.c deleted file mode 100644 index d7e18ef2fd7..00000000000 --- a/src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-x86-u16.c +++ /dev/null @@ -1,95 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_u16( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const v128_t vb = wasm_v128_load32_splat(input_b); - - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - const v128_t va2 = wasm_v128_load(input_a + 8); - const v128_t va3 = wasm_v128_load(input_a + 12); - input_a += 16; - - v128_t vy0 = wasm_f32x4_mul(va0, vb); - v128_t vy1 = wasm_f32x4_mul(va1, vb); - v128_t vy2 = wasm_f32x4_mul(va2, vb); - v128_t vy3 = wasm_f32x4_mul(va3, vb); - - - vy0 = wasm_f32x4_pmax(voutput_min, vy0); - vy1 = wasm_f32x4_pmax(voutput_min, vy1); - vy2 = wasm_f32x4_pmax(voutput_min, vy2); - vy3 = wasm_f32x4_pmax(voutput_min, vy3); - - vy0 = wasm_f32x4_pmin(voutput_max, vy0); - vy1 = wasm_f32x4_pmin(voutput_max, vy1); - vy2 = wasm_f32x4_pmin(voutput_max, vy2); - vy3 = wasm_f32x4_pmin(voutput_max, vy3); - - wasm_v128_store(output, vy0); - wasm_v128_store(output + 4, vy1); - wasm_v128_store(output + 8, vy2); - wasm_v128_store(output + 12, vy3); - output += 16; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - v128_t vy = wasm_f32x4_mul(va, vb); - - vy = wasm_f32x4_pmax(voutput_min, vy); - vy = wasm_f32x4_pmin(voutput_max, vy); - - wasm_v128_store(output, vy); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - - v128_t vy = wasm_f32x4_mul(va, vb); - - vy = wasm_f32x4_pmax(voutput_min, vy); - vy = wasm_f32x4_pmin(voutput_max, vy); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vy, 0); - vy = wasm_v64x2_shuffle(vy, vy, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vy, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-x86-u4.c b/src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-x86-u4.c deleted file mode 100644 index ac361307c6a..00000000000 --- a/src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-x86-u4.c +++ /dev/null @@ -1,66 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_u4( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const v128_t vb = wasm_v128_load32_splat(input_b); - - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - v128_t vy = wasm_f32x4_mul(va, vb); - - vy = wasm_f32x4_pmax(voutput_min, vy); - vy = wasm_f32x4_pmin(voutput_max, vy); - - wasm_v128_store(output, vy); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - - v128_t vy = wasm_f32x4_mul(va, vb); - - vy = wasm_f32x4_pmax(voutput_min, vy); - vy = wasm_f32x4_pmin(voutput_max, vy); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vy, 0); - vy = wasm_v64x2_shuffle(vy, vy, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vy, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-x86-u8.c b/src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-x86-u8.c deleted file mode 100644 index b71c40e77d0..00000000000 --- a/src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-x86-u8.c +++ /dev/null @@ -1,85 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_u8( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const v128_t vb = wasm_v128_load32_splat(input_b); - - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - input_a += 8; - - v128_t vy0 = wasm_f32x4_mul(va0, vb); - v128_t vy1 = wasm_f32x4_mul(va1, vb); - - - vy0 = wasm_f32x4_pmax(voutput_min, vy0); - vy1 = wasm_f32x4_pmax(voutput_min, vy1); - - vy0 = wasm_f32x4_pmin(voutput_max, vy0); - vy1 = wasm_f32x4_pmin(voutput_max, vy1); - - wasm_v128_store(output, vy0); - wasm_v128_store(output + 4, vy1); - output += 8; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - v128_t vy = wasm_f32x4_mul(va, vb); - - vy = wasm_f32x4_pmax(voutput_min, vy); - vy = wasm_f32x4_pmin(voutput_max, vy); - - wasm_v128_store(output, vy); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - - v128_t vy = wasm_f32x4_mul(va, vb); - - vy = wasm_f32x4_pmax(voutput_min, vy); - vy = wasm_f32x4_pmin(voutput_max, vy); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vy, 0); - vy = wasm_v64x2_shuffle(vy, vy, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vy, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vmulc-minmax-neon-u4.c b/src/f32-vbinary/gen/f32-vmulc-neon-u4.c similarity index 74% rename from src/f32-vbinary/gen/f32-vmulc-minmax-neon-u4.c rename to src/f32-vbinary/gen/f32-vmulc-neon-u4.c index 92be7fb75d1..c9f07528bd5 100644 --- a/src/f32-vbinary/gen/f32-vmulc-minmax-neon-u4.c +++ b/src/f32-vbinary/gen/f32-vmulc-neon-u4.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vmulc_minmax_ukernel__neon_u4( +void xnn_f32_vmulc_ukernel__neon_u4( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -28,8 +28,6 @@ void xnn_f32_vmulc_minmax_ukernel__neon_u4( assert(input_b != NULL); assert(output != NULL); - const float32x4_t voutput_min = vld1q_dup_f32(¶ms->scalar.min); - const float32x4_t voutput_max = vld1q_dup_f32(¶ms->scalar.max); const float32x4_t vb = vld1q_dup_f32(input_b); for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { @@ -37,9 +35,6 @@ void xnn_f32_vmulc_minmax_ukernel__neon_u4( float32x4_t vacc = vmulq_f32(va, vb); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -47,9 +42,6 @@ void xnn_f32_vmulc_minmax_ukernel__neon_u4( float32x4_t vacc = vmulq_f32(va, vb); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/gen/f32-vmulc-minmax-neon-u8.c b/src/f32-vbinary/gen/f32-vmulc-neon-u8.c similarity index 72% rename from src/f32-vbinary/gen/f32-vmulc-minmax-neon-u8.c rename to src/f32-vbinary/gen/f32-vmulc-neon-u8.c index c8f93319a29..59ea18e5e17 100644 --- a/src/f32-vbinary/gen/f32-vmulc-minmax-neon-u8.c +++ b/src/f32-vbinary/gen/f32-vmulc-neon-u8.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vmulc_minmax_ukernel__neon_u8( +void xnn_f32_vmulc_ukernel__neon_u8( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -28,8 +28,6 @@ void xnn_f32_vmulc_minmax_ukernel__neon_u8( assert(input_b != NULL); assert(output != NULL); - const float32x4_t voutput_min = vld1q_dup_f32(¶ms->scalar.min); - const float32x4_t voutput_max = vld1q_dup_f32(¶ms->scalar.max); const float32x4_t vb = vld1q_dup_f32(input_b); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { @@ -40,12 +38,6 @@ void xnn_f32_vmulc_minmax_ukernel__neon_u8( float32x4_t vacc1 = vmulq_f32(va1, vb); - vacc0 = vmaxq_f32(vacc0, voutput_min); - vacc1 = vmaxq_f32(vacc1, voutput_min); - - vacc0 = vminq_f32(vacc0, voutput_max); - vacc1 = vminq_f32(vacc1, voutput_max); - vst1q_f32(output, vacc0); output += 4; vst1q_f32(output, vacc1); output += 4; } @@ -54,9 +46,6 @@ void xnn_f32_vmulc_minmax_ukernel__neon_u8( float32x4_t vacc = vmulq_f32(va, vb); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -64,9 +53,6 @@ void xnn_f32_vmulc_minmax_ukernel__neon_u8( float32x4_t vacc = vmulq_f32(va, vb); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/gen/f32-vmulc-scalar-u2.c b/src/f32-vbinary/gen/f32-vmulc-scalar-u2.c index 40af23a2a5f..220317822ca 100644 --- a/src/f32-vbinary/gen/f32-vmulc-scalar-u2.c +++ b/src/f32-vbinary/gen/f32-vmulc-scalar-u2.c @@ -38,7 +38,6 @@ void xnn_f32_vmulc_ukernel__scalar_u2( float vacc1 = va1 * vb; - output[0] = vacc0; output[1] = vacc1; output += 2; diff --git a/src/f32-vbinary/gen/f32-vmulc-scalar-u4.c b/src/f32-vbinary/gen/f32-vmulc-scalar-u4.c index 6b770bf50b8..9c3eefd4aee 100644 --- a/src/f32-vbinary/gen/f32-vmulc-scalar-u4.c +++ b/src/f32-vbinary/gen/f32-vmulc-scalar-u4.c @@ -42,7 +42,6 @@ void xnn_f32_vmulc_ukernel__scalar_u4( float vacc3 = va3 * vb; - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vmulc-scalar-u8.c b/src/f32-vbinary/gen/f32-vmulc-scalar-u8.c index e48aaf47616..1bc89cc410d 100644 --- a/src/f32-vbinary/gen/f32-vmulc-scalar-u8.c +++ b/src/f32-vbinary/gen/f32-vmulc-scalar-u8.c @@ -50,7 +50,6 @@ void xnn_f32_vmulc_ukernel__scalar_u8( float vacc7 = va7 * vb; - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vmulc-minmax-sse-u4.c b/src/f32-vbinary/gen/f32-vmulc-sse-u4.c similarity index 71% rename from src/f32-vbinary/gen/f32-vmulc-minmax-sse-u4.c rename to src/f32-vbinary/gen/f32-vmulc-sse-u4.c index c5ef9399653..1e6a50a3dda 100644 --- a/src/f32-vbinary/gen/f32-vmulc-minmax-sse-u4.c +++ b/src/f32-vbinary/gen/f32-vmulc-sse-u4.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vmulc_minmax_ukernel__sse_u4( +void xnn_f32_vmulc_ukernel__sse_u4( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,10 +29,6 @@ void xnn_f32_vmulc_minmax_ukernel__sse_u4( assert(input_b != NULL); assert(output != NULL); - const __m128 voutput_min = _mm_set1_ps(params->scalar.min); - const __m128 voutput_max = _mm_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); const __m128 vb = _mm_load1_ps(input_b); for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { @@ -40,8 +36,6 @@ void xnn_f32_vmulc_minmax_ukernel__sse_u4( input_a += 4; __m128 vacc = _mm_mul_ps(va, vb); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); _mm_storeu_ps(output, vacc); output += 4; @@ -50,8 +44,6 @@ void xnn_f32_vmulc_minmax_ukernel__sse_u4( const __m128 va = _mm_loadu_ps(input_a); __m128 vacc = _mm_mul_ps(va, vb); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); if (batch & (2 * sizeof(float))) { _mm_storel_pi((__m64*) output, vacc); vacc = _mm_movehl_ps(vacc, vacc); diff --git a/src/f32-vbinary/gen/f32-vmulc-minmax-sse-u8.c b/src/f32-vbinary/gen/f32-vmulc-sse-u8.c similarity index 70% rename from src/f32-vbinary/gen/f32-vmulc-minmax-sse-u8.c rename to src/f32-vbinary/gen/f32-vmulc-sse-u8.c index 1a362f6b1c7..f532d6540bb 100644 --- a/src/f32-vbinary/gen/f32-vmulc-minmax-sse-u8.c +++ b/src/f32-vbinary/gen/f32-vmulc-sse-u8.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vmulc_minmax_ukernel__sse_u8( +void xnn_f32_vmulc_ukernel__sse_u8( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,10 +29,6 @@ void xnn_f32_vmulc_minmax_ukernel__sse_u8( assert(input_b != NULL); assert(output != NULL); - const __m128 voutput_min = _mm_set1_ps(params->scalar.min); - const __m128 voutput_max = _mm_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); const __m128 vb = _mm_load1_ps(input_b); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { @@ -44,12 +40,6 @@ void xnn_f32_vmulc_minmax_ukernel__sse_u8( __m128 vacc1 = _mm_mul_ps(va1, vb); - vacc0 = _mm_max_ps(vacc0, voutput_min); - vacc1 = _mm_max_ps(vacc1, voutput_min); - - vacc0 = _mm_min_ps(vacc0, voutput_max); - vacc1 = _mm_min_ps(vacc1, voutput_max); - _mm_storeu_ps(output, vacc0); _mm_storeu_ps(output + 4, vacc1); output += 8; @@ -59,8 +49,6 @@ void xnn_f32_vmulc_minmax_ukernel__sse_u8( input_a += 4; __m128 vacc = _mm_mul_ps(va, vb); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); _mm_storeu_ps(output, vacc); output += 4; @@ -69,8 +57,6 @@ void xnn_f32_vmulc_minmax_ukernel__sse_u8( const __m128 va = _mm_loadu_ps(input_a); __m128 vacc = _mm_mul_ps(va, vb); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); if (batch & (2 * sizeof(float))) { _mm_storel_pi((__m64*) output, vacc); vacc = _mm_movehl_ps(vacc, vacc); diff --git a/src/f32-vbinary/gen/f32-vmulc-minmax-scalar-u1.c b/src/f32-vbinary/gen/f32-vmulc-wasm-u1.c similarity index 72% rename from src/f32-vbinary/gen/f32-vmulc-minmax-scalar-u1.c rename to src/f32-vbinary/gen/f32-vmulc-wasm-u1.c index 7f56d1e078a..96521a2b6d2 100644 --- a/src/f32-vbinary/gen/f32-vmulc-minmax-scalar-u1.c +++ b/src/f32-vbinary/gen/f32-vmulc-wasm-u1.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vmulc_minmax_ukernel__scalar_u1( +void xnn_f32_vmulc_ukernel__wasm_u1( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,15 +27,11 @@ void xnn_f32_vmulc_minmax_ukernel__scalar_u1( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; const float vb = *input_b; for (; batch >= sizeof(float); batch -= sizeof(float)) { const float va = *input_a++; float vacc = va * vb; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output++ = vacc; } } diff --git a/src/f32-vbinary/gen/f32-vmulc-minmax-scalar-u2.c b/src/f32-vbinary/gen/f32-vmulc-wasm-u2.c similarity index 68% rename from src/f32-vbinary/gen/f32-vmulc-minmax-scalar-u2.c rename to src/f32-vbinary/gen/f32-vmulc-wasm-u2.c index c083dd03207..7132611db97 100644 --- a/src/f32-vbinary/gen/f32-vmulc-minmax-scalar-u2.c +++ b/src/f32-vbinary/gen/f32-vmulc-wasm-u2.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vmulc_minmax_ukernel__scalar_u2( +void xnn_f32_vmulc_ukernel__wasm_u2( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,8 +27,6 @@ void xnn_f32_vmulc_minmax_ukernel__scalar_u2( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; const float vb = *input_b; for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) { @@ -40,12 +38,6 @@ void xnn_f32_vmulc_minmax_ukernel__scalar_u2( float vacc1 = va1 * vb; - vacc0 = math_max_f32(vacc0, voutput_min); - vacc1 = math_max_f32(vacc1, voutput_min); - - vacc0 = math_min_f32(vacc0, voutput_max); - vacc1 = math_min_f32(vacc1, voutput_max); - output[0] = vacc0; output[1] = vacc1; output += 2; @@ -54,8 +46,6 @@ void xnn_f32_vmulc_minmax_ukernel__scalar_u2( assert(batch == sizeof(float)); const float va = *input_a; float vacc = va * vb; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output = vacc; } } diff --git a/src/f32-vbinary/gen/f32-vmulc-minmax-scalar-u4.c b/src/f32-vbinary/gen/f32-vmulc-wasm-u4.c similarity index 65% rename from src/f32-vbinary/gen/f32-vmulc-minmax-scalar-u4.c rename to src/f32-vbinary/gen/f32-vmulc-wasm-u4.c index 8e6eeddf187..291a2218945 100644 --- a/src/f32-vbinary/gen/f32-vmulc-minmax-scalar-u4.c +++ b/src/f32-vbinary/gen/f32-vmulc-wasm-u4.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vmulc_minmax_ukernel__scalar_u4( +void xnn_f32_vmulc_ukernel__wasm_u4( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,8 +27,6 @@ void xnn_f32_vmulc_minmax_ukernel__scalar_u4( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; const float vb = *input_b; for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { @@ -44,16 +42,6 @@ void xnn_f32_vmulc_minmax_ukernel__scalar_u4( float vacc3 = va3 * vb; - vacc0 = math_max_f32(vacc0, voutput_min); - vacc1 = math_max_f32(vacc1, voutput_min); - vacc2 = math_max_f32(vacc2, voutput_min); - vacc3 = math_max_f32(vacc3, voutput_min); - - vacc0 = math_min_f32(vacc0, voutput_max); - vacc1 = math_min_f32(vacc1, voutput_max); - vacc2 = math_min_f32(vacc2, voutput_max); - vacc3 = math_min_f32(vacc3, voutput_max); - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; @@ -64,8 +52,6 @@ void xnn_f32_vmulc_minmax_ukernel__scalar_u4( do { const float va = *input_a++; float vacc = va * vb; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output++ = vacc; batch -= sizeof(float); } while (batch != 0); diff --git a/src/f32-vbinary/gen/f32-vmulc-minmax-scalar-u8.c b/src/f32-vbinary/gen/f32-vmulc-wasm-u8.c similarity index 60% rename from src/f32-vbinary/gen/f32-vmulc-minmax-scalar-u8.c rename to src/f32-vbinary/gen/f32-vmulc-wasm-u8.c index 3f0d7903aef..e0f48943478 100644 --- a/src/f32-vbinary/gen/f32-vmulc-minmax-scalar-u8.c +++ b/src/f32-vbinary/gen/f32-vmulc-wasm-u8.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vmulc_minmax_ukernel__scalar_u8( +void xnn_f32_vmulc_ukernel__wasm_u8( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,8 +27,6 @@ void xnn_f32_vmulc_minmax_ukernel__scalar_u8( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; const float vb = *input_b; for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { @@ -52,24 +50,6 @@ void xnn_f32_vmulc_minmax_ukernel__scalar_u8( float vacc7 = va7 * vb; - vacc0 = math_max_f32(vacc0, voutput_min); - vacc1 = math_max_f32(vacc1, voutput_min); - vacc2 = math_max_f32(vacc2, voutput_min); - vacc3 = math_max_f32(vacc3, voutput_min); - vacc4 = math_max_f32(vacc4, voutput_min); - vacc5 = math_max_f32(vacc5, voutput_min); - vacc6 = math_max_f32(vacc6, voutput_min); - vacc7 = math_max_f32(vacc7, voutput_min); - - vacc0 = math_min_f32(vacc0, voutput_max); - vacc1 = math_min_f32(vacc1, voutput_max); - vacc2 = math_min_f32(vacc2, voutput_max); - vacc3 = math_min_f32(vacc3, voutput_max); - vacc4 = math_min_f32(vacc4, voutput_max); - vacc5 = math_min_f32(vacc5, voutput_max); - vacc6 = math_min_f32(vacc6, voutput_max); - vacc7 = math_min_f32(vacc7, voutput_max); - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; @@ -84,8 +64,6 @@ void xnn_f32_vmulc_minmax_ukernel__scalar_u8( do { const float va = *input_a++; float vacc = va * vb; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output++ = vacc; batch -= sizeof(float); } while (batch != 0); diff --git a/src/f32-vbinary/gen/f32-vmulc-wasmsimd-u16.c b/src/f32-vbinary/gen/f32-vmulc-wasmsimd-u16.c index 20bb3789790..8c553721882 100644 --- a/src/f32-vbinary/gen/f32-vmulc-wasmsimd-u16.c +++ b/src/f32-vbinary/gen/f32-vmulc-wasmsimd-u16.c @@ -43,7 +43,6 @@ void xnn_f32_vmulc_ukernel__wasmsimd_u16( v128_t vy3 = wasm_f32x4_mul(va3, vb); - wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); wasm_v128_store(output + 8, vy2); @@ -56,7 +55,6 @@ void xnn_f32_vmulc_ukernel__wasmsimd_u16( v128_t vy = wasm_f32x4_mul(va, vb); - wasm_v128_store(output, vy); output += 4; } @@ -65,7 +63,6 @@ void xnn_f32_vmulc_ukernel__wasmsimd_u16( v128_t vy = wasm_f32x4_mul(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vmulc-wasmsimd-u4.c b/src/f32-vbinary/gen/f32-vmulc-wasmsimd-u4.c index 5164e1a99b8..7c7ccc630e8 100644 --- a/src/f32-vbinary/gen/f32-vmulc-wasmsimd-u4.c +++ b/src/f32-vbinary/gen/f32-vmulc-wasmsimd-u4.c @@ -36,7 +36,6 @@ void xnn_f32_vmulc_ukernel__wasmsimd_u4( v128_t vy = wasm_f32x4_mul(va, vb); - wasm_v128_store(output, vy); output += 4; } @@ -45,7 +44,6 @@ void xnn_f32_vmulc_ukernel__wasmsimd_u4( v128_t vy = wasm_f32x4_mul(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vmulc-wasmsimd-u8.c b/src/f32-vbinary/gen/f32-vmulc-wasmsimd-u8.c index c31d87509f9..33d1b76a502 100644 --- a/src/f32-vbinary/gen/f32-vmulc-wasmsimd-u8.c +++ b/src/f32-vbinary/gen/f32-vmulc-wasmsimd-u8.c @@ -39,7 +39,6 @@ void xnn_f32_vmulc_ukernel__wasmsimd_u8( v128_t vy1 = wasm_f32x4_mul(va1, vb); - wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); output += 8; @@ -50,7 +49,6 @@ void xnn_f32_vmulc_ukernel__wasmsimd_u8( v128_t vy = wasm_f32x4_mul(va, vb); - wasm_v128_store(output, vy); output += 4; } @@ -59,7 +57,6 @@ void xnn_f32_vmulc_ukernel__wasmsimd_u8( v128_t vy = wasm_f32x4_mul(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vprelu-avx-u16.c b/src/f32-vbinary/gen/f32-vprelu-avx-u16.c index 606b870b80a..970f2656ddb 100644 --- a/src/f32-vbinary/gen/f32-vprelu-avx-u16.c +++ b/src/f32-vbinary/gen/f32-vprelu-avx-u16.c @@ -30,7 +30,6 @@ void xnn_f32_vprelu_ukernel__avx_u16( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const __m256 va0 = _mm256_loadu_ps(input_a); const __m256 va1 = _mm256_loadu_ps(input_a + 8); @@ -43,7 +42,6 @@ void xnn_f32_vprelu_ukernel__avx_u16( vacc0 = _mm256_blendv_ps(va0, vacc0, va0); vacc1 = _mm256_blendv_ps(va1, vacc1, va1); - _mm256_storeu_ps(output, vacc0); _mm256_storeu_ps(output + 8, vacc1); output += 16; diff --git a/src/f32-vbinary/gen/f32-vprelu-avx-u8.c b/src/f32-vbinary/gen/f32-vprelu-avx-u8.c index 578703788eb..6339a9e69a9 100644 --- a/src/f32-vbinary/gen/f32-vprelu-avx-u8.c +++ b/src/f32-vbinary/gen/f32-vprelu-avx-u8.c @@ -30,7 +30,6 @@ void xnn_f32_vprelu_ukernel__avx_u8( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const __m256 va = _mm256_loadu_ps(input_a); input_a += 8; diff --git a/src/f32-vbinary/gen/f32-vprelu-avx512f-u16.c b/src/f32-vbinary/gen/f32-vprelu-avx512f-u16.c index 02afe2e6f8e..2a464e9b780 100644 --- a/src/f32-vbinary/gen/f32-vprelu-avx512f-u16.c +++ b/src/f32-vbinary/gen/f32-vprelu-avx512f-u16.c @@ -29,7 +29,6 @@ void xnn_f32_vprelu_ukernel__avx512f_u16( assert(input_b != NULL); assert(output != NULL); - const __m512 vzero = _mm512_setzero_ps(); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { @@ -40,7 +39,6 @@ void xnn_f32_vprelu_ukernel__avx512f_u16( __m512 vacc = _mm512_mask_mul_ps(va, vsign, va, _mm512_loadu_ps(input_b)); input_b += 16; - _mm512_storeu_ps(output, vacc); output += 16; } diff --git a/src/f32-vbinary/gen/f32-vprelu-avx512f-u32.c b/src/f32-vbinary/gen/f32-vprelu-avx512f-u32.c index 10c05a3ac8f..fc3f70a5a7f 100644 --- a/src/f32-vbinary/gen/f32-vprelu-avx512f-u32.c +++ b/src/f32-vbinary/gen/f32-vprelu-avx512f-u32.c @@ -29,7 +29,6 @@ void xnn_f32_vprelu_ukernel__avx512f_u32( assert(input_b != NULL); assert(output != NULL); - const __m512 vzero = _mm512_setzero_ps(); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { @@ -43,7 +42,6 @@ void xnn_f32_vprelu_ukernel__avx512f_u32( __m512 vacc1 = _mm512_mask_mul_ps(va1, vsign1, va1, _mm512_loadu_ps(input_b + 16)); input_b += 32; - _mm512_storeu_ps(output, vacc0); _mm512_storeu_ps(output + 16, vacc1); output += 32; @@ -56,7 +54,6 @@ void xnn_f32_vprelu_ukernel__avx512f_u32( __m512 vacc = _mm512_mask_mul_ps(va, vsign, va, _mm512_loadu_ps(input_b)); input_b += 16; - _mm512_storeu_ps(output, vacc); output += 16; } diff --git a/src/f32-vbinary/gen/f32-vprelu-neon-u4.c b/src/f32-vbinary/gen/f32-vprelu-neon-u4.c index 99e8accfc24..3ab0cd0ef35 100644 --- a/src/f32-vbinary/gen/f32-vprelu-neon-u4.c +++ b/src/f32-vbinary/gen/f32-vprelu-neon-u4.c @@ -28,7 +28,6 @@ void xnn_f32_vprelu_ukernel__neon_u4( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float32x4_t va = vld1q_f32(input_a); input_a += 4; const float32x4_t vb = vld1q_f32(input_b); input_b += 4; @@ -37,7 +36,6 @@ void xnn_f32_vprelu_ukernel__neon_u4( const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(va), vmovq_n_s32(0)); vacc = vbslq_f32(vm, vacc, va); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -48,7 +46,6 @@ void xnn_f32_vprelu_ukernel__neon_u4( const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(va), vmovq_n_s32(0)); vacc = vbslq_f32(vm, vacc, va); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/gen/f32-vprelu-neon-u8.c b/src/f32-vbinary/gen/f32-vprelu-neon-u8.c index ec9d6b965cb..b90ed18916c 100644 --- a/src/f32-vbinary/gen/f32-vprelu-neon-u8.c +++ b/src/f32-vbinary/gen/f32-vprelu-neon-u8.c @@ -28,7 +28,6 @@ void xnn_f32_vprelu_ukernel__neon_u8( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float32x4_t va0 = vld1q_f32(input_a); input_a += 4; const float32x4_t vb0 = vld1q_f32(input_b); input_b += 4; @@ -44,7 +43,6 @@ void xnn_f32_vprelu_ukernel__neon_u8( vacc0 = vbslq_f32(vm0, vacc0, va0); vacc1 = vbslq_f32(vm1, vacc1, va1); - vst1q_f32(output, vacc0); output += 4; vst1q_f32(output, vacc1); output += 4; } @@ -56,7 +54,6 @@ void xnn_f32_vprelu_ukernel__neon_u8( const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(va), vmovq_n_s32(0)); vacc = vbslq_f32(vm, vacc, va); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -67,7 +64,6 @@ void xnn_f32_vprelu_ukernel__neon_u8( const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(va), vmovq_n_s32(0)); vacc = vbslq_f32(vm, vacc, va); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/gen/f32-vprelu-scalar-u1.c b/src/f32-vbinary/gen/f32-vprelu-scalar-u1.c index fee80d089a7..3500dff2d46 100644 --- a/src/f32-vbinary/gen/f32-vprelu-scalar-u1.c +++ b/src/f32-vbinary/gen/f32-vprelu-scalar-u1.c @@ -27,7 +27,6 @@ void xnn_f32_vprelu_ukernel__scalar_u1( assert(input_b != NULL); assert(output != NULL); - for (; batch >= sizeof(float); batch -= sizeof(float)) { const float va = *input_a++; const float vb = *input_b++; diff --git a/src/f32-vbinary/gen/f32-vprelu-scalar-u2.c b/src/f32-vbinary/gen/f32-vprelu-scalar-u2.c index 98b4e8879ba..9449561eff6 100644 --- a/src/f32-vbinary/gen/f32-vprelu-scalar-u2.c +++ b/src/f32-vbinary/gen/f32-vprelu-scalar-u2.c @@ -27,7 +27,6 @@ void xnn_f32_vprelu_ukernel__scalar_u2( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -43,7 +42,6 @@ void xnn_f32_vprelu_ukernel__scalar_u2( vacc0 = XNN_UNPREDICTABLE(va0 < 0.0f) ? va0 * vb0 : va0; vacc1 = XNN_UNPREDICTABLE(va1 < 0.0f) ? va1 * vb1 : va1; - output[0] = vacc0; output[1] = vacc1; output += 2; diff --git a/src/f32-vbinary/gen/f32-vprelu-scalar-u4.c b/src/f32-vbinary/gen/f32-vprelu-scalar-u4.c index 47869a68400..8315500c6bc 100644 --- a/src/f32-vbinary/gen/f32-vprelu-scalar-u4.c +++ b/src/f32-vbinary/gen/f32-vprelu-scalar-u4.c @@ -27,7 +27,6 @@ void xnn_f32_vprelu_ukernel__scalar_u4( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -51,7 +50,6 @@ void xnn_f32_vprelu_ukernel__scalar_u4( vacc2 = XNN_UNPREDICTABLE(va2 < 0.0f) ? va2 * vb2 : va2; vacc3 = XNN_UNPREDICTABLE(va3 < 0.0f) ? va3 * vb3 : va3; - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vprelu-scalar-u8.c b/src/f32-vbinary/gen/f32-vprelu-scalar-u8.c index 53fcdabbc4e..3e7639806b4 100644 --- a/src/f32-vbinary/gen/f32-vprelu-scalar-u8.c +++ b/src/f32-vbinary/gen/f32-vprelu-scalar-u8.c @@ -27,7 +27,6 @@ void xnn_f32_vprelu_ukernel__scalar_u8( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -67,7 +66,6 @@ void xnn_f32_vprelu_ukernel__scalar_u8( vacc6 = XNN_UNPREDICTABLE(va6 < 0.0f) ? va6 * vb6 : va6; vacc7 = XNN_UNPREDICTABLE(va7 < 0.0f) ? va7 * vb7 : va7; - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vprelu-sse2-u4.c b/src/f32-vbinary/gen/f32-vprelu-sse2-u4.c index 26237a3ea7e..e363eec8226 100644 --- a/src/f32-vbinary/gen/f32-vprelu-sse2-u4.c +++ b/src/f32-vbinary/gen/f32-vprelu-sse2-u4.c @@ -29,7 +29,6 @@ void xnn_f32_vprelu_ukernel__sse2_u4( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const __m128 va = _mm_loadu_ps(input_a); input_a += 4; diff --git a/src/f32-vbinary/gen/f32-vprelu-sse2-u8.c b/src/f32-vbinary/gen/f32-vprelu-sse2-u8.c index 840248eda0c..8ab736313ff 100644 --- a/src/f32-vbinary/gen/f32-vprelu-sse2-u8.c +++ b/src/f32-vbinary/gen/f32-vprelu-sse2-u8.c @@ -29,7 +29,6 @@ void xnn_f32_vprelu_ukernel__sse2_u8( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const __m128 va0 = _mm_loadu_ps(input_a); const __m128 va1 = _mm_loadu_ps(input_a + 4); @@ -48,7 +47,6 @@ void xnn_f32_vprelu_ukernel__sse2_u8( vacc0 = _mm_or_ps(_mm_and_ps(vacc0, vmask0), _mm_andnot_ps(vmask0, va0)); vacc1 = _mm_or_ps(_mm_and_ps(vacc1, vmask1), _mm_andnot_ps(vmask1, va1)); - _mm_storeu_ps(output, vacc0); _mm_storeu_ps(output + 4, vacc1); output += 8; diff --git a/src/f32-vbinary/gen/f32-vprelu-sse41-u4.c b/src/f32-vbinary/gen/f32-vprelu-sse41-u4.c index 12067c7239d..810d8e0961d 100644 --- a/src/f32-vbinary/gen/f32-vprelu-sse41-u4.c +++ b/src/f32-vbinary/gen/f32-vprelu-sse41-u4.c @@ -29,7 +29,6 @@ void xnn_f32_vprelu_ukernel__sse41_u4( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const __m128 va = _mm_loadu_ps(input_a); input_a += 4; diff --git a/src/f32-vbinary/gen/f32-vprelu-sse41-u8.c b/src/f32-vbinary/gen/f32-vprelu-sse41-u8.c index df3ef75a222..2c2bc2fcf5f 100644 --- a/src/f32-vbinary/gen/f32-vprelu-sse41-u8.c +++ b/src/f32-vbinary/gen/f32-vprelu-sse41-u8.c @@ -29,7 +29,6 @@ void xnn_f32_vprelu_ukernel__sse41_u8( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const __m128 va0 = _mm_loadu_ps(input_a); const __m128 va1 = _mm_loadu_ps(input_a + 4); @@ -46,7 +45,6 @@ void xnn_f32_vprelu_ukernel__sse41_u8( vacc0 = _mm_blendv_ps(va0, vacc0, va0); vacc1 = _mm_blendv_ps(va1, vacc1, va1); - _mm_storeu_ps(output, vacc0); _mm_storeu_ps(output + 4, vacc1); output += 8; diff --git a/src/f32-vbinary/gen/f32-vprelu-wasm-u1.c b/src/f32-vbinary/gen/f32-vprelu-wasm-u1.c index d5c7fb7e728..a9d448f812c 100644 --- a/src/f32-vbinary/gen/f32-vprelu-wasm-u1.c +++ b/src/f32-vbinary/gen/f32-vprelu-wasm-u1.c @@ -27,7 +27,6 @@ void xnn_f32_vprelu_ukernel__wasm_u1( assert(input_b != NULL); assert(output != NULL); - for (; batch >= sizeof(float); batch -= sizeof(float)) { const float va = *input_a++; const float vb = *input_b++; diff --git a/src/f32-vbinary/gen/f32-vprelu-wasm-u2.c b/src/f32-vbinary/gen/f32-vprelu-wasm-u2.c index e5b951c3d20..12eac1f31c9 100644 --- a/src/f32-vbinary/gen/f32-vprelu-wasm-u2.c +++ b/src/f32-vbinary/gen/f32-vprelu-wasm-u2.c @@ -27,7 +27,6 @@ void xnn_f32_vprelu_ukernel__wasm_u2( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -43,7 +42,6 @@ void xnn_f32_vprelu_ukernel__wasm_u2( vacc0 = XNN_UNPREDICTABLE(va0 < 0.0f) ? va0 * vb0 : va0; vacc1 = XNN_UNPREDICTABLE(va1 < 0.0f) ? va1 * vb1 : va1; - output[0] = vacc0; output[1] = vacc1; output += 2; diff --git a/src/f32-vbinary/gen/f32-vprelu-wasm-u4.c b/src/f32-vbinary/gen/f32-vprelu-wasm-u4.c index d059de7341f..17b780e48a4 100644 --- a/src/f32-vbinary/gen/f32-vprelu-wasm-u4.c +++ b/src/f32-vbinary/gen/f32-vprelu-wasm-u4.c @@ -27,7 +27,6 @@ void xnn_f32_vprelu_ukernel__wasm_u4( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -51,7 +50,6 @@ void xnn_f32_vprelu_ukernel__wasm_u4( vacc2 = XNN_UNPREDICTABLE(va2 < 0.0f) ? va2 * vb2 : va2; vacc3 = XNN_UNPREDICTABLE(va3 < 0.0f) ? va3 * vb3 : va3; - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vprelu-wasm-u8.c b/src/f32-vbinary/gen/f32-vprelu-wasm-u8.c index ac874be84da..08347d2eb11 100644 --- a/src/f32-vbinary/gen/f32-vprelu-wasm-u8.c +++ b/src/f32-vbinary/gen/f32-vprelu-wasm-u8.c @@ -27,7 +27,6 @@ void xnn_f32_vprelu_ukernel__wasm_u8( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -67,7 +66,6 @@ void xnn_f32_vprelu_ukernel__wasm_u8( vacc6 = XNN_UNPREDICTABLE(va6 < 0.0f) ? va6 * vb6 : va6; vacc7 = XNN_UNPREDICTABLE(va7 < 0.0f) ? va7 * vb7 : va7; - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vprelu-wasmrelaxedsimd-u16.c b/src/f32-vbinary/gen/f32-vprelu-wasmrelaxedsimd-u16.c index 6b91bd47fed..d6f25bcccee 100644 --- a/src/f32-vbinary/gen/f32-vprelu-wasmrelaxedsimd-u16.c +++ b/src/f32-vbinary/gen/f32-vprelu-wasmrelaxedsimd-u16.c @@ -28,7 +28,6 @@ void xnn_f32_vprelu_ukernel__wasmrelaxedsimd_u16( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); @@ -75,7 +74,6 @@ void xnn_f32_vprelu_ukernel__wasmrelaxedsimd_u16( vacc = wasm_i32x4_relaxed_laneselect(vacc, va, vmask); - wasm_v128_store(output, vacc); output += 4; } @@ -88,7 +86,6 @@ void xnn_f32_vprelu_ukernel__wasmrelaxedsimd_u16( vacc = wasm_i32x4_relaxed_laneselect(vacc, va, vmask); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vprelu-wasmrelaxedsimd-u4.c b/src/f32-vbinary/gen/f32-vprelu-wasmrelaxedsimd-u4.c index dbfe2d32815..60cc470de4e 100644 --- a/src/f32-vbinary/gen/f32-vprelu-wasmrelaxedsimd-u4.c +++ b/src/f32-vbinary/gen/f32-vprelu-wasmrelaxedsimd-u4.c @@ -28,7 +28,6 @@ void xnn_f32_vprelu_ukernel__wasmrelaxedsimd_u4( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; @@ -41,7 +40,6 @@ void xnn_f32_vprelu_ukernel__wasmrelaxedsimd_u4( vacc = wasm_i32x4_relaxed_laneselect(vacc, va, vmask); - wasm_v128_store(output, vacc); output += 4; } @@ -54,7 +52,6 @@ void xnn_f32_vprelu_ukernel__wasmrelaxedsimd_u4( vacc = wasm_i32x4_relaxed_laneselect(vacc, va, vmask); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vprelu-wasmrelaxedsimd-u8.c b/src/f32-vbinary/gen/f32-vprelu-wasmrelaxedsimd-u8.c index 587e2143a52..d40b3161f28 100644 --- a/src/f32-vbinary/gen/f32-vprelu-wasmrelaxedsimd-u8.c +++ b/src/f32-vbinary/gen/f32-vprelu-wasmrelaxedsimd-u8.c @@ -28,7 +28,6 @@ void xnn_f32_vprelu_ukernel__wasmrelaxedsimd_u8( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); @@ -63,7 +62,6 @@ void xnn_f32_vprelu_ukernel__wasmrelaxedsimd_u8( vacc = wasm_i32x4_relaxed_laneselect(vacc, va, vmask); - wasm_v128_store(output, vacc); output += 4; } @@ -76,7 +74,6 @@ void xnn_f32_vprelu_ukernel__wasmrelaxedsimd_u8( vacc = wasm_i32x4_relaxed_laneselect(vacc, va, vmask); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vprelu-wasmsimd-u16.c b/src/f32-vbinary/gen/f32-vprelu-wasmsimd-u16.c index ecc93d87fbf..b863d7c6cd6 100644 --- a/src/f32-vbinary/gen/f32-vprelu-wasmsimd-u16.c +++ b/src/f32-vbinary/gen/f32-vprelu-wasmsimd-u16.c @@ -28,7 +28,6 @@ void xnn_f32_vprelu_ukernel__wasmsimd_u16( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); @@ -75,7 +74,6 @@ void xnn_f32_vprelu_ukernel__wasmsimd_u16( vacc = wasm_v128_bitselect(vacc, va, vmask); - wasm_v128_store(output, vacc); output += 4; } @@ -88,7 +86,6 @@ void xnn_f32_vprelu_ukernel__wasmsimd_u16( vacc = wasm_v128_bitselect(vacc, va, vmask); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vprelu-wasmsimd-u4.c b/src/f32-vbinary/gen/f32-vprelu-wasmsimd-u4.c index 4212067839f..d82a8387ee6 100644 --- a/src/f32-vbinary/gen/f32-vprelu-wasmsimd-u4.c +++ b/src/f32-vbinary/gen/f32-vprelu-wasmsimd-u4.c @@ -28,7 +28,6 @@ void xnn_f32_vprelu_ukernel__wasmsimd_u4( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; @@ -41,7 +40,6 @@ void xnn_f32_vprelu_ukernel__wasmsimd_u4( vacc = wasm_v128_bitselect(vacc, va, vmask); - wasm_v128_store(output, vacc); output += 4; } @@ -54,7 +52,6 @@ void xnn_f32_vprelu_ukernel__wasmsimd_u4( vacc = wasm_v128_bitselect(vacc, va, vmask); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vprelu-wasmsimd-u8.c b/src/f32-vbinary/gen/f32-vprelu-wasmsimd-u8.c index a90041db791..542c3c5f5f4 100644 --- a/src/f32-vbinary/gen/f32-vprelu-wasmsimd-u8.c +++ b/src/f32-vbinary/gen/f32-vprelu-wasmsimd-u8.c @@ -28,7 +28,6 @@ void xnn_f32_vprelu_ukernel__wasmsimd_u8( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); @@ -63,7 +62,6 @@ void xnn_f32_vprelu_ukernel__wasmsimd_u8( vacc = wasm_v128_bitselect(vacc, va, vmask); - wasm_v128_store(output, vacc); output += 4; } @@ -76,7 +74,6 @@ void xnn_f32_vprelu_ukernel__wasmsimd_u8( vacc = wasm_v128_bitselect(vacc, va, vmask); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vpreluc-avx-u16.c b/src/f32-vbinary/gen/f32-vpreluc-avx-u16.c index 042e639fbef..7787ff91b24 100644 --- a/src/f32-vbinary/gen/f32-vpreluc-avx-u16.c +++ b/src/f32-vbinary/gen/f32-vpreluc-avx-u16.c @@ -43,7 +43,6 @@ void xnn_f32_vpreluc_ukernel__avx_u16( vacc0 = _mm256_blendv_ps(va0, vacc0, va0); vacc1 = _mm256_blendv_ps(va1, vacc1, va1); - _mm256_storeu_ps(output, vacc0); _mm256_storeu_ps(output + 8, vacc1); output += 16; diff --git a/src/f32-vbinary/gen/f32-vpreluc-avx512f-u16.c b/src/f32-vbinary/gen/f32-vpreluc-avx512f-u16.c index 320685b14b7..26f3ac14110 100644 --- a/src/f32-vbinary/gen/f32-vpreluc-avx512f-u16.c +++ b/src/f32-vbinary/gen/f32-vpreluc-avx512f-u16.c @@ -7,6 +7,7 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. + #include #include @@ -39,7 +40,6 @@ void xnn_f32_vpreluc_ukernel__avx512f_u16( const __mmask16 vsign0 = _mm512_cmp_ps_mask(va0, vzero, _CMP_LT_OQ); __m512 vacc0 = _mm512_mask_mul_ps(va0, vsign0, va0, vb); - _mm512_storeu_ps(output, vacc0); output += 16; } diff --git a/src/f32-vbinary/gen/f32-vpreluc-avx512f-u32.c b/src/f32-vbinary/gen/f32-vpreluc-avx512f-u32.c index 3c03251f83e..25f7609cb28 100644 --- a/src/f32-vbinary/gen/f32-vpreluc-avx512f-u32.c +++ b/src/f32-vbinary/gen/f32-vpreluc-avx512f-u32.c @@ -7,6 +7,7 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. + #include #include @@ -42,7 +43,6 @@ void xnn_f32_vpreluc_ukernel__avx512f_u32( const __mmask16 vsign1 = _mm512_cmp_ps_mask(va1, vzero, _CMP_LT_OQ); __m512 vacc1 = _mm512_mask_mul_ps(va1, vsign1, va1, vb); - _mm512_storeu_ps(output, vacc0); _mm512_storeu_ps(output + 16, vacc1); output += 32; @@ -54,7 +54,6 @@ void xnn_f32_vpreluc_ukernel__avx512f_u32( const __mmask16 vsign = _mm512_cmp_ps_mask(va, vzero, _CMP_LT_OQ); __m512 vacc = _mm512_mask_mul_ps(va, vsign, va, vb); - _mm512_storeu_ps(output, vacc); output += 16; } diff --git a/src/f32-vbinary/gen/f32-vpreluc-neon-u4.c b/src/f32-vbinary/gen/f32-vpreluc-neon-u4.c index a50e09dddb9..65aa72776ae 100644 --- a/src/f32-vbinary/gen/f32-vpreluc-neon-u4.c +++ b/src/f32-vbinary/gen/f32-vpreluc-neon-u4.c @@ -37,7 +37,6 @@ void xnn_f32_vpreluc_ukernel__neon_u4( const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(va), vmovq_n_s32(0)); vacc = vbslq_f32(vm, vacc, va); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -47,7 +46,6 @@ void xnn_f32_vpreluc_ukernel__neon_u4( const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(va), vmovq_n_s32(0)); vacc = vbslq_f32(vm, vacc, va); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/gen/f32-vpreluc-neon-u8.c b/src/f32-vbinary/gen/f32-vpreluc-neon-u8.c index e20729d8b3e..baa409b2802 100644 --- a/src/f32-vbinary/gen/f32-vpreluc-neon-u8.c +++ b/src/f32-vbinary/gen/f32-vpreluc-neon-u8.c @@ -43,7 +43,6 @@ void xnn_f32_vpreluc_ukernel__neon_u8( vacc0 = vbslq_f32(vm0, vacc0, va0); vacc1 = vbslq_f32(vm1, vacc1, va1); - vst1q_f32(output, vacc0); output += 4; vst1q_f32(output, vacc1); output += 4; } @@ -54,7 +53,6 @@ void xnn_f32_vpreluc_ukernel__neon_u8( const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(va), vmovq_n_s32(0)); vacc = vbslq_f32(vm, vacc, va); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -64,7 +62,6 @@ void xnn_f32_vpreluc_ukernel__neon_u8( const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(va), vmovq_n_s32(0)); vacc = vbslq_f32(vm, vacc, va); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/gen/f32-vpreluc-scalar-u2.c b/src/f32-vbinary/gen/f32-vpreluc-scalar-u2.c index 705197c6ffa..bec3ac0d908 100644 --- a/src/f32-vbinary/gen/f32-vpreluc-scalar-u2.c +++ b/src/f32-vbinary/gen/f32-vpreluc-scalar-u2.c @@ -40,7 +40,6 @@ void xnn_f32_vpreluc_ukernel__scalar_u2( vacc0 = XNN_UNPREDICTABLE(va0 < 0.0f) ? vacc0 : va0; vacc1 = XNN_UNPREDICTABLE(va1 < 0.0f) ? vacc1 : va1; - output[0] = vacc0; output[1] = vacc1; output += 2; diff --git a/src/f32-vbinary/gen/f32-vpreluc-scalar-u4.c b/src/f32-vbinary/gen/f32-vpreluc-scalar-u4.c index 65c8b1e1924..2df966a75b8 100644 --- a/src/f32-vbinary/gen/f32-vpreluc-scalar-u4.c +++ b/src/f32-vbinary/gen/f32-vpreluc-scalar-u4.c @@ -46,7 +46,6 @@ void xnn_f32_vpreluc_ukernel__scalar_u4( vacc2 = XNN_UNPREDICTABLE(va2 < 0.0f) ? vacc2 : va2; vacc3 = XNN_UNPREDICTABLE(va3 < 0.0f) ? vacc3 : va3; - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vpreluc-scalar-u8.c b/src/f32-vbinary/gen/f32-vpreluc-scalar-u8.c index 6a7010baad3..3c6045f7dc6 100644 --- a/src/f32-vbinary/gen/f32-vpreluc-scalar-u8.c +++ b/src/f32-vbinary/gen/f32-vpreluc-scalar-u8.c @@ -58,7 +58,6 @@ void xnn_f32_vpreluc_ukernel__scalar_u8( vacc6 = XNN_UNPREDICTABLE(va6 < 0.0f) ? vacc6 : va6; vacc7 = XNN_UNPREDICTABLE(va7 < 0.0f) ? vacc7 : va7; - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vpreluc-sse2-u8.c b/src/f32-vbinary/gen/f32-vpreluc-sse2-u8.c index ba2b6c93b2c..ddc62d9c76c 100644 --- a/src/f32-vbinary/gen/f32-vpreluc-sse2-u8.c +++ b/src/f32-vbinary/gen/f32-vpreluc-sse2-u8.c @@ -45,7 +45,6 @@ void xnn_f32_vpreluc_ukernel__sse2_u8( vacc0 = _mm_or_ps(_mm_and_ps(vacc0, vmask0), _mm_andnot_ps(vmask0, va0)); vacc1 = _mm_or_ps(_mm_and_ps(vacc1, vmask1), _mm_andnot_ps(vmask1, va1)); - _mm_storeu_ps(output, vacc0); _mm_storeu_ps(output + 4, vacc1); output += 8; diff --git a/src/f32-vbinary/gen/f32-vpreluc-sse41-u8.c b/src/f32-vbinary/gen/f32-vpreluc-sse41-u8.c index 6bc883e0de3..b7cf7b5e048 100644 --- a/src/f32-vbinary/gen/f32-vpreluc-sse41-u8.c +++ b/src/f32-vbinary/gen/f32-vpreluc-sse41-u8.c @@ -43,7 +43,6 @@ void xnn_f32_vpreluc_ukernel__sse41_u8( vacc0 = _mm_blendv_ps(va0, vacc0, va0); vacc1 = _mm_blendv_ps(va1, vacc1, va1); - _mm_storeu_ps(output, vacc0); _mm_storeu_ps(output + 4, vacc1); output += 8; diff --git a/src/f32-vbinary/gen/f32-vpreluc-wasm-u2.c b/src/f32-vbinary/gen/f32-vpreluc-wasm-u2.c index ace5a61f86a..8813859af2c 100644 --- a/src/f32-vbinary/gen/f32-vpreluc-wasm-u2.c +++ b/src/f32-vbinary/gen/f32-vpreluc-wasm-u2.c @@ -40,7 +40,6 @@ void xnn_f32_vpreluc_ukernel__wasm_u2( vacc0 = XNN_UNPREDICTABLE(va0 < 0.0f) ? vacc0 : va0; vacc1 = XNN_UNPREDICTABLE(va1 < 0.0f) ? vacc1 : va1; - output[0] = vacc0; output[1] = vacc1; output += 2; diff --git a/src/f32-vbinary/gen/f32-vpreluc-wasm-u4.c b/src/f32-vbinary/gen/f32-vpreluc-wasm-u4.c index 13b1733a329..bec0bf1386e 100644 --- a/src/f32-vbinary/gen/f32-vpreluc-wasm-u4.c +++ b/src/f32-vbinary/gen/f32-vpreluc-wasm-u4.c @@ -46,7 +46,6 @@ void xnn_f32_vpreluc_ukernel__wasm_u4( vacc2 = XNN_UNPREDICTABLE(va2 < 0.0f) ? vacc2 : va2; vacc3 = XNN_UNPREDICTABLE(va3 < 0.0f) ? vacc3 : va3; - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vpreluc-wasm-u8.c b/src/f32-vbinary/gen/f32-vpreluc-wasm-u8.c index f40b8f3da25..54576903a99 100644 --- a/src/f32-vbinary/gen/f32-vpreluc-wasm-u8.c +++ b/src/f32-vbinary/gen/f32-vpreluc-wasm-u8.c @@ -58,7 +58,6 @@ void xnn_f32_vpreluc_ukernel__wasm_u8( vacc6 = XNN_UNPREDICTABLE(va6 < 0.0f) ? vacc6 : va6; vacc7 = XNN_UNPREDICTABLE(va7 < 0.0f) ? vacc7 : va7; - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vpreluc-wasmrelaxedsimd-u16.c b/src/f32-vbinary/gen/f32-vpreluc-wasmrelaxedsimd-u16.c index 0507039c5bd..3fcd2f92e5c 100644 --- a/src/f32-vbinary/gen/f32-vpreluc-wasmrelaxedsimd-u16.c +++ b/src/f32-vbinary/gen/f32-vpreluc-wasmrelaxedsimd-u16.c @@ -52,7 +52,6 @@ void xnn_f32_vpreluc_ukernel__wasmrelaxedsimd_u16( vy2 = wasm_i32x4_relaxed_laneselect(vy2, va2, vmask2); vy3 = wasm_i32x4_relaxed_laneselect(vy3, va3, vmask3); - wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); wasm_v128_store(output + 8, vy2); @@ -67,7 +66,6 @@ void xnn_f32_vpreluc_ukernel__wasmrelaxedsimd_u16( const v128_t vmask = wasm_i32x4_shr(va, 31); vy = wasm_i32x4_relaxed_laneselect(vy, va, vmask); - wasm_v128_store(output, vy); output += 4; } @@ -78,7 +76,6 @@ void xnn_f32_vpreluc_ukernel__wasmrelaxedsimd_u16( const v128_t vmask = wasm_i32x4_shr(va, 31); vy = wasm_i32x4_relaxed_laneselect(vy, va, vmask); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vpreluc-wasmrelaxedsimd-u4.c b/src/f32-vbinary/gen/f32-vpreluc-wasmrelaxedsimd-u4.c index e8feaacc6a4..0d5811a4657 100644 --- a/src/f32-vbinary/gen/f32-vpreluc-wasmrelaxedsimd-u4.c +++ b/src/f32-vbinary/gen/f32-vpreluc-wasmrelaxedsimd-u4.c @@ -38,7 +38,6 @@ void xnn_f32_vpreluc_ukernel__wasmrelaxedsimd_u4( const v128_t vmask = wasm_i32x4_shr(va, 31); vy = wasm_i32x4_relaxed_laneselect(vy, va, vmask); - wasm_v128_store(output, vy); output += 4; } @@ -49,7 +48,6 @@ void xnn_f32_vpreluc_ukernel__wasmrelaxedsimd_u4( const v128_t vmask = wasm_i32x4_shr(va, 31); vy = wasm_i32x4_relaxed_laneselect(vy, va, vmask); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vpreluc-wasmrelaxedsimd-u8.c b/src/f32-vbinary/gen/f32-vpreluc-wasmrelaxedsimd-u8.c index 9072a9b8b78..1af075eeb97 100644 --- a/src/f32-vbinary/gen/f32-vpreluc-wasmrelaxedsimd-u8.c +++ b/src/f32-vbinary/gen/f32-vpreluc-wasmrelaxedsimd-u8.c @@ -44,7 +44,6 @@ void xnn_f32_vpreluc_ukernel__wasmrelaxedsimd_u8( vy0 = wasm_i32x4_relaxed_laneselect(vy0, va0, vmask0); vy1 = wasm_i32x4_relaxed_laneselect(vy1, va1, vmask1); - wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); output += 8; @@ -57,7 +56,6 @@ void xnn_f32_vpreluc_ukernel__wasmrelaxedsimd_u8( const v128_t vmask = wasm_i32x4_shr(va, 31); vy = wasm_i32x4_relaxed_laneselect(vy, va, vmask); - wasm_v128_store(output, vy); output += 4; } @@ -68,7 +66,6 @@ void xnn_f32_vpreluc_ukernel__wasmrelaxedsimd_u8( const v128_t vmask = wasm_i32x4_shr(va, 31); vy = wasm_i32x4_relaxed_laneselect(vy, va, vmask); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u16.c b/src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u16.c index fa09a600856..8e35443375d 100644 --- a/src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u16.c +++ b/src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u16.c @@ -52,7 +52,6 @@ void xnn_f32_vpreluc_ukernel__wasmsimd_u16( vy2 = wasm_v128_bitselect(vy2, va2, vmask2); vy3 = wasm_v128_bitselect(vy3, va3, vmask3); - wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); wasm_v128_store(output + 8, vy2); @@ -67,7 +66,6 @@ void xnn_f32_vpreluc_ukernel__wasmsimd_u16( const v128_t vmask = wasm_i32x4_shr(va, 31); vy = wasm_v128_bitselect(vy, va, vmask); - wasm_v128_store(output, vy); output += 4; } @@ -78,7 +76,6 @@ void xnn_f32_vpreluc_ukernel__wasmsimd_u16( const v128_t vmask = wasm_i32x4_shr(va, 31); vy = wasm_v128_bitselect(vy, va, vmask); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u4.c b/src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u4.c index 5632b19a355..e6788e536f0 100644 --- a/src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u4.c +++ b/src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u4.c @@ -38,7 +38,6 @@ void xnn_f32_vpreluc_ukernel__wasmsimd_u4( const v128_t vmask = wasm_i32x4_shr(va, 31); vy = wasm_v128_bitselect(vy, va, vmask); - wasm_v128_store(output, vy); output += 4; } @@ -49,7 +48,6 @@ void xnn_f32_vpreluc_ukernel__wasmsimd_u4( const v128_t vmask = wasm_i32x4_shr(va, 31); vy = wasm_v128_bitselect(vy, va, vmask); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u8.c b/src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u8.c index d91f617e4ff..ebdfd6552f2 100644 --- a/src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u8.c +++ b/src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u8.c @@ -44,7 +44,6 @@ void xnn_f32_vpreluc_ukernel__wasmsimd_u8( vy0 = wasm_v128_bitselect(vy0, va0, vmask0); vy1 = wasm_v128_bitselect(vy1, va1, vmask1); - wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); output += 8; @@ -57,7 +56,6 @@ void xnn_f32_vpreluc_ukernel__wasmsimd_u8( const v128_t vmask = wasm_i32x4_shr(va, 31); vy = wasm_v128_bitselect(vy, va, vmask); - wasm_v128_store(output, vy); output += 4; } @@ -68,7 +66,6 @@ void xnn_f32_vpreluc_ukernel__wasmsimd_u8( const v128_t vmask = wasm_i32x4_shr(va, 31); vy = wasm_v128_bitselect(vy, va, vmask); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vrdivc-minmax-aarch64-neon-u4.c b/src/f32-vbinary/gen/f32-vrdivc-aarch64-neon-u4.c similarity index 73% rename from src/f32-vbinary/gen/f32-vrdivc-minmax-aarch64-neon-u4.c rename to src/f32-vbinary/gen/f32-vrdivc-aarch64-neon-u4.c index 1a479554fab..8ed2e111322 100644 --- a/src/f32-vbinary/gen/f32-vrdivc-minmax-aarch64-neon-u4.c +++ b/src/f32-vbinary/gen/f32-vrdivc-aarch64-neon-u4.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vrdivc_minmax_ukernel__aarch64_neon_u4( +void xnn_f32_vrdivc_ukernel__aarch64_neon_u4( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -28,8 +28,6 @@ void xnn_f32_vrdivc_minmax_ukernel__aarch64_neon_u4( assert(input_b != NULL); assert(output != NULL); - const float32x4_t voutput_min = vld1q_dup_f32(¶ms->scalar.min); - const float32x4_t voutput_max = vld1q_dup_f32(¶ms->scalar.max); const float32x4_t vb = vld1q_dup_f32(input_b); for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { @@ -37,9 +35,6 @@ void xnn_f32_vrdivc_minmax_ukernel__aarch64_neon_u4( float32x4_t vacc = vdivq_f32(vb, va); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -47,9 +42,6 @@ void xnn_f32_vrdivc_minmax_ukernel__aarch64_neon_u4( float32x4_t vacc = vdivq_f32(vb, va); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/gen/f32-vrdivc-minmax-aarch64-neon-u8.c b/src/f32-vbinary/gen/f32-vrdivc-aarch64-neon-u8.c similarity index 72% rename from src/f32-vbinary/gen/f32-vrdivc-minmax-aarch64-neon-u8.c rename to src/f32-vbinary/gen/f32-vrdivc-aarch64-neon-u8.c index a8f2da2d9fa..ca79edfe78c 100644 --- a/src/f32-vbinary/gen/f32-vrdivc-minmax-aarch64-neon-u8.c +++ b/src/f32-vbinary/gen/f32-vrdivc-aarch64-neon-u8.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vrdivc_minmax_ukernel__aarch64_neon_u8( +void xnn_f32_vrdivc_ukernel__aarch64_neon_u8( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -28,8 +28,6 @@ void xnn_f32_vrdivc_minmax_ukernel__aarch64_neon_u8( assert(input_b != NULL); assert(output != NULL); - const float32x4_t voutput_min = vld1q_dup_f32(¶ms->scalar.min); - const float32x4_t voutput_max = vld1q_dup_f32(¶ms->scalar.max); const float32x4_t vb = vld1q_dup_f32(input_b); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { @@ -40,12 +38,6 @@ void xnn_f32_vrdivc_minmax_ukernel__aarch64_neon_u8( float32x4_t vacc1 = vdivq_f32(vb, va1); - vacc0 = vmaxq_f32(vacc0, voutput_min); - vacc1 = vmaxq_f32(vacc1, voutput_min); - - vacc0 = vminq_f32(vacc0, voutput_max); - vacc1 = vminq_f32(vacc1, voutput_max); - vst1q_f32(output, vacc0); output += 4; vst1q_f32(output, vacc1); output += 4; } @@ -54,9 +46,6 @@ void xnn_f32_vrdivc_minmax_ukernel__aarch64_neon_u8( float32x4_t vacc = vdivq_f32(vb, va); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -64,9 +53,6 @@ void xnn_f32_vrdivc_minmax_ukernel__aarch64_neon_u8( float32x4_t vacc = vdivq_f32(vb, va); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/gen/f32-vrdivc-minmax-avx-u8.c b/src/f32-vbinary/gen/f32-vrdivc-avx-u16.c similarity index 77% rename from src/f32-vbinary/gen/f32-vrdivc-minmax-avx-u8.c rename to src/f32-vbinary/gen/f32-vrdivc-avx-u16.c index 4a884ec5a50..0984d408081 100644 --- a/src/f32-vbinary/gen/f32-vrdivc-minmax-avx-u8.c +++ b/src/f32-vbinary/gen/f32-vrdivc-avx-u16.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vrdivc_minmax_ukernel__avx_u8( +void xnn_f32_vrdivc_ukernel__avx_u16( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -30,19 +30,26 @@ void xnn_f32_vrdivc_minmax_ukernel__avx_u8( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; - const __m256 voutput_min = _mm256_set1_ps(params->scalar.min); - const __m256 voutput_max = _mm256_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); const __m256 vb = _mm256_broadcast_ss(input_b); + for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { + const __m256 va0 = _mm256_loadu_ps(input_a); + const __m256 va1 = _mm256_loadu_ps(input_a + 8); + input_a += 16; + + __m256 vacc0 = _mm256_div_ps(vb, va0); + __m256 vacc1 = _mm256_div_ps(vb, va1); + + + _mm256_storeu_ps(output, vacc0); + _mm256_storeu_ps(output + 8, vacc1); + output += 16; + } for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const __m256 va = _mm256_loadu_ps(input_a); input_a += 8; __m256 vacc = _mm256_div_ps(vb, va); - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); _mm256_storeu_ps(output, vacc); output += 8; } @@ -54,8 +61,6 @@ void xnn_f32_vrdivc_minmax_ukernel__avx_u8( __m256 va = _mm256_maskload_ps(input_a, vmask); __m256 vacc = _mm256_div_ps(vb, va); - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); __m128 vacc_lo = _mm256_castps256_ps128(vacc); if (batch & (4 * sizeof(float))) { diff --git a/src/f32-vbinary/gen/f32-vrdivc-avx-u8.c b/src/f32-vbinary/gen/f32-vrdivc-avx-u8.c new file mode 100644 index 00000000000..a435af25989 --- /dev/null +++ b/src/f32-vbinary/gen/f32-vrdivc-avx-u8.c @@ -0,0 +1,67 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-vbinary/vopc-avx.c.in +// Generator: tools/xngen +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/vbinary.h" + + +void xnn_f32_vrdivc_ukernel__avx_u8( + size_t batch, + const float* input_a, + const float* input_b, + float* output, + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + + const __m256 vb = _mm256_broadcast_ss(input_b); + + for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { + const __m256 va = _mm256_loadu_ps(input_a); + input_a += 8; + + __m256 vacc = _mm256_div_ps(vb, va); + _mm256_storeu_ps(output, vacc); + output += 8; + } + if XNN_UNLIKELY(batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 7 * sizeof(float)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch)); + + __m256 va = _mm256_maskload_ps(input_a, vmask); + + __m256 vacc = _mm256_div_ps(vb, va); + + __m128 vacc_lo = _mm256_castps256_ps128(vacc); + if (batch & (4 * sizeof(float))) { + _mm_storeu_ps(output, vacc_lo); + vacc_lo = _mm256_extractf128_ps(vacc, 1); + output += 4; + } + if (batch & (2 * sizeof(float))) { + _mm_storel_pi((__m64*) output, vacc_lo); + vacc_lo = _mm_movehl_ps(vacc_lo, vacc_lo); + output += 2; + } + if (batch & (1 * sizeof(float))) { + _mm_store_ss(output, vacc_lo); + } + } +} diff --git a/src/f32-vbinary/gen/f32-vrdivc-minmax-avx512f-u16.c b/src/f32-vbinary/gen/f32-vrdivc-avx512f-u16.c similarity index 75% rename from src/f32-vbinary/gen/f32-vrdivc-minmax-avx512f-u16.c rename to src/f32-vbinary/gen/f32-vrdivc-avx512f-u16.c index 4ca09dc3e8d..14c2fd12df4 100644 --- a/src/f32-vbinary/gen/f32-vrdivc-minmax-avx512f-u16.c +++ b/src/f32-vbinary/gen/f32-vrdivc-avx512f-u16.c @@ -7,6 +7,7 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. + #include #include @@ -16,12 +17,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vrdivc_minmax_ukernel__avx512f_u16( +void xnn_f32_vrdivc_ukernel__avx512f_u16( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,8 +30,6 @@ void xnn_f32_vrdivc_minmax_ukernel__avx512f_u16( assert(input_b != NULL); assert(output != NULL); - const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); - const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); const __m512 vb = _mm512_set1_ps(*input_b); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { @@ -40,10 +39,6 @@ void xnn_f32_vrdivc_minmax_ukernel__avx512f_u16( __m512 vacc0 = _mm512_div_ps(vb, va0); - vacc0 = _mm512_max_ps(voutput_min, vacc0); - - vacc0 = _mm512_min_ps(voutput_max, vacc0); - _mm512_storeu_ps(output, vacc0); output += 16; } @@ -57,8 +52,6 @@ void xnn_f32_vrdivc_minmax_ukernel__avx512f_u16( __m512 va = _mm512_maskz_loadu_ps(vmask, input_a); __m512 vacc = _mm512_div_ps(vb, va); - vacc = _mm512_maskz_max_ps(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ps(vmask, voutput_max, vacc); _mm512_mask_storeu_ps(output, vmask, vacc); } } diff --git a/src/f32-vbinary/gen/f32-vrdivc-minmax-avx512f-u32.c b/src/f32-vbinary/gen/f32-vrdivc-avx512f-u32.c similarity index 73% rename from src/f32-vbinary/gen/f32-vrdivc-minmax-avx512f-u32.c rename to src/f32-vbinary/gen/f32-vrdivc-avx512f-u32.c index 9e044aca91e..0d7db977c28 100644 --- a/src/f32-vbinary/gen/f32-vrdivc-minmax-avx512f-u32.c +++ b/src/f32-vbinary/gen/f32-vrdivc-avx512f-u32.c @@ -7,6 +7,7 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. + #include #include @@ -16,12 +17,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vrdivc_minmax_ukernel__avx512f_u32( +void xnn_f32_vrdivc_ukernel__avx512f_u32( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,8 +30,6 @@ void xnn_f32_vrdivc_minmax_ukernel__avx512f_u32( assert(input_b != NULL); assert(output != NULL); - const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); - const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); const __m512 vb = _mm512_set1_ps(*input_b); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { @@ -42,12 +41,6 @@ void xnn_f32_vrdivc_minmax_ukernel__avx512f_u32( __m512 vacc1 = _mm512_div_ps(vb, va1); - vacc0 = _mm512_max_ps(voutput_min, vacc0); - vacc1 = _mm512_max_ps(voutput_min, vacc1); - - vacc0 = _mm512_min_ps(voutput_max, vacc0); - vacc1 = _mm512_min_ps(voutput_max, vacc1); - _mm512_storeu_ps(output, vacc0); _mm512_storeu_ps(output + 16, vacc1); output += 32; @@ -58,9 +51,6 @@ void xnn_f32_vrdivc_minmax_ukernel__avx512f_u32( __m512 vacc = _mm512_div_ps(vb, va); - vacc = _mm512_max_ps(voutput_min, vacc); - vacc = _mm512_min_ps(voutput_max, vacc); - _mm512_storeu_ps(output, vacc); output += 16; } @@ -74,8 +64,6 @@ void xnn_f32_vrdivc_minmax_ukernel__avx512f_u32( __m512 va = _mm512_maskz_loadu_ps(vmask, input_a); __m512 vacc = _mm512_div_ps(vb, va); - vacc = _mm512_maskz_max_ps(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ps(vmask, voutput_max, vacc); _mm512_mask_storeu_ps(output, vmask, vacc); } } diff --git a/src/f32-vbinary/gen/f32-vrdivc-minmax-avx-u16.c b/src/f32-vbinary/gen/f32-vrdivc-minmax-avx-u16.c deleted file mode 100644 index 8f46845852c..00000000000 --- a/src/f32-vbinary/gen/f32-vrdivc-minmax-avx-u16.c +++ /dev/null @@ -1,94 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-avx.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vrdivc_minmax_ukernel__avx_u16( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; - - const __m256 voutput_min = _mm256_set1_ps(params->scalar.min); - const __m256 voutput_max = _mm256_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const __m256 vb = _mm256_broadcast_ss(input_b); - - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const __m256 va0 = _mm256_loadu_ps(input_a); - const __m256 va1 = _mm256_loadu_ps(input_a + 8); - input_a += 16; - - __m256 vacc0 = _mm256_div_ps(vb, va0); - __m256 vacc1 = _mm256_div_ps(vb, va1); - - - vacc0 = _mm256_max_ps(voutput_min, vacc0); - vacc1 = _mm256_max_ps(voutput_min, vacc1); - - vacc0 = _mm256_min_ps(voutput_max, vacc0); - vacc1 = _mm256_min_ps(voutput_max, vacc1); - - _mm256_storeu_ps(output, vacc0); - _mm256_storeu_ps(output + 8, vacc1); - output += 16; - } - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const __m256 va = _mm256_loadu_ps(input_a); - input_a += 8; - - __m256 vacc = _mm256_div_ps(vb, va); - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); - _mm256_storeu_ps(output, vacc); - output += 8; - } - if XNN_UNLIKELY(batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch)); - - __m256 va = _mm256_maskload_ps(input_a, vmask); - - __m256 vacc = _mm256_div_ps(vb, va); - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); - - __m128 vacc_lo = _mm256_castps256_ps128(vacc); - if (batch & (4 * sizeof(float))) { - _mm_storeu_ps(output, vacc_lo); - vacc_lo = _mm256_extractf128_ps(vacc, 1); - output += 4; - } - if (batch & (2 * sizeof(float))) { - _mm_storel_pi((__m64*) output, vacc_lo); - vacc_lo = _mm_movehl_ps(vacc_lo, vacc_lo); - output += 2; - } - if (batch & (1 * sizeof(float))) { - _mm_store_ss(output, vacc_lo); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vrdivc-minmax-wasm-u1.c b/src/f32-vbinary/gen/f32-vrdivc-minmax-wasm-u1.c deleted file mode 100644 index 27d4880dcb1..00000000000 --- a/src/f32-vbinary/gen/f32-vrdivc-minmax-wasm-u1.c +++ /dev/null @@ -1,41 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vrdivc_minmax_ukernel__wasm_u1( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - const float vb = *input_b; - - for (; batch >= sizeof(float); batch -= sizeof(float)) { - const float va = *input_a++; - float vacc = vb / va; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output++ = vacc; - } -} diff --git a/src/f32-vbinary/gen/f32-vrdivc-minmax-wasm-u2.c b/src/f32-vbinary/gen/f32-vrdivc-minmax-wasm-u2.c deleted file mode 100644 index 78018d3c69a..00000000000 --- a/src/f32-vbinary/gen/f32-vrdivc-minmax-wasm-u2.c +++ /dev/null @@ -1,61 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vrdivc_minmax_ukernel__wasm_u2( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - const float vb = *input_b; - - for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) { - const float va0 = input_a[0]; - const float va1 = input_a[1]; - input_a += 2; - - float vacc0 = vb / va0; - float vacc1 = vb / va1; - - - vacc0 = __builtin_wasm_max_f32(vacc0, voutput_min); - vacc1 = __builtin_wasm_max_f32(vacc1, voutput_min); - - vacc0 = __builtin_wasm_min_f32(vacc0, voutput_max); - vacc1 = __builtin_wasm_min_f32(vacc1, voutput_max); - - output[0] = vacc0; - output[1] = vacc1; - output += 2; - } - if XNN_UNLIKELY(batch != 0) { - assert(batch == sizeof(float)); - const float va = *input_a; - float vacc = vb / va; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output = vacc; - } -} diff --git a/src/f32-vbinary/gen/f32-vrdivc-minmax-wasm-u4.c b/src/f32-vbinary/gen/f32-vrdivc-minmax-wasm-u4.c deleted file mode 100644 index 2629a27d753..00000000000 --- a/src/f32-vbinary/gen/f32-vrdivc-minmax-wasm-u4.c +++ /dev/null @@ -1,73 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vrdivc_minmax_ukernel__wasm_u4( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - const float vb = *input_b; - - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float va0 = input_a[0]; - const float va1 = input_a[1]; - const float va2 = input_a[2]; - const float va3 = input_a[3]; - input_a += 4; - - float vacc0 = vb / va0; - float vacc1 = vb / va1; - float vacc2 = vb / va2; - float vacc3 = vb / va3; - - - vacc0 = __builtin_wasm_max_f32(vacc0, voutput_min); - vacc1 = __builtin_wasm_max_f32(vacc1, voutput_min); - vacc2 = __builtin_wasm_max_f32(vacc2, voutput_min); - vacc3 = __builtin_wasm_max_f32(vacc3, voutput_min); - - vacc0 = __builtin_wasm_min_f32(vacc0, voutput_max); - vacc1 = __builtin_wasm_min_f32(vacc1, voutput_max); - vacc2 = __builtin_wasm_min_f32(vacc2, voutput_max); - vacc3 = __builtin_wasm_min_f32(vacc3, voutput_max); - - output[0] = vacc0; - output[1] = vacc1; - output[2] = vacc2; - output[3] = vacc3; - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - do { - const float va = *input_a++; - float vacc = vb / va; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output++ = vacc; - batch -= sizeof(float); - } while (batch != 0); - } -} diff --git a/src/f32-vbinary/gen/f32-vrdivc-minmax-wasm-u8.c b/src/f32-vbinary/gen/f32-vrdivc-minmax-wasm-u8.c deleted file mode 100644 index f6872efdb0e..00000000000 --- a/src/f32-vbinary/gen/f32-vrdivc-minmax-wasm-u8.c +++ /dev/null @@ -1,93 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vrdivc_minmax_ukernel__wasm_u8( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - const float vb = *input_b; - - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const float va0 = input_a[0]; - const float va1 = input_a[1]; - const float va2 = input_a[2]; - const float va3 = input_a[3]; - const float va4 = input_a[4]; - const float va5 = input_a[5]; - const float va6 = input_a[6]; - const float va7 = input_a[7]; - input_a += 8; - - float vacc0 = vb / va0; - float vacc1 = vb / va1; - float vacc2 = vb / va2; - float vacc3 = vb / va3; - float vacc4 = vb / va4; - float vacc5 = vb / va5; - float vacc6 = vb / va6; - float vacc7 = vb / va7; - - - vacc0 = __builtin_wasm_max_f32(vacc0, voutput_min); - vacc1 = __builtin_wasm_max_f32(vacc1, voutput_min); - vacc2 = __builtin_wasm_max_f32(vacc2, voutput_min); - vacc3 = __builtin_wasm_max_f32(vacc3, voutput_min); - vacc4 = __builtin_wasm_max_f32(vacc4, voutput_min); - vacc5 = __builtin_wasm_max_f32(vacc5, voutput_min); - vacc6 = __builtin_wasm_max_f32(vacc6, voutput_min); - vacc7 = __builtin_wasm_max_f32(vacc7, voutput_min); - - vacc0 = __builtin_wasm_min_f32(vacc0, voutput_max); - vacc1 = __builtin_wasm_min_f32(vacc1, voutput_max); - vacc2 = __builtin_wasm_min_f32(vacc2, voutput_max); - vacc3 = __builtin_wasm_min_f32(vacc3, voutput_max); - vacc4 = __builtin_wasm_min_f32(vacc4, voutput_max); - vacc5 = __builtin_wasm_min_f32(vacc5, voutput_max); - vacc6 = __builtin_wasm_min_f32(vacc6, voutput_max); - vacc7 = __builtin_wasm_min_f32(vacc7, voutput_max); - - output[0] = vacc0; - output[1] = vacc1; - output[2] = vacc2; - output[3] = vacc3; - output[4] = vacc4; - output[5] = vacc5; - output[6] = vacc6; - output[7] = vacc7; - output += 8; - } - if XNN_UNLIKELY(batch != 0) { - do { - const float va = *input_a++; - float vacc = vb / va; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output++ = vacc; - batch -= sizeof(float); - } while (batch != 0); - } -} diff --git a/src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-arm-u16.c b/src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-arm-u16.c deleted file mode 100644 index 16a6aa1c00f..00000000000 --- a/src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-arm-u16.c +++ /dev/null @@ -1,95 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vrdivc_minmax_ukernel__wasmsimd_arm_u16( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const v128_t vb = wasm_v128_load32_splat(input_b); - - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - const v128_t va2 = wasm_v128_load(input_a + 8); - const v128_t va3 = wasm_v128_load(input_a + 12); - input_a += 16; - - v128_t vy0 = wasm_f32x4_div(vb, va0); - v128_t vy1 = wasm_f32x4_div(vb, va1); - v128_t vy2 = wasm_f32x4_div(vb, va2); - v128_t vy3 = wasm_f32x4_div(vb, va3); - - - vy0 = wasm_f32x4_max(vy0, voutput_min); - vy1 = wasm_f32x4_max(vy1, voutput_min); - vy2 = wasm_f32x4_max(vy2, voutput_min); - vy3 = wasm_f32x4_max(vy3, voutput_min); - - vy0 = wasm_f32x4_min(vy0, voutput_max); - vy1 = wasm_f32x4_min(vy1, voutput_max); - vy2 = wasm_f32x4_min(vy2, voutput_max); - vy3 = wasm_f32x4_min(vy3, voutput_max); - - wasm_v128_store(output, vy0); - wasm_v128_store(output + 4, vy1); - wasm_v128_store(output + 8, vy2); - wasm_v128_store(output + 12, vy3); - output += 16; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - v128_t vy = wasm_f32x4_div(vb, va); - - vy = wasm_f32x4_max(vy, voutput_min); - vy = wasm_f32x4_min(vy, voutput_max); - - wasm_v128_store(output, vy); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - - v128_t vy = wasm_f32x4_div(vb, va); - - vy = wasm_f32x4_max(vy, voutput_min); - vy = wasm_f32x4_min(vy, voutput_max); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vy, 0); - vy = wasm_v64x2_shuffle(vy, vy, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vy, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-arm-u4.c b/src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-arm-u4.c deleted file mode 100644 index 3a88aa67cba..00000000000 --- a/src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-arm-u4.c +++ /dev/null @@ -1,66 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vrdivc_minmax_ukernel__wasmsimd_arm_u4( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const v128_t vb = wasm_v128_load32_splat(input_b); - - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - v128_t vy = wasm_f32x4_div(vb, va); - - vy = wasm_f32x4_max(vy, voutput_min); - vy = wasm_f32x4_min(vy, voutput_max); - - wasm_v128_store(output, vy); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - - v128_t vy = wasm_f32x4_div(vb, va); - - vy = wasm_f32x4_max(vy, voutput_min); - vy = wasm_f32x4_min(vy, voutput_max); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vy, 0); - vy = wasm_v64x2_shuffle(vy, vy, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vy, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-arm-u8.c b/src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-arm-u8.c deleted file mode 100644 index 9e7469ba829..00000000000 --- a/src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-arm-u8.c +++ /dev/null @@ -1,85 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vrdivc_minmax_ukernel__wasmsimd_arm_u8( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const v128_t vb = wasm_v128_load32_splat(input_b); - - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - input_a += 8; - - v128_t vy0 = wasm_f32x4_div(vb, va0); - v128_t vy1 = wasm_f32x4_div(vb, va1); - - - vy0 = wasm_f32x4_max(vy0, voutput_min); - vy1 = wasm_f32x4_max(vy1, voutput_min); - - vy0 = wasm_f32x4_min(vy0, voutput_max); - vy1 = wasm_f32x4_min(vy1, voutput_max); - - wasm_v128_store(output, vy0); - wasm_v128_store(output + 4, vy1); - output += 8; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - v128_t vy = wasm_f32x4_div(vb, va); - - vy = wasm_f32x4_max(vy, voutput_min); - vy = wasm_f32x4_min(vy, voutput_max); - - wasm_v128_store(output, vy); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - - v128_t vy = wasm_f32x4_div(vb, va); - - vy = wasm_f32x4_max(vy, voutput_min); - vy = wasm_f32x4_min(vy, voutput_max); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vy, 0); - vy = wasm_v64x2_shuffle(vy, vy, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vy, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-x86-u16.c b/src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-x86-u16.c deleted file mode 100644 index c81759a532f..00000000000 --- a/src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-x86-u16.c +++ /dev/null @@ -1,95 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vrdivc_minmax_ukernel__wasmsimd_x86_u16( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const v128_t vb = wasm_v128_load32_splat(input_b); - - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - const v128_t va2 = wasm_v128_load(input_a + 8); - const v128_t va3 = wasm_v128_load(input_a + 12); - input_a += 16; - - v128_t vy0 = wasm_f32x4_div(vb, va0); - v128_t vy1 = wasm_f32x4_div(vb, va1); - v128_t vy2 = wasm_f32x4_div(vb, va2); - v128_t vy3 = wasm_f32x4_div(vb, va3); - - - vy0 = wasm_f32x4_pmax(voutput_min, vy0); - vy1 = wasm_f32x4_pmax(voutput_min, vy1); - vy2 = wasm_f32x4_pmax(voutput_min, vy2); - vy3 = wasm_f32x4_pmax(voutput_min, vy3); - - vy0 = wasm_f32x4_pmin(voutput_max, vy0); - vy1 = wasm_f32x4_pmin(voutput_max, vy1); - vy2 = wasm_f32x4_pmin(voutput_max, vy2); - vy3 = wasm_f32x4_pmin(voutput_max, vy3); - - wasm_v128_store(output, vy0); - wasm_v128_store(output + 4, vy1); - wasm_v128_store(output + 8, vy2); - wasm_v128_store(output + 12, vy3); - output += 16; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - v128_t vy = wasm_f32x4_div(vb, va); - - vy = wasm_f32x4_pmax(voutput_min, vy); - vy = wasm_f32x4_pmin(voutput_max, vy); - - wasm_v128_store(output, vy); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - - v128_t vy = wasm_f32x4_div(vb, va); - - vy = wasm_f32x4_pmax(voutput_min, vy); - vy = wasm_f32x4_pmin(voutput_max, vy); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vy, 0); - vy = wasm_v64x2_shuffle(vy, vy, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vy, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-x86-u4.c b/src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-x86-u4.c deleted file mode 100644 index fb4135a843c..00000000000 --- a/src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-x86-u4.c +++ /dev/null @@ -1,66 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vrdivc_minmax_ukernel__wasmsimd_x86_u4( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const v128_t vb = wasm_v128_load32_splat(input_b); - - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - v128_t vy = wasm_f32x4_div(vb, va); - - vy = wasm_f32x4_pmax(voutput_min, vy); - vy = wasm_f32x4_pmin(voutput_max, vy); - - wasm_v128_store(output, vy); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - - v128_t vy = wasm_f32x4_div(vb, va); - - vy = wasm_f32x4_pmax(voutput_min, vy); - vy = wasm_f32x4_pmin(voutput_max, vy); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vy, 0); - vy = wasm_v64x2_shuffle(vy, vy, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vy, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-x86-u8.c b/src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-x86-u8.c deleted file mode 100644 index fe668e985d7..00000000000 --- a/src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-x86-u8.c +++ /dev/null @@ -1,85 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vrdivc_minmax_ukernel__wasmsimd_x86_u8( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const v128_t vb = wasm_v128_load32_splat(input_b); - - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - input_a += 8; - - v128_t vy0 = wasm_f32x4_div(vb, va0); - v128_t vy1 = wasm_f32x4_div(vb, va1); - - - vy0 = wasm_f32x4_pmax(voutput_min, vy0); - vy1 = wasm_f32x4_pmax(voutput_min, vy1); - - vy0 = wasm_f32x4_pmin(voutput_max, vy0); - vy1 = wasm_f32x4_pmin(voutput_max, vy1); - - wasm_v128_store(output, vy0); - wasm_v128_store(output + 4, vy1); - output += 8; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - v128_t vy = wasm_f32x4_div(vb, va); - - vy = wasm_f32x4_pmax(voutput_min, vy); - vy = wasm_f32x4_pmin(voutput_max, vy); - - wasm_v128_store(output, vy); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - - v128_t vy = wasm_f32x4_div(vb, va); - - vy = wasm_f32x4_pmax(voutput_min, vy); - vy = wasm_f32x4_pmin(voutput_max, vy); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vy, 0); - vy = wasm_v64x2_shuffle(vy, vy, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vy, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vrdivc-minmax-rvv-u4v.c b/src/f32-vbinary/gen/f32-vrdivc-rvv-u4v.c similarity index 74% rename from src/f32-vbinary/gen/f32-vrdivc-minmax-rvv-u4v.c rename to src/f32-vbinary/gen/f32-vrdivc-rvv-u4v.c index f1fd82939b1..b9f3ca90203 100644 --- a/src/f32-vbinary/gen/f32-vrdivc-minmax-rvv-u4v.c +++ b/src/f32-vbinary/gen/f32-vrdivc-rvv-u4v.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vrdivc_minmax_ukernel__rvv_u4v( +void xnn_f32_vrdivc_ukernel__rvv_u4v( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -28,8 +28,6 @@ void xnn_f32_vrdivc_minmax_ukernel__rvv_u4v( assert(input_b != NULL); assert(output != NULL); - const float output_min = params->scalar.min; - const float output_max = params->scalar.max; const float b = *input_b; size_t n = batch >> 2; @@ -39,8 +37,6 @@ void xnn_f32_vrdivc_minmax_ukernel__rvv_u4v( vfloat32m4_t va = __riscv_vle32_v_f32m4(input_a, vl); input_a += vl; vfloat32m4_t vacc = __riscv_vfrdiv_vf_f32m4(va, b, vl); - vacc = __riscv_vfmax_vf_f32m4(vacc, output_min, vl); - vacc = __riscv_vfmin_vf_f32m4(vacc, output_max, vl); __riscv_vse32_v_f32m4(output, vacc, vl); output += vl; } while (n > 0); diff --git a/src/f32-vbinary/gen/f32-vrdivc-minmax-rvv-u8v.c b/src/f32-vbinary/gen/f32-vrdivc-rvv-u8v.c similarity index 74% rename from src/f32-vbinary/gen/f32-vrdivc-minmax-rvv-u8v.c rename to src/f32-vbinary/gen/f32-vrdivc-rvv-u8v.c index 3a41ede569a..793e5e2cf87 100644 --- a/src/f32-vbinary/gen/f32-vrdivc-minmax-rvv-u8v.c +++ b/src/f32-vbinary/gen/f32-vrdivc-rvv-u8v.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vrdivc_minmax_ukernel__rvv_u8v( +void xnn_f32_vrdivc_ukernel__rvv_u8v( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -28,8 +28,6 @@ void xnn_f32_vrdivc_minmax_ukernel__rvv_u8v( assert(input_b != NULL); assert(output != NULL); - const float output_min = params->scalar.min; - const float output_max = params->scalar.max; const float b = *input_b; size_t n = batch >> 2; @@ -39,8 +37,6 @@ void xnn_f32_vrdivc_minmax_ukernel__rvv_u8v( vfloat32m8_t va = __riscv_vle32_v_f32m8(input_a, vl); input_a += vl; vfloat32m8_t vacc = __riscv_vfrdiv_vf_f32m8(va, b, vl); - vacc = __riscv_vfmax_vf_f32m8(vacc, output_min, vl); - vacc = __riscv_vfmin_vf_f32m8(vacc, output_max, vl); __riscv_vse32_v_f32m8(output, vacc, vl); output += vl; } while (n > 0); diff --git a/src/f32-vbinary/gen/f32-vrdivc-scalar-u2.c b/src/f32-vbinary/gen/f32-vrdivc-scalar-u2.c index 05ac9cb92c5..9cbbfdac6a3 100644 --- a/src/f32-vbinary/gen/f32-vrdivc-scalar-u2.c +++ b/src/f32-vbinary/gen/f32-vrdivc-scalar-u2.c @@ -38,7 +38,6 @@ void xnn_f32_vrdivc_ukernel__scalar_u2( float vacc1 = vb / va1; - output[0] = vacc0; output[1] = vacc1; output += 2; diff --git a/src/f32-vbinary/gen/f32-vrdivc-scalar-u4.c b/src/f32-vbinary/gen/f32-vrdivc-scalar-u4.c index 4a6d92af3b1..c9c4e8fbafc 100644 --- a/src/f32-vbinary/gen/f32-vrdivc-scalar-u4.c +++ b/src/f32-vbinary/gen/f32-vrdivc-scalar-u4.c @@ -42,7 +42,6 @@ void xnn_f32_vrdivc_ukernel__scalar_u4( float vacc3 = vb / va3; - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vrdivc-scalar-u8.c b/src/f32-vbinary/gen/f32-vrdivc-scalar-u8.c index e9024a9fbdd..aa3e023d968 100644 --- a/src/f32-vbinary/gen/f32-vrdivc-scalar-u8.c +++ b/src/f32-vbinary/gen/f32-vrdivc-scalar-u8.c @@ -50,7 +50,6 @@ void xnn_f32_vrdivc_ukernel__scalar_u8( float vacc7 = vb / va7; - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vrdivc-minmax-sse-u4.c b/src/f32-vbinary/gen/f32-vrdivc-sse-u4.c similarity index 71% rename from src/f32-vbinary/gen/f32-vrdivc-minmax-sse-u4.c rename to src/f32-vbinary/gen/f32-vrdivc-sse-u4.c index 985cb36efb8..ae35cbc839d 100644 --- a/src/f32-vbinary/gen/f32-vrdivc-minmax-sse-u4.c +++ b/src/f32-vbinary/gen/f32-vrdivc-sse-u4.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vrdivc_minmax_ukernel__sse_u4( +void xnn_f32_vrdivc_ukernel__sse_u4( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,10 +29,6 @@ void xnn_f32_vrdivc_minmax_ukernel__sse_u4( assert(input_b != NULL); assert(output != NULL); - const __m128 voutput_min = _mm_set1_ps(params->scalar.min); - const __m128 voutput_max = _mm_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); const __m128 vb = _mm_load1_ps(input_b); for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { @@ -40,8 +36,6 @@ void xnn_f32_vrdivc_minmax_ukernel__sse_u4( input_a += 4; __m128 vacc = _mm_div_ps(vb, va); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); _mm_storeu_ps(output, vacc); output += 4; @@ -50,8 +44,6 @@ void xnn_f32_vrdivc_minmax_ukernel__sse_u4( const __m128 va = _mm_loadu_ps(input_a); __m128 vacc = _mm_div_ps(vb, va); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); if (batch & (2 * sizeof(float))) { _mm_storel_pi((__m64*) output, vacc); vacc = _mm_movehl_ps(vacc, vacc); diff --git a/src/f32-vbinary/gen/f32-vrdivc-minmax-sse-u8.c b/src/f32-vbinary/gen/f32-vrdivc-sse-u8.c similarity index 70% rename from src/f32-vbinary/gen/f32-vrdivc-minmax-sse-u8.c rename to src/f32-vbinary/gen/f32-vrdivc-sse-u8.c index 66b5025cb0d..b247b78135b 100644 --- a/src/f32-vbinary/gen/f32-vrdivc-minmax-sse-u8.c +++ b/src/f32-vbinary/gen/f32-vrdivc-sse-u8.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vrdivc_minmax_ukernel__sse_u8( +void xnn_f32_vrdivc_ukernel__sse_u8( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,10 +29,6 @@ void xnn_f32_vrdivc_minmax_ukernel__sse_u8( assert(input_b != NULL); assert(output != NULL); - const __m128 voutput_min = _mm_set1_ps(params->scalar.min); - const __m128 voutput_max = _mm_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); const __m128 vb = _mm_load1_ps(input_b); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { @@ -44,12 +40,6 @@ void xnn_f32_vrdivc_minmax_ukernel__sse_u8( __m128 vacc1 = _mm_div_ps(vb, va1); - vacc0 = _mm_max_ps(vacc0, voutput_min); - vacc1 = _mm_max_ps(vacc1, voutput_min); - - vacc0 = _mm_min_ps(vacc0, voutput_max); - vacc1 = _mm_min_ps(vacc1, voutput_max); - _mm_storeu_ps(output, vacc0); _mm_storeu_ps(output + 4, vacc1); output += 8; @@ -59,8 +49,6 @@ void xnn_f32_vrdivc_minmax_ukernel__sse_u8( input_a += 4; __m128 vacc = _mm_div_ps(vb, va); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); _mm_storeu_ps(output, vacc); output += 4; @@ -69,8 +57,6 @@ void xnn_f32_vrdivc_minmax_ukernel__sse_u8( const __m128 va = _mm_loadu_ps(input_a); __m128 vacc = _mm_div_ps(vb, va); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); if (batch & (2 * sizeof(float))) { _mm_storel_pi((__m64*) output, vacc); vacc = _mm_movehl_ps(vacc, vacc); diff --git a/src/f32-vbinary/gen/f32-vrdivc-minmax-scalar-u1.c b/src/f32-vbinary/gen/f32-vrdivc-wasm-u1.c similarity index 72% rename from src/f32-vbinary/gen/f32-vrdivc-minmax-scalar-u1.c rename to src/f32-vbinary/gen/f32-vrdivc-wasm-u1.c index 0d589fe4a1a..75b486d8788 100644 --- a/src/f32-vbinary/gen/f32-vrdivc-minmax-scalar-u1.c +++ b/src/f32-vbinary/gen/f32-vrdivc-wasm-u1.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vrdivc_minmax_ukernel__scalar_u1( +void xnn_f32_vrdivc_ukernel__wasm_u1( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,15 +27,11 @@ void xnn_f32_vrdivc_minmax_ukernel__scalar_u1( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; const float vb = *input_b; for (; batch >= sizeof(float); batch -= sizeof(float)) { const float va = *input_a++; float vacc = vb / va; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output++ = vacc; } } diff --git a/src/f32-vbinary/gen/f32-vrdivc-minmax-scalar-u2.c b/src/f32-vbinary/gen/f32-vrdivc-wasm-u2.c similarity index 68% rename from src/f32-vbinary/gen/f32-vrdivc-minmax-scalar-u2.c rename to src/f32-vbinary/gen/f32-vrdivc-wasm-u2.c index 3af2d5fe7a6..ae716f0de17 100644 --- a/src/f32-vbinary/gen/f32-vrdivc-minmax-scalar-u2.c +++ b/src/f32-vbinary/gen/f32-vrdivc-wasm-u2.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vrdivc_minmax_ukernel__scalar_u2( +void xnn_f32_vrdivc_ukernel__wasm_u2( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,8 +27,6 @@ void xnn_f32_vrdivc_minmax_ukernel__scalar_u2( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; const float vb = *input_b; for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) { @@ -40,12 +38,6 @@ void xnn_f32_vrdivc_minmax_ukernel__scalar_u2( float vacc1 = vb / va1; - vacc0 = math_max_f32(vacc0, voutput_min); - vacc1 = math_max_f32(vacc1, voutput_min); - - vacc0 = math_min_f32(vacc0, voutput_max); - vacc1 = math_min_f32(vacc1, voutput_max); - output[0] = vacc0; output[1] = vacc1; output += 2; @@ -54,8 +46,6 @@ void xnn_f32_vrdivc_minmax_ukernel__scalar_u2( assert(batch == sizeof(float)); const float va = *input_a; float vacc = vb / va; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output = vacc; } } diff --git a/src/f32-vbinary/gen/f32-vrdivc-minmax-scalar-u4.c b/src/f32-vbinary/gen/f32-vrdivc-wasm-u4.c similarity index 65% rename from src/f32-vbinary/gen/f32-vrdivc-minmax-scalar-u4.c rename to src/f32-vbinary/gen/f32-vrdivc-wasm-u4.c index c859cce9b9c..0a9ddd6d49e 100644 --- a/src/f32-vbinary/gen/f32-vrdivc-minmax-scalar-u4.c +++ b/src/f32-vbinary/gen/f32-vrdivc-wasm-u4.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vrdivc_minmax_ukernel__scalar_u4( +void xnn_f32_vrdivc_ukernel__wasm_u4( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,8 +27,6 @@ void xnn_f32_vrdivc_minmax_ukernel__scalar_u4( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; const float vb = *input_b; for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { @@ -44,16 +42,6 @@ void xnn_f32_vrdivc_minmax_ukernel__scalar_u4( float vacc3 = vb / va3; - vacc0 = math_max_f32(vacc0, voutput_min); - vacc1 = math_max_f32(vacc1, voutput_min); - vacc2 = math_max_f32(vacc2, voutput_min); - vacc3 = math_max_f32(vacc3, voutput_min); - - vacc0 = math_min_f32(vacc0, voutput_max); - vacc1 = math_min_f32(vacc1, voutput_max); - vacc2 = math_min_f32(vacc2, voutput_max); - vacc3 = math_min_f32(vacc3, voutput_max); - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; @@ -64,8 +52,6 @@ void xnn_f32_vrdivc_minmax_ukernel__scalar_u4( do { const float va = *input_a++; float vacc = vb / va; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output++ = vacc; batch -= sizeof(float); } while (batch != 0); diff --git a/src/f32-vbinary/gen/f32-vrdivc-minmax-scalar-u8.c b/src/f32-vbinary/gen/f32-vrdivc-wasm-u8.c similarity index 60% rename from src/f32-vbinary/gen/f32-vrdivc-minmax-scalar-u8.c rename to src/f32-vbinary/gen/f32-vrdivc-wasm-u8.c index 324849b70b8..d9afb785bd2 100644 --- a/src/f32-vbinary/gen/f32-vrdivc-minmax-scalar-u8.c +++ b/src/f32-vbinary/gen/f32-vrdivc-wasm-u8.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vrdivc_minmax_ukernel__scalar_u8( +void xnn_f32_vrdivc_ukernel__wasm_u8( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,8 +27,6 @@ void xnn_f32_vrdivc_minmax_ukernel__scalar_u8( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; const float vb = *input_b; for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { @@ -52,24 +50,6 @@ void xnn_f32_vrdivc_minmax_ukernel__scalar_u8( float vacc7 = vb / va7; - vacc0 = math_max_f32(vacc0, voutput_min); - vacc1 = math_max_f32(vacc1, voutput_min); - vacc2 = math_max_f32(vacc2, voutput_min); - vacc3 = math_max_f32(vacc3, voutput_min); - vacc4 = math_max_f32(vacc4, voutput_min); - vacc5 = math_max_f32(vacc5, voutput_min); - vacc6 = math_max_f32(vacc6, voutput_min); - vacc7 = math_max_f32(vacc7, voutput_min); - - vacc0 = math_min_f32(vacc0, voutput_max); - vacc1 = math_min_f32(vacc1, voutput_max); - vacc2 = math_min_f32(vacc2, voutput_max); - vacc3 = math_min_f32(vacc3, voutput_max); - vacc4 = math_min_f32(vacc4, voutput_max); - vacc5 = math_min_f32(vacc5, voutput_max); - vacc6 = math_min_f32(vacc6, voutput_max); - vacc7 = math_min_f32(vacc7, voutput_max); - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; @@ -84,8 +64,6 @@ void xnn_f32_vrdivc_minmax_ukernel__scalar_u8( do { const float va = *input_a++; float vacc = vb / va; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output++ = vacc; batch -= sizeof(float); } while (batch != 0); diff --git a/src/f32-vbinary/gen/f32-vrdivc-wasmsimd-u16.c b/src/f32-vbinary/gen/f32-vrdivc-wasmsimd-u16.c index 4f39e626fb2..3d841288248 100644 --- a/src/f32-vbinary/gen/f32-vrdivc-wasmsimd-u16.c +++ b/src/f32-vbinary/gen/f32-vrdivc-wasmsimd-u16.c @@ -43,7 +43,6 @@ void xnn_f32_vrdivc_ukernel__wasmsimd_u16( v128_t vy3 = wasm_f32x4_div(vb, va3); - wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); wasm_v128_store(output + 8, vy2); @@ -56,7 +55,6 @@ void xnn_f32_vrdivc_ukernel__wasmsimd_u16( v128_t vy = wasm_f32x4_div(vb, va); - wasm_v128_store(output, vy); output += 4; } @@ -65,7 +63,6 @@ void xnn_f32_vrdivc_ukernel__wasmsimd_u16( v128_t vy = wasm_f32x4_div(vb, va); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vrdivc-wasmsimd-u4.c b/src/f32-vbinary/gen/f32-vrdivc-wasmsimd-u4.c index 3136b78bee0..ab47e6e0c8c 100644 --- a/src/f32-vbinary/gen/f32-vrdivc-wasmsimd-u4.c +++ b/src/f32-vbinary/gen/f32-vrdivc-wasmsimd-u4.c @@ -36,7 +36,6 @@ void xnn_f32_vrdivc_ukernel__wasmsimd_u4( v128_t vy = wasm_f32x4_div(vb, va); - wasm_v128_store(output, vy); output += 4; } @@ -45,7 +44,6 @@ void xnn_f32_vrdivc_ukernel__wasmsimd_u4( v128_t vy = wasm_f32x4_div(vb, va); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vrdivc-wasmsimd-u8.c b/src/f32-vbinary/gen/f32-vrdivc-wasmsimd-u8.c index 98d1d83a51a..e1dd0974f60 100644 --- a/src/f32-vbinary/gen/f32-vrdivc-wasmsimd-u8.c +++ b/src/f32-vbinary/gen/f32-vrdivc-wasmsimd-u8.c @@ -39,7 +39,6 @@ void xnn_f32_vrdivc_ukernel__wasmsimd_u8( v128_t vy1 = wasm_f32x4_div(vb, va1); - wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); output += 8; @@ -50,7 +49,6 @@ void xnn_f32_vrdivc_ukernel__wasmsimd_u8( v128_t vy = wasm_f32x4_div(vb, va); - wasm_v128_store(output, vy); output += 4; } @@ -59,7 +57,6 @@ void xnn_f32_vrdivc_ukernel__wasmsimd_u8( v128_t vy = wasm_f32x4_div(vb, va); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vrpreluc-avx-u16.c b/src/f32-vbinary/gen/f32-vrpreluc-avx-u16.c index e878c2f1880..8ab8581cf23 100644 --- a/src/f32-vbinary/gen/f32-vrpreluc-avx-u16.c +++ b/src/f32-vbinary/gen/f32-vrpreluc-avx-u16.c @@ -43,7 +43,6 @@ void xnn_f32_vrpreluc_ukernel__avx_u16( vacc0 = _mm256_blendv_ps(vb, vacc0, vb); vacc1 = _mm256_blendv_ps(vb, vacc1, vb); - _mm256_storeu_ps(output, vacc0); _mm256_storeu_ps(output + 8, vacc1); output += 16; diff --git a/src/f32-vbinary/gen/f32-vrpreluc-avx512f-u16.c b/src/f32-vbinary/gen/f32-vrpreluc-avx512f-u16.c index 3df98899970..371bef274ea 100644 --- a/src/f32-vbinary/gen/f32-vrpreluc-avx512f-u16.c +++ b/src/f32-vbinary/gen/f32-vrpreluc-avx512f-u16.c @@ -7,6 +7,7 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. + #include #include @@ -39,7 +40,6 @@ void xnn_f32_vrpreluc_ukernel__avx512f_u16( __m512 vacc0 = _mm512_mask_mul_ps(vb, vsign, va0, vb); - _mm512_storeu_ps(output, vacc0); output += 16; } diff --git a/src/f32-vbinary/gen/f32-vrpreluc-avx512f-u32.c b/src/f32-vbinary/gen/f32-vrpreluc-avx512f-u32.c index feaf2adb40f..bcd65d0b266 100644 --- a/src/f32-vbinary/gen/f32-vrpreluc-avx512f-u32.c +++ b/src/f32-vbinary/gen/f32-vrpreluc-avx512f-u32.c @@ -7,6 +7,7 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. + #include #include @@ -41,7 +42,6 @@ void xnn_f32_vrpreluc_ukernel__avx512f_u32( __m512 vacc0 = _mm512_mask_mul_ps(vb, vsign, va0, vb); __m512 vacc1 = _mm512_mask_mul_ps(vb, vsign, va1, vb); - _mm512_storeu_ps(output, vacc0); _mm512_storeu_ps(output + 16, vacc1); output += 32; @@ -52,7 +52,6 @@ void xnn_f32_vrpreluc_ukernel__avx512f_u32( __m512 vacc = _mm512_mask_mul_ps(vb, vsign, va, vb); - _mm512_storeu_ps(output, vacc); output += 16; } diff --git a/src/f32-vbinary/gen/f32-vrpreluc-neon-u4.c b/src/f32-vbinary/gen/f32-vrpreluc-neon-u4.c index 785004c0465..4dcde472d27 100644 --- a/src/f32-vbinary/gen/f32-vrpreluc-neon-u4.c +++ b/src/f32-vbinary/gen/f32-vrpreluc-neon-u4.c @@ -37,7 +37,6 @@ void xnn_f32_vrpreluc_ukernel__neon_u4( float32x4_t vacc = vmulq_f32(va, vb); vacc = vbslq_f32(vm, vacc, vb); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -46,7 +45,6 @@ void xnn_f32_vrpreluc_ukernel__neon_u4( float32x4_t vacc = vmulq_f32(va, vb); vacc = vbslq_f32(vm, vacc, vb); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/gen/f32-vrpreluc-neon-u8.c b/src/f32-vbinary/gen/f32-vrpreluc-neon-u8.c index 76b32f10782..87131592c99 100644 --- a/src/f32-vbinary/gen/f32-vrpreluc-neon-u8.c +++ b/src/f32-vbinary/gen/f32-vrpreluc-neon-u8.c @@ -41,7 +41,6 @@ void xnn_f32_vrpreluc_ukernel__neon_u8( vacc0 = vbslq_f32(vm, vacc0, vb); vacc1 = vbslq_f32(vm, vacc1, vb); - vst1q_f32(output, vacc0); output += 4; vst1q_f32(output, vacc1); output += 4; } @@ -51,7 +50,6 @@ void xnn_f32_vrpreluc_ukernel__neon_u8( float32x4_t vacc = vmulq_f32(va, vb); vacc = vbslq_f32(vm, vacc, vb); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -60,7 +58,6 @@ void xnn_f32_vrpreluc_ukernel__neon_u8( float32x4_t vacc = vmulq_f32(va, vb); vacc = vbslq_f32(vm, vacc, vb); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/gen/f32-vrpreluc-scalar-u2.c b/src/f32-vbinary/gen/f32-vrpreluc-scalar-u2.c index 92c894a483e..f9e640977f1 100644 --- a/src/f32-vbinary/gen/f32-vrpreluc-scalar-u2.c +++ b/src/f32-vbinary/gen/f32-vrpreluc-scalar-u2.c @@ -40,7 +40,6 @@ void xnn_f32_vrpreluc_ukernel__scalar_u2( vacc0 = vb < 0.0f ? vacc0 : vb; vacc1 = vb < 0.0f ? vacc1 : vb; - output[0] = vacc0; output[1] = vacc1; output += 2; diff --git a/src/f32-vbinary/gen/f32-vrpreluc-scalar-u4.c b/src/f32-vbinary/gen/f32-vrpreluc-scalar-u4.c index ad998c96926..b341c5a6589 100644 --- a/src/f32-vbinary/gen/f32-vrpreluc-scalar-u4.c +++ b/src/f32-vbinary/gen/f32-vrpreluc-scalar-u4.c @@ -46,7 +46,6 @@ void xnn_f32_vrpreluc_ukernel__scalar_u4( vacc2 = vb < 0.0f ? vacc2 : vb; vacc3 = vb < 0.0f ? vacc3 : vb; - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vrpreluc-scalar-u8.c b/src/f32-vbinary/gen/f32-vrpreluc-scalar-u8.c index 7046e09bc0e..11a69249b2e 100644 --- a/src/f32-vbinary/gen/f32-vrpreluc-scalar-u8.c +++ b/src/f32-vbinary/gen/f32-vrpreluc-scalar-u8.c @@ -58,7 +58,6 @@ void xnn_f32_vrpreluc_ukernel__scalar_u8( vacc6 = vb < 0.0f ? vacc6 : vb; vacc7 = vb < 0.0f ? vacc7 : vb; - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vrpreluc-sse2-u8.c b/src/f32-vbinary/gen/f32-vrpreluc-sse2-u8.c index 3d8328533d5..3a7dc94f3d1 100644 --- a/src/f32-vbinary/gen/f32-vrpreluc-sse2-u8.c +++ b/src/f32-vbinary/gen/f32-vrpreluc-sse2-u8.c @@ -43,7 +43,6 @@ void xnn_f32_vrpreluc_ukernel__sse2_u8( vacc0 = _mm_or_ps(_mm_and_ps(vacc0, vmask), _mm_andnot_ps(vmask, vb)); vacc1 = _mm_or_ps(_mm_and_ps(vacc1, vmask), _mm_andnot_ps(vmask, vb)); - _mm_storeu_ps(output, vacc0); _mm_storeu_ps(output + 4, vacc1); output += 8; diff --git a/src/f32-vbinary/gen/f32-vrpreluc-sse41-u8.c b/src/f32-vbinary/gen/f32-vrpreluc-sse41-u8.c index 38f0926e876..08d4e39c135 100644 --- a/src/f32-vbinary/gen/f32-vrpreluc-sse41-u8.c +++ b/src/f32-vbinary/gen/f32-vrpreluc-sse41-u8.c @@ -42,7 +42,6 @@ void xnn_f32_vrpreluc_ukernel__sse41_u8( vacc0 = _mm_blendv_ps(vb, vacc0, vb); vacc1 = _mm_blendv_ps(vb, vacc1, vb); - _mm_storeu_ps(output, vacc0); _mm_storeu_ps(output + 4, vacc1); output += 8; diff --git a/src/f32-vbinary/gen/f32-vrpreluc-wasm-u2.c b/src/f32-vbinary/gen/f32-vrpreluc-wasm-u2.c index ea274eefea1..14d478929ef 100644 --- a/src/f32-vbinary/gen/f32-vrpreluc-wasm-u2.c +++ b/src/f32-vbinary/gen/f32-vrpreluc-wasm-u2.c @@ -40,7 +40,6 @@ void xnn_f32_vrpreluc_ukernel__wasm_u2( vacc0 = vb < 0.0f ? vacc0 : vb; vacc1 = vb < 0.0f ? vacc1 : vb; - output[0] = vacc0; output[1] = vacc1; output += 2; diff --git a/src/f32-vbinary/gen/f32-vrpreluc-wasm-u4.c b/src/f32-vbinary/gen/f32-vrpreluc-wasm-u4.c index 0a2d946d7a2..dff91582870 100644 --- a/src/f32-vbinary/gen/f32-vrpreluc-wasm-u4.c +++ b/src/f32-vbinary/gen/f32-vrpreluc-wasm-u4.c @@ -46,7 +46,6 @@ void xnn_f32_vrpreluc_ukernel__wasm_u4( vacc2 = vb < 0.0f ? vacc2 : vb; vacc3 = vb < 0.0f ? vacc3 : vb; - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vrpreluc-wasm-u8.c b/src/f32-vbinary/gen/f32-vrpreluc-wasm-u8.c index 0b7a727a570..0f079bad2b0 100644 --- a/src/f32-vbinary/gen/f32-vrpreluc-wasm-u8.c +++ b/src/f32-vbinary/gen/f32-vrpreluc-wasm-u8.c @@ -58,7 +58,6 @@ void xnn_f32_vrpreluc_ukernel__wasm_u8( vacc6 = vb < 0.0f ? vacc6 : vb; vacc7 = vb < 0.0f ? vacc7 : vb; - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vrpreluc-wasmrelaxedsimd-u16.c b/src/f32-vbinary/gen/f32-vrpreluc-wasmrelaxedsimd-u16.c index 3e9d79a6082..b955c9c7555 100644 --- a/src/f32-vbinary/gen/f32-vrpreluc-wasmrelaxedsimd-u16.c +++ b/src/f32-vbinary/gen/f32-vrpreluc-wasmrelaxedsimd-u16.c @@ -48,7 +48,6 @@ void xnn_f32_vrpreluc_ukernel__wasmrelaxedsimd_u16( vy2 = wasm_i32x4_relaxed_laneselect(vy2, vb, vmask); vy3 = wasm_i32x4_relaxed_laneselect(vy3, vb, vmask); - wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); wasm_v128_store(output + 8, vy2); @@ -62,7 +61,6 @@ void xnn_f32_vrpreluc_ukernel__wasmrelaxedsimd_u16( v128_t vy = wasm_f32x4_mul(va, vb); vy = wasm_i32x4_relaxed_laneselect(vy, vb, vmask); - wasm_v128_store(output, vy); output += 4; } @@ -72,7 +70,6 @@ void xnn_f32_vrpreluc_ukernel__wasmrelaxedsimd_u16( v128_t vy = wasm_f32x4_mul(va, vb); vy = wasm_i32x4_relaxed_laneselect(vy, vb, vmask); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vrpreluc-wasmrelaxedsimd-u4.c b/src/f32-vbinary/gen/f32-vrpreluc-wasmrelaxedsimd-u4.c index 925ca6a6a06..bfe44342306 100644 --- a/src/f32-vbinary/gen/f32-vrpreluc-wasmrelaxedsimd-u4.c +++ b/src/f32-vbinary/gen/f32-vrpreluc-wasmrelaxedsimd-u4.c @@ -38,7 +38,6 @@ void xnn_f32_vrpreluc_ukernel__wasmrelaxedsimd_u4( v128_t vy = wasm_f32x4_mul(va, vb); vy = wasm_i32x4_relaxed_laneselect(vy, vb, vmask); - wasm_v128_store(output, vy); output += 4; } @@ -48,7 +47,6 @@ void xnn_f32_vrpreluc_ukernel__wasmrelaxedsimd_u4( v128_t vy = wasm_f32x4_mul(va, vb); vy = wasm_i32x4_relaxed_laneselect(vy, vb, vmask); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vrpreluc-wasmrelaxedsimd-u8.c b/src/f32-vbinary/gen/f32-vrpreluc-wasmrelaxedsimd-u8.c index 2551d50a8d3..5807e3e2d07 100644 --- a/src/f32-vbinary/gen/f32-vrpreluc-wasmrelaxedsimd-u8.c +++ b/src/f32-vbinary/gen/f32-vrpreluc-wasmrelaxedsimd-u8.c @@ -42,7 +42,6 @@ void xnn_f32_vrpreluc_ukernel__wasmrelaxedsimd_u8( vy0 = wasm_i32x4_relaxed_laneselect(vy0, vb, vmask); vy1 = wasm_i32x4_relaxed_laneselect(vy1, vb, vmask); - wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); output += 8; @@ -54,7 +53,6 @@ void xnn_f32_vrpreluc_ukernel__wasmrelaxedsimd_u8( v128_t vy = wasm_f32x4_mul(va, vb); vy = wasm_i32x4_relaxed_laneselect(vy, vb, vmask); - wasm_v128_store(output, vy); output += 4; } @@ -64,7 +62,6 @@ void xnn_f32_vrpreluc_ukernel__wasmrelaxedsimd_u8( v128_t vy = wasm_f32x4_mul(va, vb); vy = wasm_i32x4_relaxed_laneselect(vy, vb, vmask); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u16.c b/src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u16.c index 54105b1fbc7..848d939b28e 100644 --- a/src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u16.c +++ b/src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u16.c @@ -48,7 +48,6 @@ void xnn_f32_vrpreluc_ukernel__wasmsimd_u16( vy2 = wasm_v128_bitselect(vy2, vb, vmask); vy3 = wasm_v128_bitselect(vy3, vb, vmask); - wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); wasm_v128_store(output + 8, vy2); @@ -62,7 +61,6 @@ void xnn_f32_vrpreluc_ukernel__wasmsimd_u16( v128_t vy = wasm_f32x4_mul(va, vb); vy = wasm_v128_bitselect(vy, vb, vmask); - wasm_v128_store(output, vy); output += 4; } @@ -72,7 +70,6 @@ void xnn_f32_vrpreluc_ukernel__wasmsimd_u16( v128_t vy = wasm_f32x4_mul(va, vb); vy = wasm_v128_bitselect(vy, vb, vmask); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u4.c b/src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u4.c index 307e111592c..2d3c3d99e20 100644 --- a/src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u4.c +++ b/src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u4.c @@ -38,7 +38,6 @@ void xnn_f32_vrpreluc_ukernel__wasmsimd_u4( v128_t vy = wasm_f32x4_mul(va, vb); vy = wasm_v128_bitselect(vy, vb, vmask); - wasm_v128_store(output, vy); output += 4; } @@ -48,7 +47,6 @@ void xnn_f32_vrpreluc_ukernel__wasmsimd_u4( v128_t vy = wasm_f32x4_mul(va, vb); vy = wasm_v128_bitselect(vy, vb, vmask); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u8.c b/src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u8.c index 7211a4373a2..348af18e994 100644 --- a/src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u8.c +++ b/src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u8.c @@ -42,7 +42,6 @@ void xnn_f32_vrpreluc_ukernel__wasmsimd_u8( vy0 = wasm_v128_bitselect(vy0, vb, vmask); vy1 = wasm_v128_bitselect(vy1, vb, vmask); - wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); output += 8; @@ -54,7 +53,6 @@ void xnn_f32_vrpreluc_ukernel__wasmsimd_u8( v128_t vy = wasm_f32x4_mul(va, vb); vy = wasm_v128_bitselect(vy, vb, vmask); - wasm_v128_store(output, vy); output += 4; } @@ -64,7 +62,6 @@ void xnn_f32_vrpreluc_ukernel__wasmsimd_u8( v128_t vy = wasm_f32x4_mul(va, vb); vy = wasm_v128_bitselect(vy, vb, vmask); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vrsubc-minmax-avx-u8.c b/src/f32-vbinary/gen/f32-vrsubc-avx-u16.c similarity index 77% rename from src/f32-vbinary/gen/f32-vrsubc-minmax-avx-u8.c rename to src/f32-vbinary/gen/f32-vrsubc-avx-u16.c index 16106fb3b97..d5790d59446 100644 --- a/src/f32-vbinary/gen/f32-vrsubc-minmax-avx-u8.c +++ b/src/f32-vbinary/gen/f32-vrsubc-avx-u16.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vrsubc_minmax_ukernel__avx_u8( +void xnn_f32_vrsubc_ukernel__avx_u16( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -30,19 +30,26 @@ void xnn_f32_vrsubc_minmax_ukernel__avx_u8( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; - const __m256 voutput_min = _mm256_set1_ps(params->scalar.min); - const __m256 voutput_max = _mm256_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); const __m256 vb = _mm256_broadcast_ss(input_b); + for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { + const __m256 va0 = _mm256_loadu_ps(input_a); + const __m256 va1 = _mm256_loadu_ps(input_a + 8); + input_a += 16; + + __m256 vacc0 = _mm256_sub_ps(vb, va0); + __m256 vacc1 = _mm256_sub_ps(vb, va1); + + + _mm256_storeu_ps(output, vacc0); + _mm256_storeu_ps(output + 8, vacc1); + output += 16; + } for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const __m256 va = _mm256_loadu_ps(input_a); input_a += 8; __m256 vacc = _mm256_sub_ps(vb, va); - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); _mm256_storeu_ps(output, vacc); output += 8; } @@ -54,8 +61,6 @@ void xnn_f32_vrsubc_minmax_ukernel__avx_u8( __m256 va = _mm256_maskload_ps(input_a, vmask); __m256 vacc = _mm256_sub_ps(vb, va); - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); __m128 vacc_lo = _mm256_castps256_ps128(vacc); if (batch & (4 * sizeof(float))) { diff --git a/src/f32-vbinary/gen/f32-vrsubc-avx-u8.c b/src/f32-vbinary/gen/f32-vrsubc-avx-u8.c new file mode 100644 index 00000000000..80eea30b11b --- /dev/null +++ b/src/f32-vbinary/gen/f32-vrsubc-avx-u8.c @@ -0,0 +1,67 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-vbinary/vopc-avx.c.in +// Generator: tools/xngen +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/vbinary.h" + + +void xnn_f32_vrsubc_ukernel__avx_u8( + size_t batch, + const float* input_a, + const float* input_b, + float* output, + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + + const __m256 vb = _mm256_broadcast_ss(input_b); + + for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { + const __m256 va = _mm256_loadu_ps(input_a); + input_a += 8; + + __m256 vacc = _mm256_sub_ps(vb, va); + _mm256_storeu_ps(output, vacc); + output += 8; + } + if XNN_UNLIKELY(batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 7 * sizeof(float)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch)); + + __m256 va = _mm256_maskload_ps(input_a, vmask); + + __m256 vacc = _mm256_sub_ps(vb, va); + + __m128 vacc_lo = _mm256_castps256_ps128(vacc); + if (batch & (4 * sizeof(float))) { + _mm_storeu_ps(output, vacc_lo); + vacc_lo = _mm256_extractf128_ps(vacc, 1); + output += 4; + } + if (batch & (2 * sizeof(float))) { + _mm_storel_pi((__m64*) output, vacc_lo); + vacc_lo = _mm_movehl_ps(vacc_lo, vacc_lo); + output += 2; + } + if (batch & (1 * sizeof(float))) { + _mm_store_ss(output, vacc_lo); + } + } +} diff --git a/src/f32-vbinary/gen/f32-vrsubc-minmax-avx512f-u16.c b/src/f32-vbinary/gen/f32-vrsubc-avx512f-u16.c similarity index 75% rename from src/f32-vbinary/gen/f32-vrsubc-minmax-avx512f-u16.c rename to src/f32-vbinary/gen/f32-vrsubc-avx512f-u16.c index 09baffd6c6c..832cc8e21a3 100644 --- a/src/f32-vbinary/gen/f32-vrsubc-minmax-avx512f-u16.c +++ b/src/f32-vbinary/gen/f32-vrsubc-avx512f-u16.c @@ -7,6 +7,7 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. + #include #include @@ -16,12 +17,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vrsubc_minmax_ukernel__avx512f_u16( +void xnn_f32_vrsubc_ukernel__avx512f_u16( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,8 +30,6 @@ void xnn_f32_vrsubc_minmax_ukernel__avx512f_u16( assert(input_b != NULL); assert(output != NULL); - const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); - const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); const __m512 vb = _mm512_set1_ps(*input_b); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { @@ -40,10 +39,6 @@ void xnn_f32_vrsubc_minmax_ukernel__avx512f_u16( __m512 vacc0 = _mm512_sub_ps(vb, va0); - vacc0 = _mm512_max_ps(voutput_min, vacc0); - - vacc0 = _mm512_min_ps(voutput_max, vacc0); - _mm512_storeu_ps(output, vacc0); output += 16; } @@ -57,8 +52,6 @@ void xnn_f32_vrsubc_minmax_ukernel__avx512f_u16( __m512 va = _mm512_maskz_loadu_ps(vmask, input_a); __m512 vacc = _mm512_sub_ps(vb, va); - vacc = _mm512_maskz_max_ps(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ps(vmask, voutput_max, vacc); _mm512_mask_storeu_ps(output, vmask, vacc); } } diff --git a/src/f32-vbinary/gen/f32-vrsubc-minmax-avx512f-u32.c b/src/f32-vbinary/gen/f32-vrsubc-avx512f-u32.c similarity index 73% rename from src/f32-vbinary/gen/f32-vrsubc-minmax-avx512f-u32.c rename to src/f32-vbinary/gen/f32-vrsubc-avx512f-u32.c index 7f2bf4a0be9..ea08265a196 100644 --- a/src/f32-vbinary/gen/f32-vrsubc-minmax-avx512f-u32.c +++ b/src/f32-vbinary/gen/f32-vrsubc-avx512f-u32.c @@ -7,6 +7,7 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. + #include #include @@ -16,12 +17,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vrsubc_minmax_ukernel__avx512f_u32( +void xnn_f32_vrsubc_ukernel__avx512f_u32( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,8 +30,6 @@ void xnn_f32_vrsubc_minmax_ukernel__avx512f_u32( assert(input_b != NULL); assert(output != NULL); - const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); - const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); const __m512 vb = _mm512_set1_ps(*input_b); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { @@ -42,12 +41,6 @@ void xnn_f32_vrsubc_minmax_ukernel__avx512f_u32( __m512 vacc1 = _mm512_sub_ps(vb, va1); - vacc0 = _mm512_max_ps(voutput_min, vacc0); - vacc1 = _mm512_max_ps(voutput_min, vacc1); - - vacc0 = _mm512_min_ps(voutput_max, vacc0); - vacc1 = _mm512_min_ps(voutput_max, vacc1); - _mm512_storeu_ps(output, vacc0); _mm512_storeu_ps(output + 16, vacc1); output += 32; @@ -58,9 +51,6 @@ void xnn_f32_vrsubc_minmax_ukernel__avx512f_u32( __m512 vacc = _mm512_sub_ps(vb, va); - vacc = _mm512_max_ps(voutput_min, vacc); - vacc = _mm512_min_ps(voutput_max, vacc); - _mm512_storeu_ps(output, vacc); output += 16; } @@ -74,8 +64,6 @@ void xnn_f32_vrsubc_minmax_ukernel__avx512f_u32( __m512 va = _mm512_maskz_loadu_ps(vmask, input_a); __m512 vacc = _mm512_sub_ps(vb, va); - vacc = _mm512_maskz_max_ps(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ps(vmask, voutput_max, vacc); _mm512_mask_storeu_ps(output, vmask, vacc); } } diff --git a/src/f32-vbinary/gen/f32-vrsubc-minmax-hvx-u128.c b/src/f32-vbinary/gen/f32-vrsubc-hvx-u128.c similarity index 65% rename from src/f32-vbinary/gen/f32-vrsubc-minmax-hvx-u128.c rename to src/f32-vbinary/gen/f32-vrsubc-hvx-u128.c index b7a1c787721..401d1fa8b26 100644 --- a/src/f32-vbinary/gen/f32-vrsubc-minmax-hvx-u128.c +++ b/src/f32-vbinary/gen/f32-vrsubc-hvx-u128.c @@ -10,12 +10,12 @@ #include "xnnpack/math.h" #include "xnnpack/vbinary.h" -void xnn_f32_vrsubc_minmax_ukernel__hvx_u128( +void xnn_f32_vrsubc_ukernel__hvx_u128( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -23,8 +23,6 @@ void xnn_f32_vrsubc_minmax_ukernel__hvx_u128( assert(input_b != NULL); assert(output != NULL); - const HVX_Vector voutput_min = xnn_set1_f32(params->scalar.min); - const HVX_Vector voutput_max = xnn_set1_f32(params->scalar.max); HVX_Vector vb = xnn_set1_f32(*input_b); for (; batch >= 128 * sizeof(float); batch -= 128 * sizeof(float)) { @@ -40,16 +38,6 @@ void xnn_f32_vrsubc_minmax_ukernel__hvx_u128( HVX_Vector vacc3 = xnn_sub_f32(vb, va3); - vacc0 = xnn_max_f32(vacc0, voutput_min); - vacc1 = xnn_max_f32(vacc1, voutput_min); - vacc2 = xnn_max_f32(vacc2, voutput_min); - vacc3 = xnn_max_f32(vacc3, voutput_min); - - vacc0 = xnn_min_f32(vacc0, voutput_max); - vacc1 = xnn_min_f32(vacc1, voutput_max); - vacc2 = xnn_min_f32(vacc2, voutput_max); - vacc3 = xnn_min_f32(vacc3, voutput_max); - xnn_storeu_f32(output, vacc0); xnn_storeu_f32(output + 32, vacc1); xnn_storeu_f32(output + 64, vacc2); @@ -61,8 +49,6 @@ void xnn_f32_vrsubc_minmax_ukernel__hvx_u128( input_a += 32; HVX_Vector vacc = xnn_sub_f32(vb, va); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); xnn_storeu_f32(output, vacc); output+= 32; @@ -71,8 +57,6 @@ void xnn_f32_vrsubc_minmax_ukernel__hvx_u128( HVX_Vector va = xnn_loadu_f32(input_a); HVX_Vector vacc = xnn_sub_f32(vb, va); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); Q6_V_vstu_variable(output, batch, vacc); } diff --git a/src/f32-vbinary/gen/f32-vrsubc-minmax-hvx-u32.c b/src/f32-vbinary/gen/f32-vrsubc-hvx-u32.c similarity index 67% rename from src/f32-vbinary/gen/f32-vrsubc-minmax-hvx-u32.c rename to src/f32-vbinary/gen/f32-vrsubc-hvx-u32.c index 6d409ba99e1..91c5c81c801 100644 --- a/src/f32-vbinary/gen/f32-vrsubc-minmax-hvx-u32.c +++ b/src/f32-vbinary/gen/f32-vrsubc-hvx-u32.c @@ -10,12 +10,12 @@ #include "xnnpack/math.h" #include "xnnpack/vbinary.h" -void xnn_f32_vrsubc_minmax_ukernel__hvx_u32( +void xnn_f32_vrsubc_ukernel__hvx_u32( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -23,8 +23,6 @@ void xnn_f32_vrsubc_minmax_ukernel__hvx_u32( assert(input_b != NULL); assert(output != NULL); - const HVX_Vector voutput_min = xnn_set1_f32(params->scalar.min); - const HVX_Vector voutput_max = xnn_set1_f32(params->scalar.max); HVX_Vector vb = xnn_set1_f32(*input_b); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { @@ -32,8 +30,6 @@ void xnn_f32_vrsubc_minmax_ukernel__hvx_u32( input_a += 32; HVX_Vector vacc = xnn_sub_f32(vb, va); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); xnn_storeu_f32(output, vacc); output+= 32; @@ -42,8 +38,6 @@ void xnn_f32_vrsubc_minmax_ukernel__hvx_u32( HVX_Vector va = xnn_loadu_f32(input_a); HVX_Vector vacc = xnn_sub_f32(vb, va); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); Q6_V_vstu_variable(output, batch, vacc); } diff --git a/src/f32-vbinary/gen/f32-vrsubc-minmax-hvx-u64.c b/src/f32-vbinary/gen/f32-vrsubc-hvx-u64.c similarity index 67% rename from src/f32-vbinary/gen/f32-vrsubc-minmax-hvx-u64.c rename to src/f32-vbinary/gen/f32-vrsubc-hvx-u64.c index 325514d27e3..25a3896fa01 100644 --- a/src/f32-vbinary/gen/f32-vrsubc-minmax-hvx-u64.c +++ b/src/f32-vbinary/gen/f32-vrsubc-hvx-u64.c @@ -10,12 +10,12 @@ #include "xnnpack/math.h" #include "xnnpack/vbinary.h" -void xnn_f32_vrsubc_minmax_ukernel__hvx_u64( +void xnn_f32_vrsubc_ukernel__hvx_u64( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -23,8 +23,6 @@ void xnn_f32_vrsubc_minmax_ukernel__hvx_u64( assert(input_b != NULL); assert(output != NULL); - const HVX_Vector voutput_min = xnn_set1_f32(params->scalar.min); - const HVX_Vector voutput_max = xnn_set1_f32(params->scalar.max); HVX_Vector vb = xnn_set1_f32(*input_b); for (; batch >= 64 * sizeof(float); batch -= 64 * sizeof(float)) { @@ -36,12 +34,6 @@ void xnn_f32_vrsubc_minmax_ukernel__hvx_u64( HVX_Vector vacc1 = xnn_sub_f32(vb, va1); - vacc0 = xnn_max_f32(vacc0, voutput_min); - vacc1 = xnn_max_f32(vacc1, voutput_min); - - vacc0 = xnn_min_f32(vacc0, voutput_max); - vacc1 = xnn_min_f32(vacc1, voutput_max); - xnn_storeu_f32(output, vacc0); xnn_storeu_f32(output + 32, vacc1); output += 64; @@ -51,8 +43,6 @@ void xnn_f32_vrsubc_minmax_ukernel__hvx_u64( input_a += 32; HVX_Vector vacc = xnn_sub_f32(vb, va); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); xnn_storeu_f32(output, vacc); output+= 32; @@ -61,8 +51,6 @@ void xnn_f32_vrsubc_minmax_ukernel__hvx_u64( HVX_Vector va = xnn_loadu_f32(input_a); HVX_Vector vacc = xnn_sub_f32(vb, va); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); Q6_V_vstu_variable(output, batch, vacc); } diff --git a/src/f32-vbinary/gen/f32-vrsubc-minmax-avx-u16.c b/src/f32-vbinary/gen/f32-vrsubc-minmax-avx-u16.c deleted file mode 100644 index 64848ea10f8..00000000000 --- a/src/f32-vbinary/gen/f32-vrsubc-minmax-avx-u16.c +++ /dev/null @@ -1,94 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-avx.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vrsubc_minmax_ukernel__avx_u16( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; - - const __m256 voutput_min = _mm256_set1_ps(params->scalar.min); - const __m256 voutput_max = _mm256_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const __m256 vb = _mm256_broadcast_ss(input_b); - - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const __m256 va0 = _mm256_loadu_ps(input_a); - const __m256 va1 = _mm256_loadu_ps(input_a + 8); - input_a += 16; - - __m256 vacc0 = _mm256_sub_ps(vb, va0); - __m256 vacc1 = _mm256_sub_ps(vb, va1); - - - vacc0 = _mm256_max_ps(voutput_min, vacc0); - vacc1 = _mm256_max_ps(voutput_min, vacc1); - - vacc0 = _mm256_min_ps(voutput_max, vacc0); - vacc1 = _mm256_min_ps(voutput_max, vacc1); - - _mm256_storeu_ps(output, vacc0); - _mm256_storeu_ps(output + 8, vacc1); - output += 16; - } - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const __m256 va = _mm256_loadu_ps(input_a); - input_a += 8; - - __m256 vacc = _mm256_sub_ps(vb, va); - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); - _mm256_storeu_ps(output, vacc); - output += 8; - } - if XNN_UNLIKELY(batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch)); - - __m256 va = _mm256_maskload_ps(input_a, vmask); - - __m256 vacc = _mm256_sub_ps(vb, va); - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); - - __m128 vacc_lo = _mm256_castps256_ps128(vacc); - if (batch & (4 * sizeof(float))) { - _mm_storeu_ps(output, vacc_lo); - vacc_lo = _mm256_extractf128_ps(vacc, 1); - output += 4; - } - if (batch & (2 * sizeof(float))) { - _mm_storel_pi((__m64*) output, vacc_lo); - vacc_lo = _mm_movehl_ps(vacc_lo, vacc_lo); - output += 2; - } - if (batch & (1 * sizeof(float))) { - _mm_store_ss(output, vacc_lo); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vrsubc-minmax-wasm-u1.c b/src/f32-vbinary/gen/f32-vrsubc-minmax-wasm-u1.c deleted file mode 100644 index 73e9490fe56..00000000000 --- a/src/f32-vbinary/gen/f32-vrsubc-minmax-wasm-u1.c +++ /dev/null @@ -1,41 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vrsubc_minmax_ukernel__wasm_u1( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - const float vb = *input_b; - - for (; batch >= sizeof(float); batch -= sizeof(float)) { - const float va = *input_a++; - float vacc = vb - va; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output++ = vacc; - } -} diff --git a/src/f32-vbinary/gen/f32-vrsubc-minmax-wasm-u2.c b/src/f32-vbinary/gen/f32-vrsubc-minmax-wasm-u2.c deleted file mode 100644 index 2c5b97a0771..00000000000 --- a/src/f32-vbinary/gen/f32-vrsubc-minmax-wasm-u2.c +++ /dev/null @@ -1,61 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vrsubc_minmax_ukernel__wasm_u2( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - const float vb = *input_b; - - for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) { - const float va0 = input_a[0]; - const float va1 = input_a[1]; - input_a += 2; - - float vacc0 = vb - va0; - float vacc1 = vb - va1; - - - vacc0 = __builtin_wasm_max_f32(vacc0, voutput_min); - vacc1 = __builtin_wasm_max_f32(vacc1, voutput_min); - - vacc0 = __builtin_wasm_min_f32(vacc0, voutput_max); - vacc1 = __builtin_wasm_min_f32(vacc1, voutput_max); - - output[0] = vacc0; - output[1] = vacc1; - output += 2; - } - if XNN_UNLIKELY(batch != 0) { - assert(batch == sizeof(float)); - const float va = *input_a; - float vacc = vb - va; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output = vacc; - } -} diff --git a/src/f32-vbinary/gen/f32-vrsubc-minmax-wasm-u4.c b/src/f32-vbinary/gen/f32-vrsubc-minmax-wasm-u4.c deleted file mode 100644 index 4988bb148a4..00000000000 --- a/src/f32-vbinary/gen/f32-vrsubc-minmax-wasm-u4.c +++ /dev/null @@ -1,73 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vrsubc_minmax_ukernel__wasm_u4( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - const float vb = *input_b; - - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float va0 = input_a[0]; - const float va1 = input_a[1]; - const float va2 = input_a[2]; - const float va3 = input_a[3]; - input_a += 4; - - float vacc0 = vb - va0; - float vacc1 = vb - va1; - float vacc2 = vb - va2; - float vacc3 = vb - va3; - - - vacc0 = __builtin_wasm_max_f32(vacc0, voutput_min); - vacc1 = __builtin_wasm_max_f32(vacc1, voutput_min); - vacc2 = __builtin_wasm_max_f32(vacc2, voutput_min); - vacc3 = __builtin_wasm_max_f32(vacc3, voutput_min); - - vacc0 = __builtin_wasm_min_f32(vacc0, voutput_max); - vacc1 = __builtin_wasm_min_f32(vacc1, voutput_max); - vacc2 = __builtin_wasm_min_f32(vacc2, voutput_max); - vacc3 = __builtin_wasm_min_f32(vacc3, voutput_max); - - output[0] = vacc0; - output[1] = vacc1; - output[2] = vacc2; - output[3] = vacc3; - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - do { - const float va = *input_a++; - float vacc = vb - va; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output++ = vacc; - batch -= sizeof(float); - } while (batch != 0); - } -} diff --git a/src/f32-vbinary/gen/f32-vrsubc-minmax-wasm-u8.c b/src/f32-vbinary/gen/f32-vrsubc-minmax-wasm-u8.c deleted file mode 100644 index 9ae1209cf32..00000000000 --- a/src/f32-vbinary/gen/f32-vrsubc-minmax-wasm-u8.c +++ /dev/null @@ -1,93 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vrsubc_minmax_ukernel__wasm_u8( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - const float vb = *input_b; - - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const float va0 = input_a[0]; - const float va1 = input_a[1]; - const float va2 = input_a[2]; - const float va3 = input_a[3]; - const float va4 = input_a[4]; - const float va5 = input_a[5]; - const float va6 = input_a[6]; - const float va7 = input_a[7]; - input_a += 8; - - float vacc0 = vb - va0; - float vacc1 = vb - va1; - float vacc2 = vb - va2; - float vacc3 = vb - va3; - float vacc4 = vb - va4; - float vacc5 = vb - va5; - float vacc6 = vb - va6; - float vacc7 = vb - va7; - - - vacc0 = __builtin_wasm_max_f32(vacc0, voutput_min); - vacc1 = __builtin_wasm_max_f32(vacc1, voutput_min); - vacc2 = __builtin_wasm_max_f32(vacc2, voutput_min); - vacc3 = __builtin_wasm_max_f32(vacc3, voutput_min); - vacc4 = __builtin_wasm_max_f32(vacc4, voutput_min); - vacc5 = __builtin_wasm_max_f32(vacc5, voutput_min); - vacc6 = __builtin_wasm_max_f32(vacc6, voutput_min); - vacc7 = __builtin_wasm_max_f32(vacc7, voutput_min); - - vacc0 = __builtin_wasm_min_f32(vacc0, voutput_max); - vacc1 = __builtin_wasm_min_f32(vacc1, voutput_max); - vacc2 = __builtin_wasm_min_f32(vacc2, voutput_max); - vacc3 = __builtin_wasm_min_f32(vacc3, voutput_max); - vacc4 = __builtin_wasm_min_f32(vacc4, voutput_max); - vacc5 = __builtin_wasm_min_f32(vacc5, voutput_max); - vacc6 = __builtin_wasm_min_f32(vacc6, voutput_max); - vacc7 = __builtin_wasm_min_f32(vacc7, voutput_max); - - output[0] = vacc0; - output[1] = vacc1; - output[2] = vacc2; - output[3] = vacc3; - output[4] = vacc4; - output[5] = vacc5; - output[6] = vacc6; - output[7] = vacc7; - output += 8; - } - if XNN_UNLIKELY(batch != 0) { - do { - const float va = *input_a++; - float vacc = vb - va; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output++ = vacc; - batch -= sizeof(float); - } while (batch != 0); - } -} diff --git a/src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-arm-u16.c b/src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-arm-u16.c deleted file mode 100644 index 4d2ff7cab19..00000000000 --- a/src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-arm-u16.c +++ /dev/null @@ -1,95 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vrsubc_minmax_ukernel__wasmsimd_arm_u16( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const v128_t vb = wasm_v128_load32_splat(input_b); - - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - const v128_t va2 = wasm_v128_load(input_a + 8); - const v128_t va3 = wasm_v128_load(input_a + 12); - input_a += 16; - - v128_t vy0 = wasm_f32x4_sub(vb, va0); - v128_t vy1 = wasm_f32x4_sub(vb, va1); - v128_t vy2 = wasm_f32x4_sub(vb, va2); - v128_t vy3 = wasm_f32x4_sub(vb, va3); - - - vy0 = wasm_f32x4_max(vy0, voutput_min); - vy1 = wasm_f32x4_max(vy1, voutput_min); - vy2 = wasm_f32x4_max(vy2, voutput_min); - vy3 = wasm_f32x4_max(vy3, voutput_min); - - vy0 = wasm_f32x4_min(vy0, voutput_max); - vy1 = wasm_f32x4_min(vy1, voutput_max); - vy2 = wasm_f32x4_min(vy2, voutput_max); - vy3 = wasm_f32x4_min(vy3, voutput_max); - - wasm_v128_store(output, vy0); - wasm_v128_store(output + 4, vy1); - wasm_v128_store(output + 8, vy2); - wasm_v128_store(output + 12, vy3); - output += 16; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - v128_t vy = wasm_f32x4_sub(vb, va); - - vy = wasm_f32x4_max(vy, voutput_min); - vy = wasm_f32x4_min(vy, voutput_max); - - wasm_v128_store(output, vy); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - - v128_t vy = wasm_f32x4_sub(vb, va); - - vy = wasm_f32x4_max(vy, voutput_min); - vy = wasm_f32x4_min(vy, voutput_max); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vy, 0); - vy = wasm_v64x2_shuffle(vy, vy, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vy, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-arm-u4.c b/src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-arm-u4.c deleted file mode 100644 index 7224780fb3f..00000000000 --- a/src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-arm-u4.c +++ /dev/null @@ -1,66 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vrsubc_minmax_ukernel__wasmsimd_arm_u4( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const v128_t vb = wasm_v128_load32_splat(input_b); - - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - v128_t vy = wasm_f32x4_sub(vb, va); - - vy = wasm_f32x4_max(vy, voutput_min); - vy = wasm_f32x4_min(vy, voutput_max); - - wasm_v128_store(output, vy); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - - v128_t vy = wasm_f32x4_sub(vb, va); - - vy = wasm_f32x4_max(vy, voutput_min); - vy = wasm_f32x4_min(vy, voutput_max); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vy, 0); - vy = wasm_v64x2_shuffle(vy, vy, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vy, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-arm-u8.c b/src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-arm-u8.c deleted file mode 100644 index 27a007318fa..00000000000 --- a/src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-arm-u8.c +++ /dev/null @@ -1,85 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vrsubc_minmax_ukernel__wasmsimd_arm_u8( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const v128_t vb = wasm_v128_load32_splat(input_b); - - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - input_a += 8; - - v128_t vy0 = wasm_f32x4_sub(vb, va0); - v128_t vy1 = wasm_f32x4_sub(vb, va1); - - - vy0 = wasm_f32x4_max(vy0, voutput_min); - vy1 = wasm_f32x4_max(vy1, voutput_min); - - vy0 = wasm_f32x4_min(vy0, voutput_max); - vy1 = wasm_f32x4_min(vy1, voutput_max); - - wasm_v128_store(output, vy0); - wasm_v128_store(output + 4, vy1); - output += 8; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - v128_t vy = wasm_f32x4_sub(vb, va); - - vy = wasm_f32x4_max(vy, voutput_min); - vy = wasm_f32x4_min(vy, voutput_max); - - wasm_v128_store(output, vy); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - - v128_t vy = wasm_f32x4_sub(vb, va); - - vy = wasm_f32x4_max(vy, voutput_min); - vy = wasm_f32x4_min(vy, voutput_max); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vy, 0); - vy = wasm_v64x2_shuffle(vy, vy, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vy, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-x86-u16.c b/src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-x86-u16.c deleted file mode 100644 index 1076054b0bb..00000000000 --- a/src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-x86-u16.c +++ /dev/null @@ -1,95 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vrsubc_minmax_ukernel__wasmsimd_x86_u16( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const v128_t vb = wasm_v128_load32_splat(input_b); - - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - const v128_t va2 = wasm_v128_load(input_a + 8); - const v128_t va3 = wasm_v128_load(input_a + 12); - input_a += 16; - - v128_t vy0 = wasm_f32x4_sub(vb, va0); - v128_t vy1 = wasm_f32x4_sub(vb, va1); - v128_t vy2 = wasm_f32x4_sub(vb, va2); - v128_t vy3 = wasm_f32x4_sub(vb, va3); - - - vy0 = wasm_f32x4_pmax(voutput_min, vy0); - vy1 = wasm_f32x4_pmax(voutput_min, vy1); - vy2 = wasm_f32x4_pmax(voutput_min, vy2); - vy3 = wasm_f32x4_pmax(voutput_min, vy3); - - vy0 = wasm_f32x4_pmin(voutput_max, vy0); - vy1 = wasm_f32x4_pmin(voutput_max, vy1); - vy2 = wasm_f32x4_pmin(voutput_max, vy2); - vy3 = wasm_f32x4_pmin(voutput_max, vy3); - - wasm_v128_store(output, vy0); - wasm_v128_store(output + 4, vy1); - wasm_v128_store(output + 8, vy2); - wasm_v128_store(output + 12, vy3); - output += 16; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - v128_t vy = wasm_f32x4_sub(vb, va); - - vy = wasm_f32x4_pmax(voutput_min, vy); - vy = wasm_f32x4_pmin(voutput_max, vy); - - wasm_v128_store(output, vy); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - - v128_t vy = wasm_f32x4_sub(vb, va); - - vy = wasm_f32x4_pmax(voutput_min, vy); - vy = wasm_f32x4_pmin(voutput_max, vy); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vy, 0); - vy = wasm_v64x2_shuffle(vy, vy, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vy, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-x86-u4.c b/src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-x86-u4.c deleted file mode 100644 index 58df75f8914..00000000000 --- a/src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-x86-u4.c +++ /dev/null @@ -1,66 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vrsubc_minmax_ukernel__wasmsimd_x86_u4( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const v128_t vb = wasm_v128_load32_splat(input_b); - - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - v128_t vy = wasm_f32x4_sub(vb, va); - - vy = wasm_f32x4_pmax(voutput_min, vy); - vy = wasm_f32x4_pmin(voutput_max, vy); - - wasm_v128_store(output, vy); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - - v128_t vy = wasm_f32x4_sub(vb, va); - - vy = wasm_f32x4_pmax(voutput_min, vy); - vy = wasm_f32x4_pmin(voutput_max, vy); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vy, 0); - vy = wasm_v64x2_shuffle(vy, vy, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vy, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-x86-u8.c b/src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-x86-u8.c deleted file mode 100644 index 6b2df590c48..00000000000 --- a/src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-x86-u8.c +++ /dev/null @@ -1,85 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vrsubc_minmax_ukernel__wasmsimd_x86_u8( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const v128_t vb = wasm_v128_load32_splat(input_b); - - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - input_a += 8; - - v128_t vy0 = wasm_f32x4_sub(vb, va0); - v128_t vy1 = wasm_f32x4_sub(vb, va1); - - - vy0 = wasm_f32x4_pmax(voutput_min, vy0); - vy1 = wasm_f32x4_pmax(voutput_min, vy1); - - vy0 = wasm_f32x4_pmin(voutput_max, vy0); - vy1 = wasm_f32x4_pmin(voutput_max, vy1); - - wasm_v128_store(output, vy0); - wasm_v128_store(output + 4, vy1); - output += 8; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - v128_t vy = wasm_f32x4_sub(vb, va); - - vy = wasm_f32x4_pmax(voutput_min, vy); - vy = wasm_f32x4_pmin(voutput_max, vy); - - wasm_v128_store(output, vy); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - - v128_t vy = wasm_f32x4_sub(vb, va); - - vy = wasm_f32x4_pmax(voutput_min, vy); - vy = wasm_f32x4_pmin(voutput_max, vy); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vy, 0); - vy = wasm_v64x2_shuffle(vy, vy, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vy, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vrsubc-minmax-neon-u4.c b/src/f32-vbinary/gen/f32-vrsubc-neon-u4.c similarity index 74% rename from src/f32-vbinary/gen/f32-vrsubc-minmax-neon-u4.c rename to src/f32-vbinary/gen/f32-vrsubc-neon-u4.c index 2171b3d751c..06629b76a3e 100644 --- a/src/f32-vbinary/gen/f32-vrsubc-minmax-neon-u4.c +++ b/src/f32-vbinary/gen/f32-vrsubc-neon-u4.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vrsubc_minmax_ukernel__neon_u4( +void xnn_f32_vrsubc_ukernel__neon_u4( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -28,8 +28,6 @@ void xnn_f32_vrsubc_minmax_ukernel__neon_u4( assert(input_b != NULL); assert(output != NULL); - const float32x4_t voutput_min = vld1q_dup_f32(¶ms->scalar.min); - const float32x4_t voutput_max = vld1q_dup_f32(¶ms->scalar.max); const float32x4_t vb = vld1q_dup_f32(input_b); for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { @@ -37,9 +35,6 @@ void xnn_f32_vrsubc_minmax_ukernel__neon_u4( float32x4_t vacc = vsubq_f32(vb, va); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -47,9 +42,6 @@ void xnn_f32_vrsubc_minmax_ukernel__neon_u4( float32x4_t vacc = vsubq_f32(vb, va); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/gen/f32-vrsubc-minmax-neon-u8.c b/src/f32-vbinary/gen/f32-vrsubc-neon-u8.c similarity index 72% rename from src/f32-vbinary/gen/f32-vrsubc-minmax-neon-u8.c rename to src/f32-vbinary/gen/f32-vrsubc-neon-u8.c index d31323b1124..953ea3ab546 100644 --- a/src/f32-vbinary/gen/f32-vrsubc-minmax-neon-u8.c +++ b/src/f32-vbinary/gen/f32-vrsubc-neon-u8.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vrsubc_minmax_ukernel__neon_u8( +void xnn_f32_vrsubc_ukernel__neon_u8( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -28,8 +28,6 @@ void xnn_f32_vrsubc_minmax_ukernel__neon_u8( assert(input_b != NULL); assert(output != NULL); - const float32x4_t voutput_min = vld1q_dup_f32(¶ms->scalar.min); - const float32x4_t voutput_max = vld1q_dup_f32(¶ms->scalar.max); const float32x4_t vb = vld1q_dup_f32(input_b); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { @@ -40,12 +38,6 @@ void xnn_f32_vrsubc_minmax_ukernel__neon_u8( float32x4_t vacc1 = vsubq_f32(vb, va1); - vacc0 = vmaxq_f32(vacc0, voutput_min); - vacc1 = vmaxq_f32(vacc1, voutput_min); - - vacc0 = vminq_f32(vacc0, voutput_max); - vacc1 = vminq_f32(vacc1, voutput_max); - vst1q_f32(output, vacc0); output += 4; vst1q_f32(output, vacc1); output += 4; } @@ -54,9 +46,6 @@ void xnn_f32_vrsubc_minmax_ukernel__neon_u8( float32x4_t vacc = vsubq_f32(vb, va); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -64,9 +53,6 @@ void xnn_f32_vrsubc_minmax_ukernel__neon_u8( float32x4_t vacc = vsubq_f32(vb, va); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/gen/f32-vrsubc-minmax-rvv-u4v.c b/src/f32-vbinary/gen/f32-vrsubc-rvv-u4v.c similarity index 74% rename from src/f32-vbinary/gen/f32-vrsubc-minmax-rvv-u4v.c rename to src/f32-vbinary/gen/f32-vrsubc-rvv-u4v.c index 310262cd31d..3f3a8b084d1 100644 --- a/src/f32-vbinary/gen/f32-vrsubc-minmax-rvv-u4v.c +++ b/src/f32-vbinary/gen/f32-vrsubc-rvv-u4v.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vrsubc_minmax_ukernel__rvv_u4v( +void xnn_f32_vrsubc_ukernel__rvv_u4v( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -28,8 +28,6 @@ void xnn_f32_vrsubc_minmax_ukernel__rvv_u4v( assert(input_b != NULL); assert(output != NULL); - const float output_min = params->scalar.min; - const float output_max = params->scalar.max; const float b = *input_b; size_t n = batch >> 2; @@ -39,8 +37,6 @@ void xnn_f32_vrsubc_minmax_ukernel__rvv_u4v( vfloat32m4_t va = __riscv_vle32_v_f32m4(input_a, vl); input_a += vl; vfloat32m4_t vacc = __riscv_vfrsub_vf_f32m4(va, b, vl); - vacc = __riscv_vfmax_vf_f32m4(vacc, output_min, vl); - vacc = __riscv_vfmin_vf_f32m4(vacc, output_max, vl); __riscv_vse32_v_f32m4(output, vacc, vl); output += vl; } while (n > 0); diff --git a/src/f32-vbinary/gen/f32-vrsubc-minmax-rvv-u8v.c b/src/f32-vbinary/gen/f32-vrsubc-rvv-u8v.c similarity index 74% rename from src/f32-vbinary/gen/f32-vrsubc-minmax-rvv-u8v.c rename to src/f32-vbinary/gen/f32-vrsubc-rvv-u8v.c index 721b1ae7a0a..694aad4cc99 100644 --- a/src/f32-vbinary/gen/f32-vrsubc-minmax-rvv-u8v.c +++ b/src/f32-vbinary/gen/f32-vrsubc-rvv-u8v.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vrsubc_minmax_ukernel__rvv_u8v( +void xnn_f32_vrsubc_ukernel__rvv_u8v( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -28,8 +28,6 @@ void xnn_f32_vrsubc_minmax_ukernel__rvv_u8v( assert(input_b != NULL); assert(output != NULL); - const float output_min = params->scalar.min; - const float output_max = params->scalar.max; const float b = *input_b; size_t n = batch >> 2; @@ -39,8 +37,6 @@ void xnn_f32_vrsubc_minmax_ukernel__rvv_u8v( vfloat32m8_t va = __riscv_vle32_v_f32m8(input_a, vl); input_a += vl; vfloat32m8_t vacc = __riscv_vfrsub_vf_f32m8(va, b, vl); - vacc = __riscv_vfmax_vf_f32m8(vacc, output_min, vl); - vacc = __riscv_vfmin_vf_f32m8(vacc, output_max, vl); __riscv_vse32_v_f32m8(output, vacc, vl); output += vl; } while (n > 0); diff --git a/src/f32-vbinary/gen/f32-vrsubc-scalar-u2.c b/src/f32-vbinary/gen/f32-vrsubc-scalar-u2.c index 5910c2c44c9..6a10fd40308 100644 --- a/src/f32-vbinary/gen/f32-vrsubc-scalar-u2.c +++ b/src/f32-vbinary/gen/f32-vrsubc-scalar-u2.c @@ -38,7 +38,6 @@ void xnn_f32_vrsubc_ukernel__scalar_u2( float vacc1 = vb - va1; - output[0] = vacc0; output[1] = vacc1; output += 2; diff --git a/src/f32-vbinary/gen/f32-vrsubc-scalar-u4.c b/src/f32-vbinary/gen/f32-vrsubc-scalar-u4.c index 2f1b2d848b7..bba8ab22f86 100644 --- a/src/f32-vbinary/gen/f32-vrsubc-scalar-u4.c +++ b/src/f32-vbinary/gen/f32-vrsubc-scalar-u4.c @@ -42,7 +42,6 @@ void xnn_f32_vrsubc_ukernel__scalar_u4( float vacc3 = vb - va3; - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vrsubc-scalar-u8.c b/src/f32-vbinary/gen/f32-vrsubc-scalar-u8.c index d6f0531d36d..f0f94336040 100644 --- a/src/f32-vbinary/gen/f32-vrsubc-scalar-u8.c +++ b/src/f32-vbinary/gen/f32-vrsubc-scalar-u8.c @@ -50,7 +50,6 @@ void xnn_f32_vrsubc_ukernel__scalar_u8( float vacc7 = vb - va7; - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vrsubc-minmax-sse-u4.c b/src/f32-vbinary/gen/f32-vrsubc-sse-u4.c similarity index 71% rename from src/f32-vbinary/gen/f32-vrsubc-minmax-sse-u4.c rename to src/f32-vbinary/gen/f32-vrsubc-sse-u4.c index 191d2c9b0a4..86875b5a93f 100644 --- a/src/f32-vbinary/gen/f32-vrsubc-minmax-sse-u4.c +++ b/src/f32-vbinary/gen/f32-vrsubc-sse-u4.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vrsubc_minmax_ukernel__sse_u4( +void xnn_f32_vrsubc_ukernel__sse_u4( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,10 +29,6 @@ void xnn_f32_vrsubc_minmax_ukernel__sse_u4( assert(input_b != NULL); assert(output != NULL); - const __m128 voutput_min = _mm_set1_ps(params->scalar.min); - const __m128 voutput_max = _mm_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); const __m128 vb = _mm_load1_ps(input_b); for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { @@ -40,8 +36,6 @@ void xnn_f32_vrsubc_minmax_ukernel__sse_u4( input_a += 4; __m128 vacc = _mm_sub_ps(vb, va); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); _mm_storeu_ps(output, vacc); output += 4; @@ -50,8 +44,6 @@ void xnn_f32_vrsubc_minmax_ukernel__sse_u4( const __m128 va = _mm_loadu_ps(input_a); __m128 vacc = _mm_sub_ps(vb, va); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); if (batch & (2 * sizeof(float))) { _mm_storel_pi((__m64*) output, vacc); vacc = _mm_movehl_ps(vacc, vacc); diff --git a/src/f32-vbinary/gen/f32-vrsubc-minmax-sse-u8.c b/src/f32-vbinary/gen/f32-vrsubc-sse-u8.c similarity index 70% rename from src/f32-vbinary/gen/f32-vrsubc-minmax-sse-u8.c rename to src/f32-vbinary/gen/f32-vrsubc-sse-u8.c index 82c2e26a7cb..d88a70ec286 100644 --- a/src/f32-vbinary/gen/f32-vrsubc-minmax-sse-u8.c +++ b/src/f32-vbinary/gen/f32-vrsubc-sse-u8.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vrsubc_minmax_ukernel__sse_u8( +void xnn_f32_vrsubc_ukernel__sse_u8( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,10 +29,6 @@ void xnn_f32_vrsubc_minmax_ukernel__sse_u8( assert(input_b != NULL); assert(output != NULL); - const __m128 voutput_min = _mm_set1_ps(params->scalar.min); - const __m128 voutput_max = _mm_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); const __m128 vb = _mm_load1_ps(input_b); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { @@ -44,12 +40,6 @@ void xnn_f32_vrsubc_minmax_ukernel__sse_u8( __m128 vacc1 = _mm_sub_ps(vb, va1); - vacc0 = _mm_max_ps(vacc0, voutput_min); - vacc1 = _mm_max_ps(vacc1, voutput_min); - - vacc0 = _mm_min_ps(vacc0, voutput_max); - vacc1 = _mm_min_ps(vacc1, voutput_max); - _mm_storeu_ps(output, vacc0); _mm_storeu_ps(output + 4, vacc1); output += 8; @@ -59,8 +49,6 @@ void xnn_f32_vrsubc_minmax_ukernel__sse_u8( input_a += 4; __m128 vacc = _mm_sub_ps(vb, va); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); _mm_storeu_ps(output, vacc); output += 4; @@ -69,8 +57,6 @@ void xnn_f32_vrsubc_minmax_ukernel__sse_u8( const __m128 va = _mm_loadu_ps(input_a); __m128 vacc = _mm_sub_ps(vb, va); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); if (batch & (2 * sizeof(float))) { _mm_storel_pi((__m64*) output, vacc); vacc = _mm_movehl_ps(vacc, vacc); diff --git a/src/f32-vbinary/gen/f32-vrsubc-minmax-scalar-u1.c b/src/f32-vbinary/gen/f32-vrsubc-wasm-u1.c similarity index 72% rename from src/f32-vbinary/gen/f32-vrsubc-minmax-scalar-u1.c rename to src/f32-vbinary/gen/f32-vrsubc-wasm-u1.c index 2ed66184a86..c847364de31 100644 --- a/src/f32-vbinary/gen/f32-vrsubc-minmax-scalar-u1.c +++ b/src/f32-vbinary/gen/f32-vrsubc-wasm-u1.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vrsubc_minmax_ukernel__scalar_u1( +void xnn_f32_vrsubc_ukernel__wasm_u1( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,15 +27,11 @@ void xnn_f32_vrsubc_minmax_ukernel__scalar_u1( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; const float vb = *input_b; for (; batch >= sizeof(float); batch -= sizeof(float)) { const float va = *input_a++; float vacc = vb - va; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output++ = vacc; } } diff --git a/src/f32-vbinary/gen/f32-vrsubc-minmax-scalar-u2.c b/src/f32-vbinary/gen/f32-vrsubc-wasm-u2.c similarity index 68% rename from src/f32-vbinary/gen/f32-vrsubc-minmax-scalar-u2.c rename to src/f32-vbinary/gen/f32-vrsubc-wasm-u2.c index 2353cec9e7b..056a3cdab6f 100644 --- a/src/f32-vbinary/gen/f32-vrsubc-minmax-scalar-u2.c +++ b/src/f32-vbinary/gen/f32-vrsubc-wasm-u2.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vrsubc_minmax_ukernel__scalar_u2( +void xnn_f32_vrsubc_ukernel__wasm_u2( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,8 +27,6 @@ void xnn_f32_vrsubc_minmax_ukernel__scalar_u2( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; const float vb = *input_b; for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) { @@ -40,12 +38,6 @@ void xnn_f32_vrsubc_minmax_ukernel__scalar_u2( float vacc1 = vb - va1; - vacc0 = math_max_f32(vacc0, voutput_min); - vacc1 = math_max_f32(vacc1, voutput_min); - - vacc0 = math_min_f32(vacc0, voutput_max); - vacc1 = math_min_f32(vacc1, voutput_max); - output[0] = vacc0; output[1] = vacc1; output += 2; @@ -54,8 +46,6 @@ void xnn_f32_vrsubc_minmax_ukernel__scalar_u2( assert(batch == sizeof(float)); const float va = *input_a; float vacc = vb - va; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output = vacc; } } diff --git a/src/f32-vbinary/gen/f32-vrsubc-minmax-scalar-u4.c b/src/f32-vbinary/gen/f32-vrsubc-wasm-u4.c similarity index 65% rename from src/f32-vbinary/gen/f32-vrsubc-minmax-scalar-u4.c rename to src/f32-vbinary/gen/f32-vrsubc-wasm-u4.c index 25a82f19d64..6a843d4c367 100644 --- a/src/f32-vbinary/gen/f32-vrsubc-minmax-scalar-u4.c +++ b/src/f32-vbinary/gen/f32-vrsubc-wasm-u4.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vrsubc_minmax_ukernel__scalar_u4( +void xnn_f32_vrsubc_ukernel__wasm_u4( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,8 +27,6 @@ void xnn_f32_vrsubc_minmax_ukernel__scalar_u4( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; const float vb = *input_b; for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { @@ -44,16 +42,6 @@ void xnn_f32_vrsubc_minmax_ukernel__scalar_u4( float vacc3 = vb - va3; - vacc0 = math_max_f32(vacc0, voutput_min); - vacc1 = math_max_f32(vacc1, voutput_min); - vacc2 = math_max_f32(vacc2, voutput_min); - vacc3 = math_max_f32(vacc3, voutput_min); - - vacc0 = math_min_f32(vacc0, voutput_max); - vacc1 = math_min_f32(vacc1, voutput_max); - vacc2 = math_min_f32(vacc2, voutput_max); - vacc3 = math_min_f32(vacc3, voutput_max); - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; @@ -64,8 +52,6 @@ void xnn_f32_vrsubc_minmax_ukernel__scalar_u4( do { const float va = *input_a++; float vacc = vb - va; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output++ = vacc; batch -= sizeof(float); } while (batch != 0); diff --git a/src/f32-vbinary/gen/f32-vrsubc-minmax-scalar-u8.c b/src/f32-vbinary/gen/f32-vrsubc-wasm-u8.c similarity index 60% rename from src/f32-vbinary/gen/f32-vrsubc-minmax-scalar-u8.c rename to src/f32-vbinary/gen/f32-vrsubc-wasm-u8.c index 5eb25ef8f45..6e8e9d3c334 100644 --- a/src/f32-vbinary/gen/f32-vrsubc-minmax-scalar-u8.c +++ b/src/f32-vbinary/gen/f32-vrsubc-wasm-u8.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vrsubc_minmax_ukernel__scalar_u8( +void xnn_f32_vrsubc_ukernel__wasm_u8( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,8 +27,6 @@ void xnn_f32_vrsubc_minmax_ukernel__scalar_u8( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; const float vb = *input_b; for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { @@ -52,24 +50,6 @@ void xnn_f32_vrsubc_minmax_ukernel__scalar_u8( float vacc7 = vb - va7; - vacc0 = math_max_f32(vacc0, voutput_min); - vacc1 = math_max_f32(vacc1, voutput_min); - vacc2 = math_max_f32(vacc2, voutput_min); - vacc3 = math_max_f32(vacc3, voutput_min); - vacc4 = math_max_f32(vacc4, voutput_min); - vacc5 = math_max_f32(vacc5, voutput_min); - vacc6 = math_max_f32(vacc6, voutput_min); - vacc7 = math_max_f32(vacc7, voutput_min); - - vacc0 = math_min_f32(vacc0, voutput_max); - vacc1 = math_min_f32(vacc1, voutput_max); - vacc2 = math_min_f32(vacc2, voutput_max); - vacc3 = math_min_f32(vacc3, voutput_max); - vacc4 = math_min_f32(vacc4, voutput_max); - vacc5 = math_min_f32(vacc5, voutput_max); - vacc6 = math_min_f32(vacc6, voutput_max); - vacc7 = math_min_f32(vacc7, voutput_max); - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; @@ -84,8 +64,6 @@ void xnn_f32_vrsubc_minmax_ukernel__scalar_u8( do { const float va = *input_a++; float vacc = vb - va; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output++ = vacc; batch -= sizeof(float); } while (batch != 0); diff --git a/src/f32-vbinary/gen/f32-vrsubc-wasmsimd-u16.c b/src/f32-vbinary/gen/f32-vrsubc-wasmsimd-u16.c index 7536afe9463..97d50a64982 100644 --- a/src/f32-vbinary/gen/f32-vrsubc-wasmsimd-u16.c +++ b/src/f32-vbinary/gen/f32-vrsubc-wasmsimd-u16.c @@ -43,7 +43,6 @@ void xnn_f32_vrsubc_ukernel__wasmsimd_u16( v128_t vy3 = wasm_f32x4_sub(vb, va3); - wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); wasm_v128_store(output + 8, vy2); @@ -56,7 +55,6 @@ void xnn_f32_vrsubc_ukernel__wasmsimd_u16( v128_t vy = wasm_f32x4_sub(vb, va); - wasm_v128_store(output, vy); output += 4; } @@ -65,7 +63,6 @@ void xnn_f32_vrsubc_ukernel__wasmsimd_u16( v128_t vy = wasm_f32x4_sub(vb, va); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vrsubc-wasmsimd-u4.c b/src/f32-vbinary/gen/f32-vrsubc-wasmsimd-u4.c index 083f20fb83d..11f77f32b86 100644 --- a/src/f32-vbinary/gen/f32-vrsubc-wasmsimd-u4.c +++ b/src/f32-vbinary/gen/f32-vrsubc-wasmsimd-u4.c @@ -36,7 +36,6 @@ void xnn_f32_vrsubc_ukernel__wasmsimd_u4( v128_t vy = wasm_f32x4_sub(vb, va); - wasm_v128_store(output, vy); output += 4; } @@ -45,7 +44,6 @@ void xnn_f32_vrsubc_ukernel__wasmsimd_u4( v128_t vy = wasm_f32x4_sub(vb, va); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vrsubc-wasmsimd-u8.c b/src/f32-vbinary/gen/f32-vrsubc-wasmsimd-u8.c index 6ac856c22c3..de7784f20b1 100644 --- a/src/f32-vbinary/gen/f32-vrsubc-wasmsimd-u8.c +++ b/src/f32-vbinary/gen/f32-vrsubc-wasmsimd-u8.c @@ -39,7 +39,6 @@ void xnn_f32_vrsubc_ukernel__wasmsimd_u8( v128_t vy1 = wasm_f32x4_sub(vb, va1); - wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); output += 8; @@ -50,7 +49,6 @@ void xnn_f32_vrsubc_ukernel__wasmsimd_u8( v128_t vy = wasm_f32x4_sub(vb, va); - wasm_v128_store(output, vy); output += 4; } @@ -59,7 +57,6 @@ void xnn_f32_vrsubc_ukernel__wasmsimd_u8( v128_t vy = wasm_f32x4_sub(vb, va); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vsqrdiff-avx-u16.c b/src/f32-vbinary/gen/f32-vsqrdiff-avx-u16.c index 303065268bd..6473517bdd4 100644 --- a/src/f32-vbinary/gen/f32-vsqrdiff-avx-u16.c +++ b/src/f32-vbinary/gen/f32-vsqrdiff-avx-u16.c @@ -30,7 +30,6 @@ void xnn_f32_vsqrdiff_ukernel__avx_u16( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const __m256 va0 = _mm256_loadu_ps(input_a); const __m256 va1 = _mm256_loadu_ps(input_a + 8); @@ -43,7 +42,6 @@ void xnn_f32_vsqrdiff_ukernel__avx_u16( vacc0 = _mm256_mul_ps(vacc0, vacc0); vacc1 = _mm256_mul_ps(vacc1, vacc1); - _mm256_storeu_ps(output, vacc0); _mm256_storeu_ps(output + 8, vacc1); output += 16; diff --git a/src/f32-vbinary/gen/f32-vsqrdiff-avx-u8.c b/src/f32-vbinary/gen/f32-vsqrdiff-avx-u8.c index 3ca99e60c30..59167539f3b 100644 --- a/src/f32-vbinary/gen/f32-vsqrdiff-avx-u8.c +++ b/src/f32-vbinary/gen/f32-vsqrdiff-avx-u8.c @@ -30,7 +30,6 @@ void xnn_f32_vsqrdiff_ukernel__avx_u8( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const __m256 va = _mm256_loadu_ps(input_a); input_a += 8; diff --git a/src/f32-vbinary/gen/f32-vsqrdiff-avx512f-u16.c b/src/f32-vbinary/gen/f32-vsqrdiff-avx512f-u16.c index 70fbf6b02c1..70aaf1c9f31 100644 --- a/src/f32-vbinary/gen/f32-vsqrdiff-avx512f-u16.c +++ b/src/f32-vbinary/gen/f32-vsqrdiff-avx512f-u16.c @@ -30,7 +30,6 @@ void xnn_f32_vsqrdiff_ukernel__avx512f_u16( assert(output != NULL); - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const __m512 va = _mm512_loadu_ps(input_a); input_a += 16; @@ -40,7 +39,6 @@ void xnn_f32_vsqrdiff_ukernel__avx512f_u16( vacc = _mm512_mul_ps(vacc, vacc); - _mm512_storeu_ps(output, vacc); output += 16; } diff --git a/src/f32-vbinary/gen/f32-vsqrdiff-avx512f-u32.c b/src/f32-vbinary/gen/f32-vsqrdiff-avx512f-u32.c index 6abac034c23..b0e3de9132c 100644 --- a/src/f32-vbinary/gen/f32-vsqrdiff-avx512f-u32.c +++ b/src/f32-vbinary/gen/f32-vsqrdiff-avx512f-u32.c @@ -30,7 +30,6 @@ void xnn_f32_vsqrdiff_ukernel__avx512f_u32( assert(output != NULL); - for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { const __m512 va0 = _mm512_loadu_ps(input_a); const __m512 va1 = _mm512_loadu_ps(input_a + 16); @@ -43,7 +42,6 @@ void xnn_f32_vsqrdiff_ukernel__avx512f_u32( vacc0 = _mm512_mul_ps(vacc0, vacc0); vacc1 = _mm512_mul_ps(vacc1, vacc1); - _mm512_storeu_ps(output, vacc0); _mm512_storeu_ps(output + 16, vacc1); output += 32; @@ -57,7 +55,6 @@ void xnn_f32_vsqrdiff_ukernel__avx512f_u32( vacc = _mm512_mul_ps(vacc, vacc); - _mm512_storeu_ps(output, vacc); output += 16; } diff --git a/src/f32-vbinary/gen/f32-vsqrdiff-hvx-u128.c b/src/f32-vbinary/gen/f32-vsqrdiff-hvx-u128.c index 4262c90d69a..4b85d4abe01 100644 --- a/src/f32-vbinary/gen/f32-vsqrdiff-hvx-u128.c +++ b/src/f32-vbinary/gen/f32-vsqrdiff-hvx-u128.c @@ -23,7 +23,6 @@ void xnn_f32_vsqrdiff_ukernel__hvx_u128( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 128 * sizeof(float); batch -= 128 * sizeof(float)) { HVX_Vector va0 = xnn_loadu_f32(input_a); HVX_Vector va1 = xnn_loadu_f32(input_a + 32); @@ -46,7 +45,6 @@ void xnn_f32_vsqrdiff_ukernel__hvx_u128( vacc2 = xnn_mul_f32(vacc2, vacc2); vacc3 = xnn_mul_f32(vacc3, vacc3); - xnn_storeu_f32(output, vacc0); xnn_storeu_f32(output + 32, vacc1); xnn_storeu_f32(output + 64, vacc2); @@ -71,7 +69,7 @@ void xnn_f32_vsqrdiff_ukernel__hvx_u128( HVX_Vector vacc = xnn_sub_f32(va, vb); vacc = xnn_mul_f32(vacc, vacc); - + Q6_V_vstu_variable(output, batch, vacc); } } diff --git a/src/f32-vbinary/gen/f32-vsqrdiff-hvx-u32.c b/src/f32-vbinary/gen/f32-vsqrdiff-hvx-u32.c index ed7948ffa80..e0821ff4a1d 100644 --- a/src/f32-vbinary/gen/f32-vsqrdiff-hvx-u32.c +++ b/src/f32-vbinary/gen/f32-vsqrdiff-hvx-u32.c @@ -23,7 +23,6 @@ void xnn_f32_vsqrdiff_ukernel__hvx_u32( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { HVX_Vector va = xnn_loadu_f32(input_a); HVX_Vector vb = xnn_loadu_f32(input_b); @@ -42,7 +41,7 @@ void xnn_f32_vsqrdiff_ukernel__hvx_u32( HVX_Vector vacc = xnn_sub_f32(va, vb); vacc = xnn_mul_f32(vacc, vacc); - + Q6_V_vstu_variable(output, batch, vacc); } } diff --git a/src/f32-vbinary/gen/f32-vsqrdiff-hvx-u64.c b/src/f32-vbinary/gen/f32-vsqrdiff-hvx-u64.c index 12651bbc197..e7a79820cd5 100644 --- a/src/f32-vbinary/gen/f32-vsqrdiff-hvx-u64.c +++ b/src/f32-vbinary/gen/f32-vsqrdiff-hvx-u64.c @@ -23,7 +23,6 @@ void xnn_f32_vsqrdiff_ukernel__hvx_u64( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 64 * sizeof(float); batch -= 64 * sizeof(float)) { HVX_Vector va0 = xnn_loadu_f32(input_a); HVX_Vector va1 = xnn_loadu_f32(input_a + 32); @@ -38,7 +37,6 @@ void xnn_f32_vsqrdiff_ukernel__hvx_u64( vacc0 = xnn_mul_f32(vacc0, vacc0); vacc1 = xnn_mul_f32(vacc1, vacc1); - xnn_storeu_f32(output, vacc0); xnn_storeu_f32(output + 32, vacc1); output += 64; @@ -61,7 +59,7 @@ void xnn_f32_vsqrdiff_ukernel__hvx_u64( HVX_Vector vacc = xnn_sub_f32(va, vb); vacc = xnn_mul_f32(vacc, vacc); - + Q6_V_vstu_variable(output, batch, vacc); } } diff --git a/src/f32-vbinary/gen/f32-vsqrdiff-neon-u4.c b/src/f32-vbinary/gen/f32-vsqrdiff-neon-u4.c index 5a0f8c168f3..27ec5e20d7c 100644 --- a/src/f32-vbinary/gen/f32-vsqrdiff-neon-u4.c +++ b/src/f32-vbinary/gen/f32-vsqrdiff-neon-u4.c @@ -28,7 +28,6 @@ void xnn_f32_vsqrdiff_ukernel__neon_u4( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float32x4_t va = vld1q_f32(input_a); input_a += 4; const float32x4_t vb = vld1q_f32(input_b); input_b += 4; @@ -36,7 +35,6 @@ void xnn_f32_vsqrdiff_ukernel__neon_u4( float32x4_t vacc = vsubq_f32(va, vb); vacc = vmulq_f32(vacc, vacc); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -46,7 +44,6 @@ void xnn_f32_vsqrdiff_ukernel__neon_u4( float32x4_t vacc = vsubq_f32(va, vb); vacc = vmulq_f32(vacc, vacc); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/gen/f32-vsqrdiff-neon-u8.c b/src/f32-vbinary/gen/f32-vsqrdiff-neon-u8.c index 62ceed84766..8f754c2164c 100644 --- a/src/f32-vbinary/gen/f32-vsqrdiff-neon-u8.c +++ b/src/f32-vbinary/gen/f32-vsqrdiff-neon-u8.c @@ -28,7 +28,6 @@ void xnn_f32_vsqrdiff_ukernel__neon_u8( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float32x4_t va0 = vld1q_f32(input_a); input_a += 4; const float32x4_t vb0 = vld1q_f32(input_b); input_b += 4; @@ -41,7 +40,6 @@ void xnn_f32_vsqrdiff_ukernel__neon_u8( vacc0 = vmulq_f32(vacc0, vacc0); vacc1 = vmulq_f32(vacc1, vacc1); - vst1q_f32(output, vacc0); output += 4; vst1q_f32(output, vacc1); output += 4; } @@ -52,7 +50,6 @@ void xnn_f32_vsqrdiff_ukernel__neon_u8( float32x4_t vacc = vsubq_f32(va, vb); vacc = vmulq_f32(vacc, vacc); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -62,7 +59,6 @@ void xnn_f32_vsqrdiff_ukernel__neon_u8( float32x4_t vacc = vsubq_f32(va, vb); vacc = vmulq_f32(vacc, vacc); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/gen/f32-vsqrdiff-scalar-u1.c b/src/f32-vbinary/gen/f32-vsqrdiff-scalar-u1.c index 0e90b54ab51..857ed6c4ea0 100644 --- a/src/f32-vbinary/gen/f32-vsqrdiff-scalar-u1.c +++ b/src/f32-vbinary/gen/f32-vsqrdiff-scalar-u1.c @@ -27,7 +27,6 @@ void xnn_f32_vsqrdiff_ukernel__scalar_u1( assert(input_b != NULL); assert(output != NULL); - for (; batch >= sizeof(float); batch -= sizeof(float)) { const float va = *input_a++; const float vb = *input_b++; diff --git a/src/f32-vbinary/gen/f32-vsqrdiff-scalar-u2.c b/src/f32-vbinary/gen/f32-vsqrdiff-scalar-u2.c index 7e77a485d48..a66279affb6 100644 --- a/src/f32-vbinary/gen/f32-vsqrdiff-scalar-u2.c +++ b/src/f32-vbinary/gen/f32-vsqrdiff-scalar-u2.c @@ -27,7 +27,6 @@ void xnn_f32_vsqrdiff_ukernel__scalar_u2( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -43,7 +42,6 @@ void xnn_f32_vsqrdiff_ukernel__scalar_u2( vacc0 = vacc0 * vacc0; vacc1 = vacc1 * vacc1; - output[0] = vacc0; output[1] = vacc1; output += 2; diff --git a/src/f32-vbinary/gen/f32-vsqrdiff-scalar-u4.c b/src/f32-vbinary/gen/f32-vsqrdiff-scalar-u4.c index 18184c8ca65..248853ac424 100644 --- a/src/f32-vbinary/gen/f32-vsqrdiff-scalar-u4.c +++ b/src/f32-vbinary/gen/f32-vsqrdiff-scalar-u4.c @@ -27,7 +27,6 @@ void xnn_f32_vsqrdiff_ukernel__scalar_u4( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -51,7 +50,6 @@ void xnn_f32_vsqrdiff_ukernel__scalar_u4( vacc2 = vacc2 * vacc2; vacc3 = vacc3 * vacc3; - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vsqrdiff-scalar-u8.c b/src/f32-vbinary/gen/f32-vsqrdiff-scalar-u8.c index cea104f067a..4807a9084be 100644 --- a/src/f32-vbinary/gen/f32-vsqrdiff-scalar-u8.c +++ b/src/f32-vbinary/gen/f32-vsqrdiff-scalar-u8.c @@ -27,7 +27,6 @@ void xnn_f32_vsqrdiff_ukernel__scalar_u8( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -67,7 +66,6 @@ void xnn_f32_vsqrdiff_ukernel__scalar_u8( vacc6 = vacc6 * vacc6; vacc7 = vacc7 * vacc7; - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vsqrdiff-sse-u4.c b/src/f32-vbinary/gen/f32-vsqrdiff-sse-u4.c index d4dec55b21d..31bb8131622 100644 --- a/src/f32-vbinary/gen/f32-vsqrdiff-sse-u4.c +++ b/src/f32-vbinary/gen/f32-vsqrdiff-sse-u4.c @@ -29,7 +29,6 @@ void xnn_f32_vsqrdiff_ukernel__sse_u4( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const __m128 va = _mm_loadu_ps(input_a); input_a += 4; diff --git a/src/f32-vbinary/gen/f32-vsqrdiff-sse-u8.c b/src/f32-vbinary/gen/f32-vsqrdiff-sse-u8.c index 8e6e426ebd2..78fa9ccab69 100644 --- a/src/f32-vbinary/gen/f32-vsqrdiff-sse-u8.c +++ b/src/f32-vbinary/gen/f32-vsqrdiff-sse-u8.c @@ -29,7 +29,6 @@ void xnn_f32_vsqrdiff_ukernel__sse_u8( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const __m128 va0 = _mm_loadu_ps(input_a); const __m128 va1 = _mm_loadu_ps(input_a + 4); @@ -45,7 +44,6 @@ void xnn_f32_vsqrdiff_ukernel__sse_u8( vacc0 = _mm_mul_ps(vacc0, vacc0); vacc1 = _mm_mul_ps(vacc1, vacc1); - _mm_storeu_ps(output, vacc0); _mm_storeu_ps(output + 4, vacc1); output += 8; diff --git a/src/f32-vbinary/gen/f32-vsqrdiff-wasmsimd-u16.c b/src/f32-vbinary/gen/f32-vsqrdiff-wasmsimd-u16.c index 5c22e01943e..2e630ce7e15 100644 --- a/src/f32-vbinary/gen/f32-vsqrdiff-wasmsimd-u16.c +++ b/src/f32-vbinary/gen/f32-vsqrdiff-wasmsimd-u16.c @@ -28,7 +28,6 @@ void xnn_f32_vsqrdiff_ukernel__wasmsimd_u16( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); @@ -68,7 +67,6 @@ void xnn_f32_vsqrdiff_ukernel__wasmsimd_u16( v128_t vacc = wasm_f32x4_sub(va, vb); vacc = wasm_f32x4_mul(vacc, vacc); - wasm_v128_store(output, vacc); output += 4; } @@ -79,7 +77,6 @@ void xnn_f32_vsqrdiff_ukernel__wasmsimd_u16( v128_t vacc = wasm_f32x4_sub(va, vb); vacc = wasm_f32x4_mul(vacc, vacc); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vsqrdiff-wasmsimd-u4.c b/src/f32-vbinary/gen/f32-vsqrdiff-wasmsimd-u4.c index 310cad2341e..33e3da0180a 100644 --- a/src/f32-vbinary/gen/f32-vsqrdiff-wasmsimd-u4.c +++ b/src/f32-vbinary/gen/f32-vsqrdiff-wasmsimd-u4.c @@ -28,7 +28,6 @@ void xnn_f32_vsqrdiff_ukernel__wasmsimd_u4( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; @@ -39,7 +38,6 @@ void xnn_f32_vsqrdiff_ukernel__wasmsimd_u4( v128_t vacc = wasm_f32x4_sub(va, vb); vacc = wasm_f32x4_mul(vacc, vacc); - wasm_v128_store(output, vacc); output += 4; } @@ -50,7 +48,6 @@ void xnn_f32_vsqrdiff_ukernel__wasmsimd_u4( v128_t vacc = wasm_f32x4_sub(va, vb); vacc = wasm_f32x4_mul(vacc, vacc); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vsqrdiff-wasmsimd-u8.c b/src/f32-vbinary/gen/f32-vsqrdiff-wasmsimd-u8.c index 000678d7ce7..1d5f8119788 100644 --- a/src/f32-vbinary/gen/f32-vsqrdiff-wasmsimd-u8.c +++ b/src/f32-vbinary/gen/f32-vsqrdiff-wasmsimd-u8.c @@ -28,7 +28,6 @@ void xnn_f32_vsqrdiff_ukernel__wasmsimd_u8( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); @@ -58,7 +57,6 @@ void xnn_f32_vsqrdiff_ukernel__wasmsimd_u8( v128_t vacc = wasm_f32x4_sub(va, vb); vacc = wasm_f32x4_mul(vacc, vacc); - wasm_v128_store(output, vacc); output += 4; } @@ -69,7 +67,6 @@ void xnn_f32_vsqrdiff_ukernel__wasmsimd_u8( v128_t vacc = wasm_f32x4_sub(va, vb); vacc = wasm_f32x4_mul(vacc, vacc); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vsqrdiffc-avx-u16.c b/src/f32-vbinary/gen/f32-vsqrdiffc-avx-u16.c index 3b151bd05ac..ea696386506 100644 --- a/src/f32-vbinary/gen/f32-vsqrdiffc-avx-u16.c +++ b/src/f32-vbinary/gen/f32-vsqrdiffc-avx-u16.c @@ -43,7 +43,6 @@ void xnn_f32_vsqrdiffc_ukernel__avx_u16( vacc0 = _mm256_mul_ps(vacc0, vacc0); vacc1 = _mm256_mul_ps(vacc1, vacc1); - _mm256_storeu_ps(output, vacc0); _mm256_storeu_ps(output + 8, vacc1); output += 16; diff --git a/src/f32-vbinary/gen/f32-vsqrdiffc-avx512f-u16.c b/src/f32-vbinary/gen/f32-vsqrdiffc-avx512f-u16.c index 5df28a3fc6b..a8ecdc8e2c3 100644 --- a/src/f32-vbinary/gen/f32-vsqrdiffc-avx512f-u16.c +++ b/src/f32-vbinary/gen/f32-vsqrdiffc-avx512f-u16.c @@ -7,6 +7,7 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. + #include #include @@ -39,7 +40,6 @@ void xnn_f32_vsqrdiffc_ukernel__avx512f_u16( vacc0 = _mm512_mul_ps(vacc0, vacc0); - _mm512_storeu_ps(output, vacc0); output += 16; } diff --git a/src/f32-vbinary/gen/f32-vsqrdiffc-avx512f-u32.c b/src/f32-vbinary/gen/f32-vsqrdiffc-avx512f-u32.c index be2f4000958..63809791d10 100644 --- a/src/f32-vbinary/gen/f32-vsqrdiffc-avx512f-u32.c +++ b/src/f32-vbinary/gen/f32-vsqrdiffc-avx512f-u32.c @@ -7,6 +7,7 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. + #include #include @@ -42,7 +43,6 @@ void xnn_f32_vsqrdiffc_ukernel__avx512f_u32( vacc0 = _mm512_mul_ps(vacc0, vacc0); vacc1 = _mm512_mul_ps(vacc1, vacc1); - _mm512_storeu_ps(output, vacc0); _mm512_storeu_ps(output + 16, vacc1); output += 32; @@ -54,7 +54,6 @@ void xnn_f32_vsqrdiffc_ukernel__avx512f_u32( __m512 vacc = _mm512_sub_ps(va, vb); vacc = _mm512_mul_ps(vacc, vacc); - _mm512_storeu_ps(output, vacc); output += 16; } diff --git a/src/f32-vbinary/gen/f32-vsqrdiffc-hvx-u128.c b/src/f32-vbinary/gen/f32-vsqrdiffc-hvx-u128.c index 2596b04a246..2f9187dc4a4 100644 --- a/src/f32-vbinary/gen/f32-vsqrdiffc-hvx-u128.c +++ b/src/f32-vbinary/gen/f32-vsqrdiffc-hvx-u128.c @@ -42,7 +42,6 @@ void xnn_f32_vsqrdiffc_ukernel__hvx_u128( vacc2 = xnn_mul_f32(vacc2, vacc2); vacc3 = xnn_mul_f32(vacc3, vacc3); - xnn_storeu_f32(output, vacc0); xnn_storeu_f32(output + 32, vacc1); xnn_storeu_f32(output + 64, vacc2); diff --git a/src/f32-vbinary/gen/f32-vsqrdiffc-hvx-u64.c b/src/f32-vbinary/gen/f32-vsqrdiffc-hvx-u64.c index 52abfec03c4..0d800405e92 100644 --- a/src/f32-vbinary/gen/f32-vsqrdiffc-hvx-u64.c +++ b/src/f32-vbinary/gen/f32-vsqrdiffc-hvx-u64.c @@ -36,7 +36,6 @@ void xnn_f32_vsqrdiffc_ukernel__hvx_u64( vacc0 = xnn_mul_f32(vacc0, vacc0); vacc1 = xnn_mul_f32(vacc1, vacc1); - xnn_storeu_f32(output, vacc0); xnn_storeu_f32(output + 32, vacc1); output += 64; diff --git a/src/f32-vbinary/gen/f32-vsqrdiffc-neon-u4.c b/src/f32-vbinary/gen/f32-vsqrdiffc-neon-u4.c index 31da38aff1f..943b32a88ce 100644 --- a/src/f32-vbinary/gen/f32-vsqrdiffc-neon-u4.c +++ b/src/f32-vbinary/gen/f32-vsqrdiffc-neon-u4.c @@ -36,7 +36,6 @@ void xnn_f32_vsqrdiffc_ukernel__neon_u4( float32x4_t vacc = vsubq_f32(va, vb); vacc = vmulq_f32(vacc, vacc); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -45,7 +44,6 @@ void xnn_f32_vsqrdiffc_ukernel__neon_u4( float32x4_t vacc = vsubq_f32(va, vb); vacc = vmulq_f32(vacc, vacc); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/gen/f32-vsqrdiffc-neon-u8.c b/src/f32-vbinary/gen/f32-vsqrdiffc-neon-u8.c index e22ab3f4a64..7cd0bec61f8 100644 --- a/src/f32-vbinary/gen/f32-vsqrdiffc-neon-u8.c +++ b/src/f32-vbinary/gen/f32-vsqrdiffc-neon-u8.c @@ -40,7 +40,6 @@ void xnn_f32_vsqrdiffc_ukernel__neon_u8( vacc0 = vmulq_f32(vacc0, vacc0); vacc1 = vmulq_f32(vacc1, vacc1); - vst1q_f32(output, vacc0); output += 4; vst1q_f32(output, vacc1); output += 4; } @@ -50,7 +49,6 @@ void xnn_f32_vsqrdiffc_ukernel__neon_u8( float32x4_t vacc = vsubq_f32(va, vb); vacc = vmulq_f32(vacc, vacc); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -59,7 +57,6 @@ void xnn_f32_vsqrdiffc_ukernel__neon_u8( float32x4_t vacc = vsubq_f32(va, vb); vacc = vmulq_f32(vacc, vacc); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/gen/f32-vsqrdiffc-scalar-u2.c b/src/f32-vbinary/gen/f32-vsqrdiffc-scalar-u2.c index 028b7d1c43e..280542bb752 100644 --- a/src/f32-vbinary/gen/f32-vsqrdiffc-scalar-u2.c +++ b/src/f32-vbinary/gen/f32-vsqrdiffc-scalar-u2.c @@ -40,7 +40,6 @@ void xnn_f32_vsqrdiffc_ukernel__scalar_u2( vacc0 = vacc0 * vacc0; vacc1 = vacc1 * vacc1; - output[0] = vacc0; output[1] = vacc1; output += 2; diff --git a/src/f32-vbinary/gen/f32-vsqrdiffc-scalar-u4.c b/src/f32-vbinary/gen/f32-vsqrdiffc-scalar-u4.c index ac1030aeb97..fc037f6205d 100644 --- a/src/f32-vbinary/gen/f32-vsqrdiffc-scalar-u4.c +++ b/src/f32-vbinary/gen/f32-vsqrdiffc-scalar-u4.c @@ -46,7 +46,6 @@ void xnn_f32_vsqrdiffc_ukernel__scalar_u4( vacc2 = vacc2 * vacc2; vacc3 = vacc3 * vacc3; - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vsqrdiffc-scalar-u8.c b/src/f32-vbinary/gen/f32-vsqrdiffc-scalar-u8.c index 0ffe5c7fcdb..b283b77c885 100644 --- a/src/f32-vbinary/gen/f32-vsqrdiffc-scalar-u8.c +++ b/src/f32-vbinary/gen/f32-vsqrdiffc-scalar-u8.c @@ -58,7 +58,6 @@ void xnn_f32_vsqrdiffc_ukernel__scalar_u8( vacc6 = vacc6 * vacc6; vacc7 = vacc7 * vacc7; - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vsqrdiffc-sse-u8.c b/src/f32-vbinary/gen/f32-vsqrdiffc-sse-u8.c index d707477f630..0ff2dcdcc58 100644 --- a/src/f32-vbinary/gen/f32-vsqrdiffc-sse-u8.c +++ b/src/f32-vbinary/gen/f32-vsqrdiffc-sse-u8.c @@ -42,7 +42,6 @@ void xnn_f32_vsqrdiffc_ukernel__sse_u8( vacc0 = _mm_mul_ps(vacc0, vacc0); vacc1 = _mm_mul_ps(vacc1, vacc1); - _mm_storeu_ps(output, vacc0); _mm_storeu_ps(output + 4, vacc1); output += 8; diff --git a/src/f32-vbinary/gen/f32-vsqrdiffc-wasmsimd-u16.c b/src/f32-vbinary/gen/f32-vsqrdiffc-wasmsimd-u16.c index fd67b9519ef..8ecd3ba925d 100644 --- a/src/f32-vbinary/gen/f32-vsqrdiffc-wasmsimd-u16.c +++ b/src/f32-vbinary/gen/f32-vsqrdiffc-wasmsimd-u16.c @@ -47,7 +47,6 @@ void xnn_f32_vsqrdiffc_ukernel__wasmsimd_u16( vy2 = wasm_f32x4_mul(vy2, vy2); vy3 = wasm_f32x4_mul(vy3, vy3); - wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); wasm_v128_store(output + 8, vy2); @@ -61,7 +60,6 @@ void xnn_f32_vsqrdiffc_ukernel__wasmsimd_u16( v128_t vy = wasm_f32x4_sub(va, vb); vy = wasm_f32x4_mul(vy, vy); - wasm_v128_store(output, vy); output += 4; } @@ -71,7 +69,6 @@ void xnn_f32_vsqrdiffc_ukernel__wasmsimd_u16( v128_t vy = wasm_f32x4_sub(va, vb); vy = wasm_f32x4_mul(vy, vy); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vsqrdiffc-wasmsimd-u4.c b/src/f32-vbinary/gen/f32-vsqrdiffc-wasmsimd-u4.c index 3dd17a40f12..96573fc3199 100644 --- a/src/f32-vbinary/gen/f32-vsqrdiffc-wasmsimd-u4.c +++ b/src/f32-vbinary/gen/f32-vsqrdiffc-wasmsimd-u4.c @@ -37,7 +37,6 @@ void xnn_f32_vsqrdiffc_ukernel__wasmsimd_u4( v128_t vy = wasm_f32x4_sub(va, vb); vy = wasm_f32x4_mul(vy, vy); - wasm_v128_store(output, vy); output += 4; } @@ -47,7 +46,6 @@ void xnn_f32_vsqrdiffc_ukernel__wasmsimd_u4( v128_t vy = wasm_f32x4_sub(va, vb); vy = wasm_f32x4_mul(vy, vy); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vsqrdiffc-wasmsimd-u8.c b/src/f32-vbinary/gen/f32-vsqrdiffc-wasmsimd-u8.c index f3f661b3440..010cdda8d5d 100644 --- a/src/f32-vbinary/gen/f32-vsqrdiffc-wasmsimd-u8.c +++ b/src/f32-vbinary/gen/f32-vsqrdiffc-wasmsimd-u8.c @@ -41,7 +41,6 @@ void xnn_f32_vsqrdiffc_ukernel__wasmsimd_u8( vy0 = wasm_f32x4_mul(vy0, vy0); vy1 = wasm_f32x4_mul(vy1, vy1); - wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); output += 8; @@ -53,7 +52,6 @@ void xnn_f32_vsqrdiffc_ukernel__wasmsimd_u8( v128_t vy = wasm_f32x4_sub(va, vb); vy = wasm_f32x4_mul(vy, vy); - wasm_v128_store(output, vy); output += 4; } @@ -63,7 +61,6 @@ void xnn_f32_vsqrdiffc_ukernel__wasmsimd_u8( v128_t vy = wasm_f32x4_sub(va, vb); vy = wasm_f32x4_mul(vy, vy); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vsub-minmax-avx-u16.c b/src/f32-vbinary/gen/f32-vsub-avx-u16.c similarity index 76% rename from src/f32-vbinary/gen/f32-vsub-minmax-avx-u16.c rename to src/f32-vbinary/gen/f32-vsub-avx-u16.c index d745748b161..10bd811db99 100644 --- a/src/f32-vbinary/gen/f32-vsub-minmax-avx-u16.c +++ b/src/f32-vbinary/gen/f32-vsub-avx-u16.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vsub_minmax_ukernel__avx_u16( +void xnn_f32_vsub_ukernel__avx_u16( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -30,11 +30,6 @@ void xnn_f32_vsub_minmax_ukernel__avx_u16( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; - const __m256 voutput_min = _mm256_set1_ps(params->scalar.min); - const __m256 voutput_max = _mm256_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const __m256 va0 = _mm256_loadu_ps(input_a); const __m256 va1 = _mm256_loadu_ps(input_a + 8); @@ -45,12 +40,6 @@ void xnn_f32_vsub_minmax_ukernel__avx_u16( input_b += 16; - vacc0 = _mm256_max_ps(voutput_min, vacc0); - vacc1 = _mm256_max_ps(voutput_min, vacc1); - - vacc0 = _mm256_min_ps(voutput_max, vacc0); - vacc1 = _mm256_min_ps(voutput_max, vacc1); - _mm256_storeu_ps(output, vacc0); _mm256_storeu_ps(output + 8, vacc1); output += 16; @@ -61,8 +50,6 @@ void xnn_f32_vsub_minmax_ukernel__avx_u16( __m256 vacc = _mm256_sub_ps(va, _mm256_loadu_ps(input_b)); input_b += 8; - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); _mm256_storeu_ps(output, vacc); output += 8; } @@ -75,8 +62,6 @@ void xnn_f32_vsub_minmax_ukernel__avx_u16( const __m256 vb = _mm256_maskload_ps(input_b, vmask); __m256 vacc = _mm256_sub_ps(va, vb); - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); __m128 vacc_lo = _mm256_castps256_ps128(vacc); if (batch & (4 * sizeof(float))) { diff --git a/src/f32-vbinary/gen/f32-vsub-minmax-avx-u8.c b/src/f32-vbinary/gen/f32-vsub-avx-u8.c similarity index 78% rename from src/f32-vbinary/gen/f32-vsub-minmax-avx-u8.c rename to src/f32-vbinary/gen/f32-vsub-avx-u8.c index fbd7a641985..593762276fc 100644 --- a/src/f32-vbinary/gen/f32-vsub-minmax-avx-u8.c +++ b/src/f32-vbinary/gen/f32-vsub-avx-u8.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vsub_minmax_ukernel__avx_u8( +void xnn_f32_vsub_ukernel__avx_u8( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -30,19 +30,12 @@ void xnn_f32_vsub_minmax_ukernel__avx_u8( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; - const __m256 voutput_min = _mm256_set1_ps(params->scalar.min); - const __m256 voutput_max = _mm256_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const __m256 va = _mm256_loadu_ps(input_a); input_a += 8; __m256 vacc = _mm256_sub_ps(va, _mm256_loadu_ps(input_b)); input_b += 8; - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); _mm256_storeu_ps(output, vacc); output += 8; } @@ -55,8 +48,6 @@ void xnn_f32_vsub_minmax_ukernel__avx_u8( const __m256 vb = _mm256_maskload_ps(input_b, vmask); __m256 vacc = _mm256_sub_ps(va, vb); - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); __m128 vacc_lo = _mm256_castps256_ps128(vacc); if (batch & (4 * sizeof(float))) { diff --git a/src/f32-vbinary/gen/f32-vsub-minmax-avx512f-u16.c b/src/f32-vbinary/gen/f32-vsub-avx512f-u16.c similarity index 75% rename from src/f32-vbinary/gen/f32-vsub-minmax-avx512f-u16.c rename to src/f32-vbinary/gen/f32-vsub-avx512f-u16.c index fb5f2e45557..a037c66fe97 100644 --- a/src/f32-vbinary/gen/f32-vsub-minmax-avx512f-u16.c +++ b/src/f32-vbinary/gen/f32-vsub-avx512f-u16.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vsub_minmax_ukernel__avx512f_u16( +void xnn_f32_vsub_ukernel__avx512f_u16( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,9 +29,6 @@ void xnn_f32_vsub_minmax_ukernel__avx512f_u16( assert(input_b != NULL); assert(output != NULL); - const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); - const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const __m512 va = _mm512_loadu_ps(input_a); @@ -41,9 +38,6 @@ void xnn_f32_vsub_minmax_ukernel__avx512f_u16( input_b += 16; - vacc = _mm512_max_ps(voutput_min, vacc); - vacc = _mm512_min_ps(voutput_max, vacc); - _mm512_storeu_ps(output, vacc); output += 16; } @@ -57,8 +51,6 @@ void xnn_f32_vsub_minmax_ukernel__avx512f_u16( const __m512 va = _mm512_maskz_loadu_ps(vmask, input_a); __m512 vacc = _mm512_maskz_sub_ps(vmask, va, _mm512_maskz_loadu_ps(vmask, input_b)); - vacc = _mm512_maskz_max_ps(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ps(vmask, voutput_max, vacc); _mm512_mask_storeu_ps(output, vmask, vacc); } } diff --git a/src/f32-vbinary/gen/f32-vsub-minmax-avx512f-u32.c b/src/f32-vbinary/gen/f32-vsub-avx512f-u32.c similarity index 74% rename from src/f32-vbinary/gen/f32-vsub-minmax-avx512f-u32.c rename to src/f32-vbinary/gen/f32-vsub-avx512f-u32.c index cfa533e03a5..7b6d03f4b51 100644 --- a/src/f32-vbinary/gen/f32-vsub-minmax-avx512f-u32.c +++ b/src/f32-vbinary/gen/f32-vsub-avx512f-u32.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vsub_minmax_ukernel__avx512f_u32( +void xnn_f32_vsub_ukernel__avx512f_u32( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,9 +29,6 @@ void xnn_f32_vsub_minmax_ukernel__avx512f_u32( assert(input_b != NULL); assert(output != NULL); - const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); - const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); - for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { const __m512 va0 = _mm512_loadu_ps(input_a); @@ -43,12 +40,6 @@ void xnn_f32_vsub_minmax_ukernel__avx512f_u32( input_b += 32; - vacc0 = _mm512_max_ps(voutput_min, vacc0); - vacc1 = _mm512_max_ps(voutput_min, vacc1); - - vacc0 = _mm512_min_ps(voutput_max, vacc0); - vacc1 = _mm512_min_ps(voutput_max, vacc1); - _mm512_storeu_ps(output, vacc0); _mm512_storeu_ps(output + 16, vacc1); output += 32; @@ -61,9 +52,6 @@ void xnn_f32_vsub_minmax_ukernel__avx512f_u32( input_b += 16; - vacc = _mm512_max_ps(voutput_min, vacc); - vacc = _mm512_min_ps(voutput_max, vacc); - _mm512_storeu_ps(output, vacc); output += 16; } @@ -77,8 +65,6 @@ void xnn_f32_vsub_minmax_ukernel__avx512f_u32( const __m512 va = _mm512_maskz_loadu_ps(vmask, input_a); __m512 vacc = _mm512_maskz_sub_ps(vmask, va, _mm512_maskz_loadu_ps(vmask, input_b)); - vacc = _mm512_maskz_max_ps(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ps(vmask, voutput_max, vacc); _mm512_mask_storeu_ps(output, vmask, vacc); } } diff --git a/src/f32-vbinary/gen/f32-vsub-minmax-hvx-u128.c b/src/f32-vbinary/gen/f32-vsub-hvx-u128.c similarity index 69% rename from src/f32-vbinary/gen/f32-vsub-minmax-hvx-u128.c rename to src/f32-vbinary/gen/f32-vsub-hvx-u128.c index b97434203a8..15a26e349c6 100644 --- a/src/f32-vbinary/gen/f32-vsub-minmax-hvx-u128.c +++ b/src/f32-vbinary/gen/f32-vsub-hvx-u128.c @@ -10,12 +10,12 @@ #include "xnnpack/math.h" #include "xnnpack/vbinary.h" -void xnn_f32_vsub_minmax_ukernel__hvx_u128( +void xnn_f32_vsub_ukernel__hvx_u128( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -23,9 +23,6 @@ void xnn_f32_vsub_minmax_ukernel__hvx_u128( assert(input_b != NULL); assert(output != NULL); - const HVX_Vector voutput_min = xnn_set1_f32(params->scalar.min); - const HVX_Vector voutput_max = xnn_set1_f32(params->scalar.max); - for (; batch >= 128 * sizeof(float); batch -= 128 * sizeof(float)) { HVX_Vector va0 = xnn_loadu_f32(input_a); HVX_Vector va1 = xnn_loadu_f32(input_a + 32); @@ -44,16 +41,6 @@ void xnn_f32_vsub_minmax_ukernel__hvx_u128( HVX_Vector vacc3 = xnn_sub_f32(va3, vb3); - vacc0 = xnn_max_f32(vacc0, voutput_min); - vacc1 = xnn_max_f32(vacc1, voutput_min); - vacc2 = xnn_max_f32(vacc2, voutput_min); - vacc3 = xnn_max_f32(vacc3, voutput_min); - - vacc0 = xnn_min_f32(vacc0, voutput_max); - vacc1 = xnn_min_f32(vacc1, voutput_max); - vacc2 = xnn_min_f32(vacc2, voutput_max); - vacc3 = xnn_min_f32(vacc3, voutput_max); - xnn_storeu_f32(output, vacc0); xnn_storeu_f32(output + 32, vacc1); xnn_storeu_f32(output + 64, vacc2); @@ -67,8 +54,6 @@ void xnn_f32_vsub_minmax_ukernel__hvx_u128( input_b += 32; HVX_Vector vacc = xnn_sub_f32(va, vb); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); xnn_storeu_f32(output, vacc); output += 32; @@ -78,9 +63,7 @@ void xnn_f32_vsub_minmax_ukernel__hvx_u128( HVX_Vector vb = xnn_loadu_f32(input_b); HVX_Vector vacc = xnn_sub_f32(va, vb); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); - + Q6_V_vstu_variable(output, batch, vacc); } } diff --git a/src/f32-vbinary/gen/f32-vsub-minmax-hvx-u32.c b/src/f32-vbinary/gen/f32-vsub-hvx-u32.c similarity index 68% rename from src/f32-vbinary/gen/f32-vsub-minmax-hvx-u32.c rename to src/f32-vbinary/gen/f32-vsub-hvx-u32.c index c4f2ec0eda4..7597cba521e 100644 --- a/src/f32-vbinary/gen/f32-vsub-minmax-hvx-u32.c +++ b/src/f32-vbinary/gen/f32-vsub-hvx-u32.c @@ -10,12 +10,12 @@ #include "xnnpack/math.h" #include "xnnpack/vbinary.h" -void xnn_f32_vsub_minmax_ukernel__hvx_u32( +void xnn_f32_vsub_ukernel__hvx_u32( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -23,9 +23,6 @@ void xnn_f32_vsub_minmax_ukernel__hvx_u32( assert(input_b != NULL); assert(output != NULL); - const HVX_Vector voutput_min = xnn_set1_f32(params->scalar.min); - const HVX_Vector voutput_max = xnn_set1_f32(params->scalar.max); - for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { HVX_Vector va = xnn_loadu_f32(input_a); HVX_Vector vb = xnn_loadu_f32(input_b); @@ -33,8 +30,6 @@ void xnn_f32_vsub_minmax_ukernel__hvx_u32( input_b += 32; HVX_Vector vacc = xnn_sub_f32(va, vb); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); xnn_storeu_f32(output, vacc); output += 32; @@ -44,9 +39,7 @@ void xnn_f32_vsub_minmax_ukernel__hvx_u32( HVX_Vector vb = xnn_loadu_f32(input_b); HVX_Vector vacc = xnn_sub_f32(va, vb); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); - + Q6_V_vstu_variable(output, batch, vacc); } } diff --git a/src/f32-vbinary/gen/f32-vsub-minmax-hvx-u64.c b/src/f32-vbinary/gen/f32-vsub-hvx-u64.c similarity index 69% rename from src/f32-vbinary/gen/f32-vsub-minmax-hvx-u64.c rename to src/f32-vbinary/gen/f32-vsub-hvx-u64.c index c831236a856..17d1fdf7f16 100644 --- a/src/f32-vbinary/gen/f32-vsub-minmax-hvx-u64.c +++ b/src/f32-vbinary/gen/f32-vsub-hvx-u64.c @@ -10,12 +10,12 @@ #include "xnnpack/math.h" #include "xnnpack/vbinary.h" -void xnn_f32_vsub_minmax_ukernel__hvx_u64( +void xnn_f32_vsub_ukernel__hvx_u64( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -23,9 +23,6 @@ void xnn_f32_vsub_minmax_ukernel__hvx_u64( assert(input_b != NULL); assert(output != NULL); - const HVX_Vector voutput_min = xnn_set1_f32(params->scalar.min); - const HVX_Vector voutput_max = xnn_set1_f32(params->scalar.max); - for (; batch >= 64 * sizeof(float); batch -= 64 * sizeof(float)) { HVX_Vector va0 = xnn_loadu_f32(input_a); HVX_Vector va1 = xnn_loadu_f32(input_a + 32); @@ -38,12 +35,6 @@ void xnn_f32_vsub_minmax_ukernel__hvx_u64( HVX_Vector vacc1 = xnn_sub_f32(va1, vb1); - vacc0 = xnn_max_f32(vacc0, voutput_min); - vacc1 = xnn_max_f32(vacc1, voutput_min); - - vacc0 = xnn_min_f32(vacc0, voutput_max); - vacc1 = xnn_min_f32(vacc1, voutput_max); - xnn_storeu_f32(output, vacc0); xnn_storeu_f32(output + 32, vacc1); output += 64; @@ -55,8 +46,6 @@ void xnn_f32_vsub_minmax_ukernel__hvx_u64( input_b += 32; HVX_Vector vacc = xnn_sub_f32(va, vb); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); xnn_storeu_f32(output, vacc); output += 32; @@ -66,9 +55,7 @@ void xnn_f32_vsub_minmax_ukernel__hvx_u64( HVX_Vector vb = xnn_loadu_f32(input_b); HVX_Vector vacc = xnn_sub_f32(va, vb); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); - + Q6_V_vstu_variable(output, batch, vacc); } } diff --git a/src/f32-vbinary/gen/f32-vsub-minmax-wasm-u1.c b/src/f32-vbinary/gen/f32-vsub-minmax-wasm-u1.c deleted file mode 100644 index 08c5749c968..00000000000 --- a/src/f32-vbinary/gen/f32-vsub-minmax-wasm-u1.c +++ /dev/null @@ -1,41 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vsub_minmax_ukernel__wasm_u1( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - - for (; batch >= sizeof(float); batch -= sizeof(float)) { - const float va = *input_a++; - const float vb = *input_b++; - float vacc = va - vb; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output++ = vacc; - } -} diff --git a/src/f32-vbinary/gen/f32-vsub-minmax-wasm-u2.c b/src/f32-vbinary/gen/f32-vsub-minmax-wasm-u2.c deleted file mode 100644 index 9910ce59d43..00000000000 --- a/src/f32-vbinary/gen/f32-vsub-minmax-wasm-u2.c +++ /dev/null @@ -1,65 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vsub_minmax_ukernel__wasm_u2( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - - for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) { - const float va0 = input_a[0]; - const float va1 = input_a[1]; - input_a += 2; - - const float vb0 = input_b[0]; - const float vb1 = input_b[1]; - input_b += 2; - - float vacc0 = va0 - vb0; - float vacc1 = va1 - vb1; - - - vacc0 = __builtin_wasm_max_f32(vacc0, voutput_min); - vacc1 = __builtin_wasm_max_f32(vacc1, voutput_min); - - vacc0 = __builtin_wasm_min_f32(vacc0, voutput_max); - vacc1 = __builtin_wasm_min_f32(vacc1, voutput_max); - - output[0] = vacc0; - output[1] = vacc1; - output += 2; - } - if XNN_UNLIKELY(batch != 0) { - assert(batch == sizeof(float)); - const float va = *input_a; - const float vb = *input_b; - float vacc = va - vb; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output = vacc; - } -} diff --git a/src/f32-vbinary/gen/f32-vsub-minmax-wasm-u4.c b/src/f32-vbinary/gen/f32-vsub-minmax-wasm-u4.c deleted file mode 100644 index 705539fc797..00000000000 --- a/src/f32-vbinary/gen/f32-vsub-minmax-wasm-u4.c +++ /dev/null @@ -1,79 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vsub_minmax_ukernel__wasm_u4( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float va0 = input_a[0]; - const float va1 = input_a[1]; - const float va2 = input_a[2]; - const float va3 = input_a[3]; - input_a += 4; - - const float vb0 = input_b[0]; - const float vb1 = input_b[1]; - const float vb2 = input_b[2]; - const float vb3 = input_b[3]; - input_b += 4; - - float vacc0 = va0 - vb0; - float vacc1 = va1 - vb1; - float vacc2 = va2 - vb2; - float vacc3 = va3 - vb3; - - - vacc0 = __builtin_wasm_max_f32(vacc0, voutput_min); - vacc1 = __builtin_wasm_max_f32(vacc1, voutput_min); - vacc2 = __builtin_wasm_max_f32(vacc2, voutput_min); - vacc3 = __builtin_wasm_max_f32(vacc3, voutput_min); - - vacc0 = __builtin_wasm_min_f32(vacc0, voutput_max); - vacc1 = __builtin_wasm_min_f32(vacc1, voutput_max); - vacc2 = __builtin_wasm_min_f32(vacc2, voutput_max); - vacc3 = __builtin_wasm_min_f32(vacc3, voutput_max); - - output[0] = vacc0; - output[1] = vacc1; - output[2] = vacc2; - output[3] = vacc3; - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - do { - const float va = *input_a++; - const float vb = *input_b++; - float vacc = va - vb; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output++ = vacc; - batch -= sizeof(float); - } while (batch != 0); - } -} diff --git a/src/f32-vbinary/gen/f32-vsub-minmax-wasm-u8.c b/src/f32-vbinary/gen/f32-vsub-minmax-wasm-u8.c deleted file mode 100644 index b726fb00ead..00000000000 --- a/src/f32-vbinary/gen/f32-vsub-minmax-wasm-u8.c +++ /dev/null @@ -1,103 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vsub_minmax_ukernel__wasm_u8( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const float va0 = input_a[0]; - const float va1 = input_a[1]; - const float va2 = input_a[2]; - const float va3 = input_a[3]; - const float va4 = input_a[4]; - const float va5 = input_a[5]; - const float va6 = input_a[6]; - const float va7 = input_a[7]; - input_a += 8; - - const float vb0 = input_b[0]; - const float vb1 = input_b[1]; - const float vb2 = input_b[2]; - const float vb3 = input_b[3]; - const float vb4 = input_b[4]; - const float vb5 = input_b[5]; - const float vb6 = input_b[6]; - const float vb7 = input_b[7]; - input_b += 8; - - float vacc0 = va0 - vb0; - float vacc1 = va1 - vb1; - float vacc2 = va2 - vb2; - float vacc3 = va3 - vb3; - float vacc4 = va4 - vb4; - float vacc5 = va5 - vb5; - float vacc6 = va6 - vb6; - float vacc7 = va7 - vb7; - - - vacc0 = __builtin_wasm_max_f32(vacc0, voutput_min); - vacc1 = __builtin_wasm_max_f32(vacc1, voutput_min); - vacc2 = __builtin_wasm_max_f32(vacc2, voutput_min); - vacc3 = __builtin_wasm_max_f32(vacc3, voutput_min); - vacc4 = __builtin_wasm_max_f32(vacc4, voutput_min); - vacc5 = __builtin_wasm_max_f32(vacc5, voutput_min); - vacc6 = __builtin_wasm_max_f32(vacc6, voutput_min); - vacc7 = __builtin_wasm_max_f32(vacc7, voutput_min); - - vacc0 = __builtin_wasm_min_f32(vacc0, voutput_max); - vacc1 = __builtin_wasm_min_f32(vacc1, voutput_max); - vacc2 = __builtin_wasm_min_f32(vacc2, voutput_max); - vacc3 = __builtin_wasm_min_f32(vacc3, voutput_max); - vacc4 = __builtin_wasm_min_f32(vacc4, voutput_max); - vacc5 = __builtin_wasm_min_f32(vacc5, voutput_max); - vacc6 = __builtin_wasm_min_f32(vacc6, voutput_max); - vacc7 = __builtin_wasm_min_f32(vacc7, voutput_max); - - output[0] = vacc0; - output[1] = vacc1; - output[2] = vacc2; - output[3] = vacc3; - output[4] = vacc4; - output[5] = vacc5; - output[6] = vacc6; - output[7] = vacc7; - output += 8; - } - if XNN_UNLIKELY(batch != 0) { - do { - const float va = *input_a++; - const float vb = *input_b++; - float vacc = va - vb; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output++ = vacc; - batch -= sizeof(float); - } while (batch != 0); - } -} diff --git a/src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-arm-u16.c b/src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-arm-u16.c deleted file mode 100644 index fe2a6dad0a4..00000000000 --- a/src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-arm-u16.c +++ /dev/null @@ -1,103 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vsub_minmax_ukernel__wasmsimd_arm_u16( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - const v128_t va2 = wasm_v128_load(input_a + 8); - const v128_t va3 = wasm_v128_load(input_a + 12); - input_a += 16; - - const v128_t vb0 = wasm_v128_load(input_b); - const v128_t vb1 = wasm_v128_load(input_b + 4); - const v128_t vb2 = wasm_v128_load(input_b + 8); - const v128_t vb3 = wasm_v128_load(input_b + 12); - input_b += 16; - - v128_t vacc0 = wasm_f32x4_sub(va0, vb0); - v128_t vacc1 = wasm_f32x4_sub(va1, vb1); - v128_t vacc2 = wasm_f32x4_sub(va2, vb2); - v128_t vacc3 = wasm_f32x4_sub(va3, vb3); - - vacc0 = wasm_f32x4_max(vacc0, voutput_min); - vacc1 = wasm_f32x4_max(vacc1, voutput_min); - vacc2 = wasm_f32x4_max(vacc2, voutput_min); - vacc3 = wasm_f32x4_max(vacc3, voutput_min); - - vacc0 = wasm_f32x4_min(vacc0, voutput_max); - vacc1 = wasm_f32x4_min(vacc1, voutput_max); - vacc2 = wasm_f32x4_min(vacc2, voutput_max); - vacc3 = wasm_f32x4_min(vacc3, voutput_max); - - wasm_v128_store(output, vacc0); - wasm_v128_store(output + 4, vacc1); - wasm_v128_store(output + 8, vacc2); - wasm_v128_store(output + 12, vacc3); - output += 16; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - const v128_t vb = wasm_v128_load(input_b); - input_b += 4; - - v128_t vacc = wasm_f32x4_sub(va, vb); - - vacc = wasm_f32x4_max(vacc, voutput_min); - vacc = wasm_f32x4_min(vacc, voutput_max); - - wasm_v128_store(output, vacc); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - const v128_t vb = wasm_v128_load(input_b); - - v128_t vacc = wasm_f32x4_sub(va, vb); - - vacc = wasm_f32x4_max(vacc, voutput_min); - vacc = wasm_f32x4_min(vacc, voutput_max); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vacc, 0); - vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vacc, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-arm-u4.c b/src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-arm-u4.c deleted file mode 100644 index b1ac7cf73f3..00000000000 --- a/src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-arm-u4.c +++ /dev/null @@ -1,69 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vsub_minmax_ukernel__wasmsimd_arm_u4( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - const v128_t vb = wasm_v128_load(input_b); - input_b += 4; - - v128_t vacc = wasm_f32x4_sub(va, vb); - - vacc = wasm_f32x4_max(vacc, voutput_min); - vacc = wasm_f32x4_min(vacc, voutput_max); - - wasm_v128_store(output, vacc); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - const v128_t vb = wasm_v128_load(input_b); - - v128_t vacc = wasm_f32x4_sub(va, vb); - - vacc = wasm_f32x4_max(vacc, voutput_min); - vacc = wasm_f32x4_min(vacc, voutput_max); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vacc, 0); - vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vacc, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-arm-u8.c b/src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-arm-u8.c deleted file mode 100644 index 6ccfcbffa64..00000000000 --- a/src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-arm-u8.c +++ /dev/null @@ -1,91 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vsub_minmax_ukernel__wasmsimd_arm_u8( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - input_a += 8; - - const v128_t vb0 = wasm_v128_load(input_b); - const v128_t vb1 = wasm_v128_load(input_b + 4); - input_b += 8; - - v128_t vacc0 = wasm_f32x4_sub(va0, vb0); - v128_t vacc1 = wasm_f32x4_sub(va1, vb1); - - vacc0 = wasm_f32x4_max(vacc0, voutput_min); - vacc1 = wasm_f32x4_max(vacc1, voutput_min); - - vacc0 = wasm_f32x4_min(vacc0, voutput_max); - vacc1 = wasm_f32x4_min(vacc1, voutput_max); - - wasm_v128_store(output, vacc0); - wasm_v128_store(output + 4, vacc1); - output += 8; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - const v128_t vb = wasm_v128_load(input_b); - input_b += 4; - - v128_t vacc = wasm_f32x4_sub(va, vb); - - vacc = wasm_f32x4_max(vacc, voutput_min); - vacc = wasm_f32x4_min(vacc, voutput_max); - - wasm_v128_store(output, vacc); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - const v128_t vb = wasm_v128_load(input_b); - - v128_t vacc = wasm_f32x4_sub(va, vb); - - vacc = wasm_f32x4_max(vacc, voutput_min); - vacc = wasm_f32x4_min(vacc, voutput_max); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vacc, 0); - vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vacc, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-x86-u16.c b/src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-x86-u16.c deleted file mode 100644 index a796ec42e4e..00000000000 --- a/src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-x86-u16.c +++ /dev/null @@ -1,103 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vsub_minmax_ukernel__wasmsimd_x86_u16( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - const v128_t va2 = wasm_v128_load(input_a + 8); - const v128_t va3 = wasm_v128_load(input_a + 12); - input_a += 16; - - const v128_t vb0 = wasm_v128_load(input_b); - const v128_t vb1 = wasm_v128_load(input_b + 4); - const v128_t vb2 = wasm_v128_load(input_b + 8); - const v128_t vb3 = wasm_v128_load(input_b + 12); - input_b += 16; - - v128_t vacc0 = wasm_f32x4_sub(va0, vb0); - v128_t vacc1 = wasm_f32x4_sub(va1, vb1); - v128_t vacc2 = wasm_f32x4_sub(va2, vb2); - v128_t vacc3 = wasm_f32x4_sub(va3, vb3); - - vacc0 = wasm_f32x4_pmax(voutput_min, vacc0); - vacc1 = wasm_f32x4_pmax(voutput_min, vacc1); - vacc2 = wasm_f32x4_pmax(voutput_min, vacc2); - vacc3 = wasm_f32x4_pmax(voutput_min, vacc3); - - vacc0 = wasm_f32x4_pmin(voutput_max, vacc0); - vacc1 = wasm_f32x4_pmin(voutput_max, vacc1); - vacc2 = wasm_f32x4_pmin(voutput_max, vacc2); - vacc3 = wasm_f32x4_pmin(voutput_max, vacc3); - - wasm_v128_store(output, vacc0); - wasm_v128_store(output + 4, vacc1); - wasm_v128_store(output + 8, vacc2); - wasm_v128_store(output + 12, vacc3); - output += 16; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - const v128_t vb = wasm_v128_load(input_b); - input_b += 4; - - v128_t vacc = wasm_f32x4_sub(va, vb); - - vacc = wasm_f32x4_pmax(voutput_min, vacc); - vacc = wasm_f32x4_pmin(voutput_max, vacc); - - wasm_v128_store(output, vacc); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - const v128_t vb = wasm_v128_load(input_b); - - v128_t vacc = wasm_f32x4_sub(va, vb); - - vacc = wasm_f32x4_pmax(voutput_min, vacc); - vacc = wasm_f32x4_pmin(voutput_max, vacc); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vacc, 0); - vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vacc, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-x86-u4.c b/src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-x86-u4.c deleted file mode 100644 index b34be08ec1a..00000000000 --- a/src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-x86-u4.c +++ /dev/null @@ -1,69 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vsub_minmax_ukernel__wasmsimd_x86_u4( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - const v128_t vb = wasm_v128_load(input_b); - input_b += 4; - - v128_t vacc = wasm_f32x4_sub(va, vb); - - vacc = wasm_f32x4_pmax(voutput_min, vacc); - vacc = wasm_f32x4_pmin(voutput_max, vacc); - - wasm_v128_store(output, vacc); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - const v128_t vb = wasm_v128_load(input_b); - - v128_t vacc = wasm_f32x4_sub(va, vb); - - vacc = wasm_f32x4_pmax(voutput_min, vacc); - vacc = wasm_f32x4_pmin(voutput_max, vacc); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vacc, 0); - vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vacc, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-x86-u8.c b/src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-x86-u8.c deleted file mode 100644 index 70adb4e3936..00000000000 --- a/src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-x86-u8.c +++ /dev/null @@ -1,91 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vop-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vsub_minmax_ukernel__wasmsimd_x86_u8( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - input_a += 8; - - const v128_t vb0 = wasm_v128_load(input_b); - const v128_t vb1 = wasm_v128_load(input_b + 4); - input_b += 8; - - v128_t vacc0 = wasm_f32x4_sub(va0, vb0); - v128_t vacc1 = wasm_f32x4_sub(va1, vb1); - - vacc0 = wasm_f32x4_pmax(voutput_min, vacc0); - vacc1 = wasm_f32x4_pmax(voutput_min, vacc1); - - vacc0 = wasm_f32x4_pmin(voutput_max, vacc0); - vacc1 = wasm_f32x4_pmin(voutput_max, vacc1); - - wasm_v128_store(output, vacc0); - wasm_v128_store(output + 4, vacc1); - output += 8; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - const v128_t vb = wasm_v128_load(input_b); - input_b += 4; - - v128_t vacc = wasm_f32x4_sub(va, vb); - - vacc = wasm_f32x4_pmax(voutput_min, vacc); - vacc = wasm_f32x4_pmin(voutput_max, vacc); - - wasm_v128_store(output, vacc); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - const v128_t vb = wasm_v128_load(input_b); - - v128_t vacc = wasm_f32x4_sub(va, vb); - - vacc = wasm_f32x4_pmax(voutput_min, vacc); - vacc = wasm_f32x4_pmin(voutput_max, vacc); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vacc, 0); - vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vacc, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vsub-minmax-neon-u4.c b/src/f32-vbinary/gen/f32-vsub-neon-u4.c similarity index 75% rename from src/f32-vbinary/gen/f32-vsub-minmax-neon-u4.c rename to src/f32-vbinary/gen/f32-vsub-neon-u4.c index 7d374227711..e00646a38c8 100644 --- a/src/f32-vbinary/gen/f32-vsub-minmax-neon-u4.c +++ b/src/f32-vbinary/gen/f32-vsub-neon-u4.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vsub_minmax_ukernel__neon_u4( +void xnn_f32_vsub_ukernel__neon_u4( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -28,18 +28,12 @@ void xnn_f32_vsub_minmax_ukernel__neon_u4( assert(input_b != NULL); assert(output != NULL); - const float32x4_t voutput_min = vld1q_dup_f32(¶ms->scalar.min); - const float32x4_t voutput_max = vld1q_dup_f32(¶ms->scalar.max); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float32x4_t va = vld1q_f32(input_a); input_a += 4; const float32x4_t vb = vld1q_f32(input_b); input_b += 4; float32x4_t vacc = vsubq_f32(va, vb); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -48,9 +42,6 @@ void xnn_f32_vsub_minmax_ukernel__neon_u4( float32x4_t vacc = vsubq_f32(va, vb); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/gen/f32-vsub-minmax-neon-u8.c b/src/f32-vbinary/gen/f32-vsub-neon-u8.c similarity index 74% rename from src/f32-vbinary/gen/f32-vsub-minmax-neon-u8.c rename to src/f32-vbinary/gen/f32-vsub-neon-u8.c index 427d2dcf8f0..e2078ddaf91 100644 --- a/src/f32-vbinary/gen/f32-vsub-minmax-neon-u8.c +++ b/src/f32-vbinary/gen/f32-vsub-neon-u8.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vsub_minmax_ukernel__neon_u8( +void xnn_f32_vsub_ukernel__neon_u8( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -28,9 +28,6 @@ void xnn_f32_vsub_minmax_ukernel__neon_u8( assert(input_b != NULL); assert(output != NULL); - const float32x4_t voutput_min = vld1q_dup_f32(¶ms->scalar.min); - const float32x4_t voutput_max = vld1q_dup_f32(¶ms->scalar.max); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float32x4_t va0 = vld1q_f32(input_a); input_a += 4; const float32x4_t vb0 = vld1q_f32(input_b); input_b += 4; @@ -41,12 +38,6 @@ void xnn_f32_vsub_minmax_ukernel__neon_u8( float32x4_t vacc1 = vsubq_f32(va1, vb1); - vacc0 = vmaxq_f32(vacc0, voutput_min); - vacc1 = vmaxq_f32(vacc1, voutput_min); - - vacc0 = vminq_f32(vacc0, voutput_max); - vacc1 = vminq_f32(vacc1, voutput_max); - vst1q_f32(output, vacc0); output += 4; vst1q_f32(output, vacc1); output += 4; } @@ -56,9 +47,6 @@ void xnn_f32_vsub_minmax_ukernel__neon_u8( float32x4_t vacc = vsubq_f32(va, vb); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -67,9 +55,6 @@ void xnn_f32_vsub_minmax_ukernel__neon_u8( float32x4_t vacc = vsubq_f32(va, vb); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/gen/f32-vsub-minmax-rvv-u4v.c b/src/f32-vbinary/gen/f32-vsub-rvv-u4v.c similarity index 75% rename from src/f32-vbinary/gen/f32-vsub-minmax-rvv-u4v.c rename to src/f32-vbinary/gen/f32-vsub-rvv-u4v.c index 96e3794e482..3296bd8d2af 100644 --- a/src/f32-vbinary/gen/f32-vsub-minmax-rvv-u4v.c +++ b/src/f32-vbinary/gen/f32-vsub-rvv-u4v.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vsub_minmax_ukernel__rvv_u4v( +void xnn_f32_vsub_ukernel__rvv_u4v( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -28,8 +28,6 @@ void xnn_f32_vsub_minmax_ukernel__rvv_u4v( assert(input_b != NULL); assert(output != NULL); - const float output_min = params->scalar.min; - const float output_max = params->scalar.max; size_t n = batch >> 2; do { @@ -40,8 +38,6 @@ void xnn_f32_vsub_minmax_ukernel__rvv_u4v( vfloat32m4_t vb = __riscv_vle32_v_f32m4(input_b, vl); input_b += vl; vfloat32m4_t vacc = __riscv_vfsub_vv_f32m4(va, vb, vl); - vacc = __riscv_vfmax_vf_f32m4(vacc, output_min, vl); - vacc = __riscv_vfmin_vf_f32m4(vacc, output_max, vl); __riscv_vse32_v_f32m4(output, vacc, vl); output += vl; } while (n > 0); diff --git a/src/f32-vbinary/gen/f32-vsub-minmax-rvv-u8v.c b/src/f32-vbinary/gen/f32-vsub-rvv-u8v.c similarity index 75% rename from src/f32-vbinary/gen/f32-vsub-minmax-rvv-u8v.c rename to src/f32-vbinary/gen/f32-vsub-rvv-u8v.c index ee62a94d84e..a1d8231fb1c 100644 --- a/src/f32-vbinary/gen/f32-vsub-minmax-rvv-u8v.c +++ b/src/f32-vbinary/gen/f32-vsub-rvv-u8v.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vsub_minmax_ukernel__rvv_u8v( +void xnn_f32_vsub_ukernel__rvv_u8v( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -28,8 +28,6 @@ void xnn_f32_vsub_minmax_ukernel__rvv_u8v( assert(input_b != NULL); assert(output != NULL); - const float output_min = params->scalar.min; - const float output_max = params->scalar.max; size_t n = batch >> 2; do { @@ -40,8 +38,6 @@ void xnn_f32_vsub_minmax_ukernel__rvv_u8v( vfloat32m8_t vb = __riscv_vle32_v_f32m8(input_b, vl); input_b += vl; vfloat32m8_t vacc = __riscv_vfsub_vv_f32m8(va, vb, vl); - vacc = __riscv_vfmax_vf_f32m8(vacc, output_min, vl); - vacc = __riscv_vfmin_vf_f32m8(vacc, output_max, vl); __riscv_vse32_v_f32m8(output, vacc, vl); output += vl; } while (n > 0); diff --git a/src/f32-vbinary/gen/f32-vsub-scalar-u1.c b/src/f32-vbinary/gen/f32-vsub-scalar-u1.c index cf14e9bc058..24be5bf8503 100644 --- a/src/f32-vbinary/gen/f32-vsub-scalar-u1.c +++ b/src/f32-vbinary/gen/f32-vsub-scalar-u1.c @@ -27,7 +27,6 @@ void xnn_f32_vsub_ukernel__scalar_u1( assert(input_b != NULL); assert(output != NULL); - for (; batch >= sizeof(float); batch -= sizeof(float)) { const float va = *input_a++; const float vb = *input_b++; diff --git a/src/f32-vbinary/gen/f32-vsub-scalar-u2.c b/src/f32-vbinary/gen/f32-vsub-scalar-u2.c index c165c1e5e24..890c3a540dc 100644 --- a/src/f32-vbinary/gen/f32-vsub-scalar-u2.c +++ b/src/f32-vbinary/gen/f32-vsub-scalar-u2.c @@ -27,7 +27,6 @@ void xnn_f32_vsub_ukernel__scalar_u2( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -41,7 +40,6 @@ void xnn_f32_vsub_ukernel__scalar_u2( float vacc1 = va1 - vb1; - output[0] = vacc0; output[1] = vacc1; output += 2; diff --git a/src/f32-vbinary/gen/f32-vsub-scalar-u4.c b/src/f32-vbinary/gen/f32-vsub-scalar-u4.c index 3462335d6ec..81d935c5eac 100644 --- a/src/f32-vbinary/gen/f32-vsub-scalar-u4.c +++ b/src/f32-vbinary/gen/f32-vsub-scalar-u4.c @@ -27,7 +27,6 @@ void xnn_f32_vsub_ukernel__scalar_u4( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -47,7 +46,6 @@ void xnn_f32_vsub_ukernel__scalar_u4( float vacc3 = va3 - vb3; - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vsub-scalar-u8.c b/src/f32-vbinary/gen/f32-vsub-scalar-u8.c index fe33000ade7..2d6c2c10be0 100644 --- a/src/f32-vbinary/gen/f32-vsub-scalar-u8.c +++ b/src/f32-vbinary/gen/f32-vsub-scalar-u8.c @@ -27,7 +27,6 @@ void xnn_f32_vsub_ukernel__scalar_u8( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -59,7 +58,6 @@ void xnn_f32_vsub_ukernel__scalar_u8( float vacc7 = va7 - vb7; - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vsub-minmax-sse-u4.c b/src/f32-vbinary/gen/f32-vsub-sse-u4.c similarity index 72% rename from src/f32-vbinary/gen/f32-vsub-minmax-sse-u4.c rename to src/f32-vbinary/gen/f32-vsub-sse-u4.c index b32ce6da794..54ecee198e3 100644 --- a/src/f32-vbinary/gen/f32-vsub-minmax-sse-u4.c +++ b/src/f32-vbinary/gen/f32-vsub-sse-u4.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vsub_minmax_ukernel__sse_u4( +void xnn_f32_vsub_ukernel__sse_u4( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,11 +29,6 @@ void xnn_f32_vsub_minmax_ukernel__sse_u4( assert(input_b != NULL); assert(output != NULL); - const __m128 voutput_min = _mm_set1_ps(params->scalar.min); - const __m128 voutput_max = _mm_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const __m128 va = _mm_loadu_ps(input_a); input_a += 4; @@ -42,8 +37,6 @@ void xnn_f32_vsub_minmax_ukernel__sse_u4( input_b += 4; __m128 vacc = _mm_sub_ps(va, vb); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); _mm_storeu_ps(output, vacc); output += 4; @@ -53,8 +46,6 @@ void xnn_f32_vsub_minmax_ukernel__sse_u4( const __m128 vb = _mm_loadu_ps(input_b); __m128 vacc = _mm_sub_ps(va, vb); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); if (batch & (2 * sizeof(float))) { _mm_storel_pi((__m64*) output, vacc); diff --git a/src/f32-vbinary/gen/f32-vsub-minmax-sse-u8.c b/src/f32-vbinary/gen/f32-vsub-sse-u8.c similarity index 72% rename from src/f32-vbinary/gen/f32-vsub-minmax-sse-u8.c rename to src/f32-vbinary/gen/f32-vsub-sse-u8.c index f2d2cc651a4..4386fedeedf 100644 --- a/src/f32-vbinary/gen/f32-vsub-minmax-sse-u8.c +++ b/src/f32-vbinary/gen/f32-vsub-sse-u8.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vsub_minmax_ukernel__sse_u8( +void xnn_f32_vsub_ukernel__sse_u8( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,11 +29,6 @@ void xnn_f32_vsub_minmax_ukernel__sse_u8( assert(input_b != NULL); assert(output != NULL); - const __m128 voutput_min = _mm_set1_ps(params->scalar.min); - const __m128 voutput_max = _mm_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const __m128 va0 = _mm_loadu_ps(input_a); const __m128 va1 = _mm_loadu_ps(input_a + 4); @@ -47,12 +42,6 @@ void xnn_f32_vsub_minmax_ukernel__sse_u8( __m128 vacc1 = _mm_sub_ps(va1, vb1); - vacc0 = _mm_max_ps(vacc0, voutput_min); - vacc1 = _mm_max_ps(vacc1, voutput_min); - - vacc0 = _mm_min_ps(vacc0, voutput_max); - vacc1 = _mm_min_ps(vacc1, voutput_max); - _mm_storeu_ps(output, vacc0); _mm_storeu_ps(output + 4, vacc1); output += 8; @@ -65,8 +54,6 @@ void xnn_f32_vsub_minmax_ukernel__sse_u8( input_b += 4; __m128 vacc = _mm_sub_ps(va, vb); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); _mm_storeu_ps(output, vacc); output += 4; @@ -76,8 +63,6 @@ void xnn_f32_vsub_minmax_ukernel__sse_u8( const __m128 vb = _mm_loadu_ps(input_b); __m128 vacc = _mm_sub_ps(va, vb); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); if (batch & (2 * sizeof(float))) { _mm_storel_pi((__m64*) output, vacc); diff --git a/src/f32-vbinary/gen/f32-vsub-minmax-scalar-u1.c b/src/f32-vbinary/gen/f32-vsub-wasm-u1.c similarity index 72% rename from src/f32-vbinary/gen/f32-vsub-minmax-scalar-u1.c rename to src/f32-vbinary/gen/f32-vsub-wasm-u1.c index 2f1e1a1c0a7..6b419b50811 100644 --- a/src/f32-vbinary/gen/f32-vsub-minmax-scalar-u1.c +++ b/src/f32-vbinary/gen/f32-vsub-wasm-u1.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vsub_minmax_ukernel__scalar_u1( +void xnn_f32_vsub_ukernel__wasm_u1( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,15 +27,10 @@ void xnn_f32_vsub_minmax_ukernel__scalar_u1( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - for (; batch >= sizeof(float); batch -= sizeof(float)) { const float va = *input_a++; const float vb = *input_b++; float vacc = va - vb; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output++ = vacc; } } diff --git a/src/f32-vbinary/gen/f32-vsub-minmax-scalar-u2.c b/src/f32-vbinary/gen/f32-vsub-wasm-u2.c similarity index 70% rename from src/f32-vbinary/gen/f32-vsub-minmax-scalar-u2.c rename to src/f32-vbinary/gen/f32-vsub-wasm-u2.c index 06f57468cea..289cf69f958 100644 --- a/src/f32-vbinary/gen/f32-vsub-minmax-scalar-u2.c +++ b/src/f32-vbinary/gen/f32-vsub-wasm-u2.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vsub_minmax_ukernel__scalar_u2( +void xnn_f32_vsub_ukernel__wasm_u2( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,9 +27,6 @@ void xnn_f32_vsub_minmax_ukernel__scalar_u2( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -43,12 +40,6 @@ void xnn_f32_vsub_minmax_ukernel__scalar_u2( float vacc1 = va1 - vb1; - vacc0 = math_max_f32(vacc0, voutput_min); - vacc1 = math_max_f32(vacc1, voutput_min); - - vacc0 = math_min_f32(vacc0, voutput_max); - vacc1 = math_min_f32(vacc1, voutput_max); - output[0] = vacc0; output[1] = vacc1; output += 2; @@ -58,8 +49,6 @@ void xnn_f32_vsub_minmax_ukernel__scalar_u2( const float va = *input_a; const float vb = *input_b; float vacc = va - vb; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output = vacc; } } diff --git a/src/f32-vbinary/gen/f32-vsub-minmax-scalar-u4.c b/src/f32-vbinary/gen/f32-vsub-wasm-u4.c similarity index 68% rename from src/f32-vbinary/gen/f32-vsub-minmax-scalar-u4.c rename to src/f32-vbinary/gen/f32-vsub-wasm-u4.c index 95e8cfbf8e6..5da9962c4bb 100644 --- a/src/f32-vbinary/gen/f32-vsub-minmax-scalar-u4.c +++ b/src/f32-vbinary/gen/f32-vsub-wasm-u4.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vsub_minmax_ukernel__scalar_u4( +void xnn_f32_vsub_ukernel__wasm_u4( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,9 +27,6 @@ void xnn_f32_vsub_minmax_ukernel__scalar_u4( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -49,16 +46,6 @@ void xnn_f32_vsub_minmax_ukernel__scalar_u4( float vacc3 = va3 - vb3; - vacc0 = math_max_f32(vacc0, voutput_min); - vacc1 = math_max_f32(vacc1, voutput_min); - vacc2 = math_max_f32(vacc2, voutput_min); - vacc3 = math_max_f32(vacc3, voutput_min); - - vacc0 = math_min_f32(vacc0, voutput_max); - vacc1 = math_min_f32(vacc1, voutput_max); - vacc2 = math_min_f32(vacc2, voutput_max); - vacc3 = math_min_f32(vacc3, voutput_max); - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; @@ -70,8 +57,6 @@ void xnn_f32_vsub_minmax_ukernel__scalar_u4( const float va = *input_a++; const float vb = *input_b++; float vacc = va - vb; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output++ = vacc; batch -= sizeof(float); } while (batch != 0); diff --git a/src/f32-vbinary/gen/f32-vsub-minmax-scalar-u8.c b/src/f32-vbinary/gen/f32-vsub-wasm-u8.c similarity index 64% rename from src/f32-vbinary/gen/f32-vsub-minmax-scalar-u8.c rename to src/f32-vbinary/gen/f32-vsub-wasm-u8.c index b9c1a442a5b..c9847f800f9 100644 --- a/src/f32-vbinary/gen/f32-vsub-minmax-scalar-u8.c +++ b/src/f32-vbinary/gen/f32-vsub-wasm-u8.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vsub_minmax_ukernel__scalar_u8( +void xnn_f32_vsub_ukernel__wasm_u8( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,9 +27,6 @@ void xnn_f32_vsub_minmax_ukernel__scalar_u8( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float va0 = input_a[0]; const float va1 = input_a[1]; @@ -61,24 +58,6 @@ void xnn_f32_vsub_minmax_ukernel__scalar_u8( float vacc7 = va7 - vb7; - vacc0 = math_max_f32(vacc0, voutput_min); - vacc1 = math_max_f32(vacc1, voutput_min); - vacc2 = math_max_f32(vacc2, voutput_min); - vacc3 = math_max_f32(vacc3, voutput_min); - vacc4 = math_max_f32(vacc4, voutput_min); - vacc5 = math_max_f32(vacc5, voutput_min); - vacc6 = math_max_f32(vacc6, voutput_min); - vacc7 = math_max_f32(vacc7, voutput_min); - - vacc0 = math_min_f32(vacc0, voutput_max); - vacc1 = math_min_f32(vacc1, voutput_max); - vacc2 = math_min_f32(vacc2, voutput_max); - vacc3 = math_min_f32(vacc3, voutput_max); - vacc4 = math_min_f32(vacc4, voutput_max); - vacc5 = math_min_f32(vacc5, voutput_max); - vacc6 = math_min_f32(vacc6, voutput_max); - vacc7 = math_min_f32(vacc7, voutput_max); - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; @@ -94,8 +73,6 @@ void xnn_f32_vsub_minmax_ukernel__scalar_u8( const float va = *input_a++; const float vb = *input_b++; float vacc = va - vb; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output++ = vacc; batch -= sizeof(float); } while (batch != 0); diff --git a/src/f32-vbinary/gen/f32-vsub-wasmsimd-u16.c b/src/f32-vbinary/gen/f32-vsub-wasmsimd-u16.c index 5458f789ea0..4bdbc4af7d4 100644 --- a/src/f32-vbinary/gen/f32-vsub-wasmsimd-u16.c +++ b/src/f32-vbinary/gen/f32-vsub-wasmsimd-u16.c @@ -28,7 +28,6 @@ void xnn_f32_vsub_ukernel__wasmsimd_u16( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); @@ -63,7 +62,6 @@ void xnn_f32_vsub_ukernel__wasmsimd_u16( v128_t vacc = wasm_f32x4_sub(va, vb); - wasm_v128_store(output, vacc); output += 4; } @@ -73,7 +71,6 @@ void xnn_f32_vsub_ukernel__wasmsimd_u16( v128_t vacc = wasm_f32x4_sub(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vsub-wasmsimd-u4.c b/src/f32-vbinary/gen/f32-vsub-wasmsimd-u4.c index 29eba865a8a..4de2e638e6e 100644 --- a/src/f32-vbinary/gen/f32-vsub-wasmsimd-u4.c +++ b/src/f32-vbinary/gen/f32-vsub-wasmsimd-u4.c @@ -28,7 +28,6 @@ void xnn_f32_vsub_ukernel__wasmsimd_u4( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; @@ -38,7 +37,6 @@ void xnn_f32_vsub_ukernel__wasmsimd_u4( v128_t vacc = wasm_f32x4_sub(va, vb); - wasm_v128_store(output, vacc); output += 4; } @@ -48,7 +46,6 @@ void xnn_f32_vsub_ukernel__wasmsimd_u4( v128_t vacc = wasm_f32x4_sub(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vsub-wasmsimd-u8.c b/src/f32-vbinary/gen/f32-vsub-wasmsimd-u8.c index 3658040263f..246c347440e 100644 --- a/src/f32-vbinary/gen/f32-vsub-wasmsimd-u8.c +++ b/src/f32-vbinary/gen/f32-vsub-wasmsimd-u8.c @@ -28,7 +28,6 @@ void xnn_f32_vsub_ukernel__wasmsimd_u8( assert(input_b != NULL); assert(output != NULL); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); @@ -55,7 +54,6 @@ void xnn_f32_vsub_ukernel__wasmsimd_u8( v128_t vacc = wasm_f32x4_sub(va, vb); - wasm_v128_store(output, vacc); output += 4; } @@ -65,7 +63,6 @@ void xnn_f32_vsub_ukernel__wasmsimd_u8( v128_t vacc = wasm_f32x4_sub(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vsubc-minmax-avx-u8.c b/src/f32-vbinary/gen/f32-vsubc-avx-u16.c similarity index 77% rename from src/f32-vbinary/gen/f32-vsubc-minmax-avx-u8.c rename to src/f32-vbinary/gen/f32-vsubc-avx-u16.c index bd3a4804089..48176c460a7 100644 --- a/src/f32-vbinary/gen/f32-vsubc-minmax-avx-u8.c +++ b/src/f32-vbinary/gen/f32-vsubc-avx-u16.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vsubc_minmax_ukernel__avx_u8( +void xnn_f32_vsubc_ukernel__avx_u16( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -30,19 +30,26 @@ void xnn_f32_vsubc_minmax_ukernel__avx_u8( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; - const __m256 voutput_min = _mm256_set1_ps(params->scalar.min); - const __m256 voutput_max = _mm256_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); const __m256 vb = _mm256_broadcast_ss(input_b); + for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { + const __m256 va0 = _mm256_loadu_ps(input_a); + const __m256 va1 = _mm256_loadu_ps(input_a + 8); + input_a += 16; + + __m256 vacc0 = _mm256_sub_ps(va0, vb); + __m256 vacc1 = _mm256_sub_ps(va1, vb); + + + _mm256_storeu_ps(output, vacc0); + _mm256_storeu_ps(output + 8, vacc1); + output += 16; + } for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const __m256 va = _mm256_loadu_ps(input_a); input_a += 8; __m256 vacc = _mm256_sub_ps(va, vb); - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); _mm256_storeu_ps(output, vacc); output += 8; } @@ -54,8 +61,6 @@ void xnn_f32_vsubc_minmax_ukernel__avx_u8( __m256 va = _mm256_maskload_ps(input_a, vmask); __m256 vacc = _mm256_sub_ps(va, vb); - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); __m128 vacc_lo = _mm256_castps256_ps128(vacc); if (batch & (4 * sizeof(float))) { diff --git a/src/f32-vbinary/gen/f32-vsubc-avx-u8.c b/src/f32-vbinary/gen/f32-vsubc-avx-u8.c new file mode 100644 index 00000000000..fc953b7d5e4 --- /dev/null +++ b/src/f32-vbinary/gen/f32-vsubc-avx-u8.c @@ -0,0 +1,67 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-vbinary/vopc-avx.c.in +// Generator: tools/xngen +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/vbinary.h" + + +void xnn_f32_vsubc_ukernel__avx_u8( + size_t batch, + const float* input_a, + const float* input_b, + float* output, + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + + const __m256 vb = _mm256_broadcast_ss(input_b); + + for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { + const __m256 va = _mm256_loadu_ps(input_a); + input_a += 8; + + __m256 vacc = _mm256_sub_ps(va, vb); + _mm256_storeu_ps(output, vacc); + output += 8; + } + if XNN_UNLIKELY(batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 7 * sizeof(float)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch)); + + __m256 va = _mm256_maskload_ps(input_a, vmask); + + __m256 vacc = _mm256_sub_ps(va, vb); + + __m128 vacc_lo = _mm256_castps256_ps128(vacc); + if (batch & (4 * sizeof(float))) { + _mm_storeu_ps(output, vacc_lo); + vacc_lo = _mm256_extractf128_ps(vacc, 1); + output += 4; + } + if (batch & (2 * sizeof(float))) { + _mm_storel_pi((__m64*) output, vacc_lo); + vacc_lo = _mm_movehl_ps(vacc_lo, vacc_lo); + output += 2; + } + if (batch & (1 * sizeof(float))) { + _mm_store_ss(output, vacc_lo); + } + } +} diff --git a/src/f32-vbinary/gen/f32-vsubc-minmax-avx512f-u16.c b/src/f32-vbinary/gen/f32-vsubc-avx512f-u16.c similarity index 75% rename from src/f32-vbinary/gen/f32-vsubc-minmax-avx512f-u16.c rename to src/f32-vbinary/gen/f32-vsubc-avx512f-u16.c index 21903bed36a..e63f627a895 100644 --- a/src/f32-vbinary/gen/f32-vsubc-minmax-avx512f-u16.c +++ b/src/f32-vbinary/gen/f32-vsubc-avx512f-u16.c @@ -7,6 +7,7 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. + #include #include @@ -16,12 +17,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vsubc_minmax_ukernel__avx512f_u16( +void xnn_f32_vsubc_ukernel__avx512f_u16( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,8 +30,6 @@ void xnn_f32_vsubc_minmax_ukernel__avx512f_u16( assert(input_b != NULL); assert(output != NULL); - const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); - const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); const __m512 vb = _mm512_set1_ps(*input_b); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { @@ -40,10 +39,6 @@ void xnn_f32_vsubc_minmax_ukernel__avx512f_u16( __m512 vacc0 = _mm512_sub_ps(va0, vb); - vacc0 = _mm512_max_ps(voutput_min, vacc0); - - vacc0 = _mm512_min_ps(voutput_max, vacc0); - _mm512_storeu_ps(output, vacc0); output += 16; } @@ -57,8 +52,6 @@ void xnn_f32_vsubc_minmax_ukernel__avx512f_u16( __m512 va = _mm512_maskz_loadu_ps(vmask, input_a); __m512 vacc = _mm512_sub_ps(va, vb); - vacc = _mm512_maskz_max_ps(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ps(vmask, voutput_max, vacc); _mm512_mask_storeu_ps(output, vmask, vacc); } } diff --git a/src/f32-vbinary/gen/f32-vsubc-minmax-avx512f-u32.c b/src/f32-vbinary/gen/f32-vsubc-avx512f-u32.c similarity index 73% rename from src/f32-vbinary/gen/f32-vsubc-minmax-avx512f-u32.c rename to src/f32-vbinary/gen/f32-vsubc-avx512f-u32.c index 2b694f0751c..5d2a07b0f84 100644 --- a/src/f32-vbinary/gen/f32-vsubc-minmax-avx512f-u32.c +++ b/src/f32-vbinary/gen/f32-vsubc-avx512f-u32.c @@ -7,6 +7,7 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. + #include #include @@ -16,12 +17,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vsubc_minmax_ukernel__avx512f_u32( +void xnn_f32_vsubc_ukernel__avx512f_u32( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,8 +30,6 @@ void xnn_f32_vsubc_minmax_ukernel__avx512f_u32( assert(input_b != NULL); assert(output != NULL); - const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); - const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); const __m512 vb = _mm512_set1_ps(*input_b); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { @@ -42,12 +41,6 @@ void xnn_f32_vsubc_minmax_ukernel__avx512f_u32( __m512 vacc1 = _mm512_sub_ps(va1, vb); - vacc0 = _mm512_max_ps(voutput_min, vacc0); - vacc1 = _mm512_max_ps(voutput_min, vacc1); - - vacc0 = _mm512_min_ps(voutput_max, vacc0); - vacc1 = _mm512_min_ps(voutput_max, vacc1); - _mm512_storeu_ps(output, vacc0); _mm512_storeu_ps(output + 16, vacc1); output += 32; @@ -58,9 +51,6 @@ void xnn_f32_vsubc_minmax_ukernel__avx512f_u32( __m512 vacc = _mm512_sub_ps(va, vb); - vacc = _mm512_max_ps(voutput_min, vacc); - vacc = _mm512_min_ps(voutput_max, vacc); - _mm512_storeu_ps(output, vacc); output += 16; } @@ -74,8 +64,6 @@ void xnn_f32_vsubc_minmax_ukernel__avx512f_u32( __m512 va = _mm512_maskz_loadu_ps(vmask, input_a); __m512 vacc = _mm512_sub_ps(va, vb); - vacc = _mm512_maskz_max_ps(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ps(vmask, voutput_max, vacc); _mm512_mask_storeu_ps(output, vmask, vacc); } } diff --git a/src/f32-vbinary/gen/f32-vsubc-minmax-hvx-u128.c b/src/f32-vbinary/gen/f32-vsubc-hvx-u128.c similarity index 65% rename from src/f32-vbinary/gen/f32-vsubc-minmax-hvx-u128.c rename to src/f32-vbinary/gen/f32-vsubc-hvx-u128.c index 3560bea98f2..25a2d2e4afd 100644 --- a/src/f32-vbinary/gen/f32-vsubc-minmax-hvx-u128.c +++ b/src/f32-vbinary/gen/f32-vsubc-hvx-u128.c @@ -10,12 +10,12 @@ #include "xnnpack/math.h" #include "xnnpack/vbinary.h" -void xnn_f32_vsubc_minmax_ukernel__hvx_u128( +void xnn_f32_vsubc_ukernel__hvx_u128( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -23,8 +23,6 @@ void xnn_f32_vsubc_minmax_ukernel__hvx_u128( assert(input_b != NULL); assert(output != NULL); - const HVX_Vector voutput_min = xnn_set1_f32(params->scalar.min); - const HVX_Vector voutput_max = xnn_set1_f32(params->scalar.max); HVX_Vector vb = xnn_set1_f32(*input_b); for (; batch >= 128 * sizeof(float); batch -= 128 * sizeof(float)) { @@ -40,16 +38,6 @@ void xnn_f32_vsubc_minmax_ukernel__hvx_u128( HVX_Vector vacc3 = xnn_sub_f32(va3, vb); - vacc0 = xnn_max_f32(vacc0, voutput_min); - vacc1 = xnn_max_f32(vacc1, voutput_min); - vacc2 = xnn_max_f32(vacc2, voutput_min); - vacc3 = xnn_max_f32(vacc3, voutput_min); - - vacc0 = xnn_min_f32(vacc0, voutput_max); - vacc1 = xnn_min_f32(vacc1, voutput_max); - vacc2 = xnn_min_f32(vacc2, voutput_max); - vacc3 = xnn_min_f32(vacc3, voutput_max); - xnn_storeu_f32(output, vacc0); xnn_storeu_f32(output + 32, vacc1); xnn_storeu_f32(output + 64, vacc2); @@ -61,8 +49,6 @@ void xnn_f32_vsubc_minmax_ukernel__hvx_u128( input_a += 32; HVX_Vector vacc = xnn_sub_f32(va, vb); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); xnn_storeu_f32(output, vacc); output+= 32; @@ -71,8 +57,6 @@ void xnn_f32_vsubc_minmax_ukernel__hvx_u128( HVX_Vector va = xnn_loadu_f32(input_a); HVX_Vector vacc = xnn_sub_f32(va, vb); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); Q6_V_vstu_variable(output, batch, vacc); } diff --git a/src/f32-vbinary/gen/f32-vsubc-minmax-hvx-u32.c b/src/f32-vbinary/gen/f32-vsubc-hvx-u32.c similarity index 67% rename from src/f32-vbinary/gen/f32-vsubc-minmax-hvx-u32.c rename to src/f32-vbinary/gen/f32-vsubc-hvx-u32.c index 967e397b1e7..c72f0ee8001 100644 --- a/src/f32-vbinary/gen/f32-vsubc-minmax-hvx-u32.c +++ b/src/f32-vbinary/gen/f32-vsubc-hvx-u32.c @@ -10,12 +10,12 @@ #include "xnnpack/math.h" #include "xnnpack/vbinary.h" -void xnn_f32_vsubc_minmax_ukernel__hvx_u32( +void xnn_f32_vsubc_ukernel__hvx_u32( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -23,8 +23,6 @@ void xnn_f32_vsubc_minmax_ukernel__hvx_u32( assert(input_b != NULL); assert(output != NULL); - const HVX_Vector voutput_min = xnn_set1_f32(params->scalar.min); - const HVX_Vector voutput_max = xnn_set1_f32(params->scalar.max); HVX_Vector vb = xnn_set1_f32(*input_b); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { @@ -32,8 +30,6 @@ void xnn_f32_vsubc_minmax_ukernel__hvx_u32( input_a += 32; HVX_Vector vacc = xnn_sub_f32(va, vb); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); xnn_storeu_f32(output, vacc); output+= 32; @@ -42,8 +38,6 @@ void xnn_f32_vsubc_minmax_ukernel__hvx_u32( HVX_Vector va = xnn_loadu_f32(input_a); HVX_Vector vacc = xnn_sub_f32(va, vb); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); Q6_V_vstu_variable(output, batch, vacc); } diff --git a/src/f32-vbinary/gen/f32-vsubc-minmax-hvx-u64.c b/src/f32-vbinary/gen/f32-vsubc-hvx-u64.c similarity index 67% rename from src/f32-vbinary/gen/f32-vsubc-minmax-hvx-u64.c rename to src/f32-vbinary/gen/f32-vsubc-hvx-u64.c index 7e4d39387f4..9c3f966d393 100644 --- a/src/f32-vbinary/gen/f32-vsubc-minmax-hvx-u64.c +++ b/src/f32-vbinary/gen/f32-vsubc-hvx-u64.c @@ -10,12 +10,12 @@ #include "xnnpack/math.h" #include "xnnpack/vbinary.h" -void xnn_f32_vsubc_minmax_ukernel__hvx_u64( +void xnn_f32_vsubc_ukernel__hvx_u64( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -23,8 +23,6 @@ void xnn_f32_vsubc_minmax_ukernel__hvx_u64( assert(input_b != NULL); assert(output != NULL); - const HVX_Vector voutput_min = xnn_set1_f32(params->scalar.min); - const HVX_Vector voutput_max = xnn_set1_f32(params->scalar.max); HVX_Vector vb = xnn_set1_f32(*input_b); for (; batch >= 64 * sizeof(float); batch -= 64 * sizeof(float)) { @@ -36,12 +34,6 @@ void xnn_f32_vsubc_minmax_ukernel__hvx_u64( HVX_Vector vacc1 = xnn_sub_f32(va1, vb); - vacc0 = xnn_max_f32(vacc0, voutput_min); - vacc1 = xnn_max_f32(vacc1, voutput_min); - - vacc0 = xnn_min_f32(vacc0, voutput_max); - vacc1 = xnn_min_f32(vacc1, voutput_max); - xnn_storeu_f32(output, vacc0); xnn_storeu_f32(output + 32, vacc1); output += 64; @@ -51,8 +43,6 @@ void xnn_f32_vsubc_minmax_ukernel__hvx_u64( input_a += 32; HVX_Vector vacc = xnn_sub_f32(va, vb); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); xnn_storeu_f32(output, vacc); output+= 32; @@ -61,8 +51,6 @@ void xnn_f32_vsubc_minmax_ukernel__hvx_u64( HVX_Vector va = xnn_loadu_f32(input_a); HVX_Vector vacc = xnn_sub_f32(va, vb); - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); Q6_V_vstu_variable(output, batch, vacc); } diff --git a/src/f32-vbinary/gen/f32-vsubc-minmax-avx-u16.c b/src/f32-vbinary/gen/f32-vsubc-minmax-avx-u16.c deleted file mode 100644 index 9c3bda64167..00000000000 --- a/src/f32-vbinary/gen/f32-vsubc-minmax-avx-u16.c +++ /dev/null @@ -1,94 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-avx.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vsubc_minmax_ukernel__avx_u16( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; - - const __m256 voutput_min = _mm256_set1_ps(params->scalar.min); - const __m256 voutput_max = _mm256_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const __m256 vb = _mm256_broadcast_ss(input_b); - - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const __m256 va0 = _mm256_loadu_ps(input_a); - const __m256 va1 = _mm256_loadu_ps(input_a + 8); - input_a += 16; - - __m256 vacc0 = _mm256_sub_ps(va0, vb); - __m256 vacc1 = _mm256_sub_ps(va1, vb); - - - vacc0 = _mm256_max_ps(voutput_min, vacc0); - vacc1 = _mm256_max_ps(voutput_min, vacc1); - - vacc0 = _mm256_min_ps(voutput_max, vacc0); - vacc1 = _mm256_min_ps(voutput_max, vacc1); - - _mm256_storeu_ps(output, vacc0); - _mm256_storeu_ps(output + 8, vacc1); - output += 16; - } - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const __m256 va = _mm256_loadu_ps(input_a); - input_a += 8; - - __m256 vacc = _mm256_sub_ps(va, vb); - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); - _mm256_storeu_ps(output, vacc); - output += 8; - } - if XNN_UNLIKELY(batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch)); - - __m256 va = _mm256_maskload_ps(input_a, vmask); - - __m256 vacc = _mm256_sub_ps(va, vb); - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); - - __m128 vacc_lo = _mm256_castps256_ps128(vacc); - if (batch & (4 * sizeof(float))) { - _mm_storeu_ps(output, vacc_lo); - vacc_lo = _mm256_extractf128_ps(vacc, 1); - output += 4; - } - if (batch & (2 * sizeof(float))) { - _mm_storel_pi((__m64*) output, vacc_lo); - vacc_lo = _mm_movehl_ps(vacc_lo, vacc_lo); - output += 2; - } - if (batch & (1 * sizeof(float))) { - _mm_store_ss(output, vacc_lo); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vsubc-minmax-wasm-u1.c b/src/f32-vbinary/gen/f32-vsubc-minmax-wasm-u1.c deleted file mode 100644 index c1272a78717..00000000000 --- a/src/f32-vbinary/gen/f32-vsubc-minmax-wasm-u1.c +++ /dev/null @@ -1,41 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vsubc_minmax_ukernel__wasm_u1( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - const float vb = *input_b; - - for (; batch >= sizeof(float); batch -= sizeof(float)) { - const float va = *input_a++; - float vacc = va - vb; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output++ = vacc; - } -} diff --git a/src/f32-vbinary/gen/f32-vsubc-minmax-wasm-u2.c b/src/f32-vbinary/gen/f32-vsubc-minmax-wasm-u2.c deleted file mode 100644 index b13d153e264..00000000000 --- a/src/f32-vbinary/gen/f32-vsubc-minmax-wasm-u2.c +++ /dev/null @@ -1,61 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vsubc_minmax_ukernel__wasm_u2( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - const float vb = *input_b; - - for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) { - const float va0 = input_a[0]; - const float va1 = input_a[1]; - input_a += 2; - - float vacc0 = va0 - vb; - float vacc1 = va1 - vb; - - - vacc0 = __builtin_wasm_max_f32(vacc0, voutput_min); - vacc1 = __builtin_wasm_max_f32(vacc1, voutput_min); - - vacc0 = __builtin_wasm_min_f32(vacc0, voutput_max); - vacc1 = __builtin_wasm_min_f32(vacc1, voutput_max); - - output[0] = vacc0; - output[1] = vacc1; - output += 2; - } - if XNN_UNLIKELY(batch != 0) { - assert(batch == sizeof(float)); - const float va = *input_a; - float vacc = va - vb; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output = vacc; - } -} diff --git a/src/f32-vbinary/gen/f32-vsubc-minmax-wasm-u4.c b/src/f32-vbinary/gen/f32-vsubc-minmax-wasm-u4.c deleted file mode 100644 index 591d8780bc2..00000000000 --- a/src/f32-vbinary/gen/f32-vsubc-minmax-wasm-u4.c +++ /dev/null @@ -1,73 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vsubc_minmax_ukernel__wasm_u4( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - const float vb = *input_b; - - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float va0 = input_a[0]; - const float va1 = input_a[1]; - const float va2 = input_a[2]; - const float va3 = input_a[3]; - input_a += 4; - - float vacc0 = va0 - vb; - float vacc1 = va1 - vb; - float vacc2 = va2 - vb; - float vacc3 = va3 - vb; - - - vacc0 = __builtin_wasm_max_f32(vacc0, voutput_min); - vacc1 = __builtin_wasm_max_f32(vacc1, voutput_min); - vacc2 = __builtin_wasm_max_f32(vacc2, voutput_min); - vacc3 = __builtin_wasm_max_f32(vacc3, voutput_min); - - vacc0 = __builtin_wasm_min_f32(vacc0, voutput_max); - vacc1 = __builtin_wasm_min_f32(vacc1, voutput_max); - vacc2 = __builtin_wasm_min_f32(vacc2, voutput_max); - vacc3 = __builtin_wasm_min_f32(vacc3, voutput_max); - - output[0] = vacc0; - output[1] = vacc1; - output[2] = vacc2; - output[3] = vacc3; - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - do { - const float va = *input_a++; - float vacc = va - vb; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output++ = vacc; - batch -= sizeof(float); - } while (batch != 0); - } -} diff --git a/src/f32-vbinary/gen/f32-vsubc-minmax-wasm-u8.c b/src/f32-vbinary/gen/f32-vsubc-minmax-wasm-u8.c deleted file mode 100644 index c220589a283..00000000000 --- a/src/f32-vbinary/gen/f32-vsubc-minmax-wasm-u8.c +++ /dev/null @@ -1,93 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vsubc_minmax_ukernel__wasm_u8( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - const float vb = *input_b; - - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const float va0 = input_a[0]; - const float va1 = input_a[1]; - const float va2 = input_a[2]; - const float va3 = input_a[3]; - const float va4 = input_a[4]; - const float va5 = input_a[5]; - const float va6 = input_a[6]; - const float va7 = input_a[7]; - input_a += 8; - - float vacc0 = va0 - vb; - float vacc1 = va1 - vb; - float vacc2 = va2 - vb; - float vacc3 = va3 - vb; - float vacc4 = va4 - vb; - float vacc5 = va5 - vb; - float vacc6 = va6 - vb; - float vacc7 = va7 - vb; - - - vacc0 = __builtin_wasm_max_f32(vacc0, voutput_min); - vacc1 = __builtin_wasm_max_f32(vacc1, voutput_min); - vacc2 = __builtin_wasm_max_f32(vacc2, voutput_min); - vacc3 = __builtin_wasm_max_f32(vacc3, voutput_min); - vacc4 = __builtin_wasm_max_f32(vacc4, voutput_min); - vacc5 = __builtin_wasm_max_f32(vacc5, voutput_min); - vacc6 = __builtin_wasm_max_f32(vacc6, voutput_min); - vacc7 = __builtin_wasm_max_f32(vacc7, voutput_min); - - vacc0 = __builtin_wasm_min_f32(vacc0, voutput_max); - vacc1 = __builtin_wasm_min_f32(vacc1, voutput_max); - vacc2 = __builtin_wasm_min_f32(vacc2, voutput_max); - vacc3 = __builtin_wasm_min_f32(vacc3, voutput_max); - vacc4 = __builtin_wasm_min_f32(vacc4, voutput_max); - vacc5 = __builtin_wasm_min_f32(vacc5, voutput_max); - vacc6 = __builtin_wasm_min_f32(vacc6, voutput_max); - vacc7 = __builtin_wasm_min_f32(vacc7, voutput_max); - - output[0] = vacc0; - output[1] = vacc1; - output[2] = vacc2; - output[3] = vacc3; - output[4] = vacc4; - output[5] = vacc5; - output[6] = vacc6; - output[7] = vacc7; - output += 8; - } - if XNN_UNLIKELY(batch != 0) { - do { - const float va = *input_a++; - float vacc = va - vb; - vacc = __builtin_wasm_max_f32(vacc, voutput_min); - vacc = __builtin_wasm_min_f32(vacc, voutput_max); - *output++ = vacc; - batch -= sizeof(float); - } while (batch != 0); - } -} diff --git a/src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-arm-u16.c b/src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-arm-u16.c deleted file mode 100644 index 7ae13f61db8..00000000000 --- a/src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-arm-u16.c +++ /dev/null @@ -1,95 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vsubc_minmax_ukernel__wasmsimd_arm_u16( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const v128_t vb = wasm_v128_load32_splat(input_b); - - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - const v128_t va2 = wasm_v128_load(input_a + 8); - const v128_t va3 = wasm_v128_load(input_a + 12); - input_a += 16; - - v128_t vy0 = wasm_f32x4_sub(va0, vb); - v128_t vy1 = wasm_f32x4_sub(va1, vb); - v128_t vy2 = wasm_f32x4_sub(va2, vb); - v128_t vy3 = wasm_f32x4_sub(va3, vb); - - - vy0 = wasm_f32x4_max(vy0, voutput_min); - vy1 = wasm_f32x4_max(vy1, voutput_min); - vy2 = wasm_f32x4_max(vy2, voutput_min); - vy3 = wasm_f32x4_max(vy3, voutput_min); - - vy0 = wasm_f32x4_min(vy0, voutput_max); - vy1 = wasm_f32x4_min(vy1, voutput_max); - vy2 = wasm_f32x4_min(vy2, voutput_max); - vy3 = wasm_f32x4_min(vy3, voutput_max); - - wasm_v128_store(output, vy0); - wasm_v128_store(output + 4, vy1); - wasm_v128_store(output + 8, vy2); - wasm_v128_store(output + 12, vy3); - output += 16; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - v128_t vy = wasm_f32x4_sub(va, vb); - - vy = wasm_f32x4_max(vy, voutput_min); - vy = wasm_f32x4_min(vy, voutput_max); - - wasm_v128_store(output, vy); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - - v128_t vy = wasm_f32x4_sub(va, vb); - - vy = wasm_f32x4_max(vy, voutput_min); - vy = wasm_f32x4_min(vy, voutput_max); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vy, 0); - vy = wasm_v64x2_shuffle(vy, vy, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vy, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-arm-u4.c b/src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-arm-u4.c deleted file mode 100644 index fe9dcde4d23..00000000000 --- a/src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-arm-u4.c +++ /dev/null @@ -1,66 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vsubc_minmax_ukernel__wasmsimd_arm_u4( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const v128_t vb = wasm_v128_load32_splat(input_b); - - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - v128_t vy = wasm_f32x4_sub(va, vb); - - vy = wasm_f32x4_max(vy, voutput_min); - vy = wasm_f32x4_min(vy, voutput_max); - - wasm_v128_store(output, vy); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - - v128_t vy = wasm_f32x4_sub(va, vb); - - vy = wasm_f32x4_max(vy, voutput_min); - vy = wasm_f32x4_min(vy, voutput_max); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vy, 0); - vy = wasm_v64x2_shuffle(vy, vy, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vy, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-arm-u8.c b/src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-arm-u8.c deleted file mode 100644 index ed26330aca4..00000000000 --- a/src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-arm-u8.c +++ /dev/null @@ -1,85 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vsubc_minmax_ukernel__wasmsimd_arm_u8( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const v128_t vb = wasm_v128_load32_splat(input_b); - - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - input_a += 8; - - v128_t vy0 = wasm_f32x4_sub(va0, vb); - v128_t vy1 = wasm_f32x4_sub(va1, vb); - - - vy0 = wasm_f32x4_max(vy0, voutput_min); - vy1 = wasm_f32x4_max(vy1, voutput_min); - - vy0 = wasm_f32x4_min(vy0, voutput_max); - vy1 = wasm_f32x4_min(vy1, voutput_max); - - wasm_v128_store(output, vy0); - wasm_v128_store(output + 4, vy1); - output += 8; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - v128_t vy = wasm_f32x4_sub(va, vb); - - vy = wasm_f32x4_max(vy, voutput_min); - vy = wasm_f32x4_min(vy, voutput_max); - - wasm_v128_store(output, vy); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - - v128_t vy = wasm_f32x4_sub(va, vb); - - vy = wasm_f32x4_max(vy, voutput_min); - vy = wasm_f32x4_min(vy, voutput_max); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vy, 0); - vy = wasm_v64x2_shuffle(vy, vy, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vy, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-x86-u16.c b/src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-x86-u16.c deleted file mode 100644 index b324e2a2aba..00000000000 --- a/src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-x86-u16.c +++ /dev/null @@ -1,95 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vsubc_minmax_ukernel__wasmsimd_x86_u16( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const v128_t vb = wasm_v128_load32_splat(input_b); - - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - const v128_t va2 = wasm_v128_load(input_a + 8); - const v128_t va3 = wasm_v128_load(input_a + 12); - input_a += 16; - - v128_t vy0 = wasm_f32x4_sub(va0, vb); - v128_t vy1 = wasm_f32x4_sub(va1, vb); - v128_t vy2 = wasm_f32x4_sub(va2, vb); - v128_t vy3 = wasm_f32x4_sub(va3, vb); - - - vy0 = wasm_f32x4_pmax(voutput_min, vy0); - vy1 = wasm_f32x4_pmax(voutput_min, vy1); - vy2 = wasm_f32x4_pmax(voutput_min, vy2); - vy3 = wasm_f32x4_pmax(voutput_min, vy3); - - vy0 = wasm_f32x4_pmin(voutput_max, vy0); - vy1 = wasm_f32x4_pmin(voutput_max, vy1); - vy2 = wasm_f32x4_pmin(voutput_max, vy2); - vy3 = wasm_f32x4_pmin(voutput_max, vy3); - - wasm_v128_store(output, vy0); - wasm_v128_store(output + 4, vy1); - wasm_v128_store(output + 8, vy2); - wasm_v128_store(output + 12, vy3); - output += 16; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - v128_t vy = wasm_f32x4_sub(va, vb); - - vy = wasm_f32x4_pmax(voutput_min, vy); - vy = wasm_f32x4_pmin(voutput_max, vy); - - wasm_v128_store(output, vy); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - - v128_t vy = wasm_f32x4_sub(va, vb); - - vy = wasm_f32x4_pmax(voutput_min, vy); - vy = wasm_f32x4_pmin(voutput_max, vy); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vy, 0); - vy = wasm_v64x2_shuffle(vy, vy, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vy, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-x86-u4.c b/src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-x86-u4.c deleted file mode 100644 index a43333c7cb7..00000000000 --- a/src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-x86-u4.c +++ /dev/null @@ -1,66 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vsubc_minmax_ukernel__wasmsimd_x86_u4( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const v128_t vb = wasm_v128_load32_splat(input_b); - - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - v128_t vy = wasm_f32x4_sub(va, vb); - - vy = wasm_f32x4_pmax(voutput_min, vy); - vy = wasm_f32x4_pmin(voutput_max, vy); - - wasm_v128_store(output, vy); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - - v128_t vy = wasm_f32x4_sub(va, vb); - - vy = wasm_f32x4_pmax(voutput_min, vy); - vy = wasm_f32x4_pmin(voutput_max, vy); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vy, 0); - vy = wasm_v64x2_shuffle(vy, vy, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vy, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-x86-u8.c b/src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-x86-u8.c deleted file mode 100644 index 6841b31f9c2..00000000000 --- a/src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-x86-u8.c +++ /dev/null @@ -1,85 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-vbinary/vopc-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/vbinary.h" - - -void xnn_f32_vsubc_minmax_ukernel__wasmsimd_x86_u8( - size_t batch, - const float* input_a, - const float* input_b, - float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input_a != NULL); - assert(input_b != NULL); - assert(output != NULL); - - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - const v128_t vb = wasm_v128_load32_splat(input_b); - - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const v128_t va0 = wasm_v128_load(input_a); - const v128_t va1 = wasm_v128_load(input_a + 4); - input_a += 8; - - v128_t vy0 = wasm_f32x4_sub(va0, vb); - v128_t vy1 = wasm_f32x4_sub(va1, vb); - - - vy0 = wasm_f32x4_pmax(voutput_min, vy0); - vy1 = wasm_f32x4_pmax(voutput_min, vy1); - - vy0 = wasm_f32x4_pmin(voutput_max, vy0); - vy1 = wasm_f32x4_pmin(voutput_max, vy1); - - wasm_v128_store(output, vy0); - wasm_v128_store(output + 4, vy1); - output += 8; - } - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t va = wasm_v128_load(input_a); - input_a += 4; - - v128_t vy = wasm_f32x4_sub(va, vb); - - vy = wasm_f32x4_pmax(voutput_min, vy); - vy = wasm_f32x4_pmin(voutput_max, vy); - - wasm_v128_store(output, vy); - output += 4; - } - if XNN_UNLIKELY(batch != 0) { - const v128_t va = wasm_v128_load(input_a); - - v128_t vy = wasm_f32x4_sub(va, vb); - - vy = wasm_f32x4_pmax(voutput_min, vy); - vy = wasm_f32x4_pmin(voutput_max, vy); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vy, 0); - vy = wasm_v64x2_shuffle(vy, vy, 1, 1); - output += 2; - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vy, 0); - } - } -} diff --git a/src/f32-vbinary/gen/f32-vsubc-minmax-neon-u4.c b/src/f32-vbinary/gen/f32-vsubc-neon-u4.c similarity index 74% rename from src/f32-vbinary/gen/f32-vsubc-minmax-neon-u4.c rename to src/f32-vbinary/gen/f32-vsubc-neon-u4.c index b6a39b00b4f..1443fa86bfa 100644 --- a/src/f32-vbinary/gen/f32-vsubc-minmax-neon-u4.c +++ b/src/f32-vbinary/gen/f32-vsubc-neon-u4.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vsubc_minmax_ukernel__neon_u4( +void xnn_f32_vsubc_ukernel__neon_u4( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -28,8 +28,6 @@ void xnn_f32_vsubc_minmax_ukernel__neon_u4( assert(input_b != NULL); assert(output != NULL); - const float32x4_t voutput_min = vld1q_dup_f32(¶ms->scalar.min); - const float32x4_t voutput_max = vld1q_dup_f32(¶ms->scalar.max); const float32x4_t vb = vld1q_dup_f32(input_b); for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { @@ -37,9 +35,6 @@ void xnn_f32_vsubc_minmax_ukernel__neon_u4( float32x4_t vacc = vsubq_f32(va, vb); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -47,9 +42,6 @@ void xnn_f32_vsubc_minmax_ukernel__neon_u4( float32x4_t vacc = vsubq_f32(va, vb); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/gen/f32-vsubc-minmax-neon-u8.c b/src/f32-vbinary/gen/f32-vsubc-neon-u8.c similarity index 72% rename from src/f32-vbinary/gen/f32-vsubc-minmax-neon-u8.c rename to src/f32-vbinary/gen/f32-vsubc-neon-u8.c index 36d2ad91543..80e9055fe74 100644 --- a/src/f32-vbinary/gen/f32-vsubc-minmax-neon-u8.c +++ b/src/f32-vbinary/gen/f32-vsubc-neon-u8.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vsubc_minmax_ukernel__neon_u8( +void xnn_f32_vsubc_ukernel__neon_u8( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -28,8 +28,6 @@ void xnn_f32_vsubc_minmax_ukernel__neon_u8( assert(input_b != NULL); assert(output != NULL); - const float32x4_t voutput_min = vld1q_dup_f32(¶ms->scalar.min); - const float32x4_t voutput_max = vld1q_dup_f32(¶ms->scalar.max); const float32x4_t vb = vld1q_dup_f32(input_b); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { @@ -40,12 +38,6 @@ void xnn_f32_vsubc_minmax_ukernel__neon_u8( float32x4_t vacc1 = vsubq_f32(va1, vb); - vacc0 = vmaxq_f32(vacc0, voutput_min); - vacc1 = vmaxq_f32(vacc1, voutput_min); - - vacc0 = vminq_f32(vacc0, voutput_max); - vacc1 = vminq_f32(vacc1, voutput_max); - vst1q_f32(output, vacc0); output += 4; vst1q_f32(output, vacc1); output += 4; } @@ -54,9 +46,6 @@ void xnn_f32_vsubc_minmax_ukernel__neon_u8( float32x4_t vacc = vsubq_f32(va, vb); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -64,9 +53,6 @@ void xnn_f32_vsubc_minmax_ukernel__neon_u8( float32x4_t vacc = vsubq_f32(va, vb); - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/gen/f32-vsubc-minmax-rvv-u4v.c b/src/f32-vbinary/gen/f32-vsubc-rvv-u4v.c similarity index 74% rename from src/f32-vbinary/gen/f32-vsubc-minmax-rvv-u4v.c rename to src/f32-vbinary/gen/f32-vsubc-rvv-u4v.c index ebbf1095939..13b87a9d0ed 100644 --- a/src/f32-vbinary/gen/f32-vsubc-minmax-rvv-u4v.c +++ b/src/f32-vbinary/gen/f32-vsubc-rvv-u4v.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vsubc_minmax_ukernel__rvv_u4v( +void xnn_f32_vsubc_ukernel__rvv_u4v( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -28,8 +28,6 @@ void xnn_f32_vsubc_minmax_ukernel__rvv_u4v( assert(input_b != NULL); assert(output != NULL); - const float output_min = params->scalar.min; - const float output_max = params->scalar.max; const float b = *input_b; size_t n = batch >> 2; @@ -39,8 +37,6 @@ void xnn_f32_vsubc_minmax_ukernel__rvv_u4v( vfloat32m4_t va = __riscv_vle32_v_f32m4(input_a, vl); input_a += vl; vfloat32m4_t vacc = __riscv_vfsub_vf_f32m4(va, b, vl); - vacc = __riscv_vfmax_vf_f32m4(vacc, output_min, vl); - vacc = __riscv_vfmin_vf_f32m4(vacc, output_max, vl); __riscv_vse32_v_f32m4(output, vacc, vl); output += vl; } while (n > 0); diff --git a/src/f32-vbinary/gen/f32-vsubc-minmax-rvv-u8v.c b/src/f32-vbinary/gen/f32-vsubc-rvv-u8v.c similarity index 74% rename from src/f32-vbinary/gen/f32-vsubc-minmax-rvv-u8v.c rename to src/f32-vbinary/gen/f32-vsubc-rvv-u8v.c index a031c562c4a..1cdb0b0891c 100644 --- a/src/f32-vbinary/gen/f32-vsubc-minmax-rvv-u8v.c +++ b/src/f32-vbinary/gen/f32-vsubc-rvv-u8v.c @@ -15,12 +15,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vsubc_minmax_ukernel__rvv_u8v( +void xnn_f32_vsubc_ukernel__rvv_u8v( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -28,8 +28,6 @@ void xnn_f32_vsubc_minmax_ukernel__rvv_u8v( assert(input_b != NULL); assert(output != NULL); - const float output_min = params->scalar.min; - const float output_max = params->scalar.max; const float b = *input_b; size_t n = batch >> 2; @@ -39,8 +37,6 @@ void xnn_f32_vsubc_minmax_ukernel__rvv_u8v( vfloat32m8_t va = __riscv_vle32_v_f32m8(input_a, vl); input_a += vl; vfloat32m8_t vacc = __riscv_vfsub_vf_f32m8(va, b, vl); - vacc = __riscv_vfmax_vf_f32m8(vacc, output_min, vl); - vacc = __riscv_vfmin_vf_f32m8(vacc, output_max, vl); __riscv_vse32_v_f32m8(output, vacc, vl); output += vl; } while (n > 0); diff --git a/src/f32-vbinary/gen/f32-vsubc-scalar-u2.c b/src/f32-vbinary/gen/f32-vsubc-scalar-u2.c index 5b91bf90e31..484b6172826 100644 --- a/src/f32-vbinary/gen/f32-vsubc-scalar-u2.c +++ b/src/f32-vbinary/gen/f32-vsubc-scalar-u2.c @@ -38,7 +38,6 @@ void xnn_f32_vsubc_ukernel__scalar_u2( float vacc1 = va1 - vb; - output[0] = vacc0; output[1] = vacc1; output += 2; diff --git a/src/f32-vbinary/gen/f32-vsubc-scalar-u4.c b/src/f32-vbinary/gen/f32-vsubc-scalar-u4.c index 5ccbd868218..362bc8dfddf 100644 --- a/src/f32-vbinary/gen/f32-vsubc-scalar-u4.c +++ b/src/f32-vbinary/gen/f32-vsubc-scalar-u4.c @@ -42,7 +42,6 @@ void xnn_f32_vsubc_ukernel__scalar_u4( float vacc3 = va3 - vb; - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vsubc-scalar-u8.c b/src/f32-vbinary/gen/f32-vsubc-scalar-u8.c index 8ccd0017c1f..f75bb7ea86e 100644 --- a/src/f32-vbinary/gen/f32-vsubc-scalar-u8.c +++ b/src/f32-vbinary/gen/f32-vsubc-scalar-u8.c @@ -50,7 +50,6 @@ void xnn_f32_vsubc_ukernel__scalar_u8( float vacc7 = va7 - vb; - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; diff --git a/src/f32-vbinary/gen/f32-vsubc-minmax-sse-u4.c b/src/f32-vbinary/gen/f32-vsubc-sse-u4.c similarity index 71% rename from src/f32-vbinary/gen/f32-vsubc-minmax-sse-u4.c rename to src/f32-vbinary/gen/f32-vsubc-sse-u4.c index ddad523ed41..794a98b705c 100644 --- a/src/f32-vbinary/gen/f32-vsubc-minmax-sse-u4.c +++ b/src/f32-vbinary/gen/f32-vsubc-sse-u4.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vsubc_minmax_ukernel__sse_u4( +void xnn_f32_vsubc_ukernel__sse_u4( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,10 +29,6 @@ void xnn_f32_vsubc_minmax_ukernel__sse_u4( assert(input_b != NULL); assert(output != NULL); - const __m128 voutput_min = _mm_set1_ps(params->scalar.min); - const __m128 voutput_max = _mm_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); const __m128 vb = _mm_load1_ps(input_b); for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { @@ -40,8 +36,6 @@ void xnn_f32_vsubc_minmax_ukernel__sse_u4( input_a += 4; __m128 vacc = _mm_sub_ps(va, vb); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); _mm_storeu_ps(output, vacc); output += 4; @@ -50,8 +44,6 @@ void xnn_f32_vsubc_minmax_ukernel__sse_u4( const __m128 va = _mm_loadu_ps(input_a); __m128 vacc = _mm_sub_ps(va, vb); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); if (batch & (2 * sizeof(float))) { _mm_storel_pi((__m64*) output, vacc); vacc = _mm_movehl_ps(vacc, vacc); diff --git a/src/f32-vbinary/gen/f32-vsubc-minmax-sse-u8.c b/src/f32-vbinary/gen/f32-vsubc-sse-u8.c similarity index 70% rename from src/f32-vbinary/gen/f32-vsubc-minmax-sse-u8.c rename to src/f32-vbinary/gen/f32-vsubc-sse-u8.c index 7b6b890d067..fede4a5f245 100644 --- a/src/f32-vbinary/gen/f32-vsubc-minmax-sse-u8.c +++ b/src/f32-vbinary/gen/f32-vsubc-sse-u8.c @@ -16,12 +16,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vsubc_minmax_ukernel__sse_u8( +void xnn_f32_vsubc_ukernel__sse_u8( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -29,10 +29,6 @@ void xnn_f32_vsubc_minmax_ukernel__sse_u8( assert(input_b != NULL); assert(output != NULL); - const __m128 voutput_min = _mm_set1_ps(params->scalar.min); - const __m128 voutput_max = _mm_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); const __m128 vb = _mm_load1_ps(input_b); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { @@ -44,12 +40,6 @@ void xnn_f32_vsubc_minmax_ukernel__sse_u8( __m128 vacc1 = _mm_sub_ps(va1, vb); - vacc0 = _mm_max_ps(vacc0, voutput_min); - vacc1 = _mm_max_ps(vacc1, voutput_min); - - vacc0 = _mm_min_ps(vacc0, voutput_max); - vacc1 = _mm_min_ps(vacc1, voutput_max); - _mm_storeu_ps(output, vacc0); _mm_storeu_ps(output + 4, vacc1); output += 8; @@ -59,8 +49,6 @@ void xnn_f32_vsubc_minmax_ukernel__sse_u8( input_a += 4; __m128 vacc = _mm_sub_ps(va, vb); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); _mm_storeu_ps(output, vacc); output += 4; @@ -69,8 +57,6 @@ void xnn_f32_vsubc_minmax_ukernel__sse_u8( const __m128 va = _mm_loadu_ps(input_a); __m128 vacc = _mm_sub_ps(va, vb); - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); if (batch & (2 * sizeof(float))) { _mm_storel_pi((__m64*) output, vacc); vacc = _mm_movehl_ps(vacc, vacc); diff --git a/src/f32-vbinary/gen/f32-vsubc-minmax-scalar-u1.c b/src/f32-vbinary/gen/f32-vsubc-wasm-u1.c similarity index 72% rename from src/f32-vbinary/gen/f32-vsubc-minmax-scalar-u1.c rename to src/f32-vbinary/gen/f32-vsubc-wasm-u1.c index 7730c150acd..37a24c6003b 100644 --- a/src/f32-vbinary/gen/f32-vsubc-minmax-scalar-u1.c +++ b/src/f32-vbinary/gen/f32-vsubc-wasm-u1.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vsubc_minmax_ukernel__scalar_u1( +void xnn_f32_vsubc_ukernel__wasm_u1( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,15 +27,11 @@ void xnn_f32_vsubc_minmax_ukernel__scalar_u1( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; const float vb = *input_b; for (; batch >= sizeof(float); batch -= sizeof(float)) { const float va = *input_a++; float vacc = va - vb; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output++ = vacc; } } diff --git a/src/f32-vbinary/gen/f32-vsubc-minmax-scalar-u2.c b/src/f32-vbinary/gen/f32-vsubc-wasm-u2.c similarity index 68% rename from src/f32-vbinary/gen/f32-vsubc-minmax-scalar-u2.c rename to src/f32-vbinary/gen/f32-vsubc-wasm-u2.c index 5b9ffb10b56..31a61606d77 100644 --- a/src/f32-vbinary/gen/f32-vsubc-minmax-scalar-u2.c +++ b/src/f32-vbinary/gen/f32-vsubc-wasm-u2.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vsubc_minmax_ukernel__scalar_u2( +void xnn_f32_vsubc_ukernel__wasm_u2( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,8 +27,6 @@ void xnn_f32_vsubc_minmax_ukernel__scalar_u2( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; const float vb = *input_b; for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) { @@ -40,12 +38,6 @@ void xnn_f32_vsubc_minmax_ukernel__scalar_u2( float vacc1 = va1 - vb; - vacc0 = math_max_f32(vacc0, voutput_min); - vacc1 = math_max_f32(vacc1, voutput_min); - - vacc0 = math_min_f32(vacc0, voutput_max); - vacc1 = math_min_f32(vacc1, voutput_max); - output[0] = vacc0; output[1] = vacc1; output += 2; @@ -54,8 +46,6 @@ void xnn_f32_vsubc_minmax_ukernel__scalar_u2( assert(batch == sizeof(float)); const float va = *input_a; float vacc = va - vb; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output = vacc; } } diff --git a/src/f32-vbinary/gen/f32-vsubc-minmax-scalar-u4.c b/src/f32-vbinary/gen/f32-vsubc-wasm-u4.c similarity index 65% rename from src/f32-vbinary/gen/f32-vsubc-minmax-scalar-u4.c rename to src/f32-vbinary/gen/f32-vsubc-wasm-u4.c index 444b32d9d3b..ed6c9065593 100644 --- a/src/f32-vbinary/gen/f32-vsubc-minmax-scalar-u4.c +++ b/src/f32-vbinary/gen/f32-vsubc-wasm-u4.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vsubc_minmax_ukernel__scalar_u4( +void xnn_f32_vsubc_ukernel__wasm_u4( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,8 +27,6 @@ void xnn_f32_vsubc_minmax_ukernel__scalar_u4( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; const float vb = *input_b; for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { @@ -44,16 +42,6 @@ void xnn_f32_vsubc_minmax_ukernel__scalar_u4( float vacc3 = va3 - vb; - vacc0 = math_max_f32(vacc0, voutput_min); - vacc1 = math_max_f32(vacc1, voutput_min); - vacc2 = math_max_f32(vacc2, voutput_min); - vacc3 = math_max_f32(vacc3, voutput_min); - - vacc0 = math_min_f32(vacc0, voutput_max); - vacc1 = math_min_f32(vacc1, voutput_max); - vacc2 = math_min_f32(vacc2, voutput_max); - vacc3 = math_min_f32(vacc3, voutput_max); - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; @@ -64,8 +52,6 @@ void xnn_f32_vsubc_minmax_ukernel__scalar_u4( do { const float va = *input_a++; float vacc = va - vb; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output++ = vacc; batch -= sizeof(float); } while (batch != 0); diff --git a/src/f32-vbinary/gen/f32-vsubc-minmax-scalar-u8.c b/src/f32-vbinary/gen/f32-vsubc-wasm-u8.c similarity index 60% rename from src/f32-vbinary/gen/f32-vsubc-minmax-scalar-u8.c rename to src/f32-vbinary/gen/f32-vsubc-wasm-u8.c index 7b11300baf5..edc4b97d921 100644 --- a/src/f32-vbinary/gen/f32-vsubc-minmax-scalar-u8.c +++ b/src/f32-vbinary/gen/f32-vsubc-wasm-u8.c @@ -14,12 +14,12 @@ #include "xnnpack/vbinary.h" -void xnn_f32_vsubc_minmax_ukernel__scalar_u8( +void xnn_f32_vsubc_ukernel__wasm_u8( size_t batch, const float* input_a, const float* input_b, float* output, - const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -27,8 +27,6 @@ void xnn_f32_vsubc_minmax_ukernel__scalar_u8( assert(input_b != NULL); assert(output != NULL); - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; const float vb = *input_b; for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { @@ -52,24 +50,6 @@ void xnn_f32_vsubc_minmax_ukernel__scalar_u8( float vacc7 = va7 - vb; - vacc0 = math_max_f32(vacc0, voutput_min); - vacc1 = math_max_f32(vacc1, voutput_min); - vacc2 = math_max_f32(vacc2, voutput_min); - vacc3 = math_max_f32(vacc3, voutput_min); - vacc4 = math_max_f32(vacc4, voutput_min); - vacc5 = math_max_f32(vacc5, voutput_min); - vacc6 = math_max_f32(vacc6, voutput_min); - vacc7 = math_max_f32(vacc7, voutput_min); - - vacc0 = math_min_f32(vacc0, voutput_max); - vacc1 = math_min_f32(vacc1, voutput_max); - vacc2 = math_min_f32(vacc2, voutput_max); - vacc3 = math_min_f32(vacc3, voutput_max); - vacc4 = math_min_f32(vacc4, voutput_max); - vacc5 = math_min_f32(vacc5, voutput_max); - vacc6 = math_min_f32(vacc6, voutput_max); - vacc7 = math_min_f32(vacc7, voutput_max); - output[0] = vacc0; output[1] = vacc1; output[2] = vacc2; @@ -84,8 +64,6 @@ void xnn_f32_vsubc_minmax_ukernel__scalar_u8( do { const float va = *input_a++; float vacc = va - vb; - vacc = math_max_f32(vacc, voutput_min); - vacc = math_min_f32(vacc, voutput_max); *output++ = vacc; batch -= sizeof(float); } while (batch != 0); diff --git a/src/f32-vbinary/gen/f32-vsubc-wasmsimd-u16.c b/src/f32-vbinary/gen/f32-vsubc-wasmsimd-u16.c index 86760a9edb0..4f724744777 100644 --- a/src/f32-vbinary/gen/f32-vsubc-wasmsimd-u16.c +++ b/src/f32-vbinary/gen/f32-vsubc-wasmsimd-u16.c @@ -43,7 +43,6 @@ void xnn_f32_vsubc_ukernel__wasmsimd_u16( v128_t vy3 = wasm_f32x4_sub(va3, vb); - wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); wasm_v128_store(output + 8, vy2); @@ -56,7 +55,6 @@ void xnn_f32_vsubc_ukernel__wasmsimd_u16( v128_t vy = wasm_f32x4_sub(va, vb); - wasm_v128_store(output, vy); output += 4; } @@ -65,7 +63,6 @@ void xnn_f32_vsubc_ukernel__wasmsimd_u16( v128_t vy = wasm_f32x4_sub(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vsubc-wasmsimd-u4.c b/src/f32-vbinary/gen/f32-vsubc-wasmsimd-u4.c index 2cfad19b4a1..64f003154c1 100644 --- a/src/f32-vbinary/gen/f32-vsubc-wasmsimd-u4.c +++ b/src/f32-vbinary/gen/f32-vsubc-wasmsimd-u4.c @@ -36,7 +36,6 @@ void xnn_f32_vsubc_ukernel__wasmsimd_u4( v128_t vy = wasm_f32x4_sub(va, vb); - wasm_v128_store(output, vy); output += 4; } @@ -45,7 +44,6 @@ void xnn_f32_vsubc_ukernel__wasmsimd_u4( v128_t vy = wasm_f32x4_sub(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/gen/f32-vsubc-wasmsimd-u8.c b/src/f32-vbinary/gen/f32-vsubc-wasmsimd-u8.c index d1b633ded3e..c5051eaaacf 100644 --- a/src/f32-vbinary/gen/f32-vsubc-wasmsimd-u8.c +++ b/src/f32-vbinary/gen/f32-vsubc-wasmsimd-u8.c @@ -39,7 +39,6 @@ void xnn_f32_vsubc_ukernel__wasmsimd_u8( v128_t vy1 = wasm_f32x4_sub(va1, vb); - wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); output += 8; @@ -50,7 +49,6 @@ void xnn_f32_vsubc_ukernel__wasmsimd_u8( v128_t vy = wasm_f32x4_sub(va, vb); - wasm_v128_store(output, vy); output += 4; } @@ -59,7 +57,6 @@ void xnn_f32_vsubc_ukernel__wasmsimd_u8( v128_t vy = wasm_f32x4_sub(va, vb); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/f32-vbinary/vop-avx.c.in b/src/f32-vbinary/vop-avx.c.in index 38ea9b0282b..dc531db1a9d 100644 --- a/src/f32-vbinary/vop-avx.c.in +++ b/src/f32-vbinary/vop-avx.c.in @@ -8,7 +8,6 @@ $assert BATCH_TILE >= 8 $SIMD_TILE = BATCH_TILE // 8 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" $assert OP in ["ADD", "DIV", "MAX", "MIN", "MUL", "SUB", "SQRDIFF", "PRELU"] -$assert ACTIVATION in ["LINEAR", "MINMAX"] #include #include @@ -27,14 +26,12 @@ $ "SUB": "_mm256_sub_ps", $ "SQRDIFF": "_mm256_sub_ps", $ "PRELU": "_mm256_mul_ps", $}[OP] -$SUFFIX = {"LINEAR": "", "MINMAX": "_minmax"}[ACTIVATION] -$PARAMS = {"LINEAR": "struct xnn_f32_default_params", "MINMAX": "union xnn_f32_minmax_params"}[ACTIVATION] -void xnn_f32_v${OP.lower()}${SUFFIX}_ukernel__avx_u${BATCH_TILE}( +void xnn_f32_v${OP.lower()}_ukernel__avx_u${BATCH_TILE}( size_t batch, const float* input_a, const float* input_b, float* output, - const ${PARAMS} params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -44,12 +41,6 @@ void xnn_f32_v${OP.lower()}${SUFFIX}_ukernel__avx_u${BATCH_TILE}( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; - $if ACTIVATION == "MINMAX": - const __m256 voutput_min = _mm256_set1_ps(params->scalar.min); - const __m256 voutput_max = _mm256_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - $if BATCH_TILE > 8: for (; batch >= ${BATCH_TILE} * sizeof(float); batch -= ${BATCH_TILE} * sizeof(float)) { const __m256 va${ABC[0]} = _mm256_loadu_ps(input_a); @@ -69,13 +60,6 @@ void xnn_f32_v${OP.lower()}${SUFFIX}_ukernel__avx_u${BATCH_TILE}( $for N in range(SIMD_TILE): vacc${ABC[N]} = _mm256_blendv_ps(va${ABC[N]}, vacc${ABC[N]}, va${ABC[N]}); - $if ACTIVATION == "MINMAX": - $for N in range(SIMD_TILE): - vacc${ABC[N]} = _mm256_max_ps(voutput_min, vacc${ABC[N]}); - - $for N in range(SIMD_TILE): - vacc${ABC[N]} = _mm256_min_ps(voutput_max, vacc${ABC[N]}); - _mm256_storeu_ps(output, vacc${ABC[0]}); $for N in range(1, SIMD_TILE): _mm256_storeu_ps(output + ${N * 8}, vacc${ABC[N]}); @@ -91,9 +75,6 @@ void xnn_f32_v${OP.lower()}${SUFFIX}_ukernel__avx_u${BATCH_TILE}( vacc = _mm256_mul_ps(vacc, vacc); $elif OP == "PRELU": vacc = _mm256_blendv_ps(va, vacc, va); - $if ACTIVATION == "MINMAX": - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); _mm256_storeu_ps(output, vacc); output += 8; } @@ -110,9 +91,6 @@ void xnn_f32_v${OP.lower()}${SUFFIX}_ukernel__avx_u${BATCH_TILE}( vacc = _mm256_mul_ps(vacc, vacc); $elif OP == "PRELU": vacc = _mm256_blendv_ps(va, vacc, va); - $if ACTIVATION == "MINMAX": - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); __m128 vacc_lo = _mm256_castps256_ps128(vacc); if (batch & (4 * sizeof(float))) { diff --git a/src/f32-vbinary/vop-avx512f.c.in b/src/f32-vbinary/vop-avx512f.c.in index 7da010bee11..2dd655c38c6 100644 --- a/src/f32-vbinary/vop-avx512f.c.in +++ b/src/f32-vbinary/vop-avx512f.c.in @@ -8,7 +8,6 @@ $assert BATCH_TILE >= 16 $SIMD_TILE = BATCH_TILE // 16 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" $assert OP in ["ADD", "DIV", "MAX", "MIN", "MUL", "SUB", "SQRDIFF", "PRELU"] -$assert ACTIVATION in ["LINEAR", "MINMAX"] #include #include @@ -38,14 +37,12 @@ $ "SUB": "_mm512_maskz_sub_ps", $ "SQRDIFF": "_mm512_maskz_sub_ps", $ "PRELU": "_mm512_maskz_mul_ps", $}[OP] -$SUFFIX = {"LINEAR": "", "MINMAX": "_minmax"}[ACTIVATION] -$PARAMS = {"LINEAR": "struct xnn_f32_default_params", "MINMAX": "union xnn_f32_minmax_params"}[ACTIVATION] -void xnn_f32_v${OP.lower()}${SUFFIX}_ukernel__avx512f_u${BATCH_TILE}( +void xnn_f32_v${OP.lower()}_ukernel__avx512f_u${BATCH_TILE}( size_t batch, const float* input_a, const float* input_b, float* output, - const ${PARAMS} params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -53,10 +50,6 @@ void xnn_f32_v${OP.lower()}${SUFFIX}_ukernel__avx512f_u${BATCH_TILE}( assert(input_b != NULL); assert(output != NULL); - $if ACTIVATION == "MINMAX": - const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); - const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); - $if OP == "PRELU": const __m512 vzero = _mm512_setzero_ps(); @@ -82,13 +75,6 @@ void xnn_f32_v${OP.lower()}${SUFFIX}_ukernel__avx512f_u${BATCH_TILE}( $for N in range(SIMD_TILE): vacc${ABC[N]} = _mm512_mul_ps(vacc${ABC[N]}, vacc${ABC[N]}); - $if ACTIVATION == "MINMAX": - $for N in range(SIMD_TILE): - vacc${ABC[N]} = _mm512_max_ps(voutput_min, vacc${ABC[N]}); - - $for N in range(SIMD_TILE): - vacc${ABC[N]} = _mm512_min_ps(voutput_max, vacc${ABC[N]}); - _mm512_storeu_ps(output, vacc${ABC[0]}); $for N in range(1, SIMD_TILE): _mm512_storeu_ps(output + ${N * 16}, vacc${ABC[N]}); @@ -109,10 +95,6 @@ void xnn_f32_v${OP.lower()}${SUFFIX}_ukernel__avx512f_u${BATCH_TILE}( $if OP == "SQRDIFF": vacc = _mm512_mul_ps(vacc, vacc); - $if ACTIVATION == "MINMAX": - vacc = _mm512_max_ps(voutput_min, vacc); - vacc = _mm512_min_ps(voutput_max, vacc); - _mm512_storeu_ps(output, vacc); output += 16; } @@ -133,9 +115,6 @@ void xnn_f32_v${OP.lower()}${SUFFIX}_ukernel__avx512f_u${BATCH_TILE}( $if OP == "SQRDIFF": vacc = _mm512_maskz_mul_ps(vmask, vacc, vacc); - $if ACTIVATION == "MINMAX": - vacc = _mm512_maskz_max_ps(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ps(vmask, voutput_max, vacc); _mm512_mask_storeu_ps(output, vmask, vacc); } } diff --git a/src/f32-vbinary/vop-hvx.c.in b/src/f32-vbinary/vop-hvx.c.in index 81e5c968bb2..a4ce398d8d1 100644 --- a/src/f32-vbinary/vop-hvx.c.in +++ b/src/f32-vbinary/vop-hvx.c.in @@ -3,7 +3,6 @@ $assert BATCH_TILE % 32 == 0 $assert BATCH_TILE >= 32 $SIMD_TILE = BATCH_TILE // 32 $assert OP in ["ADD", "MAX", "MIN", "MUL", "SUB", "SQRDIFF"] -$assert ACTIVATION in ["LINEAR", "MINMAX"] #include #include "xnnpack/simd/f32-hvx.h" @@ -19,14 +18,12 @@ $ "MUL": "xnn_mul_f32", $ "SUB": "xnn_sub_f32", $ "SQRDIFF": "xnn_sub_f32", $}[OP] -$SUFFIX = {"LINEAR": "", "MINMAX": "_minmax"}[ACTIVATION] -$PARAMS = {"LINEAR": "struct xnn_f32_default_params", "MINMAX": "union xnn_f32_minmax_params"}[ACTIVATION] -void xnn_f32_v${OP.lower()}${SUFFIX}_ukernel__hvx_u${BATCH_TILE}( +void xnn_f32_v${OP.lower()}_ukernel__hvx_u${BATCH_TILE}( size_t batch, const float* input_a, const float* input_b, float* output, - const ${PARAMS} params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -34,10 +31,6 @@ void xnn_f32_v${OP.lower()}${SUFFIX}_ukernel__hvx_u${BATCH_TILE}( assert(input_b != NULL); assert(output != NULL); - $if ACTIVATION == "MINMAX": - const HVX_Vector voutput_min = xnn_set1_f32(params->scalar.min); - const HVX_Vector voutput_max = xnn_set1_f32(params->scalar.max); - $if BATCH_TILE > 32: for (; batch >= ${BATCH_TILE} * sizeof(float); batch -= ${BATCH_TILE} * sizeof(float)) { HVX_Vector va0 = xnn_loadu_f32(input_a); @@ -56,13 +49,6 @@ void xnn_f32_v${OP.lower()}${SUFFIX}_ukernel__hvx_u${BATCH_TILE}( $for N in range(SIMD_TILE): vacc${N} = xnn_mul_f32(vacc${N}, vacc${N}); - $if ACTIVATION == "MINMAX": - $for N in range(SIMD_TILE): - vacc${N} = xnn_max_f32(vacc${N}, voutput_min); - - $for N in range(SIMD_TILE): - vacc${N} = xnn_min_f32(vacc${N}, voutput_max); - xnn_storeu_f32(output, vacc0); $for N in range(32, BATCH_TILE, 32): xnn_storeu_f32(output + ${N}, vacc${int(N/32)}); @@ -77,9 +63,6 @@ void xnn_f32_v${OP.lower()}${SUFFIX}_ukernel__hvx_u${BATCH_TILE}( HVX_Vector vacc = ${_HEXAGON_OP_HVX}(va, vb); $if OP == "SQRDIFF": vacc = xnn_mul_f32(vacc, vacc); - $if ACTIVATION == "MINMAX": - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); xnn_storeu_f32(output, vacc); output += 32; @@ -91,10 +74,7 @@ void xnn_f32_v${OP.lower()}${SUFFIX}_ukernel__hvx_u${BATCH_TILE}( HVX_Vector vacc = ${_HEXAGON_OP_HVX}(va, vb); $if OP == "SQRDIFF": vacc = xnn_mul_f32(vacc, vacc); - $if ACTIVATION == "MINMAX": - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); - + Q6_V_vstu_variable(output, batch, vacc); } } diff --git a/src/f32-vbinary/vop-neon.c.in b/src/f32-vbinary/vop-neon.c.in index d8d07ce424f..1f42e53610d 100644 --- a/src/f32-vbinary/vop-neon.c.in +++ b/src/f32-vbinary/vop-neon.c.in @@ -8,7 +8,6 @@ $assert BATCH_TILE >= 4 $SIMD_TILE = BATCH_TILE // 4 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" $assert OP in ["ADD", "DIV", "MAX", "MIN", "MUL", "SUB", "SQRDIFF", "PRELU"] -$assert ACTIVATION in ["LINEAR", "MINMAX"] #include #include @@ -27,15 +26,13 @@ $ "SUB": "vsubq_f32", $ "SQRDIFF": "vsubq_f32", $ "PRELU": "vmulq_f32", $}[OP] -$SUFFIX = {"LINEAR": "", "MINMAX": "_minmax"}[ACTIVATION] -$PARAMS = {"LINEAR": "struct xnn_f32_default_params", "MINMAX": "union xnn_f32_minmax_params"}[ACTIVATION] $ISA = "aarch64_neon" if OP == "DIV" else "neon" -void xnn_f32_v${OP.lower()}${SUFFIX}_ukernel__${ISA}_u${BATCH_TILE}( +void xnn_f32_v${OP.lower()}_ukernel__${ISA}_u${BATCH_TILE}( size_t batch, const float* input_a, const float* input_b, float* output, - const ${PARAMS} params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -43,10 +40,6 @@ void xnn_f32_v${OP.lower()}${SUFFIX}_ukernel__${ISA}_u${BATCH_TILE}( assert(input_b != NULL); assert(output != NULL); - $if ACTIVATION == "MINMAX": - const float32x4_t voutput_min = vld1q_dup_f32(¶ms->scalar.min); - const float32x4_t voutput_max = vld1q_dup_f32(¶ms->scalar.max); - $if BATCH_TILE > 4: for (; batch >= ${BATCH_TILE} * sizeof(float); batch -= ${BATCH_TILE} * sizeof(float)) { $for N in range(SIMD_TILE): @@ -66,13 +59,6 @@ void xnn_f32_v${OP.lower()}${SUFFIX}_ukernel__${ISA}_u${BATCH_TILE}( $for N in range(SIMD_TILE): vacc${ABC[N]} = vbslq_f32(vm${ABC[N]}, vacc${ABC[N]}, va${ABC[N]}); - $if ACTIVATION == "MINMAX": - $for N in range(SIMD_TILE): - vacc${ABC[N]} = vmaxq_f32(vacc${ABC[N]}, voutput_min); - - $for N in range(SIMD_TILE): - vacc${ABC[N]} = vminq_f32(vacc${ABC[N]}, voutput_max); - $for N in range(SIMD_TILE): vst1q_f32(output, vacc${ABC[N]}); output += 4; } @@ -87,10 +73,6 @@ void xnn_f32_v${OP.lower()}${SUFFIX}_ukernel__${ISA}_u${BATCH_TILE}( const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(va), vmovq_n_s32(0)); vacc = vbslq_f32(vm, vacc, va); - $if ACTIVATION == "MINMAX": - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -104,10 +86,6 @@ void xnn_f32_v${OP.lower()}${SUFFIX}_ukernel__${ISA}_u${BATCH_TILE}( const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(va), vmovq_n_s32(0)); vacc = vbslq_f32(vm, vacc, va); - $if ACTIVATION == "MINMAX": - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/vop-rvv.c.in b/src/f32-vbinary/vop-rvv.c.in index 5aac55e7347..4adf3e4332a 100644 --- a/src/f32-vbinary/vop-rvv.c.in +++ b/src/f32-vbinary/vop-rvv.c.in @@ -5,7 +5,6 @@ $assert LMUL in [1, 2, 4, 8] $assert OP in ["ADD", "DIV", "MAX", "MIN", "MUL", "SUB", "SQRDIFF"] -$assert ACTIVATION in ["LINEAR", "MINMAX"] #include #include @@ -23,14 +22,12 @@ $ "MUL": "__riscv_vfmul_vv_f32", $ "SUB": "__riscv_vfsub_vv_f32", $ "SQRDIFF": "__riscv_vfsub_vv_f32", $}[OP] -$SUFFIX = {"LINEAR": "", "MINMAX": "_minmax"}[ACTIVATION] -$PARAMS = {"LINEAR": "struct xnn_f32_default_params", "MINMAX": "union xnn_f32_minmax_params"}[ACTIVATION] -void xnn_f32_v${OP.lower()}${SUFFIX}_ukernel__rvv_u${LMUL}v( +void xnn_f32_v${OP.lower()}_ukernel__rvv_u${LMUL}v( size_t batch, const float* input_a, const float* input_b, float* output, - const ${PARAMS} params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -38,9 +35,6 @@ void xnn_f32_v${OP.lower()}${SUFFIX}_ukernel__rvv_u${LMUL}v( assert(input_b != NULL); assert(output != NULL); - $if ACTIVATION == "MINMAX": - const float output_min = params->scalar.min; - const float output_max = params->scalar.max; size_t n = batch >> 2; do { @@ -53,9 +47,6 @@ void xnn_f32_v${OP.lower()}${SUFFIX}_ukernel__rvv_u${LMUL}v( vfloat32m${LMUL}_t vacc = ${OP_FUNC}m${LMUL}(va, vb, vl); $if OP == "SQRDIFF": vacc = __riscv_vfmul_vv_f32m${LMUL}(vacc, vacc, vl); - $if ACTIVATION == "MINMAX": - vacc = __riscv_vfmax_vf_f32m${LMUL}(vacc, output_min, vl); - vacc = __riscv_vfmin_vf_f32m${LMUL}(vacc, output_max, vl); __riscv_vse32_v_f32m${LMUL}(output, vacc, vl); output += vl; } while (n > 0); diff --git a/src/f32-vbinary/vop-scalar.c.in b/src/f32-vbinary/vop-scalar.c.in index 1f9138ee58f..cf78eb284ae 100644 --- a/src/f32-vbinary/vop-scalar.c.in +++ b/src/f32-vbinary/vop-scalar.c.in @@ -6,7 +6,6 @@ $assert BATCH_TILE >= 1 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" $assert OP in ["ADD", "DIV", "MAX", "MIN", "MUL", "SUB", "SQRDIFF", "PRELU"] -$assert ACTIVATION in ["LINEAR", "MINMAX"] #include #include "xnnpack/common.h" @@ -26,14 +25,12 @@ $ "SUB": lambda x, y: "%s - %s" % (x, y), $ "SQRDIFF": lambda x, y: "%s - %s" % (x, y), $ "PRELU": lambda x, y: "%s * %s" % (x, y), $}[OP] -$SUFFIX = {"LINEAR": "", "MINMAX": "_minmax"}[ACTIVATION] -$PARAMS = {"LINEAR": "struct xnn_f32_default_params", "MINMAX": "union xnn_f32_minmax_params"}[ACTIVATION] -void xnn_f32_v${OP.lower()}${SUFFIX}_ukernel__${"wasm" if WASM else "scalar"}_u${BATCH_TILE}( +void xnn_f32_v${OP.lower()}_ukernel__${"wasm" if WASM else "scalar"}_u${BATCH_TILE}( size_t batch, const float* input_a, const float* input_b, float* output, - const ${PARAMS} params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -41,10 +38,6 @@ void xnn_f32_v${OP.lower()}${SUFFIX}_ukernel__${"wasm" if WASM else "scalar"}_u$ assert(input_b != NULL); assert(output != NULL); - $if ACTIVATION == "MINMAX": - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; - $if BATCH_TILE == 1: for (; batch >= sizeof(float); batch -= sizeof(float)) { const float va = *input_a++; @@ -54,9 +47,6 @@ void xnn_f32_v${OP.lower()}${SUFFIX}_ukernel__${"wasm" if WASM else "scalar"}_u$ vacc = vacc * vacc; $elif OP == "PRELU": vacc = XNN_UNPREDICTABLE(va < 0.0f) ? va * vb : va; - $if ACTIVATION == "MINMAX": - vacc = ${MAX_F32}(vacc, voutput_min); - vacc = ${MIN_F32}(vacc, voutput_max); *output++ = vacc; } $else: @@ -79,13 +69,6 @@ void xnn_f32_v${OP.lower()}${SUFFIX}_ukernel__${"wasm" if WASM else "scalar"}_u$ $for N in range(BATCH_TILE): vacc${ABC[N]} = XNN_UNPREDICTABLE(va${ABC[N]} < 0.0f) ? va${ABC[N]} * vb${ABC[N]} : va${ABC[N]}; - $if ACTIVATION == "MINMAX": - $for N in range(BATCH_TILE): - vacc${ABC[N]} = ${MAX_F32}(vacc${ABC[N]}, voutput_min); - - $for N in range(BATCH_TILE): - vacc${ABC[N]} = ${MIN_F32}(vacc${ABC[N]}, voutput_max); - $for N in range(BATCH_TILE): output[${N}] = vacc${ABC[N]}; output += ${BATCH_TILE}; @@ -100,9 +83,6 @@ void xnn_f32_v${OP.lower()}${SUFFIX}_ukernel__${"wasm" if WASM else "scalar"}_u$ vacc = vacc * vacc; $elif OP == "PRELU": vacc = XNN_UNPREDICTABLE(va < 0.0f) ? vacc : va; - $if ACTIVATION == "MINMAX": - vacc = ${MAX_F32}(vacc, voutput_min); - vacc = ${MIN_F32}(vacc, voutput_max); *output = vacc; $else: do { @@ -113,9 +93,6 @@ void xnn_f32_v${OP.lower()}${SUFFIX}_ukernel__${"wasm" if WASM else "scalar"}_u$ vacc = vacc * vacc; $elif OP == "PRELU": vacc = XNN_UNPREDICTABLE(va < 0.0f) ? va * vb : va; - $if ACTIVATION == "MINMAX": - vacc = ${MAX_F32}(vacc, voutput_min); - vacc = ${MIN_F32}(vacc, voutput_max); *output++ = vacc; batch -= sizeof(float); } while (batch != 0); diff --git a/src/f32-vbinary/vop-sse.c.in b/src/f32-vbinary/vop-sse.c.in index a3a5110028e..5b842bd19f7 100644 --- a/src/f32-vbinary/vop-sse.c.in +++ b/src/f32-vbinary/vop-sse.c.in @@ -8,7 +8,6 @@ $assert BATCH_TILE >= 4 $SIMD_TILE = BATCH_TILE // 4 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" $assert OP in ["ADD", "DIV", "MAX", "MIN", "MUL", "SUB", "SQRDIFF", "PRELU"] -$assert ACTIVATION in ["LINEAR", "MINMAX"] #include $SSE_HEADER = {1: "xmmintrin.h", 2: "emmintrin.h", 4: "smmintrin.h"}[SSE] @@ -29,15 +28,13 @@ $ "SUB": "_mm_sub_ps", $ "SQRDIFF": "_mm_sub_ps", $ "PRELU": "_mm_mul_ps", $}[OP] -$SUFFIX = {"LINEAR": "", "MINMAX": "_minmax"}[ACTIVATION] -$PARAMS = {"LINEAR": "struct xnn_f32_default_params", "MINMAX": "union xnn_f32_minmax_params"}[ACTIVATION] $ISA = {1: "sse", 2: "sse2", 4: "sse41"}[SSE] -void xnn_f32_v${OP.lower()}${SUFFIX}_ukernel__${ISA}_u${BATCH_TILE}( +void xnn_f32_v${OP.lower()}_ukernel__${ISA}_u${BATCH_TILE}( size_t batch, const float* input_a, const float* input_b, float* output, - const ${PARAMS} params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -45,12 +42,6 @@ void xnn_f32_v${OP.lower()}${SUFFIX}_ukernel__${ISA}_u${BATCH_TILE}( assert(input_b != NULL); assert(output != NULL); - $if ACTIVATION == "MINMAX": - const __m128 voutput_min = _mm_set1_ps(params->scalar.min); - const __m128 voutput_max = _mm_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - $if BATCH_TILE > 4: for (; batch >= ${BATCH_TILE} * sizeof(float); batch -= ${BATCH_TILE} * sizeof(float)) { const __m128 va${ABC[0]} = _mm_loadu_ps(input_a); @@ -80,13 +71,6 @@ void xnn_f32_v${OP.lower()}${SUFFIX}_ukernel__${ISA}_u${BATCH_TILE}( $elif SSE == 4: vacc${ABC[N]} = _mm_blendv_ps(va${ABC[N]}, vacc${ABC[N]}, va${ABC[N]}); - $if ACTIVATION == "MINMAX": - $for N in range(SIMD_TILE): - vacc${ABC[N]} = _mm_max_ps(vacc${ABC[N]}, voutput_min); - - $for N in range(SIMD_TILE): - vacc${ABC[N]} = _mm_min_ps(vacc${ABC[N]}, voutput_max); - _mm_storeu_ps(output, vacc${ABC[0]}); $for N in range(1, SIMD_TILE): _mm_storeu_ps(output + ${N * 4}, vacc${ABC[N]}); @@ -108,9 +92,6 @@ void xnn_f32_v${OP.lower()}${SUFFIX}_ukernel__${ISA}_u${BATCH_TILE}( vacc = _mm_or_ps(_mm_and_ps(vacc, vmask), _mm_andnot_ps(vmask, va)); $elif SSE == 4: vacc = _mm_blendv_ps(va, vacc, va); - $if ACTIVATION == "MINMAX": - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); _mm_storeu_ps(output, vacc); output += 4; @@ -128,9 +109,6 @@ void xnn_f32_v${OP.lower()}${SUFFIX}_ukernel__${ISA}_u${BATCH_TILE}( vacc = _mm_or_ps(_mm_and_ps(vacc, vmask), _mm_andnot_ps(vmask, va)); $elif SSE == 4: vacc = _mm_blendv_ps(va, vacc, va); - $if ACTIVATION == "MINMAX": - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); if (batch & (2 * sizeof(float))) { _mm_storel_pi((__m64*) output, vacc); diff --git a/src/f32-vbinary/vop-wasmsimd.c.in b/src/f32-vbinary/vop-wasmsimd.c.in index 793d71e664a..2905e219504 100644 --- a/src/f32-vbinary/vop-wasmsimd.c.in +++ b/src/f32-vbinary/vop-wasmsimd.c.in @@ -8,7 +8,6 @@ $assert BATCH_TILE >= 4 $SIMD_TILE = BATCH_TILE // 4 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" $assert OP in ["ADD", "DIV", "MAX", "MIN", "MUL", "SUB", "SQRDIFF", "PRELU"] -$assert ACTIVATION in ["LINEAR", "MINMAX"] #include #include @@ -28,16 +27,14 @@ $ "SUB": "wasm_f32x4_sub", $ "SQRDIFF": "wasm_f32x4_sub", $ "PRELU": "wasm_f32x4_mul", $}[OP] -$ARCH_SUFFIX = "" if ACTIVATION in ["LINEAR"] and OP not in ["MIN", "MAX"] else "_x86" if X86 else "_arm" +$ARCH_SUFFIX = "" if OP not in ["MIN", "MAX"] else "_x86" if X86 else "_arm" $RELAXED_SUFFIX = "relaxed" if RELAXED else "" -$ACTIVATION_SUFFIX = {"LINEAR": ""}.get(ACTIVATION, "_" + ACTIVATION.lower()) -$PARAMS = {"LINEAR": "struct xnn_f32_default_params", "MINMAX": "union xnn_f32_minmax_params"}[ACTIVATION] -void xnn_f32_v${OP.lower()}${ACTIVATION_SUFFIX}_ukernel__wasm${RELAXED_SUFFIX}simd${ARCH_SUFFIX}_u${BATCH_TILE}( +void xnn_f32_v${OP.lower()}_ukernel__wasm${RELAXED_SUFFIX}simd${ARCH_SUFFIX}_u${BATCH_TILE}( size_t batch, const float* input_a, const float* input_b, float* output, - const ${PARAMS} params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -45,12 +42,6 @@ void xnn_f32_v${OP.lower()}${ACTIVATION_SUFFIX}_ukernel__wasm${RELAXED_SUFFIX}si assert(input_b != NULL); assert(output != NULL); - $if ACTIVATION == "MINMAX": - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); - $if BATCH_TILE > 4: for (; batch >= ${BATCH_TILE} * sizeof(float); batch -= ${BATCH_TILE} * sizeof(float)) { const v128_t va${ABC[0]} = wasm_v128_load(input_a); @@ -75,19 +66,6 @@ void xnn_f32_v${OP.lower()}${ACTIVATION_SUFFIX}_ukernel__wasm${RELAXED_SUFFIX}si $for N in range(SIMD_TILE): vacc${ABC[N]} = ${WASM_V32X4_LANESELECT}(vacc${ABC[N]}, va${ABC[N]}, vmask${ABC[N]}); - $if ACTIVATION == "MINMAX": - $if X86: - $for N in range(SIMD_TILE): - vacc${ABC[N]} = wasm_f32x4_pmax(voutput_min, vacc${ABC[N]}); - - $for N in range(SIMD_TILE): - vacc${ABC[N]} = wasm_f32x4_pmin(voutput_max, vacc${ABC[N]}); - $else: - $for N in range(SIMD_TILE): - vacc${ABC[N]} = wasm_f32x4_max(vacc${ABC[N]}, voutput_min); - - $for N in range(SIMD_TILE): - vacc${ABC[N]} = wasm_f32x4_min(vacc${ABC[N]}, voutput_max); wasm_v128_store(output, vacc${ABC[0]}); $for N in range(1, SIMD_TILE): @@ -109,14 +87,6 @@ void xnn_f32_v${OP.lower()}${ACTIVATION_SUFFIX}_ukernel__wasm${RELAXED_SUFFIX}si vacc = ${WASM_V32X4_LANESELECT}(vacc, va, vmask); - $if ACTIVATION == "MINMAX": - $if X86: - vacc = wasm_f32x4_pmax(voutput_min, vacc); - vacc = wasm_f32x4_pmin(voutput_max, vacc); - $else: - vacc = wasm_f32x4_max(vacc, voutput_min); - vacc = wasm_f32x4_min(vacc, voutput_max); - wasm_v128_store(output, vacc); output += 4; } @@ -132,14 +102,6 @@ void xnn_f32_v${OP.lower()}${ACTIVATION_SUFFIX}_ukernel__wasm${RELAXED_SUFFIX}si vacc = ${WASM_V32X4_LANESELECT}(vacc, va, vmask); - $if ACTIVATION == "MINMAX": - $if X86: - vacc = wasm_f32x4_pmax(voutput_min, vacc); - vacc = wasm_f32x4_pmin(voutput_max, vacc); - $else: - vacc = wasm_f32x4_max(vacc, voutput_min); - vacc = wasm_f32x4_min(vacc, voutput_max); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); diff --git a/src/f32-vbinary/vopc-avx.c.in b/src/f32-vbinary/vopc-avx.c.in index 7ab1be9a016..b1b15ae9850 100644 --- a/src/f32-vbinary/vopc-avx.c.in +++ b/src/f32-vbinary/vopc-avx.c.in @@ -8,7 +8,6 @@ $assert BATCH_TILE >= 8 $SIMD_TILE = BATCH_TILE // 8 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" $assert OP in ["ADD", "DIV", "RDIV", "MAX", "MIN", "MUL", "SUB", "RSUB", "SQRDIFF", "PRELU", "RPRELU"] -$assert ACTIVATION in ["LINEAR", "MINMAX"] #include #include @@ -30,14 +29,12 @@ $ "SQRDIFF": lambda x: "_mm256_sub_ps(%s, vb)" % x, $ "PRELU": lambda x: "_mm256_mul_ps(%s, vb)" % x, $ "RPRELU": lambda x: "_mm256_mul_ps(%s, vb)" % x, $}[OP] -$SUFFIX = {"LINEAR": "", "MINMAX": "_minmax"}[ACTIVATION] -$PARAMS = {"LINEAR": "struct xnn_f32_default_params", "MINMAX": "union xnn_f32_minmax_params"}[ACTIVATION] -void xnn_f32_v${OP.lower()}c${SUFFIX}_ukernel__avx_u${BATCH_TILE}( +void xnn_f32_v${OP.lower()}c_ukernel__avx_u${BATCH_TILE}( size_t batch, const float* input_a, const float* input_b, float* output, - const ${PARAMS} params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -47,11 +44,6 @@ void xnn_f32_v${OP.lower()}c${SUFFIX}_ukernel__avx_u${BATCH_TILE}( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; - $if ACTIVATION == "MINMAX": - const __m256 voutput_min = _mm256_set1_ps(params->scalar.min); - const __m256 voutput_max = _mm256_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); const __m256 vb = _mm256_broadcast_ss(input_b); $if BATCH_TILE > 8: @@ -74,13 +66,6 @@ void xnn_f32_v${OP.lower()}c${SUFFIX}_ukernel__avx_u${BATCH_TILE}( $for N in range(SIMD_TILE): vacc${ABC[N]} = _mm256_blendv_ps(vb, vacc${ABC[N]}, vb); - $if ACTIVATION == "MINMAX": - $for N in range(SIMD_TILE): - vacc${ABC[N]} = _mm256_max_ps(voutput_min, vacc${ABC[N]}); - - $for N in range(SIMD_TILE): - vacc${ABC[N]} = _mm256_min_ps(voutput_max, vacc${ABC[N]}); - _mm256_storeu_ps(output, vacc${ABC[0]}); $for N in range(1, SIMD_TILE): _mm256_storeu_ps(output + ${N * 8}, vacc${ABC[N]}); @@ -97,9 +82,6 @@ void xnn_f32_v${OP.lower()}c${SUFFIX}_ukernel__avx_u${BATCH_TILE}( vacc = _mm256_blendv_ps(va, vacc, va); $elif OP == "RPRELU": vacc = _mm256_blendv_ps(vb, vacc, vb); - $if ACTIVATION == "MINMAX": - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); _mm256_storeu_ps(output, vacc); output += 8; } @@ -117,9 +99,6 @@ void xnn_f32_v${OP.lower()}c${SUFFIX}_ukernel__avx_u${BATCH_TILE}( vacc = _mm256_blendv_ps(va, vacc, va); $elif OP == "RPRELU": vacc = _mm256_blendv_ps(vb, vacc, vb); - $if ACTIVATION == "MINMAX": - vacc = _mm256_max_ps(voutput_min, vacc); - vacc = _mm256_min_ps(voutput_max, vacc); __m128 vacc_lo = _mm256_castps256_ps128(vacc); if (batch & (4 * sizeof(float))) { diff --git a/src/f32-vbinary/vopc-avx512f.c.in b/src/f32-vbinary/vopc-avx512f.c.in index 90cb38efdea..1d495133054 100644 --- a/src/f32-vbinary/vopc-avx512f.c.in +++ b/src/f32-vbinary/vopc-avx512f.c.in @@ -8,7 +8,7 @@ $assert BATCH_TILE >= 16 $SIMD_TILE = BATCH_TILE // 16 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" $assert OP in ["ADD", "DIV", "RDIV", "MAX", "MIN", "MUL", "SUB", "RSUB", "SQRDIFF", "PRELU", "RPRELU"] -$assert ACTIVATION in ["LINEAR", "MINMAX"] + #include #include @@ -44,14 +44,12 @@ $ "SQRDIFF": lambda m, x: "_mm512_maskz_sub_ps(%s, %s, vb)" % (m, x), $ "PRELU": lambda m, x: "_mm512_maskz_mul_ps(%s, %s, vb)" % (m, x), $ "RPRELU": lambda m, x: "_mm512_maskz_mul_ps(%s, %s, vb)" % (m, x), $}[OP] -$SUFFIX = {"LINEAR": "", "MINMAX": "_minmax"}[ACTIVATION] -$PARAMS = {"LINEAR": "struct xnn_f32_default_params", "MINMAX": "union xnn_f32_minmax_params"}[ACTIVATION] -void xnn_f32_v${OP.lower()}c${SUFFIX}_ukernel__avx512f_u${BATCH_TILE}( +void xnn_f32_v${OP.lower()}c_ukernel__avx512f_u${BATCH_TILE}( size_t batch, const float* input_a, const float* input_b, float* output, - const ${PARAMS} params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -59,9 +57,6 @@ void xnn_f32_v${OP.lower()}c${SUFFIX}_ukernel__avx512f_u${BATCH_TILE}( assert(input_b != NULL); assert(output != NULL); - $if ACTIVATION == "MINMAX": - const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); - const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); const __m512 vb = _mm512_set1_ps(*input_b); $if OP == "PRELU": const __m512 vzero = _mm512_setzero_ps(); @@ -90,13 +85,6 @@ void xnn_f32_v${OP.lower()}c${SUFFIX}_ukernel__avx512f_u${BATCH_TILE}( $for N in range(SIMD_TILE): vacc${ABC[N]} = _mm512_mul_ps(vacc${ABC[N]}, vacc${ABC[N]}); - $if ACTIVATION == "MINMAX": - $for N in range(SIMD_TILE): - vacc${ABC[N]} = _mm512_max_ps(voutput_min, vacc${ABC[N]}); - - $for N in range(SIMD_TILE): - vacc${ABC[N]} = _mm512_min_ps(voutput_max, vacc${ABC[N]}); - _mm512_storeu_ps(output, vacc${ABC[0]}); $for N in range(1, SIMD_TILE): _mm512_storeu_ps(output + ${N * 16}, vacc${ABC[N]}); @@ -117,10 +105,6 @@ void xnn_f32_v${OP.lower()}c${SUFFIX}_ukernel__avx512f_u${BATCH_TILE}( $if OP == "SQRDIFF": vacc = _mm512_mul_ps(vacc, vacc); - $if ACTIVATION == "MINMAX": - vacc = _mm512_max_ps(voutput_min, vacc); - vacc = _mm512_min_ps(voutput_max, vacc); - _mm512_storeu_ps(output, vacc); output += 16; } @@ -142,9 +126,6 @@ void xnn_f32_v${OP.lower()}c${SUFFIX}_ukernel__avx512f_u${BATCH_TILE}( $if OP == "SQRDIFF": vacc = _mm512_mul_ps(vacc, vacc); - $if ACTIVATION == "MINMAX": - vacc = _mm512_maskz_max_ps(vmask, voutput_min, vacc); - vacc = _mm512_maskz_min_ps(vmask, voutput_max, vacc); _mm512_mask_storeu_ps(output, vmask, vacc); } } diff --git a/src/f32-vbinary/vopc-hvx.c.in b/src/f32-vbinary/vopc-hvx.c.in index 85f15638804..c970d0591aa 100644 --- a/src/f32-vbinary/vopc-hvx.c.in +++ b/src/f32-vbinary/vopc-hvx.c.in @@ -3,7 +3,6 @@ $assert BATCH_TILE % 32 == 0 $assert BATCH_TILE >= 32 $SIMD_TILE = BATCH_TILE // 32 $assert OP in ["ADD", "MAX", "MIN", "MUL", "SUB", "RSUB", "SQRDIFF"] -$assert ACTIVATION in ["LINEAR", "MINMAX"] #include #include "xnnpack/simd/f32-hvx.h" @@ -20,14 +19,12 @@ $ "SUB": lambda x: "xnn_sub_f32(%s, vb)" % x, $ "RSUB": lambda x: "xnn_sub_f32(vb, %s)" % x, $ "SQRDIFF": lambda x: "xnn_sub_f32(%s, vb)" % x, $}[OP] -$SUFFIX = {"LINEAR": "", "MINMAX": "_minmax"}[ACTIVATION] -$PARAMS = {"LINEAR": "struct xnn_f32_default_params", "MINMAX": "union xnn_f32_minmax_params"}[ACTIVATION] -void xnn_f32_v${OP.lower()}c${SUFFIX}_ukernel__hvx_u${BATCH_TILE}( +void xnn_f32_v${OP.lower()}c_ukernel__hvx_u${BATCH_TILE}( size_t batch, const float* input_a, const float* input_b, float* output, - const ${PARAMS} params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -35,9 +32,6 @@ void xnn_f32_v${OP.lower()}c${SUFFIX}_ukernel__hvx_u${BATCH_TILE}( assert(input_b != NULL); assert(output != NULL); - $if ACTIVATION == "MINMAX": - const HVX_Vector voutput_min = xnn_set1_f32(params->scalar.min); - const HVX_Vector voutput_max = xnn_set1_f32(params->scalar.max); HVX_Vector vb = xnn_set1_f32(*input_b); $if BATCH_TILE > 32: @@ -54,13 +48,6 @@ void xnn_f32_v${OP.lower()}c${SUFFIX}_ukernel__hvx_u${BATCH_TILE}( $for N in range(SIMD_TILE): vacc${N} = xnn_mul_f32(vacc${N}, vacc${N}); - $if ACTIVATION == "MINMAX": - $for N in range(SIMD_TILE): - vacc${N} = xnn_max_f32(vacc${N}, voutput_min); - - $for N in range(SIMD_TILE): - vacc${N} = xnn_min_f32(vacc${N}, voutput_max); - xnn_storeu_f32(output, vacc0); $for N in range(32, BATCH_TILE, 32): xnn_storeu_f32(output + ${N}, vacc${int(N/32)}); @@ -73,9 +60,6 @@ void xnn_f32_v${OP.lower()}c${SUFFIX}_ukernel__hvx_u${BATCH_TILE}( HVX_Vector vacc = ${_HEXAGON_OP_HVX("va")}; $if OP == "SQRDIFF": vacc = xnn_mul_f32(vacc, vacc); - $if ACTIVATION == "MINMAX": - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); xnn_storeu_f32(output, vacc); output+= 32; @@ -86,9 +70,6 @@ void xnn_f32_v${OP.lower()}c${SUFFIX}_ukernel__hvx_u${BATCH_TILE}( HVX_Vector vacc = ${_HEXAGON_OP_HVX("va")}; $if OP == "SQRDIFF": vacc = xnn_mul_f32(vacc, vacc); - $if ACTIVATION == "MINMAX": - vacc = xnn_max_f32(vacc, voutput_min); - vacc = xnn_min_f32(vacc, voutput_max); Q6_V_vstu_variable(output, batch, vacc); } diff --git a/src/f32-vbinary/vopc-neon.c.in b/src/f32-vbinary/vopc-neon.c.in index 8546a0a8773..0342a21ca4e 100644 --- a/src/f32-vbinary/vopc-neon.c.in +++ b/src/f32-vbinary/vopc-neon.c.in @@ -8,7 +8,6 @@ $assert BATCH_TILE >= 4 $SIMD_TILE = BATCH_TILE // 4 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" $assert OP in ["ADD", "DIV", "RDIV", "MAX", "MIN", "MUL", "SUB", "RSUB", "SQRDIFF", "PRELU", "RPRELU"] -$assert ACTIVATION in ["LINEAR", "MINMAX"] #include #include @@ -30,15 +29,13 @@ $ "SQRDIFF": lambda x: "vsubq_f32(%s, vb)" % x, $ "PRELU": lambda x: "vmulq_f32(%s, vb)" % x, $ "RPRELU": lambda x: "vmulq_f32(%s, vb)" % x, $}[OP] -$SUFFIX = {"LINEAR": "", "MINMAX": "_minmax"}[ACTIVATION] -$PARAMS = {"LINEAR": "struct xnn_f32_default_params", "MINMAX": "union xnn_f32_minmax_params"}[ACTIVATION] $ISA = "aarch64_neon" if OP in ["DIV", "RDIV"] else "neon" -void xnn_f32_v${OP.lower()}c${SUFFIX}_ukernel__${ISA}_u${BATCH_TILE}( +void xnn_f32_v${OP.lower()}c_ukernel__${ISA}_u${BATCH_TILE}( size_t batch, const float* input_a, const float* input_b, float* output, - const ${PARAMS} params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -46,9 +43,6 @@ void xnn_f32_v${OP.lower()}c${SUFFIX}_ukernel__${ISA}_u${BATCH_TILE}( assert(input_b != NULL); assert(output != NULL); - $if ACTIVATION == "MINMAX": - const float32x4_t voutput_min = vld1q_dup_f32(¶ms->scalar.min); - const float32x4_t voutput_max = vld1q_dup_f32(¶ms->scalar.max); const float32x4_t vb = vld1q_dup_f32(input_b); $if OP == "RPRELU": @@ -74,13 +68,6 @@ void xnn_f32_v${OP.lower()}c${SUFFIX}_ukernel__${ISA}_u${BATCH_TILE}( $for N in range(SIMD_TILE): vacc${ABC[N]} = vbslq_f32(vm, vacc${ABC[N]}, vb); - $if ACTIVATION == "MINMAX": - $for N in range(SIMD_TILE): - vacc${ABC[N]} = vmaxq_f32(vacc${ABC[N]}, voutput_min); - - $for N in range(SIMD_TILE): - vacc${ABC[N]} = vminq_f32(vacc${ABC[N]}, voutput_max); - $for N in range(SIMD_TILE): vst1q_f32(output, vacc${ABC[N]}); output += 4; } @@ -96,10 +83,6 @@ void xnn_f32_v${OP.lower()}c${SUFFIX}_ukernel__${ISA}_u${BATCH_TILE}( $elif OP == "RPRELU": vacc = vbslq_f32(vm, vacc, vb); - $if ACTIVATION == "MINMAX": - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { @@ -114,10 +97,6 @@ void xnn_f32_v${OP.lower()}c${SUFFIX}_ukernel__${ISA}_u${BATCH_TILE}( $elif OP == "RPRELU": vacc = vbslq_f32(vm, vacc, vb); - $if ACTIVATION == "MINMAX": - vacc = vmaxq_f32(vacc, voutput_min); - vacc = vminq_f32(vacc, voutput_max); - float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; diff --git a/src/f32-vbinary/vopc-rvv.c.in b/src/f32-vbinary/vopc-rvv.c.in index 672469c5c87..32d8ae0b84b 100644 --- a/src/f32-vbinary/vopc-rvv.c.in +++ b/src/f32-vbinary/vopc-rvv.c.in @@ -5,7 +5,6 @@ $assert LMUL in [1, 2, 4, 8] $assert OP in ["ADD", "DIV", "RDIV", "MAX", "MIN", "MUL", "SUB", "RSUB", "SQRDIFF"] -$assert ACTIVATION in ["LINEAR", "MINMAX"] #include #include @@ -25,14 +24,12 @@ $ "SUB": "__riscv_vfsub_vf_f32", $ "RSUB": "__riscv_vfrsub_vf_f32", $ "SQRDIFF": "__riscv_vfsub_vf_f32", $}[OP] -$SUFFIX = {"LINEAR": "", "MINMAX": "_minmax"}[ACTIVATION] -$PARAMS = {"LINEAR": "struct xnn_f32_default_params", "MINMAX": "union xnn_f32_minmax_params"}[ACTIVATION] -void xnn_f32_v${OP.lower()}c${SUFFIX}_ukernel__rvv_u${LMUL}v( +void xnn_f32_v${OP.lower()}c_ukernel__rvv_u${LMUL}v( size_t batch, const float* input_a, const float* input_b, float* output, - const ${PARAMS} params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -40,9 +37,6 @@ void xnn_f32_v${OP.lower()}c${SUFFIX}_ukernel__rvv_u${LMUL}v( assert(input_b != NULL); assert(output != NULL); - $if ACTIVATION == "MINMAX": - const float output_min = params->scalar.min; - const float output_max = params->scalar.max; const float b = *input_b; size_t n = batch >> 2; @@ -54,9 +48,6 @@ void xnn_f32_v${OP.lower()}c${SUFFIX}_ukernel__rvv_u${LMUL}v( vfloat32m${LMUL}_t vacc = ${OP_FUNC}m${LMUL}(va, b, vl); $if OP == "SQRDIFF": vacc = __riscv_vfmul_vv_f32m${LMUL}(vacc, vacc, vl); - $if ACTIVATION == "MINMAX": - vacc = __riscv_vfmax_vf_f32m${LMUL}(vacc, output_min, vl); - vacc = __riscv_vfmin_vf_f32m${LMUL}(vacc, output_max, vl); __riscv_vse32_v_f32m${LMUL}(output, vacc, vl); output += vl; } while (n > 0); diff --git a/src/f32-vbinary/vopc-scalar.c.in b/src/f32-vbinary/vopc-scalar.c.in index 487ecf987ec..87c61fea5fd 100644 --- a/src/f32-vbinary/vopc-scalar.c.in +++ b/src/f32-vbinary/vopc-scalar.c.in @@ -6,7 +6,6 @@ $assert BATCH_TILE >= 1 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" $assert OP in ["ADD", "DIV", "RDIV", "MAX", "MIN", "MUL", "SUB", "RSUB", "SQRDIFF", "PRELU", "RPRELU"] -$assert ACTIVATION in ["LINEAR", "MINMAX"] #include #include "xnnpack/common.h" @@ -29,14 +28,12 @@ $ "SQRDIFF": lambda x: "%s - vb" % x, $ "PRELU": lambda x: "%s * vb" % x, $ "RPRELU": lambda x: "%s * vb" % x, $}[OP] -$SUFFIX = {"LINEAR": "", "MINMAX": "_minmax"}[ACTIVATION] -$PARAMS = {"LINEAR": "struct xnn_f32_default_params", "MINMAX": "union xnn_f32_minmax_params"}[ACTIVATION] -void xnn_f32_v${OP.lower()}c${SUFFIX}_ukernel__${"wasm" if WASM else "scalar"}_u${BATCH_TILE}( +void xnn_f32_v${OP.lower()}c_ukernel__${"wasm" if WASM else "scalar"}_u${BATCH_TILE}( size_t batch, const float* input_a, const float* input_b, float* output, - const ${PARAMS} params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -44,9 +41,6 @@ void xnn_f32_v${OP.lower()}c${SUFFIX}_ukernel__${"wasm" if WASM else "scalar"}_u assert(input_b != NULL); assert(output != NULL); - $if ACTIVATION == "MINMAX": - const float voutput_min = params->scalar.min; - const float voutput_max = params->scalar.max; const float vb = *input_b; $if BATCH_TILE == 1: @@ -59,9 +53,6 @@ void xnn_f32_v${OP.lower()}c${SUFFIX}_ukernel__${"wasm" if WASM else "scalar"}_u vacc = XNN_UNPREDICTABLE(va < 0.0f) ? vacc : va; $elif OP == "RPRELU": vacc = (vb < 0.0f) ? vacc : vb; - $if ACTIVATION == "MINMAX": - vacc = ${MAX_F32}(vacc, voutput_min); - vacc = ${MIN_F32}(vacc, voutput_max); *output++ = vacc; } $else: @@ -83,13 +74,6 @@ void xnn_f32_v${OP.lower()}c${SUFFIX}_ukernel__${"wasm" if WASM else "scalar"}_u $for N in range(BATCH_TILE): vacc${ABC[N]} = vb < 0.0f ? vacc${ABC[N]} : vb; - $if ACTIVATION == "MINMAX": - $for N in range(BATCH_TILE): - vacc${ABC[N]} = ${MAX_F32}(vacc${ABC[N]}, voutput_min); - - $for N in range(BATCH_TILE): - vacc${ABC[N]} = ${MIN_F32}(vacc${ABC[N]}, voutput_max); - $for N in range(BATCH_TILE): output[${N}] = vacc${ABC[N]}; output += ${BATCH_TILE}; @@ -105,9 +89,6 @@ void xnn_f32_v${OP.lower()}c${SUFFIX}_ukernel__${"wasm" if WASM else "scalar"}_u vacc = XNN_UNPREDICTABLE(va < 0.0f) ? vacc : va; $elif OP == "RPRELU": vacc = XNN_UNPREDICTABLE(vb < 0.0f) ? vacc : vb; - $if ACTIVATION == "MINMAX": - vacc = ${MAX_F32}(vacc, voutput_min); - vacc = ${MIN_F32}(vacc, voutput_max); *output = vacc; $else: do { @@ -119,9 +100,6 @@ void xnn_f32_v${OP.lower()}c${SUFFIX}_ukernel__${"wasm" if WASM else "scalar"}_u vacc = XNN_UNPREDICTABLE(va < 0.0f) ? vacc : va; $elif OP == "RPRELU": vacc = XNN_UNPREDICTABLE(vb < 0.0f) ? vacc : vb; - $if ACTIVATION == "MINMAX": - vacc = ${MAX_F32}(vacc, voutput_min); - vacc = ${MIN_F32}(vacc, voutput_max); *output++ = vacc; batch -= sizeof(float); } while (batch != 0); diff --git a/src/f32-vbinary/vopc-sse.c.in b/src/f32-vbinary/vopc-sse.c.in index 4db8f6e5ba9..aa6275a9d04 100644 --- a/src/f32-vbinary/vopc-sse.c.in +++ b/src/f32-vbinary/vopc-sse.c.in @@ -8,7 +8,6 @@ $assert BATCH_TILE >= 4 $SIMD_TILE = BATCH_TILE // 4 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" $assert OP in ["ADD", "DIV", "RDIV", "MAX", "MIN", "MUL", "SUB", "RSUB", "SQRDIFF", "PRELU", "RPRELU"] -$assert ACTIVATION in ["LINEAR", "MINMAX"] #include $SSE_HEADER = {1: "xmmintrin.h", 2: "emmintrin.h", 4: "smmintrin.h"}[SSE] @@ -32,15 +31,13 @@ $ "SQRDIFF": lambda x: "_mm_sub_ps(%s, vb)" % x, $ "PRELU": lambda x: "_mm_mul_ps(%s, vb)" % x, $ "RPRELU": lambda x: "_mm_mul_ps(%s, vb)" % x, $}[OP] -$SUFFIX = {"LINEAR": "", "MINMAX": "_minmax"}[ACTIVATION] -$PARAMS = {"LINEAR": "struct xnn_f32_default_params", "MINMAX": "union xnn_f32_minmax_params"}[ACTIVATION] $ISA = {1: "sse", 2: "sse2", 4: "sse41"}[SSE] -void xnn_f32_v${OP.lower()}c${SUFFIX}_ukernel__${ISA}_u${BATCH_TILE}( +void xnn_f32_v${OP.lower()}c_ukernel__${ISA}_u${BATCH_TILE}( size_t batch, const float* input_a, const float* input_b, float* output, - const ${PARAMS} params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -48,11 +45,6 @@ void xnn_f32_v${OP.lower()}c${SUFFIX}_ukernel__${ISA}_u${BATCH_TILE}( assert(input_b != NULL); assert(output != NULL); - $if ACTIVATION == "MINMAX": - const __m128 voutput_min = _mm_set1_ps(params->scalar.min); - const __m128 voutput_max = _mm_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); const __m128 vb = _mm_load1_ps(input_b); $if OP == "RPRELU": @@ -88,13 +80,6 @@ void xnn_f32_v${OP.lower()}c${SUFFIX}_ukernel__${ISA}_u${BATCH_TILE}( $elif SSE == 4: vacc${ABC[N]} = _mm_blendv_ps(vb, vacc${ABC[N]}, vb); - $if ACTIVATION == "MINMAX": - $for N in range(SIMD_TILE): - vacc${ABC[N]} = _mm_max_ps(vacc${ABC[N]}, voutput_min); - - $for N in range(SIMD_TILE): - vacc${ABC[N]} = _mm_min_ps(vacc${ABC[N]}, voutput_max); - _mm_storeu_ps(output, vacc${ABC[0]}); $for N in range(1, SIMD_TILE): _mm_storeu_ps(output + ${N * 4}, vacc${ABC[N]}); @@ -118,9 +103,6 @@ void xnn_f32_v${OP.lower()}c${SUFFIX}_ukernel__${ISA}_u${BATCH_TILE}( vacc = _mm_or_ps(_mm_and_ps(vacc, vmask), _mm_andnot_ps(vmask, vb)); $elif SSE == 4: vacc = _mm_blendv_ps(vb, vacc, vb); - $if ACTIVATION == "MINMAX": - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); _mm_storeu_ps(output, vacc); output += 4; @@ -142,9 +124,6 @@ void xnn_f32_v${OP.lower()}c${SUFFIX}_ukernel__${ISA}_u${BATCH_TILE}( vacc = _mm_or_ps(_mm_and_ps(vacc, vmask), _mm_andnot_ps(vmask, vb)); $elif SSE == 4: vacc = _mm_blendv_ps(vb, vacc, vb); - $if ACTIVATION == "MINMAX": - vacc = _mm_max_ps(vacc, voutput_min); - vacc = _mm_min_ps(vacc, voutput_max); if (batch & (2 * sizeof(float))) { _mm_storel_pi((__m64*) output, vacc); vacc = _mm_movehl_ps(vacc, vacc); diff --git a/src/f32-vbinary/vopc-wasmsimd.c.in b/src/f32-vbinary/vopc-wasmsimd.c.in index 61589b9655e..ef3ef181126 100644 --- a/src/f32-vbinary/vopc-wasmsimd.c.in +++ b/src/f32-vbinary/vopc-wasmsimd.c.in @@ -8,7 +8,6 @@ $assert BATCH_TILE >= 4 $SIMD_TILE = BATCH_TILE // 4 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" $assert OP in ["ADD", "DIV", "RDIV", "MAX", "MIN", "MUL", "SUB", "RSUB", "SQRDIFF", "PRELU", "RPRELU"] -$assert ACTIVATION in ["LINEAR", "MINMAX"] #include #include @@ -31,17 +30,14 @@ $ "SQRDIFF": lambda x: "wasm_f32x4_sub(%s, vb)" % x, $ "PRELU": lambda x: "wasm_f32x4_mul(%s, vb)" % x, $ "RPRELU": lambda x: "wasm_f32x4_mul(%s, vb)" % x, $}[OP] -$assert ACTIVATION in ["LINEAR", "MINMAX"] -$ARCH_SUFFIX = "" if ACTIVATION in ["LINEAR"] and OP not in ["MIN", "MAX"] else "_x86" if X86 else "_arm" +$ARCH_SUFFIX = "" if OP not in ["MIN", "MAX"] else "_x86" if X86 else "_arm" $RELAXED_SUFFIX = "relaxed" if RELAXED else "" -$ACTIVATION_SUFFIX = {"LINEAR": ""}.get(ACTIVATION, "_" + ACTIVATION.lower()) -$PARAMS = {"LINEAR": "struct xnn_f32_default_params", "MINMAX": "union xnn_f32_minmax_params"}[ACTIVATION] -void xnn_f32_v${OP.lower()}c${ACTIVATION_SUFFIX}_ukernel__wasm${RELAXED_SUFFIX}simd${ARCH_SUFFIX}_u${BATCH_TILE}( +void xnn_f32_v${OP.lower()}c_ukernel__wasm${RELAXED_SUFFIX}simd${ARCH_SUFFIX}_u${BATCH_TILE}( size_t batch, const float* input_a, const float* input_b, float* output, - const ${PARAMS} params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); @@ -49,11 +45,6 @@ void xnn_f32_v${OP.lower()}c${ACTIVATION_SUFFIX}_ukernel__wasm${RELAXED_SUFFIX}s assert(input_b != NULL); assert(output != NULL); - $if ACTIVATION == "MINMAX": - const v128_t voutput_min = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t voutput_max = wasm_v128_load32_splat(¶ms->scalar.max); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); const v128_t vb = wasm_v128_load32_splat(input_b); $if OP == "RPRELU": @@ -81,20 +72,6 @@ void xnn_f32_v${OP.lower()}c${ACTIVATION_SUFFIX}_ukernel__wasm${RELAXED_SUFFIX}s $for N in range(SIMD_TILE): vy${ABC[N]} = ${WASM_V32X4_LANESELECT}(vy${ABC[N]}, vb, vmask); - $if ACTIVATION == "MINMAX": - $if X86: - $for N in range(SIMD_TILE): - vy${ABC[N]} = wasm_f32x4_pmax(voutput_min, vy${ABC[N]}); - - $for N in range(SIMD_TILE): - vy${ABC[N]} = wasm_f32x4_pmin(voutput_max, vy${ABC[N]}); - $else: - $for N in range(SIMD_TILE): - vy${ABC[N]} = wasm_f32x4_max(vy${ABC[N]}, voutput_min); - - $for N in range(SIMD_TILE): - vy${ABC[N]} = wasm_f32x4_min(vy${ABC[N]}, voutput_max); - wasm_v128_store(output, vy${ABC[0]}); $for N in range(1, SIMD_TILE): wasm_v128_store(output + ${N * 4}, vy${ABC[N]}); @@ -113,14 +90,6 @@ void xnn_f32_v${OP.lower()}c${ACTIVATION_SUFFIX}_ukernel__wasm${RELAXED_SUFFIX}s $elif OP == "RPRELU": vy = ${WASM_V32X4_LANESELECT}(vy, vb, vmask); - $if ACTIVATION == "MINMAX": - $if X86: - vy = wasm_f32x4_pmax(voutput_min, vy); - vy = wasm_f32x4_pmin(voutput_max, vy); - $else: - vy = wasm_f32x4_max(vy, voutput_min); - vy = wasm_f32x4_min(vy, voutput_max); - wasm_v128_store(output, vy); output += 4; } @@ -136,14 +105,6 @@ void xnn_f32_v${OP.lower()}c${ACTIVATION_SUFFIX}_ukernel__wasm${RELAXED_SUFFIX}s $elif OP == "RPRELU": vy = ${WASM_V32X4_LANESELECT}(vy, vb, vmask); - $if ACTIVATION == "MINMAX": - $if X86: - vy = wasm_f32x4_pmax(voutput_min, vy); - vy = wasm_f32x4_pmin(voutput_max, vy); - $else: - vy = wasm_f32x4_max(vy, voutput_min); - vy = wasm_f32x4_min(vy, voutput_max); - if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); diff --git a/src/operators/binary-elementwise-nd.c b/src/operators/binary-elementwise-nd.c index f3ef2ef6913..99f71611b08 100644 --- a/src/operators/binary-elementwise-nd.c +++ b/src/operators/binary-elementwise-nd.c @@ -146,26 +146,6 @@ static const struct xnn_binary_elementwise_config* init_config( } } -static bool can_use_subconfig( - const struct xnn_binary_elementwise_subconfig* subconfig) { - return subconfig->op_ukernel != NULL; -} - -static const struct xnn_binary_elementwise_subconfig* init_subconfig( - const struct xnn_binary_elementwise_config* config, - enum xnn_binary_operator type) { - // We can use either a minmax or a linear config. - if (can_use_subconfig(&config->minmax)) { - return &config->minmax; - } - if (can_use_subconfig(&config->linear)) { - return &config->linear; - } - xnn_log_error("failed to create %s operator", - xnn_binary_operator_to_string(type)); - return NULL; -} - static enum xnn_status init_binary_elementwise_nd( xnn_operator_t op, enum xnn_binary_operator type, enum xnn_datatype datatype, @@ -188,14 +168,6 @@ static enum xnn_status init_binary_elementwise_nd( return xnn_status_unsupported_hardware; } - const struct xnn_binary_elementwise_subconfig* subconfig = - init_subconfig(config, type); - if (subconfig == NULL || !can_use_subconfig(subconfig)) { - xnn_log_error("failed to create %s operator", - xnn_binary_operator_to_string(type)); - return xnn_status_unsupported_parameter; - } - union xnn_binary_uparams uparams; union xnn_binary_uparams uparams2; if (config->init != NULL) { @@ -254,7 +226,7 @@ static enum xnn_status init_binary_elementwise_nd( memcpy(&op->params, &uparams, sizeof(uparams)); memcpy(&op->params2, &uparams2, sizeof(uparams2)); - op->binary_elementwise_subconfig = subconfig; + op->binary_elementwise_config = config; op->log2_elementwise_element_size = xnn_datatype_get_log2_element_size(datatype); @@ -418,17 +390,17 @@ enum xnn_status xnn_reshape_binary_elementwise_nd(xnn_operator_t op, if (compressed_input1_shape[0] == 1) { op->context.elementwise_binary.flip_a_b = true; op->context.elementwise_binary.ukernel = - op->binary_elementwise_subconfig->ropc_ukernel; + op->binary_elementwise_config->ropc_ukernel; compressed_a_shape = compressed_input2_shape; compressed_b_shape = compressed_input1_shape; memcpy(&op->context.elementwise_binary.params, &op->params2.binary, sizeof(op->params.binary)); } else if (compressed_input2_shape[0] == 1) { op->context.elementwise_binary.ukernel = - op->binary_elementwise_subconfig->opc_ukernel; + op->binary_elementwise_config->opc_ukernel; } else if (compressed_input1_shape[0] == compressed_input2_shape[0]) { op->context.elementwise_binary.ukernel = - op->binary_elementwise_subconfig->op_ukernel; + op->binary_elementwise_config->op_ukernel; } size_t a_stride = compressed_a_shape[0]; size_t b_stride = compressed_b_shape[0]; @@ -450,7 +422,7 @@ enum xnn_status xnn_reshape_binary_elementwise_nd(xnn_operator_t op, } const size_t num_threads = pthreadpool_get_threads_count(threadpool); - const size_t element_tile = op->binary_elementwise_subconfig->element_tile; + const size_t element_tile = op->binary_elementwise_config->element_tile; if (compressed_output_shape[5] == 1) { if (compressed_output_shape[4] == 1) { if (compressed_output_shape[3] == 1) { diff --git a/src/operators/scaled-dot-product-attention-nhtc.c b/src/operators/scaled-dot-product-attention-nhtc.c index 947c80794aa..1de22ee1d87 100644 --- a/src/operators/scaled-dot-product-attention-nhtc.c +++ b/src/operators/scaled-dot-product-attention-nhtc.c @@ -555,9 +555,9 @@ static enum xnn_status reshape_scaled_dot_product_attention_nhtc( .compute_reciprocal = compute_reciprocal, .raddstoreexpminusmax_ukernel = attention_op->attention.raddstoreexpminusmax_config->ukernel, .rmax_ukernel = attention_op->attention.rmax_config->ukernel, - .vadd_ukernel = attention_op->attention.vadd_config->minmax.op_ukernel, - .vmul_ukernel = attention_op->attention.vmul_config->minmax.op_ukernel, - .vmulc_ukernel = attention_op->attention.vmul_config->minmax.opc_ukernel, + .vadd_ukernel = attention_op->attention.vadd_config->op_ukernel, + .vmul_ukernel = attention_op->attention.vmul_config->op_ukernel, + .vmulc_ukernel = attention_op->attention.vmul_config->opc_ukernel, .vtanh_ukernel = attention_op->attention.vtanh_config->ukernel, }; diff --git a/src/operators/softmax-nc.c b/src/operators/softmax-nc.c index 5fdcaaee100..0d4db7f1443 100644 --- a/src/operators/softmax-nc.c +++ b/src/operators/softmax-nc.c @@ -424,10 +424,10 @@ static enum xnn_status reshape_softmax_nc_floating_point( .rmax_ukernel = rmax, .raddstoreexpminusmax_ukernel = raddstoreexpminusmax->ukernel, .compute_reciprocal = compute_reciprocal, - .vmulc_ukernel = vmul->minmax.opc_ukernel, + .vmulc_ukernel = vmul->opc_ukernel, }; - if (vmul->linear.opc_ukernel != NULL) { - softmax_op->context.floating_point_softmax.vmulc_ukernel = vmul->linear.opc_ukernel; + if (vmul->opc_ukernel != NULL) { + softmax_op->context.floating_point_softmax.vmulc_ukernel = vmul->opc_ukernel; }; memcpy(&softmax_op->context.floating_point_softmax.rmax_params, rmax_params, rmax_params_size); memcpy(&softmax_op->context.floating_point_softmax.expminus_params, expminus_params, expminus_params_size); @@ -528,10 +528,7 @@ enum xnn_status xnn_reshape_softmax_nc_f16( } const struct xnn_binary_elementwise_config* f16_vmul_config = softmax_op->vmul_config; - union xnn_binary_uparams mul_params; - if (f16_vmul_config->init != NULL) { - f16_vmul_config->init(&mul_params, NULL, NULL, NULL); - } + struct xnn_f16_default_params mul_params; return reshape_softmax_nc_floating_point( softmax_op, xnn_operator_type_softmax_nc_f16, channels, input_stride, output_stride, @@ -563,10 +560,7 @@ enum xnn_status xnn_reshape_softmax_nc_f32( if (softmax_op->raddstoreexpminusmax_config->init.f32 != NULL) { softmax_op->raddstoreexpminusmax_config->init.f32(&expminus_params); } - union xnn_binary_uparams mul_params; - if (f32_vmul_config->init != NULL) { - f32_vmul_config->init(&mul_params, NULL, NULL, NULL); - } + struct xnn_f32_default_params mul_params; return reshape_softmax_nc_floating_point( softmax_op, xnn_operator_type_softmax_nc_f32, channels, input_stride, output_stride, diff --git a/src/xnnpack/config-types.h b/src/xnnpack/config-types.h index 660920dfd51..ffbeac09041 100644 --- a/src/xnnpack/config-types.h +++ b/src/xnnpack/config-types.h @@ -53,21 +53,16 @@ struct xnn_cmul_config { size_t element_tile; }; -struct xnn_binary_elementwise_subconfig { +struct xnn_binary_elementwise_config { xnn_vbinary_ukernel_fn op_ukernel; xnn_vbinary_ukernel_fn opc_ukernel; xnn_vbinary_ukernel_fn ropc_ukernel; + xnn_init_binary_params_fn init; // Number of elements in a tile. // For best efficiency, micro-kernel must process a multiple of this number of elements in each call. size_t element_tile; }; -struct xnn_binary_elementwise_config { - struct xnn_binary_elementwise_subconfig minmax; - struct xnn_binary_elementwise_subconfig linear; - xnn_init_binary_params_fn init; -}; - struct xnn_unary_elementwise_config { xnn_vunary_ukernel_fn ukernel; union { diff --git a/src/xnnpack/operator.h b/src/xnnpack/operator.h index 2cbbec079f1..14fee2f3929 100644 --- a/src/xnnpack/operator.h +++ b/src/xnnpack/operator.h @@ -343,7 +343,7 @@ struct xnn_operator { const struct xnn_x8_lut_config* lut_config; const struct xnn_cmul_config* cmul_config; const struct xnn_transpose_config* transpose_config; - const struct xnn_binary_elementwise_subconfig* binary_elementwise_subconfig; + const struct xnn_binary_elementwise_config* binary_elementwise_config; struct { const struct xnn_unary_elementwise_config* unary_elementwise_config; const struct xnn_reduce_config* rminmax_config; // For dynamic quantization convert operator. diff --git a/src/xnnpack/vbinary.h b/src/xnnpack/vbinary.h index dc6bb54550c..2479e2e182c 100644 --- a/src/xnnpack/vbinary.h +++ b/src/xnnpack/vbinary.h @@ -24,26 +24,26 @@ extern "C" { XNN_INTERNAL void ukernel( \ size_t n, const xnn_float16* a, const xnn_float16* b, xnn_float16* y, \ const params_type params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); -#include "src/f16-vbinary/f16-vadd-minmax.h" -#include "src/f16-vbinary/f16-vaddc-minmax.h" +#include "src/f16-vbinary/f16-vadd.h" +#include "src/f16-vbinary/f16-vaddc.h" #include "src/f16-vbinary/f16-vcmul.h" -#include "src/f16-vbinary/f16-vdiv-minmax.h" -#include "src/f16-vbinary/f16-vdivc-minmax.h" +#include "src/f16-vbinary/f16-vdiv.h" +#include "src/f16-vbinary/f16-vdivc.h" #include "src/f16-vbinary/f16-vmax.h" #include "src/f16-vbinary/f16-vmaxc.h" #include "src/f16-vbinary/f16-vmin.h" #include "src/f16-vbinary/f16-vminc.h" -#include "src/f16-vbinary/f16-vmul-minmax.h" -#include "src/f16-vbinary/f16-vmulc-minmax.h" +#include "src/f16-vbinary/f16-vmul.h" +#include "src/f16-vbinary/f16-vmulc.h" #include "src/f16-vbinary/f16-vprelu.h" #include "src/f16-vbinary/f16-vpreluc.h" #include "src/f16-vbinary/f16-vrpreluc.h" -#include "src/f16-vbinary/f16-vrdivc-minmax.h" -#include "src/f16-vbinary/f16-vrsubc-minmax.h" +#include "src/f16-vbinary/f16-vrdivc.h" +#include "src/f16-vbinary/f16-vrsubc.h" #include "src/f16-vbinary/f16-vsqrdiff.h" #include "src/f16-vbinary/f16-vsqrdiffc.h" -#include "src/f16-vbinary/f16-vsub-minmax.h" -#include "src/f16-vbinary/f16-vsubc-minmax.h" +#include "src/f16-vbinary/f16-vsub.h" +#include "src/f16-vbinary/f16-vsubc.h" #undef XNN_UKERNEL_WITH_PARAMS #define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ @@ -51,38 +51,28 @@ extern "C" { XNN_INTERNAL void ukernel( \ size_t n, const float* a, const float* b, float* y, \ const params_type params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); -#include "src/f32-vbinary/f32-vadd-minmax.h" #include "src/f32-vbinary/f32-vadd.h" -#include "src/f32-vbinary/f32-vaddc-minmax.h" #include "src/f32-vbinary/f32-vaddc.h" #include "src/f32-vbinary/f32-vcopysign.h" #include "src/f32-vbinary/f32-vcopysignc.h" #include "src/f32-vbinary/f32-vcmul.h" -#include "src/f32-vbinary/f32-vdiv-minmax.h" #include "src/f32-vbinary/f32-vdiv.h" -#include "src/f32-vbinary/f32-vdivc-minmax.h" #include "src/f32-vbinary/f32-vdivc.h" #include "src/f32-vbinary/f32-vmax.h" #include "src/f32-vbinary/f32-vmaxc.h" #include "src/f32-vbinary/f32-vmin.h" #include "src/f32-vbinary/f32-vminc.h" -#include "src/f32-vbinary/f32-vmul-minmax.h" #include "src/f32-vbinary/f32-vmul.h" -#include "src/f32-vbinary/f32-vmulc-minmax.h" #include "src/f32-vbinary/f32-vmulc.h" #include "src/f32-vbinary/f32-vprelu.h" #include "src/f32-vbinary/f32-vpreluc.h" #include "src/f32-vbinary/f32-vrpreluc.h" #include "src/f32-vbinary/f32-vrcopysignc.h" -#include "src/f32-vbinary/f32-vrdivc-minmax.h" #include "src/f32-vbinary/f32-vrdivc.h" -#include "src/f32-vbinary/f32-vrsubc-minmax.h" #include "src/f32-vbinary/f32-vrsubc.h" #include "src/f32-vbinary/f32-vsqrdiff.h" #include "src/f32-vbinary/f32-vsqrdiffc.h" -#include "src/f32-vbinary/f32-vsub-minmax.h" #include "src/f32-vbinary/f32-vsub.h" -#include "src/f32-vbinary/f32-vsubc-minmax.h" #include "src/f32-vbinary/f32-vsubc.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/BUILD.bazel b/test/BUILD.bazel index 3c94369e7d4..67b74bd5882 100644 --- a/test/BUILD.bazel +++ b/test/BUILD.bazel @@ -307,57 +307,46 @@ sh_test( ], deps = MICROKERNEL_TEST_DEPS + [":vbinary_microkernel_tester"], ) for kernel in [ - "f16_vadd_minmax", - "f16_vaddc_minmax", - "f16_vdiv_minmax", - "f16_vdivc_minmax", + "f16_vadd", + "f16_vaddc", + "f16_vdiv", + "f16_vdivc", "f16_vmax", "f16_vmaxc", "f16_vmin", "f16_vminc", - "f16_vmul_minmax", - "f16_vmulc_minmax", + "f16_vmul", + "f16_vmulc", "f16_vprelu", "f16_vpreluc", "f16_vrpreluc", - "f16_vrdivc_minmax", - "f16_vrsubc_minmax", + "f16_vrdivc", + "f16_vrsubc", "f16_vsqrdiff", "f16_vsqrdiffc", - "f16_vsub_minmax", - "f16_vsubc_minmax", + "f16_vsub", "f32_vadd", - "f32_vadd_minmax", "f32_vaddc", - "f32_vaddc_minmax", "f32_vcopysign", "f32_vcopysignc", "f32_vdiv", - "f32_vdiv_minmax", "f32_vdivc", - "f32_vdivc_minmax", "f32_vmax", "f32_vmaxc", "f32_vmin", "f32_vminc", "f32_vmul", - "f32_vmul_minmax", "f32_vmulc", - "f32_vmulc_minmax", "f32_vprelu", "f32_vpreluc", "f32_vrpreluc", "f32_vrcopysignc", "f32_vrdivc", - "f32_vrdivc_minmax", "f32_vrsubc", - "f32_vrsubc_minmax", "f32_vsqrdiff", "f32_vsqrdiffc", "f32_vsub", - "f32_vsub_minmax", "f32_vsubc", - "f32_vsubc_minmax", "qs8_vadd_minmax", "qs8_vaddc_minmax", "qs8_vmul_minmax_fp32", diff --git a/test/f16-vadd-minmax.cc b/test/f16-vadd.cc similarity index 71% rename from test/f16-vadd-minmax.cc rename to test/f16-vadd.cc index a3be413fa33..6bf82168024 100644 --- a/test/f16-vadd-minmax.cc +++ b/test/f16-vadd.cc @@ -4,7 +4,7 @@ // LICENSE file in the root directory of this source tree. // // Auto-generated file. Do not edit! -// Microkernel: f16-vadd-minmax +// Microkernel: f16-vadd // Generator: tools/generate-vbinary-test.py @@ -20,10 +20,6 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern \ XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params); \ -XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params);\ - \ - \ -XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params); \ -XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params); -#include "src/f16-vbinary/f16-vadd-minmax.h" +XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params); +#include "src/f16-vbinary/f16-vadd.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vaddc-minmax.cc b/test/f16-vaddc-minmax.cc deleted file mode 100644 index 5c5ef2c58c0..00000000000 --- a/test/f16-vaddc-minmax.cc +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Microkernel: f16-vaddc-minmax -// Generator: tools/generate-vbinary-test.py - - -#include "xnnpack/microparams-init.h" -#include "xnnpack/vbinary.h" -#include "vbinary-microkernel-tester.h" - -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ -XNN_TEST_BINARY_BATCH_EQ(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params); \ -XNN_TEST_BINARY_BATCH_DIV(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params); \ -XNN_TEST_BINARY_BATCH_LT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params); \ -XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params); \ - \ -XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params); \ -XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params); \ -XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params);\ - \ - \ -XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params); \ -XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params); -#include "src/f16-vbinary/f16-vaddc-minmax.h" -#undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vaddc-minmax.cc b/test/f16-vaddc.cc similarity index 71% rename from test/f32-vaddc-minmax.cc rename to test/f16-vaddc.cc index 0121c3bac07..f1717a8d5c7 100644 --- a/test/f32-vaddc-minmax.cc +++ b/test/f16-vaddc.cc @@ -4,7 +4,7 @@ // LICENSE file in the root directory of this source tree. // // Auto-generated file. Do not edit! -// Microkernel: f32-vaddc-minmax +// Microkernel: f16-vaddc // Generator: tools/generate-vbinary-test.py @@ -20,10 +20,6 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne \ XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params); \ -XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params);\ - \ - \ -XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params); \ -XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params); -#include "src/f32-vbinary/f32-vaddc-minmax.h" +XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params); +#include "src/f16-vbinary/f16-vaddc.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vdiv-minmax.cc b/test/f16-vdiv-minmax.cc deleted file mode 100644 index 6d30d600cd6..00000000000 --- a/test/f16-vdiv-minmax.cc +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Microkernel: f16-vdiv-minmax -// Generator: tools/generate-vbinary-test.py - - -#include "xnnpack/microparams-init.h" -#include "xnnpack/vbinary.h" -#include "vbinary-microkernel-tester.h" - -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ -XNN_TEST_BINARY_BATCH_EQ(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params); \ -XNN_TEST_BINARY_BATCH_DIV(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params); \ -XNN_TEST_BINARY_BATCH_LT(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params); \ -XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params); \ - \ -XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params); \ -XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params); \ -XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params);\ - \ - \ -XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params); \ -XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params); -#include "src/f16-vbinary/f16-vdiv-minmax.h" -#undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vdiv-minmax.cc b/test/f16-vdiv.cc similarity index 71% rename from test/f32-vdiv-minmax.cc rename to test/f16-vdiv.cc index e174a60134f..6f6b91f6665 100644 --- a/test/f32-vdiv-minmax.cc +++ b/test/f16-vdiv.cc @@ -4,7 +4,7 @@ // LICENSE file in the root directory of this source tree. // // Auto-generated file. Do not edit! -// Microkernel: f32-vdiv-minmax +// Microkernel: f16-vdiv // Generator: tools/generate-vbinary-test.py @@ -20,10 +20,6 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern \ XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params); \ -XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params);\ - \ - \ -XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params); \ -XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params); -#include "src/f32-vbinary/f32-vdiv-minmax.h" +XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params); +#include "src/f16-vbinary/f16-vdiv.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vdivc-minmax.cc b/test/f16-vdivc-minmax.cc deleted file mode 100644 index e064b87759c..00000000000 --- a/test/f16-vdivc-minmax.cc +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Microkernel: f16-vdivc-minmax -// Generator: tools/generate-vbinary-test.py - - -#include "xnnpack/microparams-init.h" -#include "xnnpack/vbinary.h" -#include "vbinary-microkernel-tester.h" - -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ -XNN_TEST_BINARY_BATCH_EQ(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params); \ -XNN_TEST_BINARY_BATCH_DIV(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params); \ -XNN_TEST_BINARY_BATCH_LT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params); \ -XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params); \ - \ -XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params); \ -XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params); \ -XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params);\ - \ - \ -XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params); \ -XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params); -#include "src/f16-vbinary/f16-vdivc-minmax.h" -#undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vdivc-minmax.cc b/test/f16-vdivc.cc similarity index 71% rename from test/f32-vdivc-minmax.cc rename to test/f16-vdivc.cc index d9825c9c18f..b30b4789328 100644 --- a/test/f32-vdivc-minmax.cc +++ b/test/f16-vdivc.cc @@ -4,7 +4,7 @@ // LICENSE file in the root directory of this source tree. // // Auto-generated file. Do not edit! -// Microkernel: f32-vdivc-minmax +// Microkernel: f16-vdivc // Generator: tools/generate-vbinary-test.py @@ -20,10 +20,6 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne \ XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params); \ -XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params);\ - \ - \ -XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params); \ -XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params); -#include "src/f32-vbinary/f32-vdivc-minmax.h" +XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params); +#include "src/f16-vbinary/f16-vdivc.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vmul-minmax.cc b/test/f16-vmul-minmax.cc deleted file mode 100644 index bc5a52b5685..00000000000 --- a/test/f16-vmul-minmax.cc +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Microkernel: f16-vmul-minmax -// Generator: tools/generate-vbinary-test.py - - -#include "xnnpack/microparams-init.h" -#include "xnnpack/vbinary.h" -#include "vbinary-microkernel-tester.h" - -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ -XNN_TEST_BINARY_BATCH_EQ(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); \ -XNN_TEST_BINARY_BATCH_DIV(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); \ -XNN_TEST_BINARY_BATCH_LT(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); \ -XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); \ - \ -XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); \ -XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); \ -XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params);\ - \ - \ -XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); \ -XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); -#include "src/f16-vbinary/f16-vmul-minmax.h" -#undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vmul-minmax.cc b/test/f16-vmul.cc similarity index 71% rename from test/f32-vmul-minmax.cc rename to test/f16-vmul.cc index 3ec25f1a725..cc169da9dbb 100644 --- a/test/f32-vmul-minmax.cc +++ b/test/f16-vmul.cc @@ -4,7 +4,7 @@ // LICENSE file in the root directory of this source tree. // // Auto-generated file. Do not edit! -// Microkernel: f32-vmul-minmax +// Microkernel: f16-vmul // Generator: tools/generate-vbinary-test.py @@ -20,10 +20,6 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern \ XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); \ -XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params);\ - \ - \ -XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); \ -XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); -#include "src/f32-vbinary/f32-vmul-minmax.h" +XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); +#include "src/f16-vbinary/f16-vmul.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vmulc-minmax.cc b/test/f16-vmulc-minmax.cc deleted file mode 100644 index a06b8134d55..00000000000 --- a/test/f16-vmulc-minmax.cc +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Microkernel: f16-vmulc-minmax -// Generator: tools/generate-vbinary-test.py - - -#include "xnnpack/microparams-init.h" -#include "xnnpack/vbinary.h" -#include "vbinary-microkernel-tester.h" - -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ -XNN_TEST_BINARY_BATCH_EQ(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); \ -XNN_TEST_BINARY_BATCH_DIV(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); \ -XNN_TEST_BINARY_BATCH_LT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); \ -XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); \ - \ -XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); \ -XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); \ -XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params);\ - \ - \ -XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); \ -XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); -#include "src/f16-vbinary/f16-vmulc-minmax.h" -#undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vmulc-minmax.cc b/test/f16-vmulc.cc similarity index 71% rename from test/f32-vmulc-minmax.cc rename to test/f16-vmulc.cc index 3ec3757485e..9ff93b385b0 100644 --- a/test/f32-vmulc-minmax.cc +++ b/test/f16-vmulc.cc @@ -4,7 +4,7 @@ // LICENSE file in the root directory of this source tree. // // Auto-generated file. Do not edit! -// Microkernel: f32-vmulc-minmax +// Microkernel: f16-vmulc // Generator: tools/generate-vbinary-test.py @@ -20,10 +20,6 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne \ XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); \ -XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params);\ - \ - \ -XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); \ -XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); -#include "src/f32-vbinary/f32-vmulc-minmax.h" +XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); +#include "src/f16-vbinary/f16-vmulc.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vrdivc-minmax.cc b/test/f16-vrdivc-minmax.cc deleted file mode 100644 index 48b837ab72a..00000000000 --- a/test/f16-vrdivc-minmax.cc +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Microkernel: f16-vrdivc-minmax -// Generator: tools/generate-vbinary-test.py - - -#include "xnnpack/microparams-init.h" -#include "xnnpack/vbinary.h" -#include "vbinary-microkernel-tester.h" - -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ -XNN_TEST_BINARY_BATCH_EQ(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RDiv, init_params); \ -XNN_TEST_BINARY_BATCH_DIV(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RDiv, init_params); \ -XNN_TEST_BINARY_BATCH_LT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RDiv, init_params); \ -XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RDiv, init_params); \ - \ -XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RDiv, init_params); \ -XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RDiv, init_params); \ -XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RDiv, init_params);\ - \ - \ -XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RDiv, init_params); \ -XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RDiv, init_params); -#include "src/f16-vbinary/f16-vrdivc-minmax.h" -#undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vrdivc-minmax.cc b/test/f16-vrdivc.cc similarity index 71% rename from test/f32-vrdivc-minmax.cc rename to test/f16-vrdivc.cc index 20316d5a0bf..fc73faa51bf 100644 --- a/test/f32-vrdivc-minmax.cc +++ b/test/f16-vrdivc.cc @@ -4,7 +4,7 @@ // LICENSE file in the root directory of this source tree. // // Auto-generated file. Do not edit! -// Microkernel: f32-vrdivc-minmax +// Microkernel: f16-vrdivc // Generator: tools/generate-vbinary-test.py @@ -20,10 +20,6 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne \ XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RDiv, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RDiv, init_params); \ -XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RDiv, init_params);\ - \ - \ -XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RDiv, init_params); \ -XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RDiv, init_params); -#include "src/f32-vbinary/f32-vrdivc-minmax.h" +XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RDiv, init_params); +#include "src/f16-vbinary/f16-vrdivc.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vrsubc-minmax.cc b/test/f16-vrsubc.cc similarity index 71% rename from test/f16-vrsubc-minmax.cc rename to test/f16-vrsubc.cc index 9fd527c2586..74cb4632e4e 100644 --- a/test/f16-vrsubc-minmax.cc +++ b/test/f16-vrsubc.cc @@ -4,7 +4,7 @@ // LICENSE file in the root directory of this source tree. // // Auto-generated file. Do not edit! -// Microkernel: f16-vrsubc-minmax +// Microkernel: f16-vrsubc // Generator: tools/generate-vbinary-test.py @@ -20,10 +20,6 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne \ XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RSub, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RSub, init_params); \ -XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RSub, init_params);\ - \ - \ -XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RSub, init_params); \ -XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RSub, init_params); -#include "src/f16-vbinary/f16-vrsubc-minmax.h" +XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RSub, init_params); +#include "src/f16-vbinary/f16-vrsubc.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vsub-minmax.cc b/test/f16-vsub-minmax.cc deleted file mode 100644 index ae530fb57f8..00000000000 --- a/test/f16-vsub-minmax.cc +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Microkernel: f16-vsub-minmax -// Generator: tools/generate-vbinary-test.py - - -#include "xnnpack/microparams-init.h" -#include "xnnpack/vbinary.h" -#include "vbinary-microkernel-tester.h" - -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ -XNN_TEST_BINARY_BATCH_EQ(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params); \ -XNN_TEST_BINARY_BATCH_DIV(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params); \ -XNN_TEST_BINARY_BATCH_LT(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params); \ -XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params); \ - \ -XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params); \ -XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params); \ -XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params);\ - \ - \ -XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params); \ -XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params); -#include "src/f16-vbinary/f16-vsub-minmax.h" -#undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vsub-minmax.cc b/test/f16-vsub.cc similarity index 71% rename from test/f32-vsub-minmax.cc rename to test/f16-vsub.cc index 9d0eedc6130..aba129113a1 100644 --- a/test/f32-vsub-minmax.cc +++ b/test/f16-vsub.cc @@ -4,7 +4,7 @@ // LICENSE file in the root directory of this source tree. // // Auto-generated file. Do not edit! -// Microkernel: f32-vsub-minmax +// Microkernel: f16-vsub // Generator: tools/generate-vbinary-test.py @@ -20,10 +20,6 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern \ XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params); \ -XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params);\ - \ - \ -XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params); \ -XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params); -#include "src/f32-vbinary/f32-vsub-minmax.h" +XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params); +#include "src/f16-vbinary/f16-vsub.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vsubc-minmax.cc b/test/f16-vsubc-minmax.cc deleted file mode 100644 index ce7332f0509..00000000000 --- a/test/f16-vsubc-minmax.cc +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Microkernel: f16-vsubc-minmax -// Generator: tools/generate-vbinary-test.py - - -#include "xnnpack/microparams-init.h" -#include "xnnpack/vbinary.h" -#include "vbinary-microkernel-tester.h" - -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ -XNN_TEST_BINARY_BATCH_EQ(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params); \ -XNN_TEST_BINARY_BATCH_DIV(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params); \ -XNN_TEST_BINARY_BATCH_LT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params); \ -XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params); \ - \ -XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params); \ -XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params); \ -XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params);\ - \ - \ -XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params); \ -XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params); -#include "src/f16-vbinary/f16-vsubc-minmax.h" -#undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vsubc-minmax.cc b/test/f16-vsubc.cc similarity index 71% rename from test/f32-vsubc-minmax.cc rename to test/f16-vsubc.cc index 5671ee3f737..ea52fcd5567 100644 --- a/test/f32-vsubc-minmax.cc +++ b/test/f16-vsubc.cc @@ -4,7 +4,7 @@ // LICENSE file in the root directory of this source tree. // // Auto-generated file. Do not edit! -// Microkernel: f32-vsubc-minmax +// Microkernel: f16-vsubc // Generator: tools/generate-vbinary-test.py @@ -20,10 +20,6 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne \ XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params); \ -XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params);\ - \ - \ -XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params); \ -XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params); -#include "src/f32-vbinary/f32-vsubc-minmax.h" +XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params); +#include "src/f16-vbinary/f16-vsubc.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vadd-minmax.cc b/test/f32-vadd-minmax.cc deleted file mode 100644 index eaf0333c796..00000000000 --- a/test/f32-vadd-minmax.cc +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Microkernel: f32-vadd-minmax -// Generator: tools/generate-vbinary-test.py - - -#include "xnnpack/microparams-init.h" -#include "xnnpack/vbinary.h" -#include "vbinary-microkernel-tester.h" - -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ -XNN_TEST_BINARY_BATCH_EQ(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params); \ -XNN_TEST_BINARY_BATCH_DIV(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params); \ -XNN_TEST_BINARY_BATCH_LT(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params); \ -XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params); \ - \ -XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params); \ -XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params); \ -XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params);\ - \ - \ -XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params); \ -XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params); -#include "src/f32-vbinary/f32-vadd-minmax.h" -#undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vrsubc-minmax.cc b/test/f32-vrsubc-minmax.cc deleted file mode 100644 index 89f4221c879..00000000000 --- a/test/f32-vrsubc-minmax.cc +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Microkernel: f32-vrsubc-minmax -// Generator: tools/generate-vbinary-test.py - - -#include "xnnpack/microparams-init.h" -#include "xnnpack/vbinary.h" -#include "vbinary-microkernel-tester.h" - -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ -XNN_TEST_BINARY_BATCH_EQ(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RSub, init_params); \ -XNN_TEST_BINARY_BATCH_DIV(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RSub, init_params); \ -XNN_TEST_BINARY_BATCH_LT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RSub, init_params); \ -XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RSub, init_params); \ - \ -XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RSub, init_params); \ -XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RSub, init_params); \ -XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RSub, init_params);\ - \ - \ -XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RSub, init_params); \ -XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RSub, init_params); -#include "src/f32-vbinary/f32-vrsubc-minmax.h" -#undef XNN_UKERNEL_WITH_PARAMS From 8dc0346ec73d470a883c49f07f62a611111e01ce Mon Sep 17 00:00:00 2001 From: Alan Kelly Date: Tue, 24 Sep 2024 05:59:03 -0700 Subject: [PATCH 38/50] Removed unused variable PiperOrigin-RevId: 678211993 --- src/configs/binary-elementwise-config.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/configs/binary-elementwise-config.c b/src/configs/binary-elementwise-config.c index 87b5bf3efb8..40ea82c711b 100644 --- a/src/configs/binary-elementwise-config.c +++ b/src/configs/binary-elementwise-config.c @@ -374,8 +374,6 @@ static void init_f32_vadd_config(void) { f32_vadd_config.element_tile = 8; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - assert(hardware_config != NULL); f32_vadd_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_ukernel__wasmsimd_u16; f32_vadd_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_ukernel__wasmsimd_u16; f32_vadd_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_ukernel__wasmsimd_u16; From a6b613900a32fb5ddb5fb5f2288b5b23b9ed0b7e Mon Sep 17 00:00:00 2001 From: Alan Kelly Date: Tue, 24 Sep 2024 06:52:44 -0700 Subject: [PATCH 39/50] Removed unused variable PiperOrigin-RevId: 678229130 --- src/configs/binary-elementwise-config.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/configs/binary-elementwise-config.c b/src/configs/binary-elementwise-config.c index 40ea82c711b..a398118ff38 100644 --- a/src/configs/binary-elementwise-config.c +++ b/src/configs/binary-elementwise-config.c @@ -546,8 +546,6 @@ static void init_f32_vdiv_config(void) { f32_vdiv_config.element_tile = 8; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - assert(hardware_config != NULL); f32_vdiv_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_ukernel__wasmsimd_u16; f32_vdiv_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_ukernel__wasmsimd_u16; f32_vdiv_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_ukernel__wasmsimd_u16; @@ -755,8 +753,6 @@ static void init_f32_vmul_config(void) { f32_vmul_config.element_tile = 8; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - assert(hardware_config != NULL); f32_vmul_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_ukernel__wasmsimd_u16; f32_vmul_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_ukernel__wasmsimd_u16; f32_vmul_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_ukernel__wasmsimd_u16; @@ -820,8 +816,6 @@ static void init_f32_vsub_config(void) { f32_vsub_config.element_tile = 8; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - assert(hardware_config != NULL); f32_vsub_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_ukernel__wasmsimd_u16; f32_vsub_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_ukernel__wasmsimd_u16; f32_vsub_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_ukernel__wasmsimd_u16; From 9007aa93227010168e615f9c6552035040c94a15 Mon Sep 17 00:00:00 2001 From: Pedro Gonnet Date: Tue, 24 Sep 2024 07:07:59 -0700 Subject: [PATCH 40/50] Fix the logic for selecting `nc` in `fully-connected-nc.c` and `dynamic-fully-connected-nc.c` such that we get at least 5 tiles per thread. PiperOrigin-RevId: 678234048 --- src/operators/dynamic-fully-connected-nc.c | 4 +++- src/operators/fully-connected-nc.c | 6 ++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/operators/dynamic-fully-connected-nc.c b/src/operators/dynamic-fully-connected-nc.c index 696312a7f30..ca5d69fdc6f 100644 --- a/src/operators/dynamic-fully-connected-nc.c +++ b/src/operators/dynamic-fully-connected-nc.c @@ -397,7 +397,9 @@ static enum xnn_status reshape_dynamic_fully_connected_nc( const size_t target_tiles_per_thread = 5; const size_t max_nc = divide_round_up(output_channels * num_other_tiles, num_threads * target_tiles_per_thread); if (max_nc < nc) { - nc = min(nc, divide_round_up(nc, max_nc * nr) * nr); + nc = min(nc, divide_round_up(output_channels, + divide_round_up(nc, max_nc) * nr) * + nr); } } diff --git a/src/operators/fully-connected-nc.c b/src/operators/fully-connected-nc.c index ae65fa290fc..c2a05764ad2 100644 --- a/src/operators/fully-connected-nc.c +++ b/src/operators/fully-connected-nc.c @@ -11,9 +11,9 @@ #include #include #include +#include #include -#include #include "xnnpack.h" #include "xnnpack/allocator.h" #include "xnnpack/cache.h" @@ -1805,7 +1805,9 @@ static enum xnn_status reshape_fully_connected_nc( const size_t target_tiles_per_thread = 5; const size_t max_nc = divide_round_up(output_channels * num_other_tiles, num_threads * target_tiles_per_thread); if (max_nc < nc) { - nc = min(nc, divide_round_up(nc, max_nc * nr) * nr); + nc = min(nc, divide_round_up(output_channels, + divide_round_up(nc, max_nc) * nr) * + nr); } } From 6fb01168762ab4b852657c90be90901c5c98156a Mon Sep 17 00:00:00 2001 From: Pedro Gonnet Date: Tue, 24 Sep 2024 10:04:36 -0700 Subject: [PATCH 41/50] Replace float literals that are OK in `C99`, but not in `C++11`. PiperOrigin-RevId: 678295380 --- CMakeLists.txt | 1 + test/fully-connected-operator-tester.h | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index dfda24ca714..2a14b87210f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1230,6 +1230,7 @@ IF(XNNPACK_BUILD_TESTS) # ---[ Launch heavy tests first. SET(SHARDED_TESTS + fully-connected-nc avgpool-minmax maxpool-minmax f32-vclamp diff --git a/test/fully-connected-operator-tester.h b/test/fully-connected-operator-tester.h index dc0ee4706d3..56c843b376d 100644 --- a/test/fully-connected-operator-tester.h +++ b/test/fully-connected-operator-tester.h @@ -1224,7 +1224,7 @@ class FullyConnectedOperatorTester { std::uniform_real_distribution f32dist(-1.0f, 1.0f); std::uniform_real_distribution f32idist(0.1f, 1.0f); std::uniform_int_distribution w8dist(-std::numeric_limits::max(), std::numeric_limits::max()); - // Weights typically have a Gaussian distrubution centred on zero. A + // Weights typically have a Gaussian distribution centred on zero. A // standard deviation of 40 means that > 99.99% of values fall in the // -127->127 range. However, the reduce the chance of overflow, we constrain // the weights further. @@ -1845,7 +1845,7 @@ class FullyConnectedOperatorTester { accumulated_max = std::max(accumulated_max, accumulators[i * output_channels() + oc]); } - float requantization_scale = 0x1.0p-32f; + float requantization_scale = 2.3283064e-10; // 0x1.0p-32f if (accumulated_max != 0) { requantization_scale = std::max(requantization_scale, float(int32_t(std::numeric_limits::max()) - int32_t(output_zero_point() - 0x80)) / float(accumulated_max)); @@ -1854,7 +1854,8 @@ class FullyConnectedOperatorTester { requantization_scale = std::max(requantization_scale, float(int32_t(std::numeric_limits::min()) - int32_t(output_zero_point() - 0x80)) / float(accumulated_min)); } - requantization_scale = std::min(requantization_scale, 0x1.FFFFFEp-1f); + requantization_scale = + std::min(requantization_scale, 0.99999988079f /*0x1.FFFFFEp-1f*/); requantization_scales[oc] = requantization_scale; } From fe2da40928089a49a17b720adab30b9086540390 Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Tue, 24 Sep 2024 10:04:36 -0700 Subject: [PATCH 42/50] Regenerate stale enum table PiperOrigin-RevId: 678295384 --- src/enums/operator-type.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/src/enums/operator-type.c b/src/enums/operator-type.c index 3e1fd4254ed..664fdb33944 100644 --- a/src/enums/operator-type.c +++ b/src/enums/operator-type.c @@ -12,16 +12,15 @@ #include "xnnpack/operator-type.h" -static const uint16_t offset[170] = { - 0, 8, 22, 36, 50, 64, 78, 92, 119, 147, 175, 203, 230, 257, 289, 321, 364, 382, 400, 425, 451, 467, 483, 498, 513, - 535, 558, 581, 604, 627, 650, 673, 696, 719, 742, 760, 783, 806, 830, 848, 871, 895, 919, 943, 967, 1002, 1037, 1061, - 1085, 1109, 1123, 1138, 1153, 1173, 1199, 1225, 1262, 1288, 1318, 1344, 1376, 1408, 1434, 1461, 1488, 1505, 1522, - 1556, 1590, 1604, 1618, 1632, 1646, 1662, 1678, 1704, 1730, 1762, 1794, 1831, 1868, 1905, 1942, 1979, 2016, 2053, - 2079, 2111, 2137, 2152, 2186, 2220, 2254, 2288, 2322, 2356, 2386, 2416, 2436, 2456, 2477, 2498, 2519, 2540, 2554, - 2578, 2602, 2625, 2648, 2666, 2684, 2699, 2714, 2729, 2747, 2765, 2784, 2803, 2822, 2841, 2860, 2877, 2894, 2910, - 2926, 2959, 2992, 3020, 3048, 3076, 3104, 3131, 3158, 3175, 3192, 3233, 3274, 3292, 3310, 3328, 3346, 3361, 3377, - 3393, 3411, 3429, 3447, 3473, 3500, 3527, 3544, 3561, 3583, 3605, 3634, 3663, 3682, 3701, 3720, 3739, 3754, 3769, - 3784, 3799, 3818, 3838, 3858, 3878, 3899, 3920 +static const uint16_t offset[157] = { + 0, 8, 22, 36, 45, 72, 100, 128, 156, 183, 210, 242, 274, 317, 335, 353, 378, 404, 420, 436, 451, 466, 488, 511, 534, + 557, 580, 603, 626, 649, 672, 695, 713, 736, 759, 783, 801, 824, 848, 872, 896, 920, 955, 990, 1014, 1038, 1062, 1076, + 1091, 1106, 1121, 1147, 1173, 1210, 1236, 1266, 1292, 1324, 1356, 1382, 1409, 1436, 1448, 1482, 1516, 1530, 1544, + 1558, 1572, 1588, 1604, 1630, 1656, 1688, 1720, 1757, 1794, 1831, 1868, 1905, 1942, 1979, 2005, 2037, 2063, 2078, + 2112, 2146, 2180, 2214, 2248, 2282, 2312, 2342, 2362, 2382, 2403, 2424, 2445, 2466, 2480, 2504, 2528, 2551, 2574, + 2587, 2602, 2617, 2632, 2647, 2660, 2674, 2691, 2708, 2724, 2740, 2773, 2806, 2834, 2862, 2890, 2918, 2945, 2972, + 2989, 3006, 3047, 3088, 3106, 3124, 3142, 3160, 3175, 3191, 3207, 3225, 3243, 3261, 3287, 3314, 3341, 3358, 3375, + 3397, 3419, 3443, 3457, 3472, 3487, 3502, 3517, 3536, 3556, 3576, 3596, 3617, 3638 }; static const char data[] = @@ -74,7 +73,7 @@ static const char data[] = "Copy (NC, X8)\0" "Copy (NC, X16)\0" "Copy (NC, X32)\0" - "Copy Sign (NC)\0" + "Copy Sign (ND)\0" "Deconvolution (NHWC, F16)\0" "Deconvolution (NHWC, F32)\0" "Deconvolution (NHWC, QD8, F32, QC8W)\0" From dfe0a36b200fa68af48103d8e223eaa4eccc627f Mon Sep 17 00:00:00 2001 From: Pedro Gonnet Date: Tue, 24 Sep 2024 10:09:31 -0700 Subject: [PATCH 43/50] Re-enable `FullyConnectedTestQP8F32QC4W.matches_operator_api_transposed_weights` which should now be supported. PiperOrigin-RevId: 678297464 --- test/fully-connected.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/fully-connected.cc b/test/fully-connected.cc index f4ac741cc0b..c6d52f3807b 100644 --- a/test/fully-connected.cc +++ b/test/fully-connected.cc @@ -706,7 +706,7 @@ TEST_F(FullyConnectedTestQP8F32QC4W, matches_operator_api_with_reshape) { } // TODO(b/355416339): Re-enable once we can handle strides again -TEST_F(FullyConnectedTestQP8F32QC4W, DISABLED_matches_operator_api_transposed_weights) { +TEST_F(FullyConnectedTestQP8F32QC4W, matches_operator_api_transposed_weights) { ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); if (xnn_init_qp8_f32_qc4w_gemm_config() == nullptr) { From c46134e76ae8c589902f78050b0a38bfe106daa0 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Tue, 24 Sep 2024 12:00:46 -0700 Subject: [PATCH 44/50] f32-raddstoreexpminusmax microkernels reduced to 4 variations per ISA - 1 vector, 2 vectors with 2 accumulators, 4 vectors with 2 accumulators, and 4 vectors with 4 accumulators - Fix comment for number of elements PiperOrigin-RevId: 678342062 --- bench/f32-raddstoreexpminusmax.cc | 856 +-- bench/f32-softmax.cc | 4 +- cmake/gen/avx2_microkernels.cmake | 34 +- cmake/gen/avx512f_microkernels.cmake | 15 +- cmake/gen/avx_microkernels.cmake | 12 - cmake/gen/hvx_microkernels.cmake | 6 - cmake/gen/neon_microkernels.cmake | 16 - cmake/gen/neonfma_microkernels.cmake | 18 +- cmake/gen/neonfp16arith_microkernels.cmake | 2 +- cmake/gen/scalar_microkernels.cmake | 4 - cmake/gen/sse2_microkernels.cmake | 10 +- cmake/gen/wasmrelaxedsimd_microkernels.cmake | 8 - cmake/gen/wasmsimd_microkernels.cmake | 8 - gen/avx2_microkernels.bzl | 34 +- gen/avx512f_microkernels.bzl | 15 +- gen/avx_microkernels.bzl | 12 - gen/hvx_microkernels.bzl | 6 - gen/neon_microkernels.bzl | 16 - gen/neonfma_microkernels.bzl | 18 +- gen/neonfp16arith_microkernels.bzl | 2 +- gen/scalar_microkernels.bzl | 4 - gen/sse2_microkernels.bzl | 10 +- gen/wasmrelaxedsimd_microkernels.bzl | 8 - gen/wasmsimd_microkernels.bzl | 8 - scripts/generate-f32-raddstoreexpminusmax.sh | 121 +- src/configs/raddstoreexpminusmax-config.c | 16 +- src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in | 1 + src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in | 5 +- ...raddstoreexpminusmax-avx-rr2-p5-u12-acc2.c | 257 - ...raddstoreexpminusmax-avx-rr2-p5-u12-acc3.c | 259 - .../f32-raddstoreexpminusmax-avx-rr2-p5-u12.c | 254 - ...raddstoreexpminusmax-avx-rr2-p5-u16-acc2.c | 273 - ...raddstoreexpminusmax-avx-rr2-p5-u16-acc4.c | 277 - .../f32-raddstoreexpminusmax-avx-rr2-p5-u16.c | 270 - ...raddstoreexpminusmax-avx-rr2-p5-u20-acc2.c | 289 - ...raddstoreexpminusmax-avx-rr2-p5-u20-acc5.c | 295 - .../f32-raddstoreexpminusmax-avx-rr2-p5-u20.c | 286 - .../f32-raddstoreexpminusmax-avx-rr2-p5-u4.c | 222 - ...-raddstoreexpminusmax-avx-rr2-p5-u8-acc2.c | 241 - .../f32-raddstoreexpminusmax-avx-rr2-p5-u8.c | 238 - ...ddstoreexpminusmax-avx2-rr1-p5-u16-acc2.c} | 43 +- ...addstoreexpminusmax-avx2-rr1-p5-u32-acc2.c | 1 + ...addstoreexpminusmax-avx2-rr1-p5-u32-acc4.c | 1 + ...addstoreexpminusmax-avx2-rr1-p5-u64-acc2.c | 276 - ...addstoreexpminusmax-avx2-rr1-p5-u64-acc4.c | 280 - ...f32-raddstoreexpminusmax-avx2-rr1-p5-u64.c | 274 - ...addstoreexpminusmax-avx2-rr1-p5-u72-acc3.c | 293 - ...f32-raddstoreexpminusmax-avx2-rr1-p5-u72.c | 289 - .../f32-raddstoreexpminusmax-avx2-rr1-p5-u8.c | 170 + ...addstoreexpminusmax-avx2-rr1-p5-u80-acc2.c | 306 - ...addstoreexpminusmax-avx2-rr1-p5-u80-acc5.c | 312 - ...f32-raddstoreexpminusmax-avx2-rr1-p5-u80.c | 304 - ...addstoreexpminusmax-avx2-rr1-p5-u96-acc2.c | 336 -- ...addstoreexpminusmax-avx2-rr1-p5-u96-acc3.c | 338 -- ...addstoreexpminusmax-avx2-rr1-p5-u96-acc6.c | 344 -- ...f32-raddstoreexpminusmax-avx2-rr1-p5-u96.c | 334 -- ...ddstoreexpminusmax-avx2-rr2-p5-u16-acc2.c} | 50 +- ...addstoreexpminusmax-avx2-rr2-p5-u32-acc2.c | 5 +- ...addstoreexpminusmax-avx2-rr2-p5-u32-acc4.c | 5 +- ...addstoreexpminusmax-avx2-rr2-p5-u64-acc2.c | 338 -- ...addstoreexpminusmax-avx2-rr2-p5-u64-acc4.c | 342 -- ...addstoreexpminusmax-avx2-rr2-p5-u72-acc3.c | 356 -- ...f32-raddstoreexpminusmax-avx2-rr2-p5-u72.c | 351 -- ...f32-raddstoreexpminusmax-avx2-rr2-p5-u8.c} | 127 +- ...addstoreexpminusmax-avx2-rr2-p5-u80-acc2.c | 370 -- ...addstoreexpminusmax-avx2-rr2-p5-u80-acc5.c | 376 -- ...f32-raddstoreexpminusmax-avx2-rr2-p5-u80.c | 367 -- ...addstoreexpminusmax-avx2-rr2-p5-u96-acc2.c | 402 -- ...addstoreexpminusmax-avx2-rr2-p5-u96-acc3.c | 404 -- ...addstoreexpminusmax-avx2-rr2-p5-u96-acc6.c | 410 -- ...f32-raddstoreexpminusmax-avx2-rr2-p5-u96.c | 399 -- ...minusmax-avx512f-rr1-p5-scalef-u128-acc2.c | 218 - ...minusmax-avx512f-rr1-p5-scalef-u128-acc4.c | 222 - ...reexpminusmax-avx512f-rr1-p5-scalef-u128.c | 216 - ...minusmax-avx512f-rr1-p5-scalef-u144-acc3.c | 232 - ...reexpminusmax-avx512f-rr1-p5-scalef-u144.c | 228 - ...oreexpminusmax-avx512f-rr1-p5-scalef-u16.c | 132 + ...minusmax-avx512f-rr1-p5-scalef-u160-acc2.c | 242 - ...minusmax-avx512f-rr1-p5-scalef-u160-acc5.c | 248 - ...reexpminusmax-avx512f-rr1-p5-scalef-u160.c | 240 - ...minusmax-avx512f-rr1-p5-scalef-u192-acc2.c | 266 - ...minusmax-avx512f-rr1-p5-scalef-u192-acc3.c | 268 - ...minusmax-avx512f-rr1-p5-scalef-u192-acc6.c | 274 - ...reexpminusmax-avx512f-rr1-p5-scalef-u192.c | 264 - ...minusmax-avx512f-rr1-p5-scalef-u32-acc2.c} | 36 +- ...addstoreexpminusmax-hvx-rr2-p5-u128-acc3.c | 226 - ...f32-raddstoreexpminusmax-hvx-rr2-p5-u128.c | 222 - .../f32-raddstoreexpminusmax-hvx-rr2-p5-u64.c | 190 - ...raddstoreexpminusmax-hvx-rr2-p5-u96-acc2.c | 208 - ...raddstoreexpminusmax-hvx-rr2-p5-u96-acc3.c | 210 - .../f32-raddstoreexpminusmax-hvx-rr2-p5-u96.c | 206 - ...reexpminusmax-neon-rr2-lut64-p2-u12-acc2.c | 243 - ...reexpminusmax-neon-rr2-lut64-p2-u12-acc3.c | 245 - ...ddstoreexpminusmax-neon-rr2-lut64-p2-u12.c | 241 - ...ddstoreexpminusmax-neon-rr2-lut64-p2-u16.c | 263 - ...reexpminusmax-neon-rr2-lut64-p2-u20-acc2.c | 287 - ...reexpminusmax-neon-rr2-lut64-p2-u20-acc5.c | 293 - ...ddstoreexpminusmax-neon-rr2-lut64-p2-u20.c | 285 - ...addstoreexpminusmax-neon-rr2-lut64-p2-u8.c | 219 - ...addstoreexpminusmax-neon-rr2-p5-u12-acc2.c | 214 - ...addstoreexpminusmax-neon-rr2-p5-u12-acc3.c | 216 - ...f32-raddstoreexpminusmax-neon-rr2-p5-u12.c | 212 - ...f32-raddstoreexpminusmax-neon-rr2-p5-u16.c | 228 - ...addstoreexpminusmax-neon-rr2-p5-u20-acc2.c | 246 - ...addstoreexpminusmax-neon-rr2-p5-u20-acc5.c | 252 - ...f32-raddstoreexpminusmax-neon-rr2-p5-u20.c | 244 - .../f32-raddstoreexpminusmax-neon-rr2-p5-u8.c | 196 - ...xpminusmax-neonfma-rr1-lut64-p2-u12-acc2.c | 235 - ...xpminusmax-neonfma-rr1-lut64-p2-u12-acc3.c | 237 - ...toreexpminusmax-neonfma-rr1-lut64-p2-u12.c | 233 - ...toreexpminusmax-neonfma-rr1-lut64-p2-u16.c | 254 - ...xpminusmax-neonfma-rr1-lut64-p2-u20-acc2.c | 277 - ...xpminusmax-neonfma-rr1-lut64-p2-u20-acc5.c | 283 - ...toreexpminusmax-neonfma-rr1-lut64-p2-u20.c | 275 - ...storeexpminusmax-neonfma-rr1-lut64-p2-u8.c | 212 - ...storeexpminusmax-neonfma-rr1-p5-u12-acc2.c | 206 - ...storeexpminusmax-neonfma-rr1-p5-u12-acc3.c | 208 - ...-raddstoreexpminusmax-neonfma-rr1-p5-u12.c | 204 - ...-raddstoreexpminusmax-neonfma-rr1-p5-u16.c | 219 - ...storeexpminusmax-neonfma-rr1-p5-u20-acc2.c | 236 - ...storeexpminusmax-neonfma-rr1-p5-u20-acc5.c | 242 - ...-raddstoreexpminusmax-neonfma-rr1-p5-u20.c | 234 - ...2-raddstoreexpminusmax-neonfma-rr1-p5-u8.c | 189 - ...dstoreexpminusmax-scalar-rr2-lut64-p2-u2.c | 198 - ...dstoreexpminusmax-scalar-rr2-lut64-p2-u4.c | 232 - ...32-raddstoreexpminusmax-scalar-rr2-p5-u2.c | 176 - ...32-raddstoreexpminusmax-scalar-rr2-p5-u4.c | 212 - ...addstoreexpminusmax-sse2-rr2-p5-u12-acc2.c | 257 - ...addstoreexpminusmax-sse2-rr2-p5-u12-acc3.c | 259 - ...f32-raddstoreexpminusmax-sse2-rr2-p5-u12.c | 254 - ...f32-raddstoreexpminusmax-sse2-rr2-p5-u16.c | 270 - ...addstoreexpminusmax-sse2-rr2-p5-u20-acc2.c | 289 - ...addstoreexpminusmax-sse2-rr2-p5-u20-acc5.c | 295 - ...f32-raddstoreexpminusmax-sse2-rr2-p5-u20.c | 286 - .../f32-raddstoreexpminusmax-sse2-rr2-p5-u8.c | 238 - ...minusmax-wasmrelaxedsimd-rr2-p5-u12-acc2.c | 203 - ...minusmax-wasmrelaxedsimd-rr2-p5-u12-acc3.c | 205 - ...reexpminusmax-wasmrelaxedsimd-rr2-p5-u12.c | 200 - ...reexpminusmax-wasmrelaxedsimd-rr2-p5-u16.c | 216 - ...minusmax-wasmrelaxedsimd-rr2-p5-u20-acc2.c | 235 - ...minusmax-wasmrelaxedsimd-rr2-p5-u20-acc5.c | 241 - ...reexpminusmax-wasmrelaxedsimd-rr2-p5-u20.c | 232 - ...oreexpminusmax-wasmrelaxedsimd-rr2-p5-u8.c | 184 - ...toreexpminusmax-wasmsimd-rr2-p5-u12-acc2.c | 203 - ...toreexpminusmax-wasmsimd-rr2-p5-u12-acc3.c | 205 - ...raddstoreexpminusmax-wasmsimd-rr2-p5-u12.c | 200 - ...raddstoreexpminusmax-wasmsimd-rr2-p5-u16.c | 216 - ...toreexpminusmax-wasmsimd-rr2-p5-u20-acc2.c | 235 - ...toreexpminusmax-wasmsimd-rr2-p5-u20-acc5.c | 241 - ...raddstoreexpminusmax-wasmsimd-rr2-p5-u20.c | 232 - ...-raddstoreexpminusmax-wasmsimd-rr2-p5-u8.c | 184 - src/xnnpack/raddstoreexpminusmax.h | 124 +- test/f32-raddstoreexpminusmax.cc | 5329 ++--------------- test/f32-raddstoreexpminusmax.yaml | 127 +- 154 files changed, 1095 insertions(+), 35492 deletions(-) delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12-acc2.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12-acc3.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16-acc2.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16-acc4.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20-acc2.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20-acc5.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u4.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u8-acc2.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u8.c rename src/f32-raddstoreexpminusmax/gen/{f32-raddstoreexpminusmax-avx2-rr1-p5-u32.c => f32-raddstoreexpminusmax-avx2-rr1-p5-u16-acc2.c} (76%) delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u64-acc2.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u64-acc4.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u64.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u72-acc3.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u72.c create mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u8.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u80-acc2.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u80-acc5.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u80.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc2.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc3.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc6.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96.c rename src/f32-raddstoreexpminusmax/gen/{f32-raddstoreexpminusmax-avx2-rr2-p5-u32.c => f32-raddstoreexpminusmax-avx2-rr2-p5-u16-acc2.c} (81%) delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64-acc2.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64-acc4.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u72-acc3.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u72.c rename src/f32-raddstoreexpminusmax/gen/{f32-raddstoreexpminusmax-avx2-rr2-p5-u64.c => f32-raddstoreexpminusmax-avx2-rr2-p5-u8.c} (56%) delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80-acc2.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80-acc5.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc2.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc3.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc6.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u128-acc2.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u128-acc4.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u128.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u144-acc3.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u144.c create mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u16.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u160-acc2.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u160-acc5.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u160.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u192-acc2.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u192-acc3.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u192-acc6.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u192.c rename src/f32-raddstoreexpminusmax/gen/{f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u64.c => f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u32-acc2.c} (76%) delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u128-acc3.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u128.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u64.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u96-acc2.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u96-acc3.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u96.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u12-acc2.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u12-acc3.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u12.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u16.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u20-acc2.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u20-acc5.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u20.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u8.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u12-acc2.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u12-acc3.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u12.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u16.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u20-acc2.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u20-acc5.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u20.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u8.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u12-acc2.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u12-acc3.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u12.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u16.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u20-acc2.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u20-acc5.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u20.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u8.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u12-acc2.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u12-acc3.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u12.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u16.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u20-acc2.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u20-acc5.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u20.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u8.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-lut64-p2-u2.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-lut64-p2-u4.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-p5-u2.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-p5-u4.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u12-acc2.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u12-acc3.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u12.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20-acc2.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20-acc5.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u8.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u12-acc2.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u12-acc3.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u12.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u16.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u20-acc2.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u20-acc5.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u20.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u8.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u12-acc2.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u12-acc3.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u12.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u16.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u20-acc2.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u20-acc5.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u20.c delete mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u8.c diff --git a/bench/f32-raddstoreexpminusmax.cc b/bench/f32-raddstoreexpminusmax.cc index 87acc4dbc25..91739e67c94 100644 --- a/bench/f32-raddstoreexpminusmax.cc +++ b/bench/f32-raddstoreexpminusmax.cc @@ -90,13 +90,6 @@ static void f32_raddstoreexpminusmax( benchmark::utils::CheckNEON) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_p5_u8, - xnn_f32_rmax_ukernel__neon_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u8, - nullptr, - benchmark::utils::CheckNEON) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_p5_u8_acc2, xnn_f32_rmax_ukernel__neon_u16_acc4, xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u8_acc2, @@ -104,34 +97,6 @@ static void f32_raddstoreexpminusmax( benchmark::utils::CheckNEON) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_p5_u12, - xnn_f32_rmax_ukernel__neon_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u12, - nullptr, - benchmark::utils::CheckNEON) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_p5_u12_acc2, - xnn_f32_rmax_ukernel__neon_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u12_acc2, - nullptr, - benchmark::utils::CheckNEON) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_p5_u12_acc3, - xnn_f32_rmax_ukernel__neon_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u12_acc3, - nullptr, - benchmark::utils::CheckNEON) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_p5_u16, - xnn_f32_rmax_ukernel__neon_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u16, - nullptr, - benchmark::utils::CheckNEON) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_p5_u16_acc2, xnn_f32_rmax_ukernel__neon_u16_acc4, xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u16_acc2, @@ -146,27 +111,6 @@ static void f32_raddstoreexpminusmax( benchmark::utils::CheckNEON) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_p5_u20, - xnn_f32_rmax_ukernel__neon_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u20, - nullptr, - benchmark::utils::CheckNEON) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_p5_u20_acc2, - xnn_f32_rmax_ukernel__neon_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u20_acc2, - nullptr, - benchmark::utils::CheckNEON) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_p5_u20_acc5, - xnn_f32_rmax_ukernel__neon_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u20_acc5, - nullptr, - benchmark::utils::CheckNEON) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_lut64_p2_u4, xnn_f32_rmax_ukernel__neon_u16_acc4, @@ -175,13 +119,6 @@ static void f32_raddstoreexpminusmax( benchmark::utils::CheckNEON) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_lut64_p2_u8, - xnn_f32_rmax_ukernel__neon_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u8, - nullptr, - benchmark::utils::CheckNEON) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_lut64_p2_u8_acc2, xnn_f32_rmax_ukernel__neon_u16_acc4, xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u8_acc2, @@ -189,34 +126,6 @@ static void f32_raddstoreexpminusmax( benchmark::utils::CheckNEON) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_lut64_p2_u12, - xnn_f32_rmax_ukernel__neon_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u12, - nullptr, - benchmark::utils::CheckNEON) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_lut64_p2_u12_acc2, - xnn_f32_rmax_ukernel__neon_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u12_acc2, - nullptr, - benchmark::utils::CheckNEON) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_lut64_p2_u12_acc3, - xnn_f32_rmax_ukernel__neon_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u12_acc3, - nullptr, - benchmark::utils::CheckNEON) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_lut64_p2_u16, - xnn_f32_rmax_ukernel__neon_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u16, - nullptr, - benchmark::utils::CheckNEON) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_lut64_p2_u16_acc2, xnn_f32_rmax_ukernel__neon_u16_acc4, xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u16_acc2, @@ -231,27 +140,6 @@ static void f32_raddstoreexpminusmax( benchmark::utils::CheckNEON) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_lut64_p2_u20, - xnn_f32_rmax_ukernel__neon_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u20, - nullptr, - benchmark::utils::CheckNEON) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_lut64_p2_u20_acc2, - xnn_f32_rmax_ukernel__neon_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u20_acc2, - nullptr, - benchmark::utils::CheckNEON) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_lut64_p2_u20_acc5, - xnn_f32_rmax_ukernel__neon_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u20_acc5, - nullptr, - benchmark::utils::CheckNEON) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_p5_u4, xnn_f32_rmax_ukernel__neon_u16_acc4, @@ -260,13 +148,6 @@ static void f32_raddstoreexpminusmax( benchmark::utils::CheckNEONFMA) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_p5_u8, - xnn_f32_rmax_ukernel__neon_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u8, - nullptr, - benchmark::utils::CheckNEONFMA) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_p5_u8_acc2, xnn_f32_rmax_ukernel__neon_u16_acc4, xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u8_acc2, @@ -274,34 +155,6 @@ static void f32_raddstoreexpminusmax( benchmark::utils::CheckNEONFMA) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_p5_u12, - xnn_f32_rmax_ukernel__neon_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u12, - nullptr, - benchmark::utils::CheckNEONFMA) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_p5_u12_acc2, - xnn_f32_rmax_ukernel__neon_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u12_acc2, - nullptr, - benchmark::utils::CheckNEONFMA) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_p5_u12_acc3, - xnn_f32_rmax_ukernel__neon_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u12_acc3, - nullptr, - benchmark::utils::CheckNEONFMA) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_p5_u16, - xnn_f32_rmax_ukernel__neon_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u16, - nullptr, - benchmark::utils::CheckNEONFMA) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_p5_u16_acc2, xnn_f32_rmax_ukernel__neon_u16_acc4, xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u16_acc2, @@ -316,27 +169,7 @@ static void f32_raddstoreexpminusmax( benchmark::utils::CheckNEONFMA) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_p5_u20, - xnn_f32_rmax_ukernel__neon_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u20, - nullptr, - benchmark::utils::CheckNEONFMA) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_p5_u20_acc2, - xnn_f32_rmax_ukernel__neon_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u20_acc2, - nullptr, - benchmark::utils::CheckNEONFMA) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_p5_u20_acc5, - xnn_f32_rmax_ukernel__neon_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u20_acc5, - nullptr, - benchmark::utils::CheckNEONFMA) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_lut64_p2_u4, xnn_f32_rmax_ukernel__neon_u16_acc4, @@ -345,13 +178,6 @@ static void f32_raddstoreexpminusmax( benchmark::utils::CheckNEONFMA) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_lut64_p2_u8, - xnn_f32_rmax_ukernel__neon_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u8, - nullptr, - benchmark::utils::CheckNEONFMA) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_lut64_p2_u8_acc2, xnn_f32_rmax_ukernel__neon_u16_acc4, xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u8_acc2, @@ -359,34 +185,6 @@ static void f32_raddstoreexpminusmax( benchmark::utils::CheckNEONFMA) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_lut64_p2_u12, - xnn_f32_rmax_ukernel__neon_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u12, - nullptr, - benchmark::utils::CheckNEONFMA) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_lut64_p2_u12_acc2, - xnn_f32_rmax_ukernel__neon_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u12_acc2, - nullptr, - benchmark::utils::CheckNEONFMA) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_lut64_p2_u12_acc3, - xnn_f32_rmax_ukernel__neon_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u12_acc3, - nullptr, - benchmark::utils::CheckNEONFMA) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_lut64_p2_u16, - xnn_f32_rmax_ukernel__neon_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u16, - nullptr, - benchmark::utils::CheckNEONFMA) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_lut64_p2_u16_acc2, xnn_f32_rmax_ukernel__neon_u16_acc4, xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u16_acc2, @@ -401,27 +199,6 @@ static void f32_raddstoreexpminusmax( benchmark::utils::CheckNEONFMA) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_lut64_p2_u20, - xnn_f32_rmax_ukernel__neon_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u20, - nullptr, - benchmark::utils::CheckNEONFMA) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_lut64_p2_u20_acc2, - xnn_f32_rmax_ukernel__neon_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u20_acc2, - nullptr, - benchmark::utils::CheckNEONFMA) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_lut64_p2_u20_acc5, - xnn_f32_rmax_ukernel__neon_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u20_acc5, - nullptr, - benchmark::utils::CheckNEONFMA) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 #if XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV @@ -442,119 +219,45 @@ static void f32_raddstoreexpminusmax( #endif // XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV #if XNN_ARCH_X86 || XNN_ARCH_X86_64 - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_rr1_p5_scalef_u64, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64, - nullptr, - benchmark::utils::CheckAVX512F) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_rr1_p5_scalef_u64_acc2, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64_acc2, - nullptr, - benchmark::utils::CheckAVX512F) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_rr1_p5_scalef_u64_acc4, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64_acc4, - nullptr, - benchmark::utils::CheckAVX512F) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_rr1_p5_scalef_u128, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u128, - nullptr, - benchmark::utils::CheckAVX512F) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_rr1_p5_scalef_u128_acc2, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u128_acc2, - nullptr, - benchmark::utils::CheckAVX512F) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_rr1_p5_scalef_u128_acc4, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u128_acc4, - nullptr, - benchmark::utils::CheckAVX512F) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_rr1_p5_scalef_u144, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u144, - nullptr, - benchmark::utils::CheckAVX512F) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_rr1_p5_scalef_u144_acc3, + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_rr1_p5_scalef_u16, xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u144_acc3, + xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u16, nullptr, benchmark::utils::CheckAVX512F) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); - - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_rr1_p5_scalef_u160, + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_rr1_p5_scalef_u32_acc2, xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u160, + xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u32_acc2, nullptr, benchmark::utils::CheckAVX512F) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_rr1_p5_scalef_u160_acc2, + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_rr1_p5_scalef_u64_acc2, xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u160_acc2, + xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64_acc2, nullptr, benchmark::utils::CheckAVX512F) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_rr1_p5_scalef_u160_acc5, + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_rr1_p5_scalef_u64_acc4, xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u160_acc5, + xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64_acc4, nullptr, benchmark::utils::CheckAVX512F) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_rr1_p5_scalef_u192, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u192, - nullptr, - benchmark::utils::CheckAVX512F) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_rr1_p5_scalef_u192_acc2, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u192_acc2, - nullptr, - benchmark::utils::CheckAVX512F) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_rr1_p5_scalef_u192_acc3, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u192_acc3, - nullptr, - benchmark::utils::CheckAVX512F) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_rr1_p5_scalef_u192_acc6, + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr1_p5_u8, xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u192_acc6, + xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u8, nullptr, - benchmark::utils::CheckAVX512F) + benchmark::utils::CheckAVX2) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); - - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr1_p5_u32, + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr1_p5_u16_acc2, xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32, + xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u16_acc2, nullptr, benchmark::utils::CheckAVX2) ->Apply(benchmark::utils::UnaryElementwiseParameters) @@ -574,404 +277,51 @@ static void f32_raddstoreexpminusmax( ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr1_p5_u64, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u64, - nullptr, - benchmark::utils::CheckAVX2) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr1_p5_u64_acc2, + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr2_p5_u8, xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u64_acc2, + xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u8, nullptr, benchmark::utils::CheckAVX2) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr1_p5_u64_acc4, + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr2_p5_u16_acc2, xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u64_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u16_acc2, nullptr, benchmark::utils::CheckAVX2) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); - - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr1_p5_u72, + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr2_p5_u32_acc2, xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u72, + xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc2, nullptr, benchmark::utils::CheckAVX2) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr1_p5_u72_acc3, + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr2_p5_u32_acc4, xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u72_acc3, + xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc4, nullptr, benchmark::utils::CheckAVX2) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr1_p5_u80, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u80, - nullptr, - benchmark::utils::CheckAVX2) + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_rr2_p5_u4, + xnn_f32_rmax_ukernel__sse_u16_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u4, + nullptr) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr1_p5_u80_acc2, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u80_acc2, - nullptr, - benchmark::utils::CheckAVX2) + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_rr2_p5_u8_acc2, + xnn_f32_rmax_ukernel__sse_u16_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u8_acc2, + nullptr) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr1_p5_u80_acc5, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u80_acc5, - nullptr, - benchmark::utils::CheckAVX2) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr1_p5_u96, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96, - nullptr, - benchmark::utils::CheckAVX2) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr1_p5_u96_acc2, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96_acc2, - nullptr, - benchmark::utils::CheckAVX2) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr1_p5_u96_acc3, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96_acc3, - nullptr, - benchmark::utils::CheckAVX2) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr1_p5_u96_acc6, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96_acc6, - nullptr, - benchmark::utils::CheckAVX2) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - - - - - - - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr2_p5_u32, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32, - nullptr, - benchmark::utils::CheckAVX2) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr2_p5_u32_acc2, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc2, - nullptr, - benchmark::utils::CheckAVX2) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr2_p5_u32_acc4, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc4, - nullptr, - benchmark::utils::CheckAVX2) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr2_p5_u64, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64, - nullptr, - benchmark::utils::CheckAVX2) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr2_p5_u64_acc2, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64_acc2, - nullptr, - benchmark::utils::CheckAVX2) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr2_p5_u64_acc4, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64_acc4, - nullptr, - benchmark::utils::CheckAVX2) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr2_p5_u72, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u72, - nullptr, - benchmark::utils::CheckAVX2) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr2_p5_u72_acc3, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u72_acc3, - nullptr, - benchmark::utils::CheckAVX2) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr2_p5_u80, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80, - nullptr, - benchmark::utils::CheckAVX2) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr2_p5_u80_acc2, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80_acc2, - nullptr, - benchmark::utils::CheckAVX2) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr2_p5_u80_acc5, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80_acc5, - nullptr, - benchmark::utils::CheckAVX2) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr2_p5_u96, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96, - nullptr, - benchmark::utils::CheckAVX2) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr2_p5_u96_acc2, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc2, - nullptr, - benchmark::utils::CheckAVX2) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr2_p5_u96_acc3, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc3, - nullptr, - benchmark::utils::CheckAVX2) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr2_p5_u96_acc6, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc6, - nullptr, - benchmark::utils::CheckAVX2) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - - - - - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_avx_rr2_p5_u4, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u4, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_avx_rr2_p5_u8, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u8, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_avx_rr2_p5_u8_acc2, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u8_acc2, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_avx_rr2_p5_u12, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u12, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_avx_rr2_p5_u12_acc2, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u12_acc2, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_avx_rr2_p5_u12_acc3, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u12_acc3, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_avx_rr2_p5_u16, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_avx_rr2_p5_u16_acc2, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16_acc2, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_avx_rr2_p5_u16_acc4, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16_acc4, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_avx_rr2_p5_u20, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u20, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_avx_rr2_p5_u20_acc2, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u20_acc2, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_avx_rr2_p5_u20_acc5, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u20_acc5, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx_rr2_p5_u4, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u4, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx_rr2_p5_u8, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u8, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx_rr2_p5_u8_acc2, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u8_acc2, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx_rr2_p5_u12, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx_rr2_p5_u12_acc2, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12_acc2, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx_rr2_p5_u12_acc3, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12_acc3, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx_rr2_p5_u16, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx_rr2_p5_u16_acc2, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16_acc2, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx_rr2_p5_u16_acc4, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16_acc4, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx_rr2_p5_u20, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx_rr2_p5_u20_acc2, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20_acc2, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx_rr2_p5_u20_acc5, - xnn_f32_rmax_ukernel__avx_u32_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20_acc5, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_rr2_p5_u4, - xnn_f32_rmax_ukernel__sse_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u4, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_rr2_p5_u8, - xnn_f32_rmax_ukernel__sse_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u8, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_rr2_p5_u8_acc2, - xnn_f32_rmax_ukernel__sse_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u8_acc2, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_rr2_p5_u12, - xnn_f32_rmax_ukernel__sse_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u12, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_rr2_p5_u12_acc2, - xnn_f32_rmax_ukernel__sse_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u12_acc2, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_rr2_p5_u12_acc3, - xnn_f32_rmax_ukernel__sse_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u12_acc3, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_rr2_p5_u16, - xnn_f32_rmax_ukernel__sse_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_rr2_p5_u16_acc2, - xnn_f32_rmax_ukernel__sse_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16_acc2, - nullptr) + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_rr2_p5_u16_acc2, + xnn_f32_rmax_ukernel__sse_u16_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16_acc2, + nullptr) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_rr2_p5_u16_acc4, @@ -980,24 +330,6 @@ static void f32_raddstoreexpminusmax( nullptr) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_rr2_p5_u20, - xnn_f32_rmax_ukernel__sse_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u20, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_rr2_p5_u20_acc2, - xnn_f32_rmax_ukernel__sse_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u20_acc2, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_rr2_p5_u20_acc5, - xnn_f32_rmax_ukernel__sse_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u20_acc5, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ARCH_WASMRELAXEDSIMD @@ -1007,42 +339,12 @@ static void f32_raddstoreexpminusmax( nullptr) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmrelaxedsimd_rr2_p5_u8, - xnn_f32_rmax_ukernel__wasmsimd_pminmax_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u8, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmrelaxedsimd_rr2_p5_u8_acc2, xnn_f32_rmax_ukernel__wasmsimd_pminmax_u16_acc4, xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u8_acc2, nullptr) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmrelaxedsimd_rr2_p5_u12, - xnn_f32_rmax_ukernel__wasmsimd_pminmax_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u12, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmrelaxedsimd_rr2_p5_u12_acc2, - xnn_f32_rmax_ukernel__wasmsimd_pminmax_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u12_acc2, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmrelaxedsimd_rr2_p5_u12_acc3, - xnn_f32_rmax_ukernel__wasmsimd_pminmax_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u12_acc3, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmrelaxedsimd_rr2_p5_u16, - xnn_f32_rmax_ukernel__wasmsimd_pminmax_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u16, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmrelaxedsimd_rr2_p5_u16_acc2, xnn_f32_rmax_ukernel__wasmsimd_pminmax_u16_acc4, xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u16_acc2, @@ -1055,24 +357,6 @@ static void f32_raddstoreexpminusmax( nullptr) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmrelaxedsimd_rr2_p5_u20, - xnn_f32_rmax_ukernel__wasmsimd_pminmax_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u20, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmrelaxedsimd_rr2_p5_u20_acc2, - xnn_f32_rmax_ukernel__wasmsimd_pminmax_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u20_acc2, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmrelaxedsimd_rr2_p5_u20_acc5, - xnn_f32_rmax_ukernel__wasmsimd_pminmax_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u20_acc5, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); #endif // XNN_ARCH_WASMRELAXEDSIMD #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -1082,42 +366,12 @@ static void f32_raddstoreexpminusmax( nullptr) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmsimd_rr2_p5_u8, - xnn_f32_rmax_ukernel__wasmsimd_pminmax_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u8, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmsimd_rr2_p5_u8_acc2, xnn_f32_rmax_ukernel__wasmsimd_pminmax_u16_acc4, xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u8_acc2, nullptr) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmsimd_rr2_p5_u12, - xnn_f32_rmax_ukernel__wasmsimd_pminmax_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u12, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmsimd_rr2_p5_u12_acc2, - xnn_f32_rmax_ukernel__wasmsimd_pminmax_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u12_acc2, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmsimd_rr2_p5_u12_acc3, - xnn_f32_rmax_ukernel__wasmsimd_pminmax_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u12_acc3, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmsimd_rr2_p5_u16, - xnn_f32_rmax_ukernel__wasmsimd_pminmax_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u16, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmsimd_rr2_p5_u16_acc2, xnn_f32_rmax_ukernel__wasmsimd_pminmax_u16_acc4, xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u16_acc2, @@ -1130,24 +384,6 @@ static void f32_raddstoreexpminusmax( nullptr) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmsimd_rr2_p5_u20, - xnn_f32_rmax_ukernel__wasmsimd_pminmax_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u20, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmsimd_rr2_p5_u20_acc2, - xnn_f32_rmax_ukernel__wasmsimd_pminmax_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u20_acc2, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmsimd_rr2_p5_u20_acc5, - xnn_f32_rmax_ukernel__wasmsimd_pminmax_u16_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u20_acc5, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_rr2_lut64_p2_u1, @@ -1156,24 +392,12 @@ BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_rr2_lut64_p2_u1, nullptr) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); -BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_rr2_lut64_p2_u2, - xnn_f32_rmax_ukernel__scalar_u4_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_lut64_p2_u2, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_rr2_lut64_p2_u2_acc2, xnn_f32_rmax_ukernel__scalar_u4_acc4, xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_lut64_p2_u2_acc2, nullptr) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); -BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_rr2_lut64_p2_u4, - xnn_f32_rmax_ukernel__scalar_u4_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_lut64_p2_u4, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_rr2_lut64_p2_u4_acc2, xnn_f32_rmax_ukernel__scalar_u4_acc4, xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_lut64_p2_u4_acc2, @@ -1193,24 +417,12 @@ BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_rr2_p5_u1, nullptr) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); -BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_rr2_p5_u2, - xnn_f32_rmax_ukernel__scalar_u4_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_u2, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_rr2_p5_u2_acc2, xnn_f32_rmax_ukernel__scalar_u4_acc4, xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_u2_acc2, nullptr) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); -BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_rr2_p5_u4, - xnn_f32_rmax_ukernel__scalar_u4_acc4, - xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_u4, - nullptr) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_rr2_p5_u4_acc2, xnn_f32_rmax_ukernel__scalar_u4_acc4, xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_u4_acc2, diff --git a/bench/f32-softmax.cc b/bench/f32-softmax.cc index 6f9752bdc6c..e57f2a98f23 100644 --- a/bench/f32-softmax.cc +++ b/bench/f32-softmax.cc @@ -432,7 +432,7 @@ static void CharacteristicArguments(benchmark::internal::Benchmark* b) { BENCHMARK_CAPTURE(ThreePassSoftMaxWithReloading, avx2_p5, xnn_f32_rmax_ukernel__avx_u32_acc4, (xnn_init_f32_default_params_fn) nullptr, - xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u64_acc2, + xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32_acc2, nullptr, xnn_f32_vmulc_ukernel__avx_u16, benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime(); @@ -450,7 +450,7 @@ static void CharacteristicArguments(benchmark::internal::Benchmark* b) { BENCHMARK_CAPTURE(ThreePassSoftMaxWithReloading, avx512f_p5_scalef, xnn_f32_rmax_ukernel__avx512f_u64_acc4, (xnn_init_f32_default_params_fn) nullptr, - xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u128_acc2, + xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64_acc2, nullptr, xnn_f32_vmulc_ukernel__avx512f_u32, benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseManualTime(); diff --git a/cmake/gen/avx2_microkernels.cmake b/cmake/gen/avx2_microkernels.cmake index d7906685e70..bb00d81de14 100644 --- a/cmake/gen/avx2_microkernels.cmake +++ b/cmake/gen/avx2_microkernels.cmake @@ -16,7 +16,7 @@ SET(PROD_AVX2_MICROKERNEL_SRCS src/f16-f32acc-igemm/gen/f16-f32acc-igemm-4x16-minmax-avx2-broadcast.c src/f16-pavgpool/f16-pavgpool-9p8x-minmax-avx2-c8.c src/f16-pavgpool/f16-pavgpool-9x-minmax-avx2-c8.c - src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-u40.c + src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-u32.c src/f16-velu/gen/f16-velu-avx2-rr1-p3-u16.c src/f16-vsigmoid/gen/f16-vsigmoid-avx2-rr1-p2-rcp-u32.c src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x16-minmax-avx2-broadcast.c @@ -120,9 +120,9 @@ SET(NON_PROD_AVX2_MICROKERNEL_SRCS src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-u16.c src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-u32-acc2.c src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-u32-acc4.c - src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-u32.c src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-u40-acc2.c src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-u40-acc5.c + src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-u40.c src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-u48-acc2.c src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-u48-acc3.c src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-u48.c @@ -234,35 +234,13 @@ SET(NON_PROD_AVX2_MICROKERNEL_SRCS src/f32-raddextexp/gen/f32-raddextexp-avx2-p5-u96-acc3.c src/f32-raddextexp/gen/f32-raddextexp-avx2-p5-u96-acc6.c src/f32-raddextexp/gen/f32-raddextexp-avx2-p5-u96.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u8.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u16-acc2.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u32-acc2.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u32-acc4.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u32.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u64-acc2.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u64-acc4.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u64.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u72-acc3.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u72.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u80-acc2.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u80-acc5.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u80.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc2.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc3.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc6.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u8.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u16-acc2.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32-acc4.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64-acc2.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64-acc4.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u72-acc3.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u72.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80-acc2.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80-acc5.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc2.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc3.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc6.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96.c src/f32-velu/gen/f32-velu-avx2-rr1-lut4-p4-perm-u8.c src/f32-velu/gen/f32-velu-avx2-rr1-lut4-p4-perm-u16.c src/f32-velu/gen/f32-velu-avx2-rr1-lut4-p4-perm-u24.c diff --git a/cmake/gen/avx512f_microkernels.cmake b/cmake/gen/avx512f_microkernels.cmake index 41598bdc2aa..1eafbfcff4b 100644 --- a/cmake/gen/avx512f_microkernels.cmake +++ b/cmake/gen/avx512f_microkernels.cmake @@ -128,21 +128,10 @@ SET(NON_PROD_AVX512F_MICROKERNEL_SRCS src/f32-raddextexp/gen/f32-raddextexp-avx512f-p5-scalef-u192-acc3.c src/f32-raddextexp/gen/f32-raddextexp-avx512f-p5-scalef-u192-acc6.c src/f32-raddextexp/gen/f32-raddextexp-avx512f-p5-scalef-u192.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u16.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u32-acc2.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u64-acc2.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u64-acc4.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u64.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u128-acc2.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u128-acc4.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u128.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u144-acc3.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u144.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u160-acc2.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u160-acc5.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u160.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u192-acc2.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u192-acc3.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u192-acc6.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u192.c src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx512f-c16.c src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx512f-c32.c src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx512f-c128.c diff --git a/cmake/gen/avx_microkernels.cmake b/cmake/gen/avx_microkernels.cmake index 0ec12965ee1..eb28687e6c5 100644 --- a/cmake/gen/avx_microkernels.cmake +++ b/cmake/gen/avx_microkernels.cmake @@ -184,18 +184,6 @@ SET(NON_PROD_AVX_MICROKERNEL_SRCS src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u8.c src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u16.c src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u24.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u4.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u8-acc2.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u8.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12-acc2.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12-acc3.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16-acc2.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16-acc4.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20-acc2.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20-acc5.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20.c src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx-c16.c src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx-c64.c src/f32-rminmax/gen/f32-rmax-avx-u8.c diff --git a/cmake/gen/hvx_microkernels.cmake b/cmake/gen/hvx_microkernels.cmake index 4724ce14b7c..34b5aad873c 100644 --- a/cmake/gen/hvx_microkernels.cmake +++ b/cmake/gen/hvx_microkernels.cmake @@ -35,14 +35,8 @@ SET(NON_PROD_HVX_MICROKERNEL_SRCS src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u256.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u32.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u64-acc2.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u64.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u96-acc2.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u96-acc3.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u96.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u128-acc2.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u128-acc3.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u128-acc4.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u128.c src/f32-rsum/gen/f32-rsum-hvx-u32.c src/f32-rsum/gen/f32-rsum-hvx-u64-acc2.c src/f32-rsum/gen/f32-rsum-hvx-u96-acc3.c diff --git a/cmake/gen/neon_microkernels.cmake b/cmake/gen/neon_microkernels.cmake index 7f0597c722e..c7397cb6178 100644 --- a/cmake/gen/neon_microkernels.cmake +++ b/cmake/gen/neon_microkernels.cmake @@ -48,7 +48,6 @@ SET(PROD_NEON_MICROKERNEL_SRCS src/f32-qc8w-gemm/gen/f32-qc8w-gemm-4x8-minmax-neon-lane-ld64.c src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-u32.c src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-u32.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u8.c src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-neon-c16.c src/f32-rminmax/gen/f32-rmax-neon-u16-acc4.c src/f32-rminmax/gen/f32-rminmax-neon-u16-acc4.c @@ -364,27 +363,12 @@ SET(NON_PROD_NEON_MICROKERNEL_SRCS src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-u24.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u4.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u8-acc2.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u12-acc2.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u12-acc3.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u12.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u16-acc2.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u16-acc4.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u16.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u20-acc2.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u20-acc5.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u20.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u4.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u8-acc2.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u8.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u12-acc2.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u12-acc3.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u12.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u16-acc2.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u16-acc4.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u16.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u20-acc2.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u20-acc5.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u20.c src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-neon-c32.c src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-neon-c64.c src/f32-rminmax/gen/f32-rmax-neon-u4.c diff --git a/cmake/gen/neonfma_microkernels.cmake b/cmake/gen/neonfma_microkernels.cmake index be1d5948a53..8a445a6ccb9 100644 --- a/cmake/gen/neonfma_microkernels.cmake +++ b/cmake/gen/neonfma_microkernels.cmake @@ -23,7 +23,7 @@ SET(PROD_NEONFMA_MICROKERNEL_SRCS src/f32-igemm/gen/f32-igemm-1x8s4-minmax-neonfma.c src/f32-igemm/gen/f32-igemm-4x8s4-minmax-neonfma.c src/f32-igemm/gen/f32-igemm-6x8s4-minmax-neonfma.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u16.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u16-acc2.c src/f32-spmm/gen/f32-spmm-32x1-minmax-neonfma-pipelined.c src/f32-velu/gen/f32-velu-neonfma-rr1-lut16-p3-u16.c src/f32-velu/gen/f32-velu-neonfma-rr1-p6-u8.c @@ -107,27 +107,11 @@ SET(NON_PROD_NEONFMA_MICROKERNEL_SRCS src/f32-qc8w-gemm/gen/f32-qc8w-gemm-6x8s4-minmax-neonfma.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u4.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u8-acc2.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u8.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u12-acc2.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u12-acc3.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u12.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u16-acc2.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u16-acc4.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u20-acc2.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u20-acc5.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u20.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u4.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u8-acc2.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u8.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u12-acc2.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u12-acc3.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u12.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u16-acc2.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u16-acc4.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u16.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u20-acc2.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u20-acc5.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u20.c src/f32-spmm/gen/f32-spmm-4x1-minmax-neonfma-pipelined.c src/f32-spmm/gen/f32-spmm-4x1-minmax-neonfma-x2.c src/f32-spmm/gen/f32-spmm-4x1-minmax-neonfma.c diff --git a/cmake/gen/neonfp16arith_microkernels.cmake b/cmake/gen/neonfp16arith_microkernels.cmake index d9d1b3bd9e9..4cc50373bae 100644 --- a/cmake/gen/neonfp16arith_microkernels.cmake +++ b/cmake/gen/neonfp16arith_microkernels.cmake @@ -43,7 +43,6 @@ SET(PROD_NEONFP16ARITH_MICROKERNEL_SRCS src/f16-prelu/gen/f16-prelu-neonfp16arith-2x16.c src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u32.c src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u32.c - src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u40.c src/f16-rminmax/gen/f16-rmax-neonfp16arith-u32-acc4.c src/f16-rminmax/gen/f16-rminmax-neonfp16arith-u32-acc4.c src/f16-spmm/gen/f16-spmm-32x1-minmax-neonfp16arith-pipelined.c @@ -204,6 +203,7 @@ SET(NON_PROD_NEONFP16ARITH_MICROKERNEL_SRCS src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u32-acc4.c src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u40-acc2.c src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u40-acc5.c + src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u40.c src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u48-acc2.c src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u48-acc3.c src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u48.c diff --git a/cmake/gen/scalar_microkernels.cmake b/cmake/gen/scalar_microkernels.cmake index 18f41fa2e99..d30bceb4257 100644 --- a/cmake/gen/scalar_microkernels.cmake +++ b/cmake/gen/scalar_microkernels.cmake @@ -422,15 +422,11 @@ SET(NON_PROD_SCALAR_MICROKERNEL_SRCS src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-u3.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-lut64-p2-u1.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-lut64-p2-u2-acc2.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-lut64-p2-u2.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-lut64-p2-u4-acc2.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-lut64-p2-u4-acc4.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-lut64-p2-u4.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-p5-u1.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-p5-u2-acc2.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-p5-u2.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-p5-u4-acc4.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-p5-u4.c src/f32-rminmax/gen/f32-rmax-scalar-u1.c src/f32-rminmax/gen/f32-rmax-scalar-u2-acc2.c src/f32-rminmax/gen/f32-rmax-scalar-u3-acc3.c diff --git a/cmake/gen/sse2_microkernels.cmake b/cmake/gen/sse2_microkernels.cmake index 9df3a49a88d..5c26ace15ae 100644 --- a/cmake/gen/sse2_microkernels.cmake +++ b/cmake/gen/sse2_microkernels.cmake @@ -20,7 +20,7 @@ SET(PROD_SSE2_MICROKERNEL_SRCS src/f32-prelu/gen/f32-prelu-sse2-2x8.c src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u32.c src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u32.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20-acc2.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16-acc2.c src/f32-vcopysign/gen/f32-vcopysign-sse2.c src/f32-vcopysign/gen/f32-vcopysignc-sse2.c src/f32-vcopysign/gen/f32-vrcopysignc-sse2.c @@ -127,15 +127,7 @@ SET(NON_PROD_SSE2_MICROKERNEL_SRCS src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u24.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u4.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u8-acc2.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u8.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u12-acc2.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u12-acc3.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u12.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16-acc2.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16-acc4.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20-acc5.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20.c src/f32-vbinary/gen/f32-vprelu-sse2-u4.c src/f32-vbinary/gen/f32-vprelu-sse2-u8.c src/f32-vbinary/gen/f32-vpreluc-sse2-u4.c diff --git a/cmake/gen/wasmrelaxedsimd_microkernels.cmake b/cmake/gen/wasmrelaxedsimd_microkernels.cmake index 65bd461e16a..40affb6956a 100644 --- a/cmake/gen/wasmrelaxedsimd_microkernels.cmake +++ b/cmake/gen/wasmrelaxedsimd_microkernels.cmake @@ -353,15 +353,7 @@ SET(NON_PROD_WASMRELAXEDSIMD_MICROKERNEL_SRCS src/f32-qc8w-gemm/gen/f32-qc8w-gemm-6x8s4-wasmrelaxedsimd-fma.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u4.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u8-acc2.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u8.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u12-acc2.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u12-acc3.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u12.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u16-acc4.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u16.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u20-acc2.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u20-acc5.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u20.c src/f32-spmm/gen/f32-spmm-4x1-minmax-wasmrelaxedsimd-arm-pipelined-x2.c src/f32-spmm/gen/f32-spmm-4x1-minmax-wasmrelaxedsimd-arm-pipelined.c src/f32-spmm/gen/f32-spmm-4x1-minmax-wasmrelaxedsimd-arm-x2.c diff --git a/cmake/gen/wasmsimd_microkernels.cmake b/cmake/gen/wasmsimd_microkernels.cmake index d4bff4df747..378197f3942 100644 --- a/cmake/gen/wasmsimd_microkernels.cmake +++ b/cmake/gen/wasmsimd_microkernels.cmake @@ -650,15 +650,7 @@ SET(NON_PROD_WASMSIMD_MICROKERNEL_SRCS src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-magic-u24.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u4.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u8-acc2.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u8.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u12-acc2.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u12-acc3.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u12.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u16-acc4.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u16.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u20-acc2.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u20-acc5.c - src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u20.c src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-wasmsimd-c32.c src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-wasmsimd-c64.c src/f32-rminmax/gen/f32-rmax-wasmsimd-minmax-u4.c diff --git a/gen/avx2_microkernels.bzl b/gen/avx2_microkernels.bzl index 4aa456a5565..798ba06a6ed 100644 --- a/gen/avx2_microkernels.bzl +++ b/gen/avx2_microkernels.bzl @@ -12,7 +12,7 @@ PROD_AVX2_MICROKERNEL_SRCS = [ "src/f16-f32acc-igemm/gen/f16-f32acc-igemm-4x16-minmax-avx2-broadcast.c", "src/f16-pavgpool/f16-pavgpool-9p8x-minmax-avx2-c8.c", "src/f16-pavgpool/f16-pavgpool-9x-minmax-avx2-c8.c", - "src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-u40.c", + "src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-u32.c", "src/f16-velu/gen/f16-velu-avx2-rr1-p3-u16.c", "src/f16-vsigmoid/gen/f16-vsigmoid-avx2-rr1-p2-rcp-u32.c", "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x16-minmax-avx2-broadcast.c", @@ -117,9 +117,9 @@ NON_PROD_AVX2_MICROKERNEL_SRCS = [ "src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-u16.c", "src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-u32-acc2.c", "src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-u32-acc4.c", - "src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-u32.c", "src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-u40-acc2.c", "src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-u40-acc5.c", + "src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-u40.c", "src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-u48-acc2.c", "src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-u48-acc3.c", "src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-u48.c", @@ -231,35 +231,13 @@ NON_PROD_AVX2_MICROKERNEL_SRCS = [ "src/f32-raddextexp/gen/f32-raddextexp-avx2-p5-u96-acc3.c", "src/f32-raddextexp/gen/f32-raddextexp-avx2-p5-u96-acc6.c", "src/f32-raddextexp/gen/f32-raddextexp-avx2-p5-u96.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u8.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u16-acc2.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u32-acc2.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u32-acc4.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u32.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u64-acc2.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u64-acc4.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u64.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u72-acc3.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u72.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u80-acc2.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u80-acc5.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u80.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc2.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc3.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc6.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u8.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u16-acc2.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32-acc4.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64-acc2.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64-acc4.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u72-acc3.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u72.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80-acc2.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80-acc5.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc2.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc3.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc6.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96.c", "src/f32-velu/gen/f32-velu-avx2-rr1-lut4-p4-perm-u8.c", "src/f32-velu/gen/f32-velu-avx2-rr1-lut4-p4-perm-u16.c", "src/f32-velu/gen/f32-velu-avx2-rr1-lut4-p4-perm-u24.c", diff --git a/gen/avx512f_microkernels.bzl b/gen/avx512f_microkernels.bzl index df2192fffb0..13402d30b47 100644 --- a/gen/avx512f_microkernels.bzl +++ b/gen/avx512f_microkernels.bzl @@ -125,21 +125,10 @@ NON_PROD_AVX512F_MICROKERNEL_SRCS = [ "src/f32-raddextexp/gen/f32-raddextexp-avx512f-p5-scalef-u192-acc3.c", "src/f32-raddextexp/gen/f32-raddextexp-avx512f-p5-scalef-u192-acc6.c", "src/f32-raddextexp/gen/f32-raddextexp-avx512f-p5-scalef-u192.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u16.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u32-acc2.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u64-acc2.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u64-acc4.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u64.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u128-acc2.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u128-acc4.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u128.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u144-acc3.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u144.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u160-acc2.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u160-acc5.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u160.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u192-acc2.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u192-acc3.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u192-acc6.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u192.c", "src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx512f-c16.c", "src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx512f-c32.c", "src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx512f-c128.c", diff --git a/gen/avx_microkernels.bzl b/gen/avx_microkernels.bzl index 67231286ef2..2794ad08fa7 100644 --- a/gen/avx_microkernels.bzl +++ b/gen/avx_microkernels.bzl @@ -181,18 +181,6 @@ NON_PROD_AVX_MICROKERNEL_SRCS = [ "src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u8.c", "src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u16.c", "src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u24.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u4.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u8-acc2.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u8.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12-acc2.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12-acc3.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16-acc2.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16-acc4.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20-acc2.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20-acc5.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20.c", "src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx-c16.c", "src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx-c64.c", "src/f32-rminmax/gen/f32-rmax-avx-u8.c", diff --git a/gen/hvx_microkernels.bzl b/gen/hvx_microkernels.bzl index c011b417135..691a12da673 100644 --- a/gen/hvx_microkernels.bzl +++ b/gen/hvx_microkernels.bzl @@ -32,14 +32,8 @@ NON_PROD_HVX_MICROKERNEL_SRCS = [ "src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u256.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u32.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u64-acc2.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u64.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u96-acc2.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u96-acc3.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u96.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u128-acc2.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u128-acc3.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u128-acc4.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u128.c", "src/f32-rsum/gen/f32-rsum-hvx-u32.c", "src/f32-rsum/gen/f32-rsum-hvx-u64-acc2.c", "src/f32-rsum/gen/f32-rsum-hvx-u96-acc3.c", diff --git a/gen/neon_microkernels.bzl b/gen/neon_microkernels.bzl index ad95a864633..a4dbc44fd82 100644 --- a/gen/neon_microkernels.bzl +++ b/gen/neon_microkernels.bzl @@ -44,7 +44,6 @@ PROD_NEON_MICROKERNEL_SRCS = [ "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-4x8-minmax-neon-lane-ld64.c", "src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-u32.c", "src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-u32.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u8.c", "src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-neon-c16.c", "src/f32-rminmax/gen/f32-rmax-neon-u16-acc4.c", "src/f32-rminmax/gen/f32-rminmax-neon-u16-acc4.c", @@ -361,27 +360,12 @@ NON_PROD_NEON_MICROKERNEL_SRCS = [ "src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-u24.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u4.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u8-acc2.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u12-acc2.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u12-acc3.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u12.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u16-acc2.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u16-acc4.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u16.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u20-acc2.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u20-acc5.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u20.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u4.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u8-acc2.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u8.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u12-acc2.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u12-acc3.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u12.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u16-acc2.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u16-acc4.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u16.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u20-acc2.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u20-acc5.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u20.c", "src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-neon-c32.c", "src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-neon-c64.c", "src/f32-rminmax/gen/f32-rmax-neon-u4.c", diff --git a/gen/neonfma_microkernels.bzl b/gen/neonfma_microkernels.bzl index 316adcf49f8..bb8807928a0 100644 --- a/gen/neonfma_microkernels.bzl +++ b/gen/neonfma_microkernels.bzl @@ -19,7 +19,7 @@ PROD_NEONFMA_MICROKERNEL_SRCS = [ "src/f32-igemm/gen/f32-igemm-1x8s4-minmax-neonfma.c", "src/f32-igemm/gen/f32-igemm-4x8s4-minmax-neonfma.c", "src/f32-igemm/gen/f32-igemm-6x8s4-minmax-neonfma.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u16.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u16-acc2.c", "src/f32-spmm/gen/f32-spmm-32x1-minmax-neonfma-pipelined.c", "src/f32-velu/gen/f32-velu-neonfma-rr1-lut16-p3-u16.c", "src/f32-velu/gen/f32-velu-neonfma-rr1-p6-u8.c", @@ -104,27 +104,11 @@ NON_PROD_NEONFMA_MICROKERNEL_SRCS = [ "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-6x8s4-minmax-neonfma.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u4.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u8-acc2.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u8.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u12-acc2.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u12-acc3.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u12.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u16-acc2.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u16-acc4.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u20-acc2.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u20-acc5.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u20.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u4.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u8-acc2.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u8.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u12-acc2.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u12-acc3.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u12.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u16-acc2.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u16-acc4.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u16.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u20-acc2.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u20-acc5.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u20.c", "src/f32-spmm/gen/f32-spmm-4x1-minmax-neonfma-pipelined.c", "src/f32-spmm/gen/f32-spmm-4x1-minmax-neonfma-x2.c", "src/f32-spmm/gen/f32-spmm-4x1-minmax-neonfma.c", diff --git a/gen/neonfp16arith_microkernels.bzl b/gen/neonfp16arith_microkernels.bzl index e2d2fbb766b..7e0c07c3ca3 100644 --- a/gen/neonfp16arith_microkernels.bzl +++ b/gen/neonfp16arith_microkernels.bzl @@ -39,7 +39,6 @@ PROD_NEONFP16ARITH_MICROKERNEL_SRCS = [ "src/f16-prelu/gen/f16-prelu-neonfp16arith-2x16.c", "src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u32.c", "src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u32.c", - "src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u40.c", "src/f16-rminmax/gen/f16-rmax-neonfp16arith-u32-acc4.c", "src/f16-rminmax/gen/f16-rminmax-neonfp16arith-u32-acc4.c", "src/f16-spmm/gen/f16-spmm-32x1-minmax-neonfp16arith-pipelined.c", @@ -201,6 +200,7 @@ NON_PROD_NEONFP16ARITH_MICROKERNEL_SRCS = [ "src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u32-acc4.c", "src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u40-acc2.c", "src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u40-acc5.c", + "src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u40.c", "src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u48-acc2.c", "src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u48-acc3.c", "src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u48.c", diff --git a/gen/scalar_microkernels.bzl b/gen/scalar_microkernels.bzl index d0858ca1f9d..7ef2fc7f766 100644 --- a/gen/scalar_microkernels.bzl +++ b/gen/scalar_microkernels.bzl @@ -419,15 +419,11 @@ NON_PROD_SCALAR_MICROKERNEL_SRCS = [ "src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-u3.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-lut64-p2-u1.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-lut64-p2-u2-acc2.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-lut64-p2-u2.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-lut64-p2-u4-acc2.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-lut64-p2-u4-acc4.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-lut64-p2-u4.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-p5-u1.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-p5-u2-acc2.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-p5-u2.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-p5-u4-acc4.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-p5-u4.c", "src/f32-rminmax/gen/f32-rmax-scalar-u1.c", "src/f32-rminmax/gen/f32-rmax-scalar-u2-acc2.c", "src/f32-rminmax/gen/f32-rmax-scalar-u3-acc3.c", diff --git a/gen/sse2_microkernels.bzl b/gen/sse2_microkernels.bzl index a891baf6a16..844f5cc018f 100644 --- a/gen/sse2_microkernels.bzl +++ b/gen/sse2_microkernels.bzl @@ -16,7 +16,7 @@ PROD_SSE2_MICROKERNEL_SRCS = [ "src/f32-prelu/gen/f32-prelu-sse2-2x8.c", "src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u32.c", "src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u32.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20-acc2.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16-acc2.c", "src/f32-vcopysign/gen/f32-vcopysign-sse2.c", "src/f32-vcopysign/gen/f32-vcopysignc-sse2.c", "src/f32-vcopysign/gen/f32-vrcopysignc-sse2.c", @@ -124,15 +124,7 @@ NON_PROD_SSE2_MICROKERNEL_SRCS = [ "src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u24.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u4.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u8-acc2.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u8.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u12-acc2.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u12-acc3.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u12.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16-acc2.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16-acc4.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20-acc5.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20.c", "src/f32-vbinary/gen/f32-vprelu-sse2-u4.c", "src/f32-vbinary/gen/f32-vprelu-sse2-u8.c", "src/f32-vbinary/gen/f32-vpreluc-sse2-u4.c", diff --git a/gen/wasmrelaxedsimd_microkernels.bzl b/gen/wasmrelaxedsimd_microkernels.bzl index f279040bb5e..550161f83cd 100644 --- a/gen/wasmrelaxedsimd_microkernels.bzl +++ b/gen/wasmrelaxedsimd_microkernels.bzl @@ -350,15 +350,7 @@ NON_PROD_WASMRELAXEDSIMD_MICROKERNEL_SRCS = [ "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-6x8s4-wasmrelaxedsimd-fma.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u4.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u8-acc2.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u8.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u12-acc2.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u12-acc3.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u12.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u16-acc4.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u16.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u20-acc2.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u20-acc5.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u20.c", "src/f32-spmm/gen/f32-spmm-4x1-minmax-wasmrelaxedsimd-arm-pipelined-x2.c", "src/f32-spmm/gen/f32-spmm-4x1-minmax-wasmrelaxedsimd-arm-pipelined.c", "src/f32-spmm/gen/f32-spmm-4x1-minmax-wasmrelaxedsimd-arm-x2.c", diff --git a/gen/wasmsimd_microkernels.bzl b/gen/wasmsimd_microkernels.bzl index 4efad9dce7f..55f5a48184b 100644 --- a/gen/wasmsimd_microkernels.bzl +++ b/gen/wasmsimd_microkernels.bzl @@ -647,15 +647,7 @@ NON_PROD_WASMSIMD_MICROKERNEL_SRCS = [ "src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-magic-u24.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u4.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u8-acc2.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u8.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u12-acc2.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u12-acc3.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u12.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u16-acc4.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u16.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u20-acc2.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u20-acc5.c", - "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u20.c", "src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-wasmsimd-c32.c", "src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-wasmsimd-c64.c", "src/f32-rminmax/gen/f32-rmax-wasmsimd-minmax-u4.c", diff --git a/scripts/generate-f32-raddstoreexpminusmax.sh b/scripts/generate-f32-raddstoreexpminusmax.sh index b467de77ecb..f4d7bfe7cfb 100755 --- a/scripts/generate-f32-raddstoreexpminusmax.sh +++ b/scripts/generate-f32-raddstoreexpminusmax.sh @@ -6,56 +6,24 @@ ################################### ARM NEON ################################## tools/xngen src/f32-raddstoreexpminusmax/neon-p5.c.in -D BATCH_TILE=4 -D ACCUMULATORS=1 -D FMA=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u4.c & -tools/xngen src/f32-raddstoreexpminusmax/neon-p5.c.in -D BATCH_TILE=8 -D ACCUMULATORS=1 -D FMA=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u8.c & tools/xngen src/f32-raddstoreexpminusmax/neon-p5.c.in -D BATCH_TILE=8 -D ACCUMULATORS=2 -D FMA=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u8-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/neon-p5.c.in -D BATCH_TILE=12 -D ACCUMULATORS=1 -D FMA=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u12.c & -tools/xngen src/f32-raddstoreexpminusmax/neon-p5.c.in -D BATCH_TILE=12 -D ACCUMULATORS=2 -D FMA=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u12-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/neon-p5.c.in -D BATCH_TILE=12 -D ACCUMULATORS=3 -D FMA=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u12-acc3.c & -tools/xngen src/f32-raddstoreexpminusmax/neon-p5.c.in -D BATCH_TILE=16 -D ACCUMULATORS=1 -D FMA=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u16.c & tools/xngen src/f32-raddstoreexpminusmax/neon-p5.c.in -D BATCH_TILE=16 -D ACCUMULATORS=2 -D FMA=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u16-acc2.c & tools/xngen src/f32-raddstoreexpminusmax/neon-p5.c.in -D BATCH_TILE=16 -D ACCUMULATORS=4 -D FMA=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u16-acc4.c & -tools/xngen src/f32-raddstoreexpminusmax/neon-p5.c.in -D BATCH_TILE=20 -D ACCUMULATORS=1 -D FMA=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u20.c & -tools/xngen src/f32-raddstoreexpminusmax/neon-p5.c.in -D BATCH_TILE=20 -D ACCUMULATORS=2 -D FMA=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u20-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/neon-p5.c.in -D BATCH_TILE=20 -D ACCUMULATORS=5 -D FMA=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u20-acc5.c & tools/xngen src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -D BATCH_TILE=4 -D ACCUMULATORS=1 -D FMA=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u4.c & -tools/xngen src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -D BATCH_TILE=8 -D ACCUMULATORS=1 -D FMA=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u8.c & tools/xngen src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -D BATCH_TILE=8 -D ACCUMULATORS=2 -D FMA=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u8-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -D BATCH_TILE=12 -D ACCUMULATORS=1 -D FMA=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u12.c & -tools/xngen src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -D BATCH_TILE=12 -D ACCUMULATORS=2 -D FMA=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u12-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -D BATCH_TILE=12 -D ACCUMULATORS=3 -D FMA=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u12-acc3.c & -tools/xngen src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -D BATCH_TILE=16 -D ACCUMULATORS=1 -D FMA=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u16.c & tools/xngen src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -D BATCH_TILE=16 -D ACCUMULATORS=2 -D FMA=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u16-acc2.c & tools/xngen src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -D BATCH_TILE=16 -D ACCUMULATORS=4 -D FMA=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u16-acc4.c & -tools/xngen src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -D BATCH_TILE=20 -D ACCUMULATORS=1 -D FMA=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u20.c & -tools/xngen src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -D BATCH_TILE=20 -D ACCUMULATORS=2 -D FMA=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u20-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -D BATCH_TILE=20 -D ACCUMULATORS=5 -D FMA=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u20-acc5.c & tools/xngen src/f32-raddstoreexpminusmax/neon-p5.c.in -D BATCH_TILE=4 -D ACCUMULATORS=1 -D FMA=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u4.c & -tools/xngen src/f32-raddstoreexpminusmax/neon-p5.c.in -D BATCH_TILE=8 -D ACCUMULATORS=1 -D FMA=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u8.c & tools/xngen src/f32-raddstoreexpminusmax/neon-p5.c.in -D BATCH_TILE=8 -D ACCUMULATORS=2 -D FMA=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u8-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/neon-p5.c.in -D BATCH_TILE=12 -D ACCUMULATORS=1 -D FMA=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u12.c & -tools/xngen src/f32-raddstoreexpminusmax/neon-p5.c.in -D BATCH_TILE=12 -D ACCUMULATORS=2 -D FMA=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u12-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/neon-p5.c.in -D BATCH_TILE=12 -D ACCUMULATORS=3 -D FMA=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u12-acc3.c & -tools/xngen src/f32-raddstoreexpminusmax/neon-p5.c.in -D BATCH_TILE=16 -D ACCUMULATORS=1 -D FMA=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u16.c & tools/xngen src/f32-raddstoreexpminusmax/neon-p5.c.in -D BATCH_TILE=16 -D ACCUMULATORS=2 -D FMA=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u16-acc2.c & tools/xngen src/f32-raddstoreexpminusmax/neon-p5.c.in -D BATCH_TILE=16 -D ACCUMULATORS=4 -D FMA=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u16-acc4.c & -tools/xngen src/f32-raddstoreexpminusmax/neon-p5.c.in -D BATCH_TILE=20 -D ACCUMULATORS=1 -D FMA=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u20.c & -tools/xngen src/f32-raddstoreexpminusmax/neon-p5.c.in -D BATCH_TILE=20 -D ACCUMULATORS=2 -D FMA=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u20-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/neon-p5.c.in -D BATCH_TILE=20 -D ACCUMULATORS=5 -D FMA=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u20-acc5.c & tools/xngen src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -D BATCH_TILE=4 -D ACCUMULATORS=1 -D FMA=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u4.c & -tools/xngen src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -D BATCH_TILE=8 -D ACCUMULATORS=1 -D FMA=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u8.c & tools/xngen src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -D BATCH_TILE=8 -D ACCUMULATORS=2 -D FMA=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u8-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -D BATCH_TILE=12 -D ACCUMULATORS=1 -D FMA=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u12.c & -tools/xngen src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -D BATCH_TILE=12 -D ACCUMULATORS=2 -D FMA=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u12-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -D BATCH_TILE=12 -D ACCUMULATORS=3 -D FMA=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u12-acc3.c & -tools/xngen src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -D BATCH_TILE=16 -D ACCUMULATORS=1 -D FMA=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u16.c & tools/xngen src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -D BATCH_TILE=16 -D ACCUMULATORS=2 -D FMA=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u16-acc2.c & tools/xngen src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -D BATCH_TILE=16 -D ACCUMULATORS=4 -D FMA=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u16-acc4.c & -tools/xngen src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -D BATCH_TILE=20 -D ACCUMULATORS=1 -D FMA=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u20.c & -tools/xngen src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -D BATCH_TILE=20 -D ACCUMULATORS=2 -D FMA=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u20-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -D BATCH_TILE=20 -D ACCUMULATORS=5 -D FMA=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u20-acc5.c & ################################ RISC-V Vector ################################ tools/xngen src/f32-raddstoreexpminusmax/rvv-rr2-p6.c.in -D LMUL=2 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-rvv-rr2-p6-u2v.c & @@ -63,119 +31,52 @@ tools/xngen src/f32-raddstoreexpminusmax/rvv-rr2-p6.c.in -D LMUL=4 -o src/f32-ra ################################### x86 SSE2 ################################## tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=4 -D ACCUMULATORS=1 -D AVX=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u4.c & -tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=8 -D ACCUMULATORS=1 -D AVX=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u8.c & tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=8 -D ACCUMULATORS=2 -D AVX=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u8-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=12 -D ACCUMULATORS=1 -D AVX=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u12.c & -tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=12 -D ACCUMULATORS=2 -D AVX=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u12-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=12 -D ACCUMULATORS=3 -D AVX=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u12-acc3.c & -tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=16 -D ACCUMULATORS=1 -D AVX=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16.c & tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=16 -D ACCUMULATORS=2 -D AVX=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16-acc2.c & tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=16 -D ACCUMULATORS=4 -D AVX=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16-acc4.c & -tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=20 -D ACCUMULATORS=1 -D AVX=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20.c & -tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=20 -D ACCUMULATORS=2 -D AVX=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -D BATCH_TILE=20 -D ACCUMULATORS=5 -D AVX=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20-acc5.c & ################################### x86 AVX2 ################################## -tools/xngen src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -D BATCH_TILE=32 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32.c & -tools/xngen src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -D BATCH_TILE=32 -D ACCUMULATORS=2 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -D BATCH_TILE=32 -D ACCUMULATORS=4 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32-acc4.c & -tools/xngen src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -D BATCH_TILE=64 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64.c & -tools/xngen src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -D BATCH_TILE=64 -D ACCUMULATORS=2 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -D BATCH_TILE=64 -D ACCUMULATORS=4 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64-acc4.c & -tools/xngen src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -D BATCH_TILE=72 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u72.c & -tools/xngen src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -D BATCH_TILE=72 -D ACCUMULATORS=3 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u72-acc3.c & -tools/xngen src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -D BATCH_TILE=80 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80.c & -tools/xngen src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -D BATCH_TILE=80 -D ACCUMULATORS=2 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -D BATCH_TILE=80 -D ACCUMULATORS=5 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80-acc5.c & -tools/xngen src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -D BATCH_TILE=96 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96.c & -tools/xngen src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -D BATCH_TILE=96 -D ACCUMULATORS=2 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -D BATCH_TILE=96 -D ACCUMULATORS=3 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc3.c & -tools/xngen src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -D BATCH_TILE=96 -D ACCUMULATORS=6 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc6.c & - -tools/xngen src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in -D BATCH_TILE=32 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u32.c & +tools/xngen src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in -D BATCH_TILE=8 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u8.c & +tools/xngen src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in -D BATCH_TILE=16 -D ACCUMULATORS=2 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u16-acc2.c & tools/xngen src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in -D BATCH_TILE=32 -D ACCUMULATORS=2 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u32-acc2.c & tools/xngen src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in -D BATCH_TILE=32 -D ACCUMULATORS=4 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u32-acc4.c & -tools/xngen src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in -D BATCH_TILE=64 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u64.c & -tools/xngen src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in -D BATCH_TILE=64 -D ACCUMULATORS=2 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u64-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in -D BATCH_TILE=64 -D ACCUMULATORS=4 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u64-acc4.c & -tools/xngen src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in -D BATCH_TILE=72 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u72.c & -tools/xngen src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in -D BATCH_TILE=72 -D ACCUMULATORS=3 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u72-acc3.c & -tools/xngen src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in -D BATCH_TILE=80 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u80.c & -tools/xngen src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in -D BATCH_TILE=80 -D ACCUMULATORS=2 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u80-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in -D BATCH_TILE=80 -D ACCUMULATORS=5 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u80-acc5.c & -tools/xngen src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in -D BATCH_TILE=96 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96.c & -tools/xngen src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in -D BATCH_TILE=96 -D ACCUMULATORS=2 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in -D BATCH_TILE=96 -D ACCUMULATORS=3 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc3.c & -tools/xngen src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in -D BATCH_TILE=96 -D ACCUMULATORS=6 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc6.c & + +tools/xngen src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -D BATCH_TILE=8 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u8.c & +tools/xngen src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -D BATCH_TILE=16 -D ACCUMULATORS=2 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u16-acc2.c & +tools/xngen src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -D BATCH_TILE=32 -D ACCUMULATORS=2 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32-acc2.c & +tools/xngen src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -D BATCH_TILE=32 -D ACCUMULATORS=4 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32-acc4.c & ################################# x86 AVX512F ################################# -tools/xngen src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in -D BATCH_TILE=64 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u64.c & -tools/xngen src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in -D BATCH_TILE=64 -D ACCUMULATORS=2 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u64-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in -D BATCH_TILE=64 -D ACCUMULATORS=4 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u64-acc4.c & -tools/xngen src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in -D BATCH_TILE=128 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u128.c & -tools/xngen src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in -D BATCH_TILE=128 -D ACCUMULATORS=2 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u128-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in -D BATCH_TILE=128 -D ACCUMULATORS=4 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u128-acc4.c & -tools/xngen src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in -D BATCH_TILE=144 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u144.c & -tools/xngen src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in -D BATCH_TILE=144 -D ACCUMULATORS=3 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u144-acc3.c & -tools/xngen src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in -D BATCH_TILE=160 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u160.c & -tools/xngen src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in -D BATCH_TILE=160 -D ACCUMULATORS=2 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u160-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in -D BATCH_TILE=160 -D ACCUMULATORS=5 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u160-acc5.c & -tools/xngen src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in -D BATCH_TILE=192 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u192.c & -tools/xngen src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in -D BATCH_TILE=192 -D ACCUMULATORS=2 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u192-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in -D BATCH_TILE=192 -D ACCUMULATORS=3 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u192-acc3.c & -tools/xngen src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in -D BATCH_TILE=192 -D ACCUMULATORS=6 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u192-acc6.c & +tools/xngen src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in -D BATCH_TILE=16 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u16.c & +tools/xngen src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in -D BATCH_TILE=32 -D ACCUMULATORS=2 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u32-acc2.c & +tools/xngen src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in -D BATCH_TILE=64 -D ACCUMULATORS=2 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u64-acc2.c & +tools/xngen src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in -D BATCH_TILE=64 -D ACCUMULATORS=4 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u64-acc4.c & ################################## WAsm SIMD ################################## tools/xngen src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -D BATCH_TILE=4 -D ACCUMULATORS=1 -D FMA=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u4.c & -tools/xngen src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -D BATCH_TILE=8 -D ACCUMULATORS=1 -D FMA=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u8.c & tools/xngen src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -D BATCH_TILE=8 -D ACCUMULATORS=2 -D FMA=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u8-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -D BATCH_TILE=12 -D ACCUMULATORS=1 -D FMA=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u12.c & -tools/xngen src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -D BATCH_TILE=12 -D ACCUMULATORS=2 -D FMA=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u12-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -D BATCH_TILE=12 -D ACCUMULATORS=3 -D FMA=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u12-acc3.c & -tools/xngen src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -D BATCH_TILE=16 -D ACCUMULATORS=1 -D FMA=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u16.c & tools/xngen src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -D BATCH_TILE=16 -D ACCUMULATORS=2 -D FMA=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u16-acc2.c & tools/xngen src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -D BATCH_TILE=16 -D ACCUMULATORS=4 -D FMA=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u16-acc4.c & -tools/xngen src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -D BATCH_TILE=20 -D ACCUMULATORS=1 -D FMA=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u20.c & -tools/xngen src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -D BATCH_TILE=20 -D ACCUMULATORS=2 -D FMA=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u20-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -D BATCH_TILE=20 -D ACCUMULATORS=5 -D FMA=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u20-acc5.c & tools/xngen src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -D BATCH_TILE=4 -D ACCUMULATORS=1 -D FMA=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u4.c & -tools/xngen src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -D BATCH_TILE=8 -D ACCUMULATORS=1 -D FMA=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u8.c & tools/xngen src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -D BATCH_TILE=8 -D ACCUMULATORS=2 -D FMA=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u8-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -D BATCH_TILE=12 -D ACCUMULATORS=1 -D FMA=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u12.c & -tools/xngen src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -D BATCH_TILE=12 -D ACCUMULATORS=2 -D FMA=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u12-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -D BATCH_TILE=12 -D ACCUMULATORS=3 -D FMA=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u12-acc3.c & -tools/xngen src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -D BATCH_TILE=16 -D ACCUMULATORS=1 -D FMA=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u16.c & tools/xngen src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -D BATCH_TILE=16 -D ACCUMULATORS=2 -D FMA=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u16-acc2.c & tools/xngen src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -D BATCH_TILE=16 -D ACCUMULATORS=4 -D FMA=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u16-acc4.c & -tools/xngen src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -D BATCH_TILE=20 -D ACCUMULATORS=1 -D FMA=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u20.c & -tools/xngen src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -D BATCH_TILE=20 -D ACCUMULATORS=2 -D FMA=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u20-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -D BATCH_TILE=20 -D ACCUMULATORS=5 -D FMA=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u20-acc5.c & ################################## Hexagon HVX ################################## tools/xngen src/f32-raddstoreexpminusmax/hvx-rr2-p5.c.in -D BATCH_TILE=32 -D ACCUMULATORS=1 -D -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u32.c & -tools/xngen src/f32-raddstoreexpminusmax/hvx-rr2-p5.c.in -D BATCH_TILE=64 -D ACCUMULATORS=1 -D -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u64.c & tools/xngen src/f32-raddstoreexpminusmax/hvx-rr2-p5.c.in -D BATCH_TILE=64 -D ACCUMULATORS=2 -D -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u64-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/hvx-rr2-p5.c.in -D BATCH_TILE=96 -D ACCUMULATORS=1 -D -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u96.c & -tools/xngen src/f32-raddstoreexpminusmax/hvx-rr2-p5.c.in -D BATCH_TILE=96 -D ACCUMULATORS=2 -D -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u96-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/hvx-rr2-p5.c.in -D BATCH_TILE=96 -D ACCUMULATORS=3 -D -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u96-acc3.c & -tools/xngen src/f32-raddstoreexpminusmax/hvx-rr2-p5.c.in -D BATCH_TILE=128 -D ACCUMULATORS=1 -D -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u128.c & tools/xngen src/f32-raddstoreexpminusmax/hvx-rr2-p5.c.in -D BATCH_TILE=128 -D ACCUMULATORS=2 -D -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u128-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/hvx-rr2-p5.c.in -D BATCH_TILE=128 -D ACCUMULATORS=3 -D -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u128-acc3.c & tools/xngen src/f32-raddstoreexpminusmax/hvx-rr2-p5.c.in -D BATCH_TILE=128 -D ACCUMULATORS=4 -D -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u128-acc4.c & ################################### Scalar #################################### tools/xngen src/f32-raddstoreexpminusmax/scalar-rr2-p5.c.in -D BATCH_TILE=1 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-p5-u1.c & -tools/xngen src/f32-raddstoreexpminusmax/scalar-rr2-p5.c.in -D BATCH_TILE=2 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-p5-u2.c & tools/xngen src/f32-raddstoreexpminusmax/scalar-rr2-p5.c.in -D BATCH_TILE=2 -D ACCUMULATORS=2 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-p5-u2-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/scalar-rr2-p5.c.in -D BATCH_TILE=4 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-p5-u4.c & tools/xngen src/f32-raddstoreexpminusmax/scalar-rr2-p5.c.in -D BATCH_TILE=4 -D ACCUMULATORS=2 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-p5-u4-acc2.c & tools/xngen src/f32-raddstoreexpminusmax/scalar-rr2-p5.c.in -D BATCH_TILE=4 -D ACCUMULATORS=4 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-p5-u4-acc4.c & tools/xngen src/f32-raddstoreexpminusmax/scalar-rr2-lut64-p2.c.in -D BATCH_TILE=1 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-lut64-p2-u1.c & -tools/xngen src/f32-raddstoreexpminusmax/scalar-rr2-lut64-p2.c.in -D BATCH_TILE=2 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-lut64-p2-u2.c & tools/xngen src/f32-raddstoreexpminusmax/scalar-rr2-lut64-p2.c.in -D BATCH_TILE=2 -D ACCUMULATORS=2 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-lut64-p2-u2-acc2.c & -tools/xngen src/f32-raddstoreexpminusmax/scalar-rr2-lut64-p2.c.in -D BATCH_TILE=4 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-lut64-p2-u4.c & tools/xngen src/f32-raddstoreexpminusmax/scalar-rr2-lut64-p2.c.in -D BATCH_TILE=4 -D ACCUMULATORS=2 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-lut64-p2-u4-acc2.c & tools/xngen src/f32-raddstoreexpminusmax/scalar-rr2-lut64-p2.c.in -D BATCH_TILE=4 -D ACCUMULATORS=4 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-lut64-p2-u4-acc4.c & diff --git a/src/configs/raddstoreexpminusmax-config.c b/src/configs/raddstoreexpminusmax-config.c index 6fd1c5d4f07..163bb97190d 100644 --- a/src/configs/raddstoreexpminusmax-config.c +++ b/src/configs/raddstoreexpminusmax-config.c @@ -31,15 +31,15 @@ static void init_f16_raddstoreexpminusmax_config(void) { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon_fp16_arith) { - f16_raddstoreexpminusmax_config.ukernel = (xnn_raddstoreexpminusmax_ukernel_fn) xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_u40; - f16_raddstoreexpminusmax_config.element_tile = 40; + f16_raddstoreexpminusmax_config.ukernel = (xnn_raddstoreexpminusmax_ukernel_fn) xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_u32; + f16_raddstoreexpminusmax_config.element_tile = 32; } #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_x86_avx2) { - f16_raddstoreexpminusmax_config.ukernel = (xnn_raddstoreexpminusmax_ukernel_fn) xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_u40; - f16_raddstoreexpminusmax_config.element_tile = 40; + f16_raddstoreexpminusmax_config.ukernel = (xnn_raddstoreexpminusmax_ukernel_fn) xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_u32; + f16_raddstoreexpminusmax_config.element_tile = 32; } #endif } @@ -50,7 +50,7 @@ static void init_f32_raddstoreexpminusmax_config(void) { assert(hardware_config != NULL); if (hardware_config->use_arm_neon) { f32_raddstoreexpminusmax_config.ukernel = - (xnn_raddstoreexpminusmax_ukernel_fn) xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u8; + (xnn_raddstoreexpminusmax_ukernel_fn) xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u16_acc2; f32_raddstoreexpminusmax_config.element_tile = 8; } else if (!XNN_PLATFORM_MOBILE) { f32_raddstoreexpminusmax_config.ukernel = @@ -59,7 +59,7 @@ static void init_f32_raddstoreexpminusmax_config(void) { } #elif XNN_ARCH_ARM64 f32_raddstoreexpminusmax_config.ukernel = - (xnn_raddstoreexpminusmax_ukernel_fn) xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u16; + (xnn_raddstoreexpminusmax_ukernel_fn) xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u16_acc2; f32_raddstoreexpminusmax_config.element_tile = 16; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); @@ -68,8 +68,8 @@ static void init_f32_raddstoreexpminusmax_config(void) { f32_raddstoreexpminusmax_config.ukernel = (xnn_raddstoreexpminusmax_ukernel_fn) xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc2; f32_raddstoreexpminusmax_config.element_tile = 32; } else { - f32_raddstoreexpminusmax_config.ukernel = (xnn_raddstoreexpminusmax_ukernel_fn) xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u20_acc2; - f32_raddstoreexpminusmax_config.element_tile = 20; + f32_raddstoreexpminusmax_config.ukernel = (xnn_raddstoreexpminusmax_ukernel_fn) xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16_acc2; + f32_raddstoreexpminusmax_config.element_tile = 16; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD #if XNN_ARCH_WASMRELAXEDSIMD diff --git a/src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in b/src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in index 0bb62d22b12..7d8b862c657 100644 --- a/src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in +++ b/src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in @@ -10,6 +10,7 @@ $SIMD_TILE = BATCH_TILE // 8 #include +#include "xnnpack/intrinsics-polyfill.h" #include "xnnpack/raddstoreexpminusmax.h" diff --git a/src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in b/src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in index 3ba462b9bed..c5d91ee911e 100644 --- a/src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in +++ b/src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in @@ -10,6 +10,7 @@ $SIMD_TILE = BATCH_TILE // 8 #include +#include "xnnpack/intrinsics-polyfill.h" #include "xnnpack/raddstoreexpminusmax.h" @@ -57,7 +58,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u${BATCH_TILE}${"" if ACC $for K in range(ACCUMULATORS): __m256 vacc${K} = _mm256_setzero_ps(); for (; batch >= ${BATCH_TILE} * sizeof(float); batch -= ${BATCH_TILE} * sizeof(float)) { - // Load ${BATCH_TILE} (${SIMD_TILE}x4) inputs at a time. + // Load ${BATCH_TILE} (${SIMD_TILE}x8) inputs at a time. const __m256 vi0 = _mm256_loadu_ps(input); $for N in range(1, SIMD_TILE): const __m256 vi${N} = _mm256_loadu_ps(input + ${N * 8}); @@ -116,7 +117,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u${BATCH_TILE}${"" if ACC $for N in range(SIMD_TILE): vf${N} = _mm256_andnot_ps(_mm256_cmp_ps(vx${N}, vdenorm_cutoff, _CMP_LT_OS), vf${N}); - // Store ${BATCH_TILE} (${SIMD_TILE}x4) outputs at a time. + // Store ${BATCH_TILE} (${SIMD_TILE}x8) outputs at a time. _mm256_storeu_ps(output, vf0); $for N in range(1, SIMD_TILE): _mm256_storeu_ps(output + ${N * 8}, vf${N}); diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12-acc2.c deleted file mode 100644 index 01e3bc41ea6..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12-acc2.c +++ /dev/null @@ -1,257 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12_acc2( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f); - const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f); - const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f); - const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f); - const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f); - const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f); - const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f); - const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f); - const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f); - const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m128 vi_max = _mm_load1_ps(max); - - __m128 vacc0 = _mm_setzero_ps(); - __m128 vacc1 = _mm_setzero_ps(); - for (; batch >= 12 * sizeof(float); batch -= 12 * sizeof(float)) { - // Load 12 (3x4) inputs at a time. - const __m128 vi0 = _mm_loadu_ps(input); - const __m128 vi1 = _mm_loadu_ps(input + 4); - const __m128 vi2 = _mm_loadu_ps(input + 8); - input += 12; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx0 = _mm_sub_ps(vi0, vi_max); - const __m128 vx1 = _mm_sub_ps(vi1, vi_max); - const __m128 vx2 = _mm_sub_ps(vi2, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); - __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); - __m128 vn2 = _mm_add_ps(_mm_mul_ps(vx2, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); - const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); - const __m128 vs2 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn2), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn0 = _mm_sub_ps(vn0, vmagic_bias); - vn1 = _mm_sub_ps(vn1, vmagic_bias); - vn2 = _mm_sub_ps(vn2, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); - __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); - __m128 vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_hi), vx2); - - vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); - vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); - vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_lo), vt2); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); - __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); - __m128 vp2 = _mm_add_ps(_mm_mul_ps(vc5, vt2), vc4); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc3); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc2); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt0 = _mm_mul_ps(vt0, vs0); - vt1 = _mm_mul_ps(vt1, vs1); - vt2 = _mm_mul_ps(vt2, vs2); - - __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); - __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); - __m128 vf2 = _mm_add_ps(_mm_mul_ps(vt2, vp2), vs2); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); - vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); - vf2 = _mm_andnot_ps(_mm_cmplt_ps(vx2, vdenorm_cutoff), vf2); - - // Store 12 (3x4) outputs at a time. - _mm_storeu_ps(output, vf0); - _mm_storeu_ps(output + 4, vf1); - _mm_storeu_ps(output + 8, vf2); - output += 12; - - // Accumulate computed exponents. - vacc0 = _mm_add_ps(vacc0, vf0); - vacc1 = _mm_add_ps(vacc1, vf1); - vacc0 = _mm_add_ps(vacc0, vf2); - } - // Add up all accumulators to vacc0 - vacc0 = _mm_add_ps(vacc0, vacc1); - - __m128 vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - input += 4; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - // Store 4 outputs at a time. - _mm_storeu_ps(output, vf); - output += 4; - - // Accumulate computed exponents. - vacc = _mm_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - if (batch & (2 * sizeof(float))) { - // Store 2 outputs at a time. - _mm_storel_pi((__m64*) output, vf); - output += 2; - - // Accumulate 2 computed exponents. - vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps())); - - vf = _mm_movehl_ps(vf, vf); - } - if (batch & (1 * sizeof(float))) { - // Store 1 output at a time. - _mm_store_ss(output, vf); - - // Accumulate 1 computed exponent. - vacc = _mm_add_ss(vacc, vf); - } - } - // Reduce 4 batch in the SIMD register - vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); - vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1))); - _mm_store_ss(sum, vacc); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12-acc3.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12-acc3.c deleted file mode 100644 index 4d1456328d0..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12-acc3.c +++ /dev/null @@ -1,259 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12_acc3( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f); - const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f); - const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f); - const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f); - const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f); - const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f); - const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f); - const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f); - const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f); - const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m128 vi_max = _mm_load1_ps(max); - - __m128 vacc0 = _mm_setzero_ps(); - __m128 vacc1 = _mm_setzero_ps(); - __m128 vacc2 = _mm_setzero_ps(); - for (; batch >= 12 * sizeof(float); batch -= 12 * sizeof(float)) { - // Load 12 (3x4) inputs at a time. - const __m128 vi0 = _mm_loadu_ps(input); - const __m128 vi1 = _mm_loadu_ps(input + 4); - const __m128 vi2 = _mm_loadu_ps(input + 8); - input += 12; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx0 = _mm_sub_ps(vi0, vi_max); - const __m128 vx1 = _mm_sub_ps(vi1, vi_max); - const __m128 vx2 = _mm_sub_ps(vi2, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); - __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); - __m128 vn2 = _mm_add_ps(_mm_mul_ps(vx2, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); - const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); - const __m128 vs2 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn2), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn0 = _mm_sub_ps(vn0, vmagic_bias); - vn1 = _mm_sub_ps(vn1, vmagic_bias); - vn2 = _mm_sub_ps(vn2, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); - __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); - __m128 vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_hi), vx2); - - vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); - vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); - vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_lo), vt2); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); - __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); - __m128 vp2 = _mm_add_ps(_mm_mul_ps(vc5, vt2), vc4); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc3); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc2); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt0 = _mm_mul_ps(vt0, vs0); - vt1 = _mm_mul_ps(vt1, vs1); - vt2 = _mm_mul_ps(vt2, vs2); - - __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); - __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); - __m128 vf2 = _mm_add_ps(_mm_mul_ps(vt2, vp2), vs2); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); - vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); - vf2 = _mm_andnot_ps(_mm_cmplt_ps(vx2, vdenorm_cutoff), vf2); - - // Store 12 (3x4) outputs at a time. - _mm_storeu_ps(output, vf0); - _mm_storeu_ps(output + 4, vf1); - _mm_storeu_ps(output + 8, vf2); - output += 12; - - // Accumulate computed exponents. - vacc0 = _mm_add_ps(vacc0, vf0); - vacc1 = _mm_add_ps(vacc1, vf1); - vacc2 = _mm_add_ps(vacc2, vf2); - } - // Add up all accumulators to vacc0 - vacc0 = _mm_add_ps(vacc0, vacc1); - vacc0 = _mm_add_ps(vacc0, vacc2); - - __m128 vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - input += 4; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - // Store 4 outputs at a time. - _mm_storeu_ps(output, vf); - output += 4; - - // Accumulate computed exponents. - vacc = _mm_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - if (batch & (2 * sizeof(float))) { - // Store 2 outputs at a time. - _mm_storel_pi((__m64*) output, vf); - output += 2; - - // Accumulate 2 computed exponents. - vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps())); - - vf = _mm_movehl_ps(vf, vf); - } - if (batch & (1 * sizeof(float))) { - // Store 1 output at a time. - _mm_store_ss(output, vf); - - // Accumulate 1 computed exponent. - vacc = _mm_add_ss(vacc, vf); - } - } - // Reduce 4 batch in the SIMD register - vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); - vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1))); - _mm_store_ss(sum, vacc); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12.c deleted file mode 100644 index 83b546b8140..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u12.c +++ /dev/null @@ -1,254 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f); - const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f); - const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f); - const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f); - const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f); - const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f); - const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f); - const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f); - const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f); - const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m128 vi_max = _mm_load1_ps(max); - - __m128 vacc0 = _mm_setzero_ps(); - for (; batch >= 12 * sizeof(float); batch -= 12 * sizeof(float)) { - // Load 12 (3x4) inputs at a time. - const __m128 vi0 = _mm_loadu_ps(input); - const __m128 vi1 = _mm_loadu_ps(input + 4); - const __m128 vi2 = _mm_loadu_ps(input + 8); - input += 12; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx0 = _mm_sub_ps(vi0, vi_max); - const __m128 vx1 = _mm_sub_ps(vi1, vi_max); - const __m128 vx2 = _mm_sub_ps(vi2, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); - __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); - __m128 vn2 = _mm_add_ps(_mm_mul_ps(vx2, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); - const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); - const __m128 vs2 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn2), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn0 = _mm_sub_ps(vn0, vmagic_bias); - vn1 = _mm_sub_ps(vn1, vmagic_bias); - vn2 = _mm_sub_ps(vn2, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); - __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); - __m128 vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_hi), vx2); - - vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); - vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); - vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_lo), vt2); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); - __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); - __m128 vp2 = _mm_add_ps(_mm_mul_ps(vc5, vt2), vc4); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc3); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc2); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt0 = _mm_mul_ps(vt0, vs0); - vt1 = _mm_mul_ps(vt1, vs1); - vt2 = _mm_mul_ps(vt2, vs2); - - __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); - __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); - __m128 vf2 = _mm_add_ps(_mm_mul_ps(vt2, vp2), vs2); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); - vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); - vf2 = _mm_andnot_ps(_mm_cmplt_ps(vx2, vdenorm_cutoff), vf2); - - // Store 12 (3x4) outputs at a time. - _mm_storeu_ps(output, vf0); - _mm_storeu_ps(output + 4, vf1); - _mm_storeu_ps(output + 8, vf2); - output += 12; - - // Accumulate computed exponents. - vacc0 = _mm_add_ps(vacc0, vf0); - vacc0 = _mm_add_ps(vacc0, vf1); - vacc0 = _mm_add_ps(vacc0, vf2); - } - - __m128 vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - input += 4; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - // Store 4 outputs at a time. - _mm_storeu_ps(output, vf); - output += 4; - - // Accumulate computed exponents. - vacc = _mm_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - if (batch & (2 * sizeof(float))) { - // Store 2 outputs at a time. - _mm_storel_pi((__m64*) output, vf); - output += 2; - - // Accumulate 2 computed exponents. - vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps())); - - vf = _mm_movehl_ps(vf, vf); - } - if (batch & (1 * sizeof(float))) { - // Store 1 output at a time. - _mm_store_ss(output, vf); - - // Accumulate 1 computed exponent. - vacc = _mm_add_ss(vacc, vf); - } - } - // Reduce 4 batch in the SIMD register - vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); - vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1))); - _mm_store_ss(sum, vacc); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16-acc2.c deleted file mode 100644 index 7bb87dc34d5..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16-acc2.c +++ /dev/null @@ -1,273 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16_acc2( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f); - const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f); - const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f); - const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f); - const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f); - const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f); - const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f); - const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f); - const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f); - const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m128 vi_max = _mm_load1_ps(max); - - __m128 vacc0 = _mm_setzero_ps(); - __m128 vacc1 = _mm_setzero_ps(); - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - // Load 16 (4x4) inputs at a time. - const __m128 vi0 = _mm_loadu_ps(input); - const __m128 vi1 = _mm_loadu_ps(input + 4); - const __m128 vi2 = _mm_loadu_ps(input + 8); - const __m128 vi3 = _mm_loadu_ps(input + 12); - input += 16; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx0 = _mm_sub_ps(vi0, vi_max); - const __m128 vx1 = _mm_sub_ps(vi1, vi_max); - const __m128 vx2 = _mm_sub_ps(vi2, vi_max); - const __m128 vx3 = _mm_sub_ps(vi3, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); - __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); - __m128 vn2 = _mm_add_ps(_mm_mul_ps(vx2, vlog2e), vmagic_bias); - __m128 vn3 = _mm_add_ps(_mm_mul_ps(vx3, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); - const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); - const __m128 vs2 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn2), 23)); - const __m128 vs3 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn3), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn0 = _mm_sub_ps(vn0, vmagic_bias); - vn1 = _mm_sub_ps(vn1, vmagic_bias); - vn2 = _mm_sub_ps(vn2, vmagic_bias); - vn3 = _mm_sub_ps(vn3, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); - __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); - __m128 vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_hi), vx2); - __m128 vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_hi), vx3); - - vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); - vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); - vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_lo), vt2); - vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_lo), vt3); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); - __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); - __m128 vp2 = _mm_add_ps(_mm_mul_ps(vc5, vt2), vc4); - __m128 vp3 = _mm_add_ps(_mm_mul_ps(vc5, vt3), vc4); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc3); - vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc3); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc2); - vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc2); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc1); - vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt0 = _mm_mul_ps(vt0, vs0); - vt1 = _mm_mul_ps(vt1, vs1); - vt2 = _mm_mul_ps(vt2, vs2); - vt3 = _mm_mul_ps(vt3, vs3); - - __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); - __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); - __m128 vf2 = _mm_add_ps(_mm_mul_ps(vt2, vp2), vs2); - __m128 vf3 = _mm_add_ps(_mm_mul_ps(vt3, vp3), vs3); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); - vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); - vf2 = _mm_andnot_ps(_mm_cmplt_ps(vx2, vdenorm_cutoff), vf2); - vf3 = _mm_andnot_ps(_mm_cmplt_ps(vx3, vdenorm_cutoff), vf3); - - // Store 16 (4x4) outputs at a time. - _mm_storeu_ps(output, vf0); - _mm_storeu_ps(output + 4, vf1); - _mm_storeu_ps(output + 8, vf2); - _mm_storeu_ps(output + 12, vf3); - output += 16; - - // Accumulate computed exponents. - vacc0 = _mm_add_ps(vacc0, vf0); - vacc1 = _mm_add_ps(vacc1, vf1); - vacc0 = _mm_add_ps(vacc0, vf2); - vacc1 = _mm_add_ps(vacc1, vf3); - } - // Add up all accumulators to vacc0 - vacc0 = _mm_add_ps(vacc0, vacc1); - - __m128 vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - input += 4; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - // Store 4 outputs at a time. - _mm_storeu_ps(output, vf); - output += 4; - - // Accumulate computed exponents. - vacc = _mm_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - if (batch & (2 * sizeof(float))) { - // Store 2 outputs at a time. - _mm_storel_pi((__m64*) output, vf); - output += 2; - - // Accumulate 2 computed exponents. - vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps())); - - vf = _mm_movehl_ps(vf, vf); - } - if (batch & (1 * sizeof(float))) { - // Store 1 output at a time. - _mm_store_ss(output, vf); - - // Accumulate 1 computed exponent. - vacc = _mm_add_ss(vacc, vf); - } - } - // Reduce 4 batch in the SIMD register - vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); - vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1))); - _mm_store_ss(sum, vacc); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16-acc4.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16-acc4.c deleted file mode 100644 index fa79b152af6..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16-acc4.c +++ /dev/null @@ -1,277 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16_acc4( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f); - const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f); - const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f); - const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f); - const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f); - const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f); - const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f); - const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f); - const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f); - const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m128 vi_max = _mm_load1_ps(max); - - __m128 vacc0 = _mm_setzero_ps(); - __m128 vacc1 = _mm_setzero_ps(); - __m128 vacc2 = _mm_setzero_ps(); - __m128 vacc3 = _mm_setzero_ps(); - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - // Load 16 (4x4) inputs at a time. - const __m128 vi0 = _mm_loadu_ps(input); - const __m128 vi1 = _mm_loadu_ps(input + 4); - const __m128 vi2 = _mm_loadu_ps(input + 8); - const __m128 vi3 = _mm_loadu_ps(input + 12); - input += 16; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx0 = _mm_sub_ps(vi0, vi_max); - const __m128 vx1 = _mm_sub_ps(vi1, vi_max); - const __m128 vx2 = _mm_sub_ps(vi2, vi_max); - const __m128 vx3 = _mm_sub_ps(vi3, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); - __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); - __m128 vn2 = _mm_add_ps(_mm_mul_ps(vx2, vlog2e), vmagic_bias); - __m128 vn3 = _mm_add_ps(_mm_mul_ps(vx3, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); - const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); - const __m128 vs2 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn2), 23)); - const __m128 vs3 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn3), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn0 = _mm_sub_ps(vn0, vmagic_bias); - vn1 = _mm_sub_ps(vn1, vmagic_bias); - vn2 = _mm_sub_ps(vn2, vmagic_bias); - vn3 = _mm_sub_ps(vn3, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); - __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); - __m128 vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_hi), vx2); - __m128 vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_hi), vx3); - - vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); - vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); - vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_lo), vt2); - vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_lo), vt3); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); - __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); - __m128 vp2 = _mm_add_ps(_mm_mul_ps(vc5, vt2), vc4); - __m128 vp3 = _mm_add_ps(_mm_mul_ps(vc5, vt3), vc4); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc3); - vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc3); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc2); - vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc2); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc1); - vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt0 = _mm_mul_ps(vt0, vs0); - vt1 = _mm_mul_ps(vt1, vs1); - vt2 = _mm_mul_ps(vt2, vs2); - vt3 = _mm_mul_ps(vt3, vs3); - - __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); - __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); - __m128 vf2 = _mm_add_ps(_mm_mul_ps(vt2, vp2), vs2); - __m128 vf3 = _mm_add_ps(_mm_mul_ps(vt3, vp3), vs3); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); - vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); - vf2 = _mm_andnot_ps(_mm_cmplt_ps(vx2, vdenorm_cutoff), vf2); - vf3 = _mm_andnot_ps(_mm_cmplt_ps(vx3, vdenorm_cutoff), vf3); - - // Store 16 (4x4) outputs at a time. - _mm_storeu_ps(output, vf0); - _mm_storeu_ps(output + 4, vf1); - _mm_storeu_ps(output + 8, vf2); - _mm_storeu_ps(output + 12, vf3); - output += 16; - - // Accumulate computed exponents. - vacc0 = _mm_add_ps(vacc0, vf0); - vacc1 = _mm_add_ps(vacc1, vf1); - vacc2 = _mm_add_ps(vacc2, vf2); - vacc3 = _mm_add_ps(vacc3, vf3); - } - // Add up all accumulators to vacc0 - vacc0 = _mm_add_ps(vacc0, vacc1); - vacc2 = _mm_add_ps(vacc2, vacc3); - vacc0 = _mm_add_ps(vacc0, vacc2); - - __m128 vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - input += 4; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - // Store 4 outputs at a time. - _mm_storeu_ps(output, vf); - output += 4; - - // Accumulate computed exponents. - vacc = _mm_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - if (batch & (2 * sizeof(float))) { - // Store 2 outputs at a time. - _mm_storel_pi((__m64*) output, vf); - output += 2; - - // Accumulate 2 computed exponents. - vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps())); - - vf = _mm_movehl_ps(vf, vf); - } - if (batch & (1 * sizeof(float))) { - // Store 1 output at a time. - _mm_store_ss(output, vf); - - // Accumulate 1 computed exponent. - vacc = _mm_add_ss(vacc, vf); - } - } - // Reduce 4 batch in the SIMD register - vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); - vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1))); - _mm_store_ss(sum, vacc); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16.c deleted file mode 100644 index 73afcd02a67..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u16.c +++ /dev/null @@ -1,270 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f); - const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f); - const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f); - const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f); - const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f); - const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f); - const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f); - const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f); - const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f); - const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m128 vi_max = _mm_load1_ps(max); - - __m128 vacc0 = _mm_setzero_ps(); - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - // Load 16 (4x4) inputs at a time. - const __m128 vi0 = _mm_loadu_ps(input); - const __m128 vi1 = _mm_loadu_ps(input + 4); - const __m128 vi2 = _mm_loadu_ps(input + 8); - const __m128 vi3 = _mm_loadu_ps(input + 12); - input += 16; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx0 = _mm_sub_ps(vi0, vi_max); - const __m128 vx1 = _mm_sub_ps(vi1, vi_max); - const __m128 vx2 = _mm_sub_ps(vi2, vi_max); - const __m128 vx3 = _mm_sub_ps(vi3, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); - __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); - __m128 vn2 = _mm_add_ps(_mm_mul_ps(vx2, vlog2e), vmagic_bias); - __m128 vn3 = _mm_add_ps(_mm_mul_ps(vx3, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); - const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); - const __m128 vs2 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn2), 23)); - const __m128 vs3 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn3), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn0 = _mm_sub_ps(vn0, vmagic_bias); - vn1 = _mm_sub_ps(vn1, vmagic_bias); - vn2 = _mm_sub_ps(vn2, vmagic_bias); - vn3 = _mm_sub_ps(vn3, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); - __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); - __m128 vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_hi), vx2); - __m128 vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_hi), vx3); - - vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); - vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); - vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_lo), vt2); - vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_lo), vt3); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); - __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); - __m128 vp2 = _mm_add_ps(_mm_mul_ps(vc5, vt2), vc4); - __m128 vp3 = _mm_add_ps(_mm_mul_ps(vc5, vt3), vc4); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc3); - vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc3); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc2); - vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc2); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc1); - vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt0 = _mm_mul_ps(vt0, vs0); - vt1 = _mm_mul_ps(vt1, vs1); - vt2 = _mm_mul_ps(vt2, vs2); - vt3 = _mm_mul_ps(vt3, vs3); - - __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); - __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); - __m128 vf2 = _mm_add_ps(_mm_mul_ps(vt2, vp2), vs2); - __m128 vf3 = _mm_add_ps(_mm_mul_ps(vt3, vp3), vs3); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); - vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); - vf2 = _mm_andnot_ps(_mm_cmplt_ps(vx2, vdenorm_cutoff), vf2); - vf3 = _mm_andnot_ps(_mm_cmplt_ps(vx3, vdenorm_cutoff), vf3); - - // Store 16 (4x4) outputs at a time. - _mm_storeu_ps(output, vf0); - _mm_storeu_ps(output + 4, vf1); - _mm_storeu_ps(output + 8, vf2); - _mm_storeu_ps(output + 12, vf3); - output += 16; - - // Accumulate computed exponents. - vacc0 = _mm_add_ps(vacc0, vf0); - vacc0 = _mm_add_ps(vacc0, vf1); - vacc0 = _mm_add_ps(vacc0, vf2); - vacc0 = _mm_add_ps(vacc0, vf3); - } - - __m128 vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - input += 4; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - // Store 4 outputs at a time. - _mm_storeu_ps(output, vf); - output += 4; - - // Accumulate computed exponents. - vacc = _mm_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - if (batch & (2 * sizeof(float))) { - // Store 2 outputs at a time. - _mm_storel_pi((__m64*) output, vf); - output += 2; - - // Accumulate 2 computed exponents. - vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps())); - - vf = _mm_movehl_ps(vf, vf); - } - if (batch & (1 * sizeof(float))) { - // Store 1 output at a time. - _mm_store_ss(output, vf); - - // Accumulate 1 computed exponent. - vacc = _mm_add_ss(vacc, vf); - } - } - // Reduce 4 batch in the SIMD register - vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); - vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1))); - _mm_store_ss(sum, vacc); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20-acc2.c deleted file mode 100644 index dd99ce2cf0d..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20-acc2.c +++ /dev/null @@ -1,289 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20_acc2( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f); - const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f); - const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f); - const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f); - const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f); - const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f); - const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f); - const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f); - const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f); - const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m128 vi_max = _mm_load1_ps(max); - - __m128 vacc0 = _mm_setzero_ps(); - __m128 vacc1 = _mm_setzero_ps(); - for (; batch >= 20 * sizeof(float); batch -= 20 * sizeof(float)) { - // Load 20 (5x4) inputs at a time. - const __m128 vi0 = _mm_loadu_ps(input); - const __m128 vi1 = _mm_loadu_ps(input + 4); - const __m128 vi2 = _mm_loadu_ps(input + 8); - const __m128 vi3 = _mm_loadu_ps(input + 12); - const __m128 vi4 = _mm_loadu_ps(input + 16); - input += 20; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx0 = _mm_sub_ps(vi0, vi_max); - const __m128 vx1 = _mm_sub_ps(vi1, vi_max); - const __m128 vx2 = _mm_sub_ps(vi2, vi_max); - const __m128 vx3 = _mm_sub_ps(vi3, vi_max); - const __m128 vx4 = _mm_sub_ps(vi4, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); - __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); - __m128 vn2 = _mm_add_ps(_mm_mul_ps(vx2, vlog2e), vmagic_bias); - __m128 vn3 = _mm_add_ps(_mm_mul_ps(vx3, vlog2e), vmagic_bias); - __m128 vn4 = _mm_add_ps(_mm_mul_ps(vx4, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); - const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); - const __m128 vs2 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn2), 23)); - const __m128 vs3 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn3), 23)); - const __m128 vs4 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn0 = _mm_sub_ps(vn0, vmagic_bias); - vn1 = _mm_sub_ps(vn1, vmagic_bias); - vn2 = _mm_sub_ps(vn2, vmagic_bias); - vn3 = _mm_sub_ps(vn3, vmagic_bias); - vn4 = _mm_sub_ps(vn4, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); - __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); - __m128 vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_hi), vx2); - __m128 vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_hi), vx3); - __m128 vt4 = _mm_add_ps(_mm_mul_ps(vn4, vminus_ln2_hi), vx4); - - vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); - vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); - vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_lo), vt2); - vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_lo), vt3); - vt4 = _mm_add_ps(_mm_mul_ps(vn4, vminus_ln2_lo), vt4); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); - __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); - __m128 vp2 = _mm_add_ps(_mm_mul_ps(vc5, vt2), vc4); - __m128 vp3 = _mm_add_ps(_mm_mul_ps(vc5, vt3), vc4); - __m128 vp4 = _mm_add_ps(_mm_mul_ps(vc5, vt4), vc4); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc3); - vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc3); - vp4 = _mm_add_ps(_mm_mul_ps(vp4, vt4), vc3); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc2); - vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc2); - vp4 = _mm_add_ps(_mm_mul_ps(vp4, vt4), vc2); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc1); - vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc1); - vp4 = _mm_add_ps(_mm_mul_ps(vp4, vt4), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt0 = _mm_mul_ps(vt0, vs0); - vt1 = _mm_mul_ps(vt1, vs1); - vt2 = _mm_mul_ps(vt2, vs2); - vt3 = _mm_mul_ps(vt3, vs3); - vt4 = _mm_mul_ps(vt4, vs4); - - __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); - __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); - __m128 vf2 = _mm_add_ps(_mm_mul_ps(vt2, vp2), vs2); - __m128 vf3 = _mm_add_ps(_mm_mul_ps(vt3, vp3), vs3); - __m128 vf4 = _mm_add_ps(_mm_mul_ps(vt4, vp4), vs4); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); - vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); - vf2 = _mm_andnot_ps(_mm_cmplt_ps(vx2, vdenorm_cutoff), vf2); - vf3 = _mm_andnot_ps(_mm_cmplt_ps(vx3, vdenorm_cutoff), vf3); - vf4 = _mm_andnot_ps(_mm_cmplt_ps(vx4, vdenorm_cutoff), vf4); - - // Store 20 (5x4) outputs at a time. - _mm_storeu_ps(output, vf0); - _mm_storeu_ps(output + 4, vf1); - _mm_storeu_ps(output + 8, vf2); - _mm_storeu_ps(output + 12, vf3); - _mm_storeu_ps(output + 16, vf4); - output += 20; - - // Accumulate computed exponents. - vacc0 = _mm_add_ps(vacc0, vf0); - vacc1 = _mm_add_ps(vacc1, vf1); - vacc0 = _mm_add_ps(vacc0, vf2); - vacc1 = _mm_add_ps(vacc1, vf3); - vacc0 = _mm_add_ps(vacc0, vf4); - } - // Add up all accumulators to vacc0 - vacc0 = _mm_add_ps(vacc0, vacc1); - - __m128 vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - input += 4; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - // Store 4 outputs at a time. - _mm_storeu_ps(output, vf); - output += 4; - - // Accumulate computed exponents. - vacc = _mm_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - if (batch & (2 * sizeof(float))) { - // Store 2 outputs at a time. - _mm_storel_pi((__m64*) output, vf); - output += 2; - - // Accumulate 2 computed exponents. - vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps())); - - vf = _mm_movehl_ps(vf, vf); - } - if (batch & (1 * sizeof(float))) { - // Store 1 output at a time. - _mm_store_ss(output, vf); - - // Accumulate 1 computed exponent. - vacc = _mm_add_ss(vacc, vf); - } - } - // Reduce 4 batch in the SIMD register - vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); - vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1))); - _mm_store_ss(sum, vacc); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20-acc5.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20-acc5.c deleted file mode 100644 index c83c2044873..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20-acc5.c +++ /dev/null @@ -1,295 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20_acc5( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f); - const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f); - const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f); - const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f); - const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f); - const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f); - const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f); - const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f); - const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f); - const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m128 vi_max = _mm_load1_ps(max); - - __m128 vacc0 = _mm_setzero_ps(); - __m128 vacc1 = _mm_setzero_ps(); - __m128 vacc2 = _mm_setzero_ps(); - __m128 vacc3 = _mm_setzero_ps(); - __m128 vacc4 = _mm_setzero_ps(); - for (; batch >= 20 * sizeof(float); batch -= 20 * sizeof(float)) { - // Load 20 (5x4) inputs at a time. - const __m128 vi0 = _mm_loadu_ps(input); - const __m128 vi1 = _mm_loadu_ps(input + 4); - const __m128 vi2 = _mm_loadu_ps(input + 8); - const __m128 vi3 = _mm_loadu_ps(input + 12); - const __m128 vi4 = _mm_loadu_ps(input + 16); - input += 20; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx0 = _mm_sub_ps(vi0, vi_max); - const __m128 vx1 = _mm_sub_ps(vi1, vi_max); - const __m128 vx2 = _mm_sub_ps(vi2, vi_max); - const __m128 vx3 = _mm_sub_ps(vi3, vi_max); - const __m128 vx4 = _mm_sub_ps(vi4, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); - __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); - __m128 vn2 = _mm_add_ps(_mm_mul_ps(vx2, vlog2e), vmagic_bias); - __m128 vn3 = _mm_add_ps(_mm_mul_ps(vx3, vlog2e), vmagic_bias); - __m128 vn4 = _mm_add_ps(_mm_mul_ps(vx4, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); - const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); - const __m128 vs2 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn2), 23)); - const __m128 vs3 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn3), 23)); - const __m128 vs4 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn0 = _mm_sub_ps(vn0, vmagic_bias); - vn1 = _mm_sub_ps(vn1, vmagic_bias); - vn2 = _mm_sub_ps(vn2, vmagic_bias); - vn3 = _mm_sub_ps(vn3, vmagic_bias); - vn4 = _mm_sub_ps(vn4, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); - __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); - __m128 vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_hi), vx2); - __m128 vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_hi), vx3); - __m128 vt4 = _mm_add_ps(_mm_mul_ps(vn4, vminus_ln2_hi), vx4); - - vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); - vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); - vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_lo), vt2); - vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_lo), vt3); - vt4 = _mm_add_ps(_mm_mul_ps(vn4, vminus_ln2_lo), vt4); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); - __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); - __m128 vp2 = _mm_add_ps(_mm_mul_ps(vc5, vt2), vc4); - __m128 vp3 = _mm_add_ps(_mm_mul_ps(vc5, vt3), vc4); - __m128 vp4 = _mm_add_ps(_mm_mul_ps(vc5, vt4), vc4); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc3); - vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc3); - vp4 = _mm_add_ps(_mm_mul_ps(vp4, vt4), vc3); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc2); - vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc2); - vp4 = _mm_add_ps(_mm_mul_ps(vp4, vt4), vc2); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc1); - vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc1); - vp4 = _mm_add_ps(_mm_mul_ps(vp4, vt4), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt0 = _mm_mul_ps(vt0, vs0); - vt1 = _mm_mul_ps(vt1, vs1); - vt2 = _mm_mul_ps(vt2, vs2); - vt3 = _mm_mul_ps(vt3, vs3); - vt4 = _mm_mul_ps(vt4, vs4); - - __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); - __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); - __m128 vf2 = _mm_add_ps(_mm_mul_ps(vt2, vp2), vs2); - __m128 vf3 = _mm_add_ps(_mm_mul_ps(vt3, vp3), vs3); - __m128 vf4 = _mm_add_ps(_mm_mul_ps(vt4, vp4), vs4); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); - vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); - vf2 = _mm_andnot_ps(_mm_cmplt_ps(vx2, vdenorm_cutoff), vf2); - vf3 = _mm_andnot_ps(_mm_cmplt_ps(vx3, vdenorm_cutoff), vf3); - vf4 = _mm_andnot_ps(_mm_cmplt_ps(vx4, vdenorm_cutoff), vf4); - - // Store 20 (5x4) outputs at a time. - _mm_storeu_ps(output, vf0); - _mm_storeu_ps(output + 4, vf1); - _mm_storeu_ps(output + 8, vf2); - _mm_storeu_ps(output + 12, vf3); - _mm_storeu_ps(output + 16, vf4); - output += 20; - - // Accumulate computed exponents. - vacc0 = _mm_add_ps(vacc0, vf0); - vacc1 = _mm_add_ps(vacc1, vf1); - vacc2 = _mm_add_ps(vacc2, vf2); - vacc3 = _mm_add_ps(vacc3, vf3); - vacc4 = _mm_add_ps(vacc4, vf4); - } - // Add up all accumulators to vacc0 - vacc0 = _mm_add_ps(vacc0, vacc1); - vacc2 = _mm_add_ps(vacc2, vacc3); - vacc0 = _mm_add_ps(vacc0, vacc2); - vacc0 = _mm_add_ps(vacc0, vacc4); - - __m128 vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - input += 4; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - // Store 4 outputs at a time. - _mm_storeu_ps(output, vf); - output += 4; - - // Accumulate computed exponents. - vacc = _mm_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - if (batch & (2 * sizeof(float))) { - // Store 2 outputs at a time. - _mm_storel_pi((__m64*) output, vf); - output += 2; - - // Accumulate 2 computed exponents. - vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps())); - - vf = _mm_movehl_ps(vf, vf); - } - if (batch & (1 * sizeof(float))) { - // Store 1 output at a time. - _mm_store_ss(output, vf); - - // Accumulate 1 computed exponent. - vacc = _mm_add_ss(vacc, vf); - } - } - // Reduce 4 batch in the SIMD register - vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); - vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1))); - _mm_store_ss(sum, vacc); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20.c deleted file mode 100644 index d6ee4ad84c4..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u20.c +++ /dev/null @@ -1,286 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f); - const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f); - const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f); - const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f); - const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f); - const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f); - const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f); - const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f); - const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f); - const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m128 vi_max = _mm_load1_ps(max); - - __m128 vacc0 = _mm_setzero_ps(); - for (; batch >= 20 * sizeof(float); batch -= 20 * sizeof(float)) { - // Load 20 (5x4) inputs at a time. - const __m128 vi0 = _mm_loadu_ps(input); - const __m128 vi1 = _mm_loadu_ps(input + 4); - const __m128 vi2 = _mm_loadu_ps(input + 8); - const __m128 vi3 = _mm_loadu_ps(input + 12); - const __m128 vi4 = _mm_loadu_ps(input + 16); - input += 20; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx0 = _mm_sub_ps(vi0, vi_max); - const __m128 vx1 = _mm_sub_ps(vi1, vi_max); - const __m128 vx2 = _mm_sub_ps(vi2, vi_max); - const __m128 vx3 = _mm_sub_ps(vi3, vi_max); - const __m128 vx4 = _mm_sub_ps(vi4, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); - __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); - __m128 vn2 = _mm_add_ps(_mm_mul_ps(vx2, vlog2e), vmagic_bias); - __m128 vn3 = _mm_add_ps(_mm_mul_ps(vx3, vlog2e), vmagic_bias); - __m128 vn4 = _mm_add_ps(_mm_mul_ps(vx4, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); - const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); - const __m128 vs2 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn2), 23)); - const __m128 vs3 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn3), 23)); - const __m128 vs4 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn0 = _mm_sub_ps(vn0, vmagic_bias); - vn1 = _mm_sub_ps(vn1, vmagic_bias); - vn2 = _mm_sub_ps(vn2, vmagic_bias); - vn3 = _mm_sub_ps(vn3, vmagic_bias); - vn4 = _mm_sub_ps(vn4, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); - __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); - __m128 vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_hi), vx2); - __m128 vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_hi), vx3); - __m128 vt4 = _mm_add_ps(_mm_mul_ps(vn4, vminus_ln2_hi), vx4); - - vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); - vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); - vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_lo), vt2); - vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_lo), vt3); - vt4 = _mm_add_ps(_mm_mul_ps(vn4, vminus_ln2_lo), vt4); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); - __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); - __m128 vp2 = _mm_add_ps(_mm_mul_ps(vc5, vt2), vc4); - __m128 vp3 = _mm_add_ps(_mm_mul_ps(vc5, vt3), vc4); - __m128 vp4 = _mm_add_ps(_mm_mul_ps(vc5, vt4), vc4); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc3); - vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc3); - vp4 = _mm_add_ps(_mm_mul_ps(vp4, vt4), vc3); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc2); - vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc2); - vp4 = _mm_add_ps(_mm_mul_ps(vp4, vt4), vc2); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc1); - vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc1); - vp4 = _mm_add_ps(_mm_mul_ps(vp4, vt4), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt0 = _mm_mul_ps(vt0, vs0); - vt1 = _mm_mul_ps(vt1, vs1); - vt2 = _mm_mul_ps(vt2, vs2); - vt3 = _mm_mul_ps(vt3, vs3); - vt4 = _mm_mul_ps(vt4, vs4); - - __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); - __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); - __m128 vf2 = _mm_add_ps(_mm_mul_ps(vt2, vp2), vs2); - __m128 vf3 = _mm_add_ps(_mm_mul_ps(vt3, vp3), vs3); - __m128 vf4 = _mm_add_ps(_mm_mul_ps(vt4, vp4), vs4); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); - vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); - vf2 = _mm_andnot_ps(_mm_cmplt_ps(vx2, vdenorm_cutoff), vf2); - vf3 = _mm_andnot_ps(_mm_cmplt_ps(vx3, vdenorm_cutoff), vf3); - vf4 = _mm_andnot_ps(_mm_cmplt_ps(vx4, vdenorm_cutoff), vf4); - - // Store 20 (5x4) outputs at a time. - _mm_storeu_ps(output, vf0); - _mm_storeu_ps(output + 4, vf1); - _mm_storeu_ps(output + 8, vf2); - _mm_storeu_ps(output + 12, vf3); - _mm_storeu_ps(output + 16, vf4); - output += 20; - - // Accumulate computed exponents. - vacc0 = _mm_add_ps(vacc0, vf0); - vacc0 = _mm_add_ps(vacc0, vf1); - vacc0 = _mm_add_ps(vacc0, vf2); - vacc0 = _mm_add_ps(vacc0, vf3); - vacc0 = _mm_add_ps(vacc0, vf4); - } - - __m128 vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - input += 4; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - // Store 4 outputs at a time. - _mm_storeu_ps(output, vf); - output += 4; - - // Accumulate computed exponents. - vacc = _mm_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - if (batch & (2 * sizeof(float))) { - // Store 2 outputs at a time. - _mm_storel_pi((__m64*) output, vf); - output += 2; - - // Accumulate 2 computed exponents. - vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps())); - - vf = _mm_movehl_ps(vf, vf); - } - if (batch & (1 * sizeof(float))) { - // Store 1 output at a time. - _mm_store_ss(output, vf); - - // Accumulate 1 computed exponent. - vacc = _mm_add_ss(vacc, vf); - } - } - // Reduce 4 batch in the SIMD register - vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); - vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1))); - _mm_store_ss(sum, vacc); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u4.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u4.c deleted file mode 100644 index ee1ac67db15..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u4.c +++ /dev/null @@ -1,222 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u4( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f); - const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f); - const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f); - const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f); - const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f); - const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f); - const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f); - const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f); - const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f); - const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m128 vi_max = _mm_load1_ps(max); - - __m128 vacc0 = _mm_setzero_ps(); - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - // Load 4 (1x4) inputs at a time. - const __m128 vi0 = _mm_loadu_ps(input); - input += 4; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx0 = _mm_sub_ps(vi0, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn0 = _mm_sub_ps(vn0, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); - - vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt0 = _mm_mul_ps(vt0, vs0); - - __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); - - // Store 4 (1x4) outputs at a time. - _mm_storeu_ps(output, vf0); - output += 4; - - // Accumulate computed exponents. - vacc0 = _mm_add_ps(vacc0, vf0); - } - - __m128 vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - input += 4; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - // Store 4 outputs at a time. - _mm_storeu_ps(output, vf); - output += 4; - - // Accumulate computed exponents. - vacc = _mm_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - if (batch & (2 * sizeof(float))) { - // Store 2 outputs at a time. - _mm_storel_pi((__m64*) output, vf); - output += 2; - - // Accumulate 2 computed exponents. - vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps())); - - vf = _mm_movehl_ps(vf, vf); - } - if (batch & (1 * sizeof(float))) { - // Store 1 output at a time. - _mm_store_ss(output, vf); - - // Accumulate 1 computed exponent. - vacc = _mm_add_ss(vacc, vf); - } - } - // Reduce 4 batch in the SIMD register - vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); - vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1))); - _mm_store_ss(sum, vacc); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u8-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u8-acc2.c deleted file mode 100644 index 92ca6f83081..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u8-acc2.c +++ /dev/null @@ -1,241 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u8_acc2( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f); - const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f); - const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f); - const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f); - const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f); - const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f); - const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f); - const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f); - const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f); - const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m128 vi_max = _mm_load1_ps(max); - - __m128 vacc0 = _mm_setzero_ps(); - __m128 vacc1 = _mm_setzero_ps(); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - // Load 8 (2x4) inputs at a time. - const __m128 vi0 = _mm_loadu_ps(input); - const __m128 vi1 = _mm_loadu_ps(input + 4); - input += 8; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx0 = _mm_sub_ps(vi0, vi_max); - const __m128 vx1 = _mm_sub_ps(vi1, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); - __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); - const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn0 = _mm_sub_ps(vn0, vmagic_bias); - vn1 = _mm_sub_ps(vn1, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); - __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); - - vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); - vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); - __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt0 = _mm_mul_ps(vt0, vs0); - vt1 = _mm_mul_ps(vt1, vs1); - - __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); - __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); - vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); - - // Store 8 (2x4) outputs at a time. - _mm_storeu_ps(output, vf0); - _mm_storeu_ps(output + 4, vf1); - output += 8; - - // Accumulate computed exponents. - vacc0 = _mm_add_ps(vacc0, vf0); - vacc1 = _mm_add_ps(vacc1, vf1); - } - // Add up all accumulators to vacc0 - vacc0 = _mm_add_ps(vacc0, vacc1); - - __m128 vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - input += 4; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - // Store 4 outputs at a time. - _mm_storeu_ps(output, vf); - output += 4; - - // Accumulate computed exponents. - vacc = _mm_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - if (batch & (2 * sizeof(float))) { - // Store 2 outputs at a time. - _mm_storel_pi((__m64*) output, vf); - output += 2; - - // Accumulate 2 computed exponents. - vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps())); - - vf = _mm_movehl_ps(vf, vf); - } - if (batch & (1 * sizeof(float))) { - // Store 1 output at a time. - _mm_store_ss(output, vf); - - // Accumulate 1 computed exponent. - vacc = _mm_add_ss(vacc, vf); - } - } - // Reduce 4 batch in the SIMD register - vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); - vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1))); - _mm_store_ss(sum, vacc); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u8.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u8.c deleted file mode 100644 index 742b25b307a..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx-rr2-p5-u8.c +++ /dev/null @@ -1,238 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u8( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f); - const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f); - const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f); - const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f); - const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f); - const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f); - const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f); - const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f); - const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f); - const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m128 vi_max = _mm_load1_ps(max); - - __m128 vacc0 = _mm_setzero_ps(); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - // Load 8 (2x4) inputs at a time. - const __m128 vi0 = _mm_loadu_ps(input); - const __m128 vi1 = _mm_loadu_ps(input + 4); - input += 8; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx0 = _mm_sub_ps(vi0, vi_max); - const __m128 vx1 = _mm_sub_ps(vi1, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); - __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); - const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn0 = _mm_sub_ps(vn0, vmagic_bias); - vn1 = _mm_sub_ps(vn1, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); - __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); - - vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); - vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); - __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt0 = _mm_mul_ps(vt0, vs0); - vt1 = _mm_mul_ps(vt1, vs1); - - __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); - __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); - vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); - - // Store 8 (2x4) outputs at a time. - _mm_storeu_ps(output, vf0); - _mm_storeu_ps(output + 4, vf1); - output += 8; - - // Accumulate computed exponents. - vacc0 = _mm_add_ps(vacc0, vf0); - vacc0 = _mm_add_ps(vacc0, vf1); - } - - __m128 vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - input += 4; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - // Store 4 outputs at a time. - _mm_storeu_ps(output, vf); - output += 4; - - // Accumulate computed exponents. - vacc = _mm_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - if (batch & (2 * sizeof(float))) { - // Store 2 outputs at a time. - _mm_storel_pi((__m64*) output, vf); - output += 2; - - // Accumulate 2 computed exponents. - vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps())); - - vf = _mm_movehl_ps(vf, vf); - } - if (batch & (1 * sizeof(float))) { - // Store 1 output at a time. - _mm_store_ss(output, vf); - - // Accumulate 1 computed exponent. - vacc = _mm_add_ss(vacc, vf); - } - } - // Reduce 4 batch in the SIMD register - vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); - vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1))); - _mm_store_ss(sum, vacc); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u32.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u16-acc2.c similarity index 76% rename from src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u32.c rename to src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u16-acc2.c index 0abda8dc5ba..0305cae9589 100644 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u32.c +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u16-acc2.c @@ -11,10 +11,11 @@ #include +#include "xnnpack/intrinsics-polyfill.h" #include "xnnpack/raddstoreexpminusmax.h" -void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32( +void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u16_acc2( size_t batch, const float* input, const float* max, @@ -54,84 +55,56 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32( const __m256 vi_max = _mm256_broadcast_ss(max); __m256 vacc0 = _mm256_setzero_ps(); - for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { + __m256 vacc1 = _mm256_setzero_ps(); + for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const __m256 vi0 = _mm256_loadu_ps(input); const __m256 vi1 = _mm256_loadu_ps(input + 8); - const __m256 vi2 = _mm256_loadu_ps(input + 16); - const __m256 vi3 = _mm256_loadu_ps(input + 24); - input += 32; + input += 16; const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); - const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); - const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); __m256 vn0 = _mm256_fmadd_ps(vx0, vlog2e, vmagic_bias); __m256 vn1 = _mm256_fmadd_ps(vx1, vlog2e, vmagic_bias); - __m256 vn2 = _mm256_fmadd_ps(vx2, vlog2e, vmagic_bias); - __m256 vn3 = _mm256_fmadd_ps(vx3, vlog2e, vmagic_bias); const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); - const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); - const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); vn0 = _mm256_sub_ps(vn0, vmagic_bias); vn1 = _mm256_sub_ps(vn1, vmagic_bias); - vn2 = _mm256_sub_ps(vn2, vmagic_bias); - vn3 = _mm256_sub_ps(vn3, vmagic_bias); __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2, vx0); __m256 vt1 = _mm256_fmadd_ps(vn1, vminus_ln2, vx1); - __m256 vt2 = _mm256_fmadd_ps(vn2, vminus_ln2, vx2); - __m256 vt3 = _mm256_fmadd_ps(vn3, vminus_ln2, vx3); __m256 vp0 = _mm256_fmadd_ps(vc5, vt0, vc4); __m256 vp1 = _mm256_fmadd_ps(vc5, vt1, vc4); - __m256 vp2 = _mm256_fmadd_ps(vc5, vt2, vc4); - __m256 vp3 = _mm256_fmadd_ps(vc5, vt3, vc4); vp0 = _mm256_fmadd_ps(vp0, vt0, vc3); vp1 = _mm256_fmadd_ps(vp1, vt1, vc3); - vp2 = _mm256_fmadd_ps(vp2, vt2, vc3); - vp3 = _mm256_fmadd_ps(vp3, vt3, vc3); vp0 = _mm256_fmadd_ps(vp0, vt0, vc2); vp1 = _mm256_fmadd_ps(vp1, vt1, vc2); - vp2 = _mm256_fmadd_ps(vp2, vt2, vc2); - vp3 = _mm256_fmadd_ps(vp3, vt3, vc2); vp0 = _mm256_fmadd_ps(vp0, vt0, vc1); vp1 = _mm256_fmadd_ps(vp1, vt1, vc1); - vp2 = _mm256_fmadd_ps(vp2, vt2, vc1); - vp3 = _mm256_fmadd_ps(vp3, vt3, vc1); vt0 = _mm256_mul_ps(vt0, vs0); vt1 = _mm256_mul_ps(vt1, vs1); - vt2 = _mm256_mul_ps(vt2, vs2); - vt3 = _mm256_mul_ps(vt3, vs3); __m256 vf0 = _mm256_fmadd_ps(vt0, vp0, vs0); __m256 vf1 = _mm256_fmadd_ps(vt1, vp1, vs1); - __m256 vf2 = _mm256_fmadd_ps(vt2, vp2, vs2); - __m256 vf3 = _mm256_fmadd_ps(vt3, vp3, vs3); vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); - vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); - vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); _mm256_storeu_ps(output, vf0); _mm256_storeu_ps(output + 8, vf1); - _mm256_storeu_ps(output + 16, vf2); - _mm256_storeu_ps(output + 24, vf3); - output += 32; + output += 16; vacc0 = _mm256_add_ps(vacc0, vf0); - vacc0 = _mm256_add_ps(vacc0, vf1); - vacc0 = _mm256_add_ps(vacc0, vf2); - vacc0 = _mm256_add_ps(vacc0, vf3); + vacc1 = _mm256_add_ps(vacc1, vf1); } + vacc0 = _mm256_add_ps(vacc0, vacc1); __m256 vacc = vacc0; for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u32-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u32-acc2.c index 21ddfdc3e04..aeea68f36e7 100644 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u32-acc2.c +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u32-acc2.c @@ -11,6 +11,7 @@ #include +#include "xnnpack/intrinsics-polyfill.h" #include "xnnpack/raddstoreexpminusmax.h" diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u32-acc4.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u32-acc4.c index 08d69215e9a..81884848401 100644 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u32-acc4.c +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u32-acc4.c @@ -11,6 +11,7 @@ #include +#include "xnnpack/intrinsics-polyfill.h" #include "xnnpack/raddstoreexpminusmax.h" diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u64-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u64-acc2.c deleted file mode 100644 index 1d9e4b79e14..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u64-acc2.c +++ /dev/null @@ -1,276 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u64_acc2( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; - - const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); - const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); - const __m256 vminus_ln2 = _mm256_set1_ps(-0x1.62E430p-1f); - const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); - const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); - const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); - const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); - const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); - const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m256 vi_max = _mm256_broadcast_ss(max); - - __m256 vacc0 = _mm256_setzero_ps(); - __m256 vacc1 = _mm256_setzero_ps(); - for (; batch >= 64 * sizeof(float); batch -= 64 * sizeof(float)) { - const __m256 vi0 = _mm256_loadu_ps(input); - const __m256 vi1 = _mm256_loadu_ps(input + 8); - const __m256 vi2 = _mm256_loadu_ps(input + 16); - const __m256 vi3 = _mm256_loadu_ps(input + 24); - const __m256 vi4 = _mm256_loadu_ps(input + 32); - const __m256 vi5 = _mm256_loadu_ps(input + 40); - const __m256 vi6 = _mm256_loadu_ps(input + 48); - const __m256 vi7 = _mm256_loadu_ps(input + 56); - input += 64; - - const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); - const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); - const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); - const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); - const __m256 vx4 = _mm256_sub_ps(vi4, vi_max); - const __m256 vx5 = _mm256_sub_ps(vi5, vi_max); - const __m256 vx6 = _mm256_sub_ps(vi6, vi_max); - const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); - - __m256 vn0 = _mm256_fmadd_ps(vx0, vlog2e, vmagic_bias); - __m256 vn1 = _mm256_fmadd_ps(vx1, vlog2e, vmagic_bias); - __m256 vn2 = _mm256_fmadd_ps(vx2, vlog2e, vmagic_bias); - __m256 vn3 = _mm256_fmadd_ps(vx3, vlog2e, vmagic_bias); - __m256 vn4 = _mm256_fmadd_ps(vx4, vlog2e, vmagic_bias); - __m256 vn5 = _mm256_fmadd_ps(vx5, vlog2e, vmagic_bias); - __m256 vn6 = _mm256_fmadd_ps(vx6, vlog2e, vmagic_bias); - __m256 vn7 = _mm256_fmadd_ps(vx7, vlog2e, vmagic_bias); - - const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); - const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); - const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); - const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); - const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23)); - const __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); - const __m256 vs6 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn6), 23)); - const __m256 vs7 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn7), 23)); - - vn0 = _mm256_sub_ps(vn0, vmagic_bias); - vn1 = _mm256_sub_ps(vn1, vmagic_bias); - vn2 = _mm256_sub_ps(vn2, vmagic_bias); - vn3 = _mm256_sub_ps(vn3, vmagic_bias); - vn4 = _mm256_sub_ps(vn4, vmagic_bias); - vn5 = _mm256_sub_ps(vn5, vmagic_bias); - vn6 = _mm256_sub_ps(vn6, vmagic_bias); - vn7 = _mm256_sub_ps(vn7, vmagic_bias); - - __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2, vx0); - __m256 vt1 = _mm256_fmadd_ps(vn1, vminus_ln2, vx1); - __m256 vt2 = _mm256_fmadd_ps(vn2, vminus_ln2, vx2); - __m256 vt3 = _mm256_fmadd_ps(vn3, vminus_ln2, vx3); - __m256 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2, vx4); - __m256 vt5 = _mm256_fmadd_ps(vn5, vminus_ln2, vx5); - __m256 vt6 = _mm256_fmadd_ps(vn6, vminus_ln2, vx6); - __m256 vt7 = _mm256_fmadd_ps(vn7, vminus_ln2, vx7); - - __m256 vp0 = _mm256_fmadd_ps(vc5, vt0, vc4); - __m256 vp1 = _mm256_fmadd_ps(vc5, vt1, vc4); - __m256 vp2 = _mm256_fmadd_ps(vc5, vt2, vc4); - __m256 vp3 = _mm256_fmadd_ps(vc5, vt3, vc4); - __m256 vp4 = _mm256_fmadd_ps(vc5, vt4, vc4); - __m256 vp5 = _mm256_fmadd_ps(vc5, vt5, vc4); - __m256 vp6 = _mm256_fmadd_ps(vc5, vt6, vc4); - __m256 vp7 = _mm256_fmadd_ps(vc5, vt7, vc4); - - vp0 = _mm256_fmadd_ps(vp0, vt0, vc3); - vp1 = _mm256_fmadd_ps(vp1, vt1, vc3); - vp2 = _mm256_fmadd_ps(vp2, vt2, vc3); - vp3 = _mm256_fmadd_ps(vp3, vt3, vc3); - vp4 = _mm256_fmadd_ps(vp4, vt4, vc3); - vp5 = _mm256_fmadd_ps(vp5, vt5, vc3); - vp6 = _mm256_fmadd_ps(vp6, vt6, vc3); - vp7 = _mm256_fmadd_ps(vp7, vt7, vc3); - - vp0 = _mm256_fmadd_ps(vp0, vt0, vc2); - vp1 = _mm256_fmadd_ps(vp1, vt1, vc2); - vp2 = _mm256_fmadd_ps(vp2, vt2, vc2); - vp3 = _mm256_fmadd_ps(vp3, vt3, vc2); - vp4 = _mm256_fmadd_ps(vp4, vt4, vc2); - vp5 = _mm256_fmadd_ps(vp5, vt5, vc2); - vp6 = _mm256_fmadd_ps(vp6, vt6, vc2); - vp7 = _mm256_fmadd_ps(vp7, vt7, vc2); - - vp0 = _mm256_fmadd_ps(vp0, vt0, vc1); - vp1 = _mm256_fmadd_ps(vp1, vt1, vc1); - vp2 = _mm256_fmadd_ps(vp2, vt2, vc1); - vp3 = _mm256_fmadd_ps(vp3, vt3, vc1); - vp4 = _mm256_fmadd_ps(vp4, vt4, vc1); - vp5 = _mm256_fmadd_ps(vp5, vt5, vc1); - vp6 = _mm256_fmadd_ps(vp6, vt6, vc1); - vp7 = _mm256_fmadd_ps(vp7, vt7, vc1); - - vt0 = _mm256_mul_ps(vt0, vs0); - vt1 = _mm256_mul_ps(vt1, vs1); - vt2 = _mm256_mul_ps(vt2, vs2); - vt3 = _mm256_mul_ps(vt3, vs3); - vt4 = _mm256_mul_ps(vt4, vs4); - vt5 = _mm256_mul_ps(vt5, vs5); - vt6 = _mm256_mul_ps(vt6, vs6); - vt7 = _mm256_mul_ps(vt7, vs7); - - __m256 vf0 = _mm256_fmadd_ps(vt0, vp0, vs0); - __m256 vf1 = _mm256_fmadd_ps(vt1, vp1, vs1); - __m256 vf2 = _mm256_fmadd_ps(vt2, vp2, vs2); - __m256 vf3 = _mm256_fmadd_ps(vt3, vp3, vs3); - __m256 vf4 = _mm256_fmadd_ps(vt4, vp4, vs4); - __m256 vf5 = _mm256_fmadd_ps(vt5, vp5, vs5); - __m256 vf6 = _mm256_fmadd_ps(vt6, vp6, vs6); - __m256 vf7 = _mm256_fmadd_ps(vt7, vp7, vs7); - - vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); - vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); - vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); - vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); - vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vx4, vdenorm_cutoff, _CMP_LT_OS), vf4); - vf5 = _mm256_andnot_ps(_mm256_cmp_ps(vx5, vdenorm_cutoff, _CMP_LT_OS), vf5); - vf6 = _mm256_andnot_ps(_mm256_cmp_ps(vx6, vdenorm_cutoff, _CMP_LT_OS), vf6); - vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); - - _mm256_storeu_ps(output, vf0); - _mm256_storeu_ps(output + 8, vf1); - _mm256_storeu_ps(output + 16, vf2); - _mm256_storeu_ps(output + 24, vf3); - _mm256_storeu_ps(output + 32, vf4); - _mm256_storeu_ps(output + 40, vf5); - _mm256_storeu_ps(output + 48, vf6); - _mm256_storeu_ps(output + 56, vf7); - output += 64; - - vacc0 = _mm256_add_ps(vacc0, vf0); - vacc1 = _mm256_add_ps(vacc1, vf1); - vacc0 = _mm256_add_ps(vacc0, vf2); - vacc1 = _mm256_add_ps(vacc1, vf3); - vacc0 = _mm256_add_ps(vacc0, vf4); - vacc1 = _mm256_add_ps(vacc1, vf5); - vacc0 = _mm256_add_ps(vacc0, vf6); - vacc1 = _mm256_add_ps(vacc1, vf7); - } - vacc0 = _mm256_add_ps(vacc0, vacc1); - - __m256 vacc = vacc0; - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const __m256 vi = _mm256_loadu_ps(input); - input += 8; - - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - __m256 vn = _mm256_fmadd_ps(vx, vlog2e, vmagic_bias); - - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - vn = _mm256_sub_ps(vn, vmagic_bias); - - __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vx); - - __m256 vp = _mm256_fmadd_ps(vc5, vt, vc4); - vp = _mm256_fmadd_ps(vp, vt, vc3); - vp = _mm256_fmadd_ps(vp, vt, vc2); - vp = _mm256_fmadd_ps(vp, vt, vc1); - - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_fmadd_ps(vt, vp, vs); - - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - _mm256_storeu_ps(output, vf); - output += 8; - - vacc = _mm256_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); - - const __m256 vi = _mm256_maskload_ps(input, vmask); - - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - __m256 vn = _mm256_fmadd_ps(vx, vlog2e, vmagic_bias); - - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - vn = _mm256_sub_ps(vn, vmagic_bias); - - __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vx); - - __m256 vp = _mm256_fmadd_ps(vc5, vt, vc4); - vp = _mm256_fmadd_ps(vp, vt, vc3); - vp = _mm256_fmadd_ps(vp, vt, vc2); - vp = _mm256_fmadd_ps(vp, vt, vc1); - - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_fmadd_ps(vt, vp, vs); - - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - __m128 vf_lo = _mm256_castps256_ps128(vf); - if (batch & (4 * sizeof(float))) { - _mm_storeu_ps(output, vf_lo); - vf_lo = _mm256_extractf128_ps(vf, 1); - output += 4; - } - if (batch & (2 * sizeof(float))) { - _mm_storel_pi((__m64*) output, vf_lo); - vf_lo = _mm_movehl_ps(vf_lo, vf_lo); - output += 2; - } - if (batch & (1 * sizeof(float))) { - _mm_store_ss(output, vf_lo); - } - - vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); - } - __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); - vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); - vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); - _mm_store_ss(sum, vacc_lo); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u64-acc4.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u64-acc4.c deleted file mode 100644 index 6be24aa0635..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u64-acc4.c +++ /dev/null @@ -1,280 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u64_acc4( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; - - const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); - const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); - const __m256 vminus_ln2 = _mm256_set1_ps(-0x1.62E430p-1f); - const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); - const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); - const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); - const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); - const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); - const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m256 vi_max = _mm256_broadcast_ss(max); - - __m256 vacc0 = _mm256_setzero_ps(); - __m256 vacc1 = _mm256_setzero_ps(); - __m256 vacc2 = _mm256_setzero_ps(); - __m256 vacc3 = _mm256_setzero_ps(); - for (; batch >= 64 * sizeof(float); batch -= 64 * sizeof(float)) { - const __m256 vi0 = _mm256_loadu_ps(input); - const __m256 vi1 = _mm256_loadu_ps(input + 8); - const __m256 vi2 = _mm256_loadu_ps(input + 16); - const __m256 vi3 = _mm256_loadu_ps(input + 24); - const __m256 vi4 = _mm256_loadu_ps(input + 32); - const __m256 vi5 = _mm256_loadu_ps(input + 40); - const __m256 vi6 = _mm256_loadu_ps(input + 48); - const __m256 vi7 = _mm256_loadu_ps(input + 56); - input += 64; - - const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); - const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); - const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); - const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); - const __m256 vx4 = _mm256_sub_ps(vi4, vi_max); - const __m256 vx5 = _mm256_sub_ps(vi5, vi_max); - const __m256 vx6 = _mm256_sub_ps(vi6, vi_max); - const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); - - __m256 vn0 = _mm256_fmadd_ps(vx0, vlog2e, vmagic_bias); - __m256 vn1 = _mm256_fmadd_ps(vx1, vlog2e, vmagic_bias); - __m256 vn2 = _mm256_fmadd_ps(vx2, vlog2e, vmagic_bias); - __m256 vn3 = _mm256_fmadd_ps(vx3, vlog2e, vmagic_bias); - __m256 vn4 = _mm256_fmadd_ps(vx4, vlog2e, vmagic_bias); - __m256 vn5 = _mm256_fmadd_ps(vx5, vlog2e, vmagic_bias); - __m256 vn6 = _mm256_fmadd_ps(vx6, vlog2e, vmagic_bias); - __m256 vn7 = _mm256_fmadd_ps(vx7, vlog2e, vmagic_bias); - - const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); - const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); - const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); - const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); - const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23)); - const __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); - const __m256 vs6 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn6), 23)); - const __m256 vs7 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn7), 23)); - - vn0 = _mm256_sub_ps(vn0, vmagic_bias); - vn1 = _mm256_sub_ps(vn1, vmagic_bias); - vn2 = _mm256_sub_ps(vn2, vmagic_bias); - vn3 = _mm256_sub_ps(vn3, vmagic_bias); - vn4 = _mm256_sub_ps(vn4, vmagic_bias); - vn5 = _mm256_sub_ps(vn5, vmagic_bias); - vn6 = _mm256_sub_ps(vn6, vmagic_bias); - vn7 = _mm256_sub_ps(vn7, vmagic_bias); - - __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2, vx0); - __m256 vt1 = _mm256_fmadd_ps(vn1, vminus_ln2, vx1); - __m256 vt2 = _mm256_fmadd_ps(vn2, vminus_ln2, vx2); - __m256 vt3 = _mm256_fmadd_ps(vn3, vminus_ln2, vx3); - __m256 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2, vx4); - __m256 vt5 = _mm256_fmadd_ps(vn5, vminus_ln2, vx5); - __m256 vt6 = _mm256_fmadd_ps(vn6, vminus_ln2, vx6); - __m256 vt7 = _mm256_fmadd_ps(vn7, vminus_ln2, vx7); - - __m256 vp0 = _mm256_fmadd_ps(vc5, vt0, vc4); - __m256 vp1 = _mm256_fmadd_ps(vc5, vt1, vc4); - __m256 vp2 = _mm256_fmadd_ps(vc5, vt2, vc4); - __m256 vp3 = _mm256_fmadd_ps(vc5, vt3, vc4); - __m256 vp4 = _mm256_fmadd_ps(vc5, vt4, vc4); - __m256 vp5 = _mm256_fmadd_ps(vc5, vt5, vc4); - __m256 vp6 = _mm256_fmadd_ps(vc5, vt6, vc4); - __m256 vp7 = _mm256_fmadd_ps(vc5, vt7, vc4); - - vp0 = _mm256_fmadd_ps(vp0, vt0, vc3); - vp1 = _mm256_fmadd_ps(vp1, vt1, vc3); - vp2 = _mm256_fmadd_ps(vp2, vt2, vc3); - vp3 = _mm256_fmadd_ps(vp3, vt3, vc3); - vp4 = _mm256_fmadd_ps(vp4, vt4, vc3); - vp5 = _mm256_fmadd_ps(vp5, vt5, vc3); - vp6 = _mm256_fmadd_ps(vp6, vt6, vc3); - vp7 = _mm256_fmadd_ps(vp7, vt7, vc3); - - vp0 = _mm256_fmadd_ps(vp0, vt0, vc2); - vp1 = _mm256_fmadd_ps(vp1, vt1, vc2); - vp2 = _mm256_fmadd_ps(vp2, vt2, vc2); - vp3 = _mm256_fmadd_ps(vp3, vt3, vc2); - vp4 = _mm256_fmadd_ps(vp4, vt4, vc2); - vp5 = _mm256_fmadd_ps(vp5, vt5, vc2); - vp6 = _mm256_fmadd_ps(vp6, vt6, vc2); - vp7 = _mm256_fmadd_ps(vp7, vt7, vc2); - - vp0 = _mm256_fmadd_ps(vp0, vt0, vc1); - vp1 = _mm256_fmadd_ps(vp1, vt1, vc1); - vp2 = _mm256_fmadd_ps(vp2, vt2, vc1); - vp3 = _mm256_fmadd_ps(vp3, vt3, vc1); - vp4 = _mm256_fmadd_ps(vp4, vt4, vc1); - vp5 = _mm256_fmadd_ps(vp5, vt5, vc1); - vp6 = _mm256_fmadd_ps(vp6, vt6, vc1); - vp7 = _mm256_fmadd_ps(vp7, vt7, vc1); - - vt0 = _mm256_mul_ps(vt0, vs0); - vt1 = _mm256_mul_ps(vt1, vs1); - vt2 = _mm256_mul_ps(vt2, vs2); - vt3 = _mm256_mul_ps(vt3, vs3); - vt4 = _mm256_mul_ps(vt4, vs4); - vt5 = _mm256_mul_ps(vt5, vs5); - vt6 = _mm256_mul_ps(vt6, vs6); - vt7 = _mm256_mul_ps(vt7, vs7); - - __m256 vf0 = _mm256_fmadd_ps(vt0, vp0, vs0); - __m256 vf1 = _mm256_fmadd_ps(vt1, vp1, vs1); - __m256 vf2 = _mm256_fmadd_ps(vt2, vp2, vs2); - __m256 vf3 = _mm256_fmadd_ps(vt3, vp3, vs3); - __m256 vf4 = _mm256_fmadd_ps(vt4, vp4, vs4); - __m256 vf5 = _mm256_fmadd_ps(vt5, vp5, vs5); - __m256 vf6 = _mm256_fmadd_ps(vt6, vp6, vs6); - __m256 vf7 = _mm256_fmadd_ps(vt7, vp7, vs7); - - vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); - vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); - vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); - vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); - vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vx4, vdenorm_cutoff, _CMP_LT_OS), vf4); - vf5 = _mm256_andnot_ps(_mm256_cmp_ps(vx5, vdenorm_cutoff, _CMP_LT_OS), vf5); - vf6 = _mm256_andnot_ps(_mm256_cmp_ps(vx6, vdenorm_cutoff, _CMP_LT_OS), vf6); - vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); - - _mm256_storeu_ps(output, vf0); - _mm256_storeu_ps(output + 8, vf1); - _mm256_storeu_ps(output + 16, vf2); - _mm256_storeu_ps(output + 24, vf3); - _mm256_storeu_ps(output + 32, vf4); - _mm256_storeu_ps(output + 40, vf5); - _mm256_storeu_ps(output + 48, vf6); - _mm256_storeu_ps(output + 56, vf7); - output += 64; - - vacc0 = _mm256_add_ps(vacc0, vf0); - vacc1 = _mm256_add_ps(vacc1, vf1); - vacc2 = _mm256_add_ps(vacc2, vf2); - vacc3 = _mm256_add_ps(vacc3, vf3); - vacc0 = _mm256_add_ps(vacc0, vf4); - vacc1 = _mm256_add_ps(vacc1, vf5); - vacc2 = _mm256_add_ps(vacc2, vf6); - vacc3 = _mm256_add_ps(vacc3, vf7); - } - vacc0 = _mm256_add_ps(vacc0, vacc1); - vacc2 = _mm256_add_ps(vacc2, vacc3); - vacc0 = _mm256_add_ps(vacc0, vacc2); - - __m256 vacc = vacc0; - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const __m256 vi = _mm256_loadu_ps(input); - input += 8; - - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - __m256 vn = _mm256_fmadd_ps(vx, vlog2e, vmagic_bias); - - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - vn = _mm256_sub_ps(vn, vmagic_bias); - - __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vx); - - __m256 vp = _mm256_fmadd_ps(vc5, vt, vc4); - vp = _mm256_fmadd_ps(vp, vt, vc3); - vp = _mm256_fmadd_ps(vp, vt, vc2); - vp = _mm256_fmadd_ps(vp, vt, vc1); - - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_fmadd_ps(vt, vp, vs); - - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - _mm256_storeu_ps(output, vf); - output += 8; - - vacc = _mm256_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); - - const __m256 vi = _mm256_maskload_ps(input, vmask); - - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - __m256 vn = _mm256_fmadd_ps(vx, vlog2e, vmagic_bias); - - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - vn = _mm256_sub_ps(vn, vmagic_bias); - - __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vx); - - __m256 vp = _mm256_fmadd_ps(vc5, vt, vc4); - vp = _mm256_fmadd_ps(vp, vt, vc3); - vp = _mm256_fmadd_ps(vp, vt, vc2); - vp = _mm256_fmadd_ps(vp, vt, vc1); - - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_fmadd_ps(vt, vp, vs); - - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - __m128 vf_lo = _mm256_castps256_ps128(vf); - if (batch & (4 * sizeof(float))) { - _mm_storeu_ps(output, vf_lo); - vf_lo = _mm256_extractf128_ps(vf, 1); - output += 4; - } - if (batch & (2 * sizeof(float))) { - _mm_storel_pi((__m64*) output, vf_lo); - vf_lo = _mm_movehl_ps(vf_lo, vf_lo); - output += 2; - } - if (batch & (1 * sizeof(float))) { - _mm_store_ss(output, vf_lo); - } - - vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); - } - __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); - vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); - vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); - _mm_store_ss(sum, vacc_lo); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u64.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u64.c deleted file mode 100644 index d163e68f9cc..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u64.c +++ /dev/null @@ -1,274 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u64( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; - - const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); - const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); - const __m256 vminus_ln2 = _mm256_set1_ps(-0x1.62E430p-1f); - const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); - const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); - const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); - const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); - const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); - const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m256 vi_max = _mm256_broadcast_ss(max); - - __m256 vacc0 = _mm256_setzero_ps(); - for (; batch >= 64 * sizeof(float); batch -= 64 * sizeof(float)) { - const __m256 vi0 = _mm256_loadu_ps(input); - const __m256 vi1 = _mm256_loadu_ps(input + 8); - const __m256 vi2 = _mm256_loadu_ps(input + 16); - const __m256 vi3 = _mm256_loadu_ps(input + 24); - const __m256 vi4 = _mm256_loadu_ps(input + 32); - const __m256 vi5 = _mm256_loadu_ps(input + 40); - const __m256 vi6 = _mm256_loadu_ps(input + 48); - const __m256 vi7 = _mm256_loadu_ps(input + 56); - input += 64; - - const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); - const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); - const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); - const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); - const __m256 vx4 = _mm256_sub_ps(vi4, vi_max); - const __m256 vx5 = _mm256_sub_ps(vi5, vi_max); - const __m256 vx6 = _mm256_sub_ps(vi6, vi_max); - const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); - - __m256 vn0 = _mm256_fmadd_ps(vx0, vlog2e, vmagic_bias); - __m256 vn1 = _mm256_fmadd_ps(vx1, vlog2e, vmagic_bias); - __m256 vn2 = _mm256_fmadd_ps(vx2, vlog2e, vmagic_bias); - __m256 vn3 = _mm256_fmadd_ps(vx3, vlog2e, vmagic_bias); - __m256 vn4 = _mm256_fmadd_ps(vx4, vlog2e, vmagic_bias); - __m256 vn5 = _mm256_fmadd_ps(vx5, vlog2e, vmagic_bias); - __m256 vn6 = _mm256_fmadd_ps(vx6, vlog2e, vmagic_bias); - __m256 vn7 = _mm256_fmadd_ps(vx7, vlog2e, vmagic_bias); - - const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); - const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); - const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); - const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); - const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23)); - const __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); - const __m256 vs6 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn6), 23)); - const __m256 vs7 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn7), 23)); - - vn0 = _mm256_sub_ps(vn0, vmagic_bias); - vn1 = _mm256_sub_ps(vn1, vmagic_bias); - vn2 = _mm256_sub_ps(vn2, vmagic_bias); - vn3 = _mm256_sub_ps(vn3, vmagic_bias); - vn4 = _mm256_sub_ps(vn4, vmagic_bias); - vn5 = _mm256_sub_ps(vn5, vmagic_bias); - vn6 = _mm256_sub_ps(vn6, vmagic_bias); - vn7 = _mm256_sub_ps(vn7, vmagic_bias); - - __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2, vx0); - __m256 vt1 = _mm256_fmadd_ps(vn1, vminus_ln2, vx1); - __m256 vt2 = _mm256_fmadd_ps(vn2, vminus_ln2, vx2); - __m256 vt3 = _mm256_fmadd_ps(vn3, vminus_ln2, vx3); - __m256 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2, vx4); - __m256 vt5 = _mm256_fmadd_ps(vn5, vminus_ln2, vx5); - __m256 vt6 = _mm256_fmadd_ps(vn6, vminus_ln2, vx6); - __m256 vt7 = _mm256_fmadd_ps(vn7, vminus_ln2, vx7); - - __m256 vp0 = _mm256_fmadd_ps(vc5, vt0, vc4); - __m256 vp1 = _mm256_fmadd_ps(vc5, vt1, vc4); - __m256 vp2 = _mm256_fmadd_ps(vc5, vt2, vc4); - __m256 vp3 = _mm256_fmadd_ps(vc5, vt3, vc4); - __m256 vp4 = _mm256_fmadd_ps(vc5, vt4, vc4); - __m256 vp5 = _mm256_fmadd_ps(vc5, vt5, vc4); - __m256 vp6 = _mm256_fmadd_ps(vc5, vt6, vc4); - __m256 vp7 = _mm256_fmadd_ps(vc5, vt7, vc4); - - vp0 = _mm256_fmadd_ps(vp0, vt0, vc3); - vp1 = _mm256_fmadd_ps(vp1, vt1, vc3); - vp2 = _mm256_fmadd_ps(vp2, vt2, vc3); - vp3 = _mm256_fmadd_ps(vp3, vt3, vc3); - vp4 = _mm256_fmadd_ps(vp4, vt4, vc3); - vp5 = _mm256_fmadd_ps(vp5, vt5, vc3); - vp6 = _mm256_fmadd_ps(vp6, vt6, vc3); - vp7 = _mm256_fmadd_ps(vp7, vt7, vc3); - - vp0 = _mm256_fmadd_ps(vp0, vt0, vc2); - vp1 = _mm256_fmadd_ps(vp1, vt1, vc2); - vp2 = _mm256_fmadd_ps(vp2, vt2, vc2); - vp3 = _mm256_fmadd_ps(vp3, vt3, vc2); - vp4 = _mm256_fmadd_ps(vp4, vt4, vc2); - vp5 = _mm256_fmadd_ps(vp5, vt5, vc2); - vp6 = _mm256_fmadd_ps(vp6, vt6, vc2); - vp7 = _mm256_fmadd_ps(vp7, vt7, vc2); - - vp0 = _mm256_fmadd_ps(vp0, vt0, vc1); - vp1 = _mm256_fmadd_ps(vp1, vt1, vc1); - vp2 = _mm256_fmadd_ps(vp2, vt2, vc1); - vp3 = _mm256_fmadd_ps(vp3, vt3, vc1); - vp4 = _mm256_fmadd_ps(vp4, vt4, vc1); - vp5 = _mm256_fmadd_ps(vp5, vt5, vc1); - vp6 = _mm256_fmadd_ps(vp6, vt6, vc1); - vp7 = _mm256_fmadd_ps(vp7, vt7, vc1); - - vt0 = _mm256_mul_ps(vt0, vs0); - vt1 = _mm256_mul_ps(vt1, vs1); - vt2 = _mm256_mul_ps(vt2, vs2); - vt3 = _mm256_mul_ps(vt3, vs3); - vt4 = _mm256_mul_ps(vt4, vs4); - vt5 = _mm256_mul_ps(vt5, vs5); - vt6 = _mm256_mul_ps(vt6, vs6); - vt7 = _mm256_mul_ps(vt7, vs7); - - __m256 vf0 = _mm256_fmadd_ps(vt0, vp0, vs0); - __m256 vf1 = _mm256_fmadd_ps(vt1, vp1, vs1); - __m256 vf2 = _mm256_fmadd_ps(vt2, vp2, vs2); - __m256 vf3 = _mm256_fmadd_ps(vt3, vp3, vs3); - __m256 vf4 = _mm256_fmadd_ps(vt4, vp4, vs4); - __m256 vf5 = _mm256_fmadd_ps(vt5, vp5, vs5); - __m256 vf6 = _mm256_fmadd_ps(vt6, vp6, vs6); - __m256 vf7 = _mm256_fmadd_ps(vt7, vp7, vs7); - - vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); - vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); - vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); - vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); - vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vx4, vdenorm_cutoff, _CMP_LT_OS), vf4); - vf5 = _mm256_andnot_ps(_mm256_cmp_ps(vx5, vdenorm_cutoff, _CMP_LT_OS), vf5); - vf6 = _mm256_andnot_ps(_mm256_cmp_ps(vx6, vdenorm_cutoff, _CMP_LT_OS), vf6); - vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); - - _mm256_storeu_ps(output, vf0); - _mm256_storeu_ps(output + 8, vf1); - _mm256_storeu_ps(output + 16, vf2); - _mm256_storeu_ps(output + 24, vf3); - _mm256_storeu_ps(output + 32, vf4); - _mm256_storeu_ps(output + 40, vf5); - _mm256_storeu_ps(output + 48, vf6); - _mm256_storeu_ps(output + 56, vf7); - output += 64; - - vacc0 = _mm256_add_ps(vacc0, vf0); - vacc0 = _mm256_add_ps(vacc0, vf1); - vacc0 = _mm256_add_ps(vacc0, vf2); - vacc0 = _mm256_add_ps(vacc0, vf3); - vacc0 = _mm256_add_ps(vacc0, vf4); - vacc0 = _mm256_add_ps(vacc0, vf5); - vacc0 = _mm256_add_ps(vacc0, vf6); - vacc0 = _mm256_add_ps(vacc0, vf7); - } - - __m256 vacc = vacc0; - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const __m256 vi = _mm256_loadu_ps(input); - input += 8; - - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - __m256 vn = _mm256_fmadd_ps(vx, vlog2e, vmagic_bias); - - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - vn = _mm256_sub_ps(vn, vmagic_bias); - - __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vx); - - __m256 vp = _mm256_fmadd_ps(vc5, vt, vc4); - vp = _mm256_fmadd_ps(vp, vt, vc3); - vp = _mm256_fmadd_ps(vp, vt, vc2); - vp = _mm256_fmadd_ps(vp, vt, vc1); - - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_fmadd_ps(vt, vp, vs); - - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - _mm256_storeu_ps(output, vf); - output += 8; - - vacc = _mm256_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); - - const __m256 vi = _mm256_maskload_ps(input, vmask); - - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - __m256 vn = _mm256_fmadd_ps(vx, vlog2e, vmagic_bias); - - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - vn = _mm256_sub_ps(vn, vmagic_bias); - - __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vx); - - __m256 vp = _mm256_fmadd_ps(vc5, vt, vc4); - vp = _mm256_fmadd_ps(vp, vt, vc3); - vp = _mm256_fmadd_ps(vp, vt, vc2); - vp = _mm256_fmadd_ps(vp, vt, vc1); - - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_fmadd_ps(vt, vp, vs); - - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - __m128 vf_lo = _mm256_castps256_ps128(vf); - if (batch & (4 * sizeof(float))) { - _mm_storeu_ps(output, vf_lo); - vf_lo = _mm256_extractf128_ps(vf, 1); - output += 4; - } - if (batch & (2 * sizeof(float))) { - _mm_storel_pi((__m64*) output, vf_lo); - vf_lo = _mm_movehl_ps(vf_lo, vf_lo); - output += 2; - } - if (batch & (1 * sizeof(float))) { - _mm_store_ss(output, vf_lo); - } - - vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); - } - __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); - vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); - vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); - _mm_store_ss(sum, vacc_lo); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u72-acc3.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u72-acc3.c deleted file mode 100644 index dc41eb737b7..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u72-acc3.c +++ /dev/null @@ -1,293 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u72_acc3( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; - - const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); - const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); - const __m256 vminus_ln2 = _mm256_set1_ps(-0x1.62E430p-1f); - const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); - const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); - const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); - const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); - const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); - const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m256 vi_max = _mm256_broadcast_ss(max); - - __m256 vacc0 = _mm256_setzero_ps(); - __m256 vacc1 = _mm256_setzero_ps(); - __m256 vacc2 = _mm256_setzero_ps(); - for (; batch >= 72 * sizeof(float); batch -= 72 * sizeof(float)) { - const __m256 vi0 = _mm256_loadu_ps(input); - const __m256 vi1 = _mm256_loadu_ps(input + 8); - const __m256 vi2 = _mm256_loadu_ps(input + 16); - const __m256 vi3 = _mm256_loadu_ps(input + 24); - const __m256 vi4 = _mm256_loadu_ps(input + 32); - const __m256 vi5 = _mm256_loadu_ps(input + 40); - const __m256 vi6 = _mm256_loadu_ps(input + 48); - const __m256 vi7 = _mm256_loadu_ps(input + 56); - const __m256 vi8 = _mm256_loadu_ps(input + 64); - input += 72; - - const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); - const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); - const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); - const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); - const __m256 vx4 = _mm256_sub_ps(vi4, vi_max); - const __m256 vx5 = _mm256_sub_ps(vi5, vi_max); - const __m256 vx6 = _mm256_sub_ps(vi6, vi_max); - const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); - const __m256 vx8 = _mm256_sub_ps(vi8, vi_max); - - __m256 vn0 = _mm256_fmadd_ps(vx0, vlog2e, vmagic_bias); - __m256 vn1 = _mm256_fmadd_ps(vx1, vlog2e, vmagic_bias); - __m256 vn2 = _mm256_fmadd_ps(vx2, vlog2e, vmagic_bias); - __m256 vn3 = _mm256_fmadd_ps(vx3, vlog2e, vmagic_bias); - __m256 vn4 = _mm256_fmadd_ps(vx4, vlog2e, vmagic_bias); - __m256 vn5 = _mm256_fmadd_ps(vx5, vlog2e, vmagic_bias); - __m256 vn6 = _mm256_fmadd_ps(vx6, vlog2e, vmagic_bias); - __m256 vn7 = _mm256_fmadd_ps(vx7, vlog2e, vmagic_bias); - __m256 vn8 = _mm256_fmadd_ps(vx8, vlog2e, vmagic_bias); - - const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); - const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); - const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); - const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); - const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23)); - const __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); - const __m256 vs6 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn6), 23)); - const __m256 vs7 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn7), 23)); - const __m256 vs8 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn8), 23)); - - vn0 = _mm256_sub_ps(vn0, vmagic_bias); - vn1 = _mm256_sub_ps(vn1, vmagic_bias); - vn2 = _mm256_sub_ps(vn2, vmagic_bias); - vn3 = _mm256_sub_ps(vn3, vmagic_bias); - vn4 = _mm256_sub_ps(vn4, vmagic_bias); - vn5 = _mm256_sub_ps(vn5, vmagic_bias); - vn6 = _mm256_sub_ps(vn6, vmagic_bias); - vn7 = _mm256_sub_ps(vn7, vmagic_bias); - vn8 = _mm256_sub_ps(vn8, vmagic_bias); - - __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2, vx0); - __m256 vt1 = _mm256_fmadd_ps(vn1, vminus_ln2, vx1); - __m256 vt2 = _mm256_fmadd_ps(vn2, vminus_ln2, vx2); - __m256 vt3 = _mm256_fmadd_ps(vn3, vminus_ln2, vx3); - __m256 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2, vx4); - __m256 vt5 = _mm256_fmadd_ps(vn5, vminus_ln2, vx5); - __m256 vt6 = _mm256_fmadd_ps(vn6, vminus_ln2, vx6); - __m256 vt7 = _mm256_fmadd_ps(vn7, vminus_ln2, vx7); - __m256 vt8 = _mm256_fmadd_ps(vn8, vminus_ln2, vx8); - - __m256 vp0 = _mm256_fmadd_ps(vc5, vt0, vc4); - __m256 vp1 = _mm256_fmadd_ps(vc5, vt1, vc4); - __m256 vp2 = _mm256_fmadd_ps(vc5, vt2, vc4); - __m256 vp3 = _mm256_fmadd_ps(vc5, vt3, vc4); - __m256 vp4 = _mm256_fmadd_ps(vc5, vt4, vc4); - __m256 vp5 = _mm256_fmadd_ps(vc5, vt5, vc4); - __m256 vp6 = _mm256_fmadd_ps(vc5, vt6, vc4); - __m256 vp7 = _mm256_fmadd_ps(vc5, vt7, vc4); - __m256 vp8 = _mm256_fmadd_ps(vc5, vt8, vc4); - - vp0 = _mm256_fmadd_ps(vp0, vt0, vc3); - vp1 = _mm256_fmadd_ps(vp1, vt1, vc3); - vp2 = _mm256_fmadd_ps(vp2, vt2, vc3); - vp3 = _mm256_fmadd_ps(vp3, vt3, vc3); - vp4 = _mm256_fmadd_ps(vp4, vt4, vc3); - vp5 = _mm256_fmadd_ps(vp5, vt5, vc3); - vp6 = _mm256_fmadd_ps(vp6, vt6, vc3); - vp7 = _mm256_fmadd_ps(vp7, vt7, vc3); - vp8 = _mm256_fmadd_ps(vp8, vt8, vc3); - - vp0 = _mm256_fmadd_ps(vp0, vt0, vc2); - vp1 = _mm256_fmadd_ps(vp1, vt1, vc2); - vp2 = _mm256_fmadd_ps(vp2, vt2, vc2); - vp3 = _mm256_fmadd_ps(vp3, vt3, vc2); - vp4 = _mm256_fmadd_ps(vp4, vt4, vc2); - vp5 = _mm256_fmadd_ps(vp5, vt5, vc2); - vp6 = _mm256_fmadd_ps(vp6, vt6, vc2); - vp7 = _mm256_fmadd_ps(vp7, vt7, vc2); - vp8 = _mm256_fmadd_ps(vp8, vt8, vc2); - - vp0 = _mm256_fmadd_ps(vp0, vt0, vc1); - vp1 = _mm256_fmadd_ps(vp1, vt1, vc1); - vp2 = _mm256_fmadd_ps(vp2, vt2, vc1); - vp3 = _mm256_fmadd_ps(vp3, vt3, vc1); - vp4 = _mm256_fmadd_ps(vp4, vt4, vc1); - vp5 = _mm256_fmadd_ps(vp5, vt5, vc1); - vp6 = _mm256_fmadd_ps(vp6, vt6, vc1); - vp7 = _mm256_fmadd_ps(vp7, vt7, vc1); - vp8 = _mm256_fmadd_ps(vp8, vt8, vc1); - - vt0 = _mm256_mul_ps(vt0, vs0); - vt1 = _mm256_mul_ps(vt1, vs1); - vt2 = _mm256_mul_ps(vt2, vs2); - vt3 = _mm256_mul_ps(vt3, vs3); - vt4 = _mm256_mul_ps(vt4, vs4); - vt5 = _mm256_mul_ps(vt5, vs5); - vt6 = _mm256_mul_ps(vt6, vs6); - vt7 = _mm256_mul_ps(vt7, vs7); - vt8 = _mm256_mul_ps(vt8, vs8); - - __m256 vf0 = _mm256_fmadd_ps(vt0, vp0, vs0); - __m256 vf1 = _mm256_fmadd_ps(vt1, vp1, vs1); - __m256 vf2 = _mm256_fmadd_ps(vt2, vp2, vs2); - __m256 vf3 = _mm256_fmadd_ps(vt3, vp3, vs3); - __m256 vf4 = _mm256_fmadd_ps(vt4, vp4, vs4); - __m256 vf5 = _mm256_fmadd_ps(vt5, vp5, vs5); - __m256 vf6 = _mm256_fmadd_ps(vt6, vp6, vs6); - __m256 vf7 = _mm256_fmadd_ps(vt7, vp7, vs7); - __m256 vf8 = _mm256_fmadd_ps(vt8, vp8, vs8); - - vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); - vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); - vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); - vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); - vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vx4, vdenorm_cutoff, _CMP_LT_OS), vf4); - vf5 = _mm256_andnot_ps(_mm256_cmp_ps(vx5, vdenorm_cutoff, _CMP_LT_OS), vf5); - vf6 = _mm256_andnot_ps(_mm256_cmp_ps(vx6, vdenorm_cutoff, _CMP_LT_OS), vf6); - vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); - vf8 = _mm256_andnot_ps(_mm256_cmp_ps(vx8, vdenorm_cutoff, _CMP_LT_OS), vf8); - - _mm256_storeu_ps(output, vf0); - _mm256_storeu_ps(output + 8, vf1); - _mm256_storeu_ps(output + 16, vf2); - _mm256_storeu_ps(output + 24, vf3); - _mm256_storeu_ps(output + 32, vf4); - _mm256_storeu_ps(output + 40, vf5); - _mm256_storeu_ps(output + 48, vf6); - _mm256_storeu_ps(output + 56, vf7); - _mm256_storeu_ps(output + 64, vf8); - output += 72; - - vacc0 = _mm256_add_ps(vacc0, vf0); - vacc1 = _mm256_add_ps(vacc1, vf1); - vacc2 = _mm256_add_ps(vacc2, vf2); - vacc0 = _mm256_add_ps(vacc0, vf3); - vacc1 = _mm256_add_ps(vacc1, vf4); - vacc2 = _mm256_add_ps(vacc2, vf5); - vacc0 = _mm256_add_ps(vacc0, vf6); - vacc1 = _mm256_add_ps(vacc1, vf7); - vacc2 = _mm256_add_ps(vacc2, vf8); - } - vacc0 = _mm256_add_ps(vacc0, vacc1); - vacc0 = _mm256_add_ps(vacc0, vacc2); - - __m256 vacc = vacc0; - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const __m256 vi = _mm256_loadu_ps(input); - input += 8; - - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - __m256 vn = _mm256_fmadd_ps(vx, vlog2e, vmagic_bias); - - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - vn = _mm256_sub_ps(vn, vmagic_bias); - - __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vx); - - __m256 vp = _mm256_fmadd_ps(vc5, vt, vc4); - vp = _mm256_fmadd_ps(vp, vt, vc3); - vp = _mm256_fmadd_ps(vp, vt, vc2); - vp = _mm256_fmadd_ps(vp, vt, vc1); - - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_fmadd_ps(vt, vp, vs); - - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - _mm256_storeu_ps(output, vf); - output += 8; - - vacc = _mm256_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); - - const __m256 vi = _mm256_maskload_ps(input, vmask); - - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - __m256 vn = _mm256_fmadd_ps(vx, vlog2e, vmagic_bias); - - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - vn = _mm256_sub_ps(vn, vmagic_bias); - - __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vx); - - __m256 vp = _mm256_fmadd_ps(vc5, vt, vc4); - vp = _mm256_fmadd_ps(vp, vt, vc3); - vp = _mm256_fmadd_ps(vp, vt, vc2); - vp = _mm256_fmadd_ps(vp, vt, vc1); - - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_fmadd_ps(vt, vp, vs); - - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - __m128 vf_lo = _mm256_castps256_ps128(vf); - if (batch & (4 * sizeof(float))) { - _mm_storeu_ps(output, vf_lo); - vf_lo = _mm256_extractf128_ps(vf, 1); - output += 4; - } - if (batch & (2 * sizeof(float))) { - _mm_storel_pi((__m64*) output, vf_lo); - vf_lo = _mm_movehl_ps(vf_lo, vf_lo); - output += 2; - } - if (batch & (1 * sizeof(float))) { - _mm_store_ss(output, vf_lo); - } - - vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); - } - __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); - vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); - vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); - _mm_store_ss(sum, vacc_lo); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u72.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u72.c deleted file mode 100644 index ad00f6f05db..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u72.c +++ /dev/null @@ -1,289 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u72( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; - - const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); - const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); - const __m256 vminus_ln2 = _mm256_set1_ps(-0x1.62E430p-1f); - const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); - const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); - const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); - const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); - const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); - const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m256 vi_max = _mm256_broadcast_ss(max); - - __m256 vacc0 = _mm256_setzero_ps(); - for (; batch >= 72 * sizeof(float); batch -= 72 * sizeof(float)) { - const __m256 vi0 = _mm256_loadu_ps(input); - const __m256 vi1 = _mm256_loadu_ps(input + 8); - const __m256 vi2 = _mm256_loadu_ps(input + 16); - const __m256 vi3 = _mm256_loadu_ps(input + 24); - const __m256 vi4 = _mm256_loadu_ps(input + 32); - const __m256 vi5 = _mm256_loadu_ps(input + 40); - const __m256 vi6 = _mm256_loadu_ps(input + 48); - const __m256 vi7 = _mm256_loadu_ps(input + 56); - const __m256 vi8 = _mm256_loadu_ps(input + 64); - input += 72; - - const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); - const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); - const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); - const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); - const __m256 vx4 = _mm256_sub_ps(vi4, vi_max); - const __m256 vx5 = _mm256_sub_ps(vi5, vi_max); - const __m256 vx6 = _mm256_sub_ps(vi6, vi_max); - const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); - const __m256 vx8 = _mm256_sub_ps(vi8, vi_max); - - __m256 vn0 = _mm256_fmadd_ps(vx0, vlog2e, vmagic_bias); - __m256 vn1 = _mm256_fmadd_ps(vx1, vlog2e, vmagic_bias); - __m256 vn2 = _mm256_fmadd_ps(vx2, vlog2e, vmagic_bias); - __m256 vn3 = _mm256_fmadd_ps(vx3, vlog2e, vmagic_bias); - __m256 vn4 = _mm256_fmadd_ps(vx4, vlog2e, vmagic_bias); - __m256 vn5 = _mm256_fmadd_ps(vx5, vlog2e, vmagic_bias); - __m256 vn6 = _mm256_fmadd_ps(vx6, vlog2e, vmagic_bias); - __m256 vn7 = _mm256_fmadd_ps(vx7, vlog2e, vmagic_bias); - __m256 vn8 = _mm256_fmadd_ps(vx8, vlog2e, vmagic_bias); - - const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); - const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); - const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); - const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); - const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23)); - const __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); - const __m256 vs6 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn6), 23)); - const __m256 vs7 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn7), 23)); - const __m256 vs8 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn8), 23)); - - vn0 = _mm256_sub_ps(vn0, vmagic_bias); - vn1 = _mm256_sub_ps(vn1, vmagic_bias); - vn2 = _mm256_sub_ps(vn2, vmagic_bias); - vn3 = _mm256_sub_ps(vn3, vmagic_bias); - vn4 = _mm256_sub_ps(vn4, vmagic_bias); - vn5 = _mm256_sub_ps(vn5, vmagic_bias); - vn6 = _mm256_sub_ps(vn6, vmagic_bias); - vn7 = _mm256_sub_ps(vn7, vmagic_bias); - vn8 = _mm256_sub_ps(vn8, vmagic_bias); - - __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2, vx0); - __m256 vt1 = _mm256_fmadd_ps(vn1, vminus_ln2, vx1); - __m256 vt2 = _mm256_fmadd_ps(vn2, vminus_ln2, vx2); - __m256 vt3 = _mm256_fmadd_ps(vn3, vminus_ln2, vx3); - __m256 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2, vx4); - __m256 vt5 = _mm256_fmadd_ps(vn5, vminus_ln2, vx5); - __m256 vt6 = _mm256_fmadd_ps(vn6, vminus_ln2, vx6); - __m256 vt7 = _mm256_fmadd_ps(vn7, vminus_ln2, vx7); - __m256 vt8 = _mm256_fmadd_ps(vn8, vminus_ln2, vx8); - - __m256 vp0 = _mm256_fmadd_ps(vc5, vt0, vc4); - __m256 vp1 = _mm256_fmadd_ps(vc5, vt1, vc4); - __m256 vp2 = _mm256_fmadd_ps(vc5, vt2, vc4); - __m256 vp3 = _mm256_fmadd_ps(vc5, vt3, vc4); - __m256 vp4 = _mm256_fmadd_ps(vc5, vt4, vc4); - __m256 vp5 = _mm256_fmadd_ps(vc5, vt5, vc4); - __m256 vp6 = _mm256_fmadd_ps(vc5, vt6, vc4); - __m256 vp7 = _mm256_fmadd_ps(vc5, vt7, vc4); - __m256 vp8 = _mm256_fmadd_ps(vc5, vt8, vc4); - - vp0 = _mm256_fmadd_ps(vp0, vt0, vc3); - vp1 = _mm256_fmadd_ps(vp1, vt1, vc3); - vp2 = _mm256_fmadd_ps(vp2, vt2, vc3); - vp3 = _mm256_fmadd_ps(vp3, vt3, vc3); - vp4 = _mm256_fmadd_ps(vp4, vt4, vc3); - vp5 = _mm256_fmadd_ps(vp5, vt5, vc3); - vp6 = _mm256_fmadd_ps(vp6, vt6, vc3); - vp7 = _mm256_fmadd_ps(vp7, vt7, vc3); - vp8 = _mm256_fmadd_ps(vp8, vt8, vc3); - - vp0 = _mm256_fmadd_ps(vp0, vt0, vc2); - vp1 = _mm256_fmadd_ps(vp1, vt1, vc2); - vp2 = _mm256_fmadd_ps(vp2, vt2, vc2); - vp3 = _mm256_fmadd_ps(vp3, vt3, vc2); - vp4 = _mm256_fmadd_ps(vp4, vt4, vc2); - vp5 = _mm256_fmadd_ps(vp5, vt5, vc2); - vp6 = _mm256_fmadd_ps(vp6, vt6, vc2); - vp7 = _mm256_fmadd_ps(vp7, vt7, vc2); - vp8 = _mm256_fmadd_ps(vp8, vt8, vc2); - - vp0 = _mm256_fmadd_ps(vp0, vt0, vc1); - vp1 = _mm256_fmadd_ps(vp1, vt1, vc1); - vp2 = _mm256_fmadd_ps(vp2, vt2, vc1); - vp3 = _mm256_fmadd_ps(vp3, vt3, vc1); - vp4 = _mm256_fmadd_ps(vp4, vt4, vc1); - vp5 = _mm256_fmadd_ps(vp5, vt5, vc1); - vp6 = _mm256_fmadd_ps(vp6, vt6, vc1); - vp7 = _mm256_fmadd_ps(vp7, vt7, vc1); - vp8 = _mm256_fmadd_ps(vp8, vt8, vc1); - - vt0 = _mm256_mul_ps(vt0, vs0); - vt1 = _mm256_mul_ps(vt1, vs1); - vt2 = _mm256_mul_ps(vt2, vs2); - vt3 = _mm256_mul_ps(vt3, vs3); - vt4 = _mm256_mul_ps(vt4, vs4); - vt5 = _mm256_mul_ps(vt5, vs5); - vt6 = _mm256_mul_ps(vt6, vs6); - vt7 = _mm256_mul_ps(vt7, vs7); - vt8 = _mm256_mul_ps(vt8, vs8); - - __m256 vf0 = _mm256_fmadd_ps(vt0, vp0, vs0); - __m256 vf1 = _mm256_fmadd_ps(vt1, vp1, vs1); - __m256 vf2 = _mm256_fmadd_ps(vt2, vp2, vs2); - __m256 vf3 = _mm256_fmadd_ps(vt3, vp3, vs3); - __m256 vf4 = _mm256_fmadd_ps(vt4, vp4, vs4); - __m256 vf5 = _mm256_fmadd_ps(vt5, vp5, vs5); - __m256 vf6 = _mm256_fmadd_ps(vt6, vp6, vs6); - __m256 vf7 = _mm256_fmadd_ps(vt7, vp7, vs7); - __m256 vf8 = _mm256_fmadd_ps(vt8, vp8, vs8); - - vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); - vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); - vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); - vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); - vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vx4, vdenorm_cutoff, _CMP_LT_OS), vf4); - vf5 = _mm256_andnot_ps(_mm256_cmp_ps(vx5, vdenorm_cutoff, _CMP_LT_OS), vf5); - vf6 = _mm256_andnot_ps(_mm256_cmp_ps(vx6, vdenorm_cutoff, _CMP_LT_OS), vf6); - vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); - vf8 = _mm256_andnot_ps(_mm256_cmp_ps(vx8, vdenorm_cutoff, _CMP_LT_OS), vf8); - - _mm256_storeu_ps(output, vf0); - _mm256_storeu_ps(output + 8, vf1); - _mm256_storeu_ps(output + 16, vf2); - _mm256_storeu_ps(output + 24, vf3); - _mm256_storeu_ps(output + 32, vf4); - _mm256_storeu_ps(output + 40, vf5); - _mm256_storeu_ps(output + 48, vf6); - _mm256_storeu_ps(output + 56, vf7); - _mm256_storeu_ps(output + 64, vf8); - output += 72; - - vacc0 = _mm256_add_ps(vacc0, vf0); - vacc0 = _mm256_add_ps(vacc0, vf1); - vacc0 = _mm256_add_ps(vacc0, vf2); - vacc0 = _mm256_add_ps(vacc0, vf3); - vacc0 = _mm256_add_ps(vacc0, vf4); - vacc0 = _mm256_add_ps(vacc0, vf5); - vacc0 = _mm256_add_ps(vacc0, vf6); - vacc0 = _mm256_add_ps(vacc0, vf7); - vacc0 = _mm256_add_ps(vacc0, vf8); - } - - __m256 vacc = vacc0; - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const __m256 vi = _mm256_loadu_ps(input); - input += 8; - - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - __m256 vn = _mm256_fmadd_ps(vx, vlog2e, vmagic_bias); - - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - vn = _mm256_sub_ps(vn, vmagic_bias); - - __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vx); - - __m256 vp = _mm256_fmadd_ps(vc5, vt, vc4); - vp = _mm256_fmadd_ps(vp, vt, vc3); - vp = _mm256_fmadd_ps(vp, vt, vc2); - vp = _mm256_fmadd_ps(vp, vt, vc1); - - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_fmadd_ps(vt, vp, vs); - - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - _mm256_storeu_ps(output, vf); - output += 8; - - vacc = _mm256_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); - - const __m256 vi = _mm256_maskload_ps(input, vmask); - - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - __m256 vn = _mm256_fmadd_ps(vx, vlog2e, vmagic_bias); - - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - vn = _mm256_sub_ps(vn, vmagic_bias); - - __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vx); - - __m256 vp = _mm256_fmadd_ps(vc5, vt, vc4); - vp = _mm256_fmadd_ps(vp, vt, vc3); - vp = _mm256_fmadd_ps(vp, vt, vc2); - vp = _mm256_fmadd_ps(vp, vt, vc1); - - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_fmadd_ps(vt, vp, vs); - - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - __m128 vf_lo = _mm256_castps256_ps128(vf); - if (batch & (4 * sizeof(float))) { - _mm_storeu_ps(output, vf_lo); - vf_lo = _mm256_extractf128_ps(vf, 1); - output += 4; - } - if (batch & (2 * sizeof(float))) { - _mm_storel_pi((__m64*) output, vf_lo); - vf_lo = _mm_movehl_ps(vf_lo, vf_lo); - output += 2; - } - if (batch & (1 * sizeof(float))) { - _mm_store_ss(output, vf_lo); - } - - vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); - } - __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); - vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); - vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); - _mm_store_ss(sum, vacc_lo); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u8.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u8.c new file mode 100644 index 00000000000..146262266de --- /dev/null +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u8.c @@ -0,0 +1,170 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in +// Generator: tools/xngen +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/intrinsics-polyfill.h" +#include "xnnpack/raddstoreexpminusmax.h" + + +void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u8( + size_t batch, + const float* input, + const float* max, + float* output, + float* sum, + const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(max != NULL); + assert(output != NULL); + assert(sum != NULL); + + static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; + + const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); + const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); + const __m256 vminus_ln2 = _mm256_set1_ps(-0x1.62E430p-1f); + const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); + const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); + const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); + const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); + const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); + const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); + + XNN_FORCE_REALIZATION(vlog2e); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vminus_ln2); + XNN_FORCE_REALIZATION(vc5); + XNN_FORCE_REALIZATION(vc4); + XNN_FORCE_REALIZATION(vc3); + XNN_FORCE_REALIZATION(vc2); + XNN_FORCE_REALIZATION(vc1); + XNN_FORCE_REALIZATION(vdenorm_cutoff); + + const __m256 vi_max = _mm256_broadcast_ss(max); + + __m256 vacc0 = _mm256_setzero_ps(); + for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { + const __m256 vi0 = _mm256_loadu_ps(input); + input += 8; + + const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); + + __m256 vn0 = _mm256_fmadd_ps(vx0, vlog2e, vmagic_bias); + + const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); + + vn0 = _mm256_sub_ps(vn0, vmagic_bias); + + __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2, vx0); + + __m256 vp0 = _mm256_fmadd_ps(vc5, vt0, vc4); + + vp0 = _mm256_fmadd_ps(vp0, vt0, vc3); + + vp0 = _mm256_fmadd_ps(vp0, vt0, vc2); + + vp0 = _mm256_fmadd_ps(vp0, vt0, vc1); + + vt0 = _mm256_mul_ps(vt0, vs0); + + __m256 vf0 = _mm256_fmadd_ps(vt0, vp0, vs0); + + vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); + + _mm256_storeu_ps(output, vf0); + output += 8; + + vacc0 = _mm256_add_ps(vacc0, vf0); + } + + __m256 vacc = vacc0; + for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { + const __m256 vi = _mm256_loadu_ps(input); + input += 8; + + const __m256 vx = _mm256_sub_ps(vi, vi_max); + + __m256 vn = _mm256_fmadd_ps(vx, vlog2e, vmagic_bias); + + const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); + + vn = _mm256_sub_ps(vn, vmagic_bias); + + __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vx); + + __m256 vp = _mm256_fmadd_ps(vc5, vt, vc4); + vp = _mm256_fmadd_ps(vp, vt, vc3); + vp = _mm256_fmadd_ps(vp, vt, vc2); + vp = _mm256_fmadd_ps(vp, vt, vc1); + + vt = _mm256_mul_ps(vt, vs); + __m256 vf = _mm256_fmadd_ps(vt, vp, vs); + + vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); + + _mm256_storeu_ps(output, vf); + output += 8; + + vacc = _mm256_add_ps(vacc, vf); + } + if (batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 7 * sizeof(float)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); + + const __m256 vi = _mm256_maskload_ps(input, vmask); + + const __m256 vx = _mm256_sub_ps(vi, vi_max); + + __m256 vn = _mm256_fmadd_ps(vx, vlog2e, vmagic_bias); + + const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); + + vn = _mm256_sub_ps(vn, vmagic_bias); + + __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vx); + + __m256 vp = _mm256_fmadd_ps(vc5, vt, vc4); + vp = _mm256_fmadd_ps(vp, vt, vc3); + vp = _mm256_fmadd_ps(vp, vt, vc2); + vp = _mm256_fmadd_ps(vp, vt, vc1); + + vt = _mm256_mul_ps(vt, vs); + __m256 vf = _mm256_fmadd_ps(vt, vp, vs); + + vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); + + __m128 vf_lo = _mm256_castps256_ps128(vf); + if (batch & (4 * sizeof(float))) { + _mm_storeu_ps(output, vf_lo); + vf_lo = _mm256_extractf128_ps(vf, 1); + output += 4; + } + if (batch & (2 * sizeof(float))) { + _mm_storel_pi((__m64*) output, vf_lo); + vf_lo = _mm_movehl_ps(vf_lo, vf_lo); + output += 2; + } + if (batch & (1 * sizeof(float))) { + _mm_store_ss(output, vf_lo); + } + + vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); + } + __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); + vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); + vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); + _mm_store_ss(sum, vacc_lo); +} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u80-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u80-acc2.c deleted file mode 100644 index 3e63283c08d..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u80-acc2.c +++ /dev/null @@ -1,306 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u80_acc2( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; - - const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); - const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); - const __m256 vminus_ln2 = _mm256_set1_ps(-0x1.62E430p-1f); - const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); - const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); - const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); - const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); - const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); - const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m256 vi_max = _mm256_broadcast_ss(max); - - __m256 vacc0 = _mm256_setzero_ps(); - __m256 vacc1 = _mm256_setzero_ps(); - for (; batch >= 80 * sizeof(float); batch -= 80 * sizeof(float)) { - const __m256 vi0 = _mm256_loadu_ps(input); - const __m256 vi1 = _mm256_loadu_ps(input + 8); - const __m256 vi2 = _mm256_loadu_ps(input + 16); - const __m256 vi3 = _mm256_loadu_ps(input + 24); - const __m256 vi4 = _mm256_loadu_ps(input + 32); - const __m256 vi5 = _mm256_loadu_ps(input + 40); - const __m256 vi6 = _mm256_loadu_ps(input + 48); - const __m256 vi7 = _mm256_loadu_ps(input + 56); - const __m256 vi8 = _mm256_loadu_ps(input + 64); - const __m256 vi9 = _mm256_loadu_ps(input + 72); - input += 80; - - const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); - const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); - const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); - const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); - const __m256 vx4 = _mm256_sub_ps(vi4, vi_max); - const __m256 vx5 = _mm256_sub_ps(vi5, vi_max); - const __m256 vx6 = _mm256_sub_ps(vi6, vi_max); - const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); - const __m256 vx8 = _mm256_sub_ps(vi8, vi_max); - const __m256 vx9 = _mm256_sub_ps(vi9, vi_max); - - __m256 vn0 = _mm256_fmadd_ps(vx0, vlog2e, vmagic_bias); - __m256 vn1 = _mm256_fmadd_ps(vx1, vlog2e, vmagic_bias); - __m256 vn2 = _mm256_fmadd_ps(vx2, vlog2e, vmagic_bias); - __m256 vn3 = _mm256_fmadd_ps(vx3, vlog2e, vmagic_bias); - __m256 vn4 = _mm256_fmadd_ps(vx4, vlog2e, vmagic_bias); - __m256 vn5 = _mm256_fmadd_ps(vx5, vlog2e, vmagic_bias); - __m256 vn6 = _mm256_fmadd_ps(vx6, vlog2e, vmagic_bias); - __m256 vn7 = _mm256_fmadd_ps(vx7, vlog2e, vmagic_bias); - __m256 vn8 = _mm256_fmadd_ps(vx8, vlog2e, vmagic_bias); - __m256 vn9 = _mm256_fmadd_ps(vx9, vlog2e, vmagic_bias); - - const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); - const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); - const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); - const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); - const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23)); - const __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); - const __m256 vs6 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn6), 23)); - const __m256 vs7 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn7), 23)); - const __m256 vs8 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn8), 23)); - const __m256 vs9 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn9), 23)); - - vn0 = _mm256_sub_ps(vn0, vmagic_bias); - vn1 = _mm256_sub_ps(vn1, vmagic_bias); - vn2 = _mm256_sub_ps(vn2, vmagic_bias); - vn3 = _mm256_sub_ps(vn3, vmagic_bias); - vn4 = _mm256_sub_ps(vn4, vmagic_bias); - vn5 = _mm256_sub_ps(vn5, vmagic_bias); - vn6 = _mm256_sub_ps(vn6, vmagic_bias); - vn7 = _mm256_sub_ps(vn7, vmagic_bias); - vn8 = _mm256_sub_ps(vn8, vmagic_bias); - vn9 = _mm256_sub_ps(vn9, vmagic_bias); - - __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2, vx0); - __m256 vt1 = _mm256_fmadd_ps(vn1, vminus_ln2, vx1); - __m256 vt2 = _mm256_fmadd_ps(vn2, vminus_ln2, vx2); - __m256 vt3 = _mm256_fmadd_ps(vn3, vminus_ln2, vx3); - __m256 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2, vx4); - __m256 vt5 = _mm256_fmadd_ps(vn5, vminus_ln2, vx5); - __m256 vt6 = _mm256_fmadd_ps(vn6, vminus_ln2, vx6); - __m256 vt7 = _mm256_fmadd_ps(vn7, vminus_ln2, vx7); - __m256 vt8 = _mm256_fmadd_ps(vn8, vminus_ln2, vx8); - __m256 vt9 = _mm256_fmadd_ps(vn9, vminus_ln2, vx9); - - __m256 vp0 = _mm256_fmadd_ps(vc5, vt0, vc4); - __m256 vp1 = _mm256_fmadd_ps(vc5, vt1, vc4); - __m256 vp2 = _mm256_fmadd_ps(vc5, vt2, vc4); - __m256 vp3 = _mm256_fmadd_ps(vc5, vt3, vc4); - __m256 vp4 = _mm256_fmadd_ps(vc5, vt4, vc4); - __m256 vp5 = _mm256_fmadd_ps(vc5, vt5, vc4); - __m256 vp6 = _mm256_fmadd_ps(vc5, vt6, vc4); - __m256 vp7 = _mm256_fmadd_ps(vc5, vt7, vc4); - __m256 vp8 = _mm256_fmadd_ps(vc5, vt8, vc4); - __m256 vp9 = _mm256_fmadd_ps(vc5, vt9, vc4); - - vp0 = _mm256_fmadd_ps(vp0, vt0, vc3); - vp1 = _mm256_fmadd_ps(vp1, vt1, vc3); - vp2 = _mm256_fmadd_ps(vp2, vt2, vc3); - vp3 = _mm256_fmadd_ps(vp3, vt3, vc3); - vp4 = _mm256_fmadd_ps(vp4, vt4, vc3); - vp5 = _mm256_fmadd_ps(vp5, vt5, vc3); - vp6 = _mm256_fmadd_ps(vp6, vt6, vc3); - vp7 = _mm256_fmadd_ps(vp7, vt7, vc3); - vp8 = _mm256_fmadd_ps(vp8, vt8, vc3); - vp9 = _mm256_fmadd_ps(vp9, vt9, vc3); - - vp0 = _mm256_fmadd_ps(vp0, vt0, vc2); - vp1 = _mm256_fmadd_ps(vp1, vt1, vc2); - vp2 = _mm256_fmadd_ps(vp2, vt2, vc2); - vp3 = _mm256_fmadd_ps(vp3, vt3, vc2); - vp4 = _mm256_fmadd_ps(vp4, vt4, vc2); - vp5 = _mm256_fmadd_ps(vp5, vt5, vc2); - vp6 = _mm256_fmadd_ps(vp6, vt6, vc2); - vp7 = _mm256_fmadd_ps(vp7, vt7, vc2); - vp8 = _mm256_fmadd_ps(vp8, vt8, vc2); - vp9 = _mm256_fmadd_ps(vp9, vt9, vc2); - - vp0 = _mm256_fmadd_ps(vp0, vt0, vc1); - vp1 = _mm256_fmadd_ps(vp1, vt1, vc1); - vp2 = _mm256_fmadd_ps(vp2, vt2, vc1); - vp3 = _mm256_fmadd_ps(vp3, vt3, vc1); - vp4 = _mm256_fmadd_ps(vp4, vt4, vc1); - vp5 = _mm256_fmadd_ps(vp5, vt5, vc1); - vp6 = _mm256_fmadd_ps(vp6, vt6, vc1); - vp7 = _mm256_fmadd_ps(vp7, vt7, vc1); - vp8 = _mm256_fmadd_ps(vp8, vt8, vc1); - vp9 = _mm256_fmadd_ps(vp9, vt9, vc1); - - vt0 = _mm256_mul_ps(vt0, vs0); - vt1 = _mm256_mul_ps(vt1, vs1); - vt2 = _mm256_mul_ps(vt2, vs2); - vt3 = _mm256_mul_ps(vt3, vs3); - vt4 = _mm256_mul_ps(vt4, vs4); - vt5 = _mm256_mul_ps(vt5, vs5); - vt6 = _mm256_mul_ps(vt6, vs6); - vt7 = _mm256_mul_ps(vt7, vs7); - vt8 = _mm256_mul_ps(vt8, vs8); - vt9 = _mm256_mul_ps(vt9, vs9); - - __m256 vf0 = _mm256_fmadd_ps(vt0, vp0, vs0); - __m256 vf1 = _mm256_fmadd_ps(vt1, vp1, vs1); - __m256 vf2 = _mm256_fmadd_ps(vt2, vp2, vs2); - __m256 vf3 = _mm256_fmadd_ps(vt3, vp3, vs3); - __m256 vf4 = _mm256_fmadd_ps(vt4, vp4, vs4); - __m256 vf5 = _mm256_fmadd_ps(vt5, vp5, vs5); - __m256 vf6 = _mm256_fmadd_ps(vt6, vp6, vs6); - __m256 vf7 = _mm256_fmadd_ps(vt7, vp7, vs7); - __m256 vf8 = _mm256_fmadd_ps(vt8, vp8, vs8); - __m256 vf9 = _mm256_fmadd_ps(vt9, vp9, vs9); - - vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); - vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); - vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); - vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); - vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vx4, vdenorm_cutoff, _CMP_LT_OS), vf4); - vf5 = _mm256_andnot_ps(_mm256_cmp_ps(vx5, vdenorm_cutoff, _CMP_LT_OS), vf5); - vf6 = _mm256_andnot_ps(_mm256_cmp_ps(vx6, vdenorm_cutoff, _CMP_LT_OS), vf6); - vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); - vf8 = _mm256_andnot_ps(_mm256_cmp_ps(vx8, vdenorm_cutoff, _CMP_LT_OS), vf8); - vf9 = _mm256_andnot_ps(_mm256_cmp_ps(vx9, vdenorm_cutoff, _CMP_LT_OS), vf9); - - _mm256_storeu_ps(output, vf0); - _mm256_storeu_ps(output + 8, vf1); - _mm256_storeu_ps(output + 16, vf2); - _mm256_storeu_ps(output + 24, vf3); - _mm256_storeu_ps(output + 32, vf4); - _mm256_storeu_ps(output + 40, vf5); - _mm256_storeu_ps(output + 48, vf6); - _mm256_storeu_ps(output + 56, vf7); - _mm256_storeu_ps(output + 64, vf8); - _mm256_storeu_ps(output + 72, vf9); - output += 80; - - vacc0 = _mm256_add_ps(vacc0, vf0); - vacc1 = _mm256_add_ps(vacc1, vf1); - vacc0 = _mm256_add_ps(vacc0, vf2); - vacc1 = _mm256_add_ps(vacc1, vf3); - vacc0 = _mm256_add_ps(vacc0, vf4); - vacc1 = _mm256_add_ps(vacc1, vf5); - vacc0 = _mm256_add_ps(vacc0, vf6); - vacc1 = _mm256_add_ps(vacc1, vf7); - vacc0 = _mm256_add_ps(vacc0, vf8); - vacc1 = _mm256_add_ps(vacc1, vf9); - } - vacc0 = _mm256_add_ps(vacc0, vacc1); - - __m256 vacc = vacc0; - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const __m256 vi = _mm256_loadu_ps(input); - input += 8; - - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - __m256 vn = _mm256_fmadd_ps(vx, vlog2e, vmagic_bias); - - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - vn = _mm256_sub_ps(vn, vmagic_bias); - - __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vx); - - __m256 vp = _mm256_fmadd_ps(vc5, vt, vc4); - vp = _mm256_fmadd_ps(vp, vt, vc3); - vp = _mm256_fmadd_ps(vp, vt, vc2); - vp = _mm256_fmadd_ps(vp, vt, vc1); - - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_fmadd_ps(vt, vp, vs); - - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - _mm256_storeu_ps(output, vf); - output += 8; - - vacc = _mm256_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); - - const __m256 vi = _mm256_maskload_ps(input, vmask); - - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - __m256 vn = _mm256_fmadd_ps(vx, vlog2e, vmagic_bias); - - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - vn = _mm256_sub_ps(vn, vmagic_bias); - - __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vx); - - __m256 vp = _mm256_fmadd_ps(vc5, vt, vc4); - vp = _mm256_fmadd_ps(vp, vt, vc3); - vp = _mm256_fmadd_ps(vp, vt, vc2); - vp = _mm256_fmadd_ps(vp, vt, vc1); - - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_fmadd_ps(vt, vp, vs); - - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - __m128 vf_lo = _mm256_castps256_ps128(vf); - if (batch & (4 * sizeof(float))) { - _mm_storeu_ps(output, vf_lo); - vf_lo = _mm256_extractf128_ps(vf, 1); - output += 4; - } - if (batch & (2 * sizeof(float))) { - _mm_storel_pi((__m64*) output, vf_lo); - vf_lo = _mm_movehl_ps(vf_lo, vf_lo); - output += 2; - } - if (batch & (1 * sizeof(float))) { - _mm_store_ss(output, vf_lo); - } - - vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); - } - __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); - vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); - vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); - _mm_store_ss(sum, vacc_lo); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u80-acc5.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u80-acc5.c deleted file mode 100644 index cab22fc7376..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u80-acc5.c +++ /dev/null @@ -1,312 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u80_acc5( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; - - const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); - const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); - const __m256 vminus_ln2 = _mm256_set1_ps(-0x1.62E430p-1f); - const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); - const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); - const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); - const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); - const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); - const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m256 vi_max = _mm256_broadcast_ss(max); - - __m256 vacc0 = _mm256_setzero_ps(); - __m256 vacc1 = _mm256_setzero_ps(); - __m256 vacc2 = _mm256_setzero_ps(); - __m256 vacc3 = _mm256_setzero_ps(); - __m256 vacc4 = _mm256_setzero_ps(); - for (; batch >= 80 * sizeof(float); batch -= 80 * sizeof(float)) { - const __m256 vi0 = _mm256_loadu_ps(input); - const __m256 vi1 = _mm256_loadu_ps(input + 8); - const __m256 vi2 = _mm256_loadu_ps(input + 16); - const __m256 vi3 = _mm256_loadu_ps(input + 24); - const __m256 vi4 = _mm256_loadu_ps(input + 32); - const __m256 vi5 = _mm256_loadu_ps(input + 40); - const __m256 vi6 = _mm256_loadu_ps(input + 48); - const __m256 vi7 = _mm256_loadu_ps(input + 56); - const __m256 vi8 = _mm256_loadu_ps(input + 64); - const __m256 vi9 = _mm256_loadu_ps(input + 72); - input += 80; - - const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); - const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); - const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); - const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); - const __m256 vx4 = _mm256_sub_ps(vi4, vi_max); - const __m256 vx5 = _mm256_sub_ps(vi5, vi_max); - const __m256 vx6 = _mm256_sub_ps(vi6, vi_max); - const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); - const __m256 vx8 = _mm256_sub_ps(vi8, vi_max); - const __m256 vx9 = _mm256_sub_ps(vi9, vi_max); - - __m256 vn0 = _mm256_fmadd_ps(vx0, vlog2e, vmagic_bias); - __m256 vn1 = _mm256_fmadd_ps(vx1, vlog2e, vmagic_bias); - __m256 vn2 = _mm256_fmadd_ps(vx2, vlog2e, vmagic_bias); - __m256 vn3 = _mm256_fmadd_ps(vx3, vlog2e, vmagic_bias); - __m256 vn4 = _mm256_fmadd_ps(vx4, vlog2e, vmagic_bias); - __m256 vn5 = _mm256_fmadd_ps(vx5, vlog2e, vmagic_bias); - __m256 vn6 = _mm256_fmadd_ps(vx6, vlog2e, vmagic_bias); - __m256 vn7 = _mm256_fmadd_ps(vx7, vlog2e, vmagic_bias); - __m256 vn8 = _mm256_fmadd_ps(vx8, vlog2e, vmagic_bias); - __m256 vn9 = _mm256_fmadd_ps(vx9, vlog2e, vmagic_bias); - - const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); - const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); - const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); - const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); - const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23)); - const __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); - const __m256 vs6 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn6), 23)); - const __m256 vs7 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn7), 23)); - const __m256 vs8 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn8), 23)); - const __m256 vs9 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn9), 23)); - - vn0 = _mm256_sub_ps(vn0, vmagic_bias); - vn1 = _mm256_sub_ps(vn1, vmagic_bias); - vn2 = _mm256_sub_ps(vn2, vmagic_bias); - vn3 = _mm256_sub_ps(vn3, vmagic_bias); - vn4 = _mm256_sub_ps(vn4, vmagic_bias); - vn5 = _mm256_sub_ps(vn5, vmagic_bias); - vn6 = _mm256_sub_ps(vn6, vmagic_bias); - vn7 = _mm256_sub_ps(vn7, vmagic_bias); - vn8 = _mm256_sub_ps(vn8, vmagic_bias); - vn9 = _mm256_sub_ps(vn9, vmagic_bias); - - __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2, vx0); - __m256 vt1 = _mm256_fmadd_ps(vn1, vminus_ln2, vx1); - __m256 vt2 = _mm256_fmadd_ps(vn2, vminus_ln2, vx2); - __m256 vt3 = _mm256_fmadd_ps(vn3, vminus_ln2, vx3); - __m256 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2, vx4); - __m256 vt5 = _mm256_fmadd_ps(vn5, vminus_ln2, vx5); - __m256 vt6 = _mm256_fmadd_ps(vn6, vminus_ln2, vx6); - __m256 vt7 = _mm256_fmadd_ps(vn7, vminus_ln2, vx7); - __m256 vt8 = _mm256_fmadd_ps(vn8, vminus_ln2, vx8); - __m256 vt9 = _mm256_fmadd_ps(vn9, vminus_ln2, vx9); - - __m256 vp0 = _mm256_fmadd_ps(vc5, vt0, vc4); - __m256 vp1 = _mm256_fmadd_ps(vc5, vt1, vc4); - __m256 vp2 = _mm256_fmadd_ps(vc5, vt2, vc4); - __m256 vp3 = _mm256_fmadd_ps(vc5, vt3, vc4); - __m256 vp4 = _mm256_fmadd_ps(vc5, vt4, vc4); - __m256 vp5 = _mm256_fmadd_ps(vc5, vt5, vc4); - __m256 vp6 = _mm256_fmadd_ps(vc5, vt6, vc4); - __m256 vp7 = _mm256_fmadd_ps(vc5, vt7, vc4); - __m256 vp8 = _mm256_fmadd_ps(vc5, vt8, vc4); - __m256 vp9 = _mm256_fmadd_ps(vc5, vt9, vc4); - - vp0 = _mm256_fmadd_ps(vp0, vt0, vc3); - vp1 = _mm256_fmadd_ps(vp1, vt1, vc3); - vp2 = _mm256_fmadd_ps(vp2, vt2, vc3); - vp3 = _mm256_fmadd_ps(vp3, vt3, vc3); - vp4 = _mm256_fmadd_ps(vp4, vt4, vc3); - vp5 = _mm256_fmadd_ps(vp5, vt5, vc3); - vp6 = _mm256_fmadd_ps(vp6, vt6, vc3); - vp7 = _mm256_fmadd_ps(vp7, vt7, vc3); - vp8 = _mm256_fmadd_ps(vp8, vt8, vc3); - vp9 = _mm256_fmadd_ps(vp9, vt9, vc3); - - vp0 = _mm256_fmadd_ps(vp0, vt0, vc2); - vp1 = _mm256_fmadd_ps(vp1, vt1, vc2); - vp2 = _mm256_fmadd_ps(vp2, vt2, vc2); - vp3 = _mm256_fmadd_ps(vp3, vt3, vc2); - vp4 = _mm256_fmadd_ps(vp4, vt4, vc2); - vp5 = _mm256_fmadd_ps(vp5, vt5, vc2); - vp6 = _mm256_fmadd_ps(vp6, vt6, vc2); - vp7 = _mm256_fmadd_ps(vp7, vt7, vc2); - vp8 = _mm256_fmadd_ps(vp8, vt8, vc2); - vp9 = _mm256_fmadd_ps(vp9, vt9, vc2); - - vp0 = _mm256_fmadd_ps(vp0, vt0, vc1); - vp1 = _mm256_fmadd_ps(vp1, vt1, vc1); - vp2 = _mm256_fmadd_ps(vp2, vt2, vc1); - vp3 = _mm256_fmadd_ps(vp3, vt3, vc1); - vp4 = _mm256_fmadd_ps(vp4, vt4, vc1); - vp5 = _mm256_fmadd_ps(vp5, vt5, vc1); - vp6 = _mm256_fmadd_ps(vp6, vt6, vc1); - vp7 = _mm256_fmadd_ps(vp7, vt7, vc1); - vp8 = _mm256_fmadd_ps(vp8, vt8, vc1); - vp9 = _mm256_fmadd_ps(vp9, vt9, vc1); - - vt0 = _mm256_mul_ps(vt0, vs0); - vt1 = _mm256_mul_ps(vt1, vs1); - vt2 = _mm256_mul_ps(vt2, vs2); - vt3 = _mm256_mul_ps(vt3, vs3); - vt4 = _mm256_mul_ps(vt4, vs4); - vt5 = _mm256_mul_ps(vt5, vs5); - vt6 = _mm256_mul_ps(vt6, vs6); - vt7 = _mm256_mul_ps(vt7, vs7); - vt8 = _mm256_mul_ps(vt8, vs8); - vt9 = _mm256_mul_ps(vt9, vs9); - - __m256 vf0 = _mm256_fmadd_ps(vt0, vp0, vs0); - __m256 vf1 = _mm256_fmadd_ps(vt1, vp1, vs1); - __m256 vf2 = _mm256_fmadd_ps(vt2, vp2, vs2); - __m256 vf3 = _mm256_fmadd_ps(vt3, vp3, vs3); - __m256 vf4 = _mm256_fmadd_ps(vt4, vp4, vs4); - __m256 vf5 = _mm256_fmadd_ps(vt5, vp5, vs5); - __m256 vf6 = _mm256_fmadd_ps(vt6, vp6, vs6); - __m256 vf7 = _mm256_fmadd_ps(vt7, vp7, vs7); - __m256 vf8 = _mm256_fmadd_ps(vt8, vp8, vs8); - __m256 vf9 = _mm256_fmadd_ps(vt9, vp9, vs9); - - vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); - vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); - vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); - vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); - vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vx4, vdenorm_cutoff, _CMP_LT_OS), vf4); - vf5 = _mm256_andnot_ps(_mm256_cmp_ps(vx5, vdenorm_cutoff, _CMP_LT_OS), vf5); - vf6 = _mm256_andnot_ps(_mm256_cmp_ps(vx6, vdenorm_cutoff, _CMP_LT_OS), vf6); - vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); - vf8 = _mm256_andnot_ps(_mm256_cmp_ps(vx8, vdenorm_cutoff, _CMP_LT_OS), vf8); - vf9 = _mm256_andnot_ps(_mm256_cmp_ps(vx9, vdenorm_cutoff, _CMP_LT_OS), vf9); - - _mm256_storeu_ps(output, vf0); - _mm256_storeu_ps(output + 8, vf1); - _mm256_storeu_ps(output + 16, vf2); - _mm256_storeu_ps(output + 24, vf3); - _mm256_storeu_ps(output + 32, vf4); - _mm256_storeu_ps(output + 40, vf5); - _mm256_storeu_ps(output + 48, vf6); - _mm256_storeu_ps(output + 56, vf7); - _mm256_storeu_ps(output + 64, vf8); - _mm256_storeu_ps(output + 72, vf9); - output += 80; - - vacc0 = _mm256_add_ps(vacc0, vf0); - vacc1 = _mm256_add_ps(vacc1, vf1); - vacc2 = _mm256_add_ps(vacc2, vf2); - vacc3 = _mm256_add_ps(vacc3, vf3); - vacc4 = _mm256_add_ps(vacc4, vf4); - vacc0 = _mm256_add_ps(vacc0, vf5); - vacc1 = _mm256_add_ps(vacc1, vf6); - vacc2 = _mm256_add_ps(vacc2, vf7); - vacc3 = _mm256_add_ps(vacc3, vf8); - vacc4 = _mm256_add_ps(vacc4, vf9); - } - vacc0 = _mm256_add_ps(vacc0, vacc1); - vacc2 = _mm256_add_ps(vacc2, vacc3); - vacc0 = _mm256_add_ps(vacc0, vacc2); - vacc0 = _mm256_add_ps(vacc0, vacc4); - - __m256 vacc = vacc0; - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const __m256 vi = _mm256_loadu_ps(input); - input += 8; - - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - __m256 vn = _mm256_fmadd_ps(vx, vlog2e, vmagic_bias); - - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - vn = _mm256_sub_ps(vn, vmagic_bias); - - __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vx); - - __m256 vp = _mm256_fmadd_ps(vc5, vt, vc4); - vp = _mm256_fmadd_ps(vp, vt, vc3); - vp = _mm256_fmadd_ps(vp, vt, vc2); - vp = _mm256_fmadd_ps(vp, vt, vc1); - - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_fmadd_ps(vt, vp, vs); - - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - _mm256_storeu_ps(output, vf); - output += 8; - - vacc = _mm256_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); - - const __m256 vi = _mm256_maskload_ps(input, vmask); - - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - __m256 vn = _mm256_fmadd_ps(vx, vlog2e, vmagic_bias); - - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - vn = _mm256_sub_ps(vn, vmagic_bias); - - __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vx); - - __m256 vp = _mm256_fmadd_ps(vc5, vt, vc4); - vp = _mm256_fmadd_ps(vp, vt, vc3); - vp = _mm256_fmadd_ps(vp, vt, vc2); - vp = _mm256_fmadd_ps(vp, vt, vc1); - - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_fmadd_ps(vt, vp, vs); - - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - __m128 vf_lo = _mm256_castps256_ps128(vf); - if (batch & (4 * sizeof(float))) { - _mm_storeu_ps(output, vf_lo); - vf_lo = _mm256_extractf128_ps(vf, 1); - output += 4; - } - if (batch & (2 * sizeof(float))) { - _mm_storel_pi((__m64*) output, vf_lo); - vf_lo = _mm_movehl_ps(vf_lo, vf_lo); - output += 2; - } - if (batch & (1 * sizeof(float))) { - _mm_store_ss(output, vf_lo); - } - - vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); - } - __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); - vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); - vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); - _mm_store_ss(sum, vacc_lo); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u80.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u80.c deleted file mode 100644 index 82abac46a59..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u80.c +++ /dev/null @@ -1,304 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u80( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; - - const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); - const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); - const __m256 vminus_ln2 = _mm256_set1_ps(-0x1.62E430p-1f); - const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); - const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); - const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); - const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); - const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); - const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m256 vi_max = _mm256_broadcast_ss(max); - - __m256 vacc0 = _mm256_setzero_ps(); - for (; batch >= 80 * sizeof(float); batch -= 80 * sizeof(float)) { - const __m256 vi0 = _mm256_loadu_ps(input); - const __m256 vi1 = _mm256_loadu_ps(input + 8); - const __m256 vi2 = _mm256_loadu_ps(input + 16); - const __m256 vi3 = _mm256_loadu_ps(input + 24); - const __m256 vi4 = _mm256_loadu_ps(input + 32); - const __m256 vi5 = _mm256_loadu_ps(input + 40); - const __m256 vi6 = _mm256_loadu_ps(input + 48); - const __m256 vi7 = _mm256_loadu_ps(input + 56); - const __m256 vi8 = _mm256_loadu_ps(input + 64); - const __m256 vi9 = _mm256_loadu_ps(input + 72); - input += 80; - - const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); - const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); - const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); - const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); - const __m256 vx4 = _mm256_sub_ps(vi4, vi_max); - const __m256 vx5 = _mm256_sub_ps(vi5, vi_max); - const __m256 vx6 = _mm256_sub_ps(vi6, vi_max); - const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); - const __m256 vx8 = _mm256_sub_ps(vi8, vi_max); - const __m256 vx9 = _mm256_sub_ps(vi9, vi_max); - - __m256 vn0 = _mm256_fmadd_ps(vx0, vlog2e, vmagic_bias); - __m256 vn1 = _mm256_fmadd_ps(vx1, vlog2e, vmagic_bias); - __m256 vn2 = _mm256_fmadd_ps(vx2, vlog2e, vmagic_bias); - __m256 vn3 = _mm256_fmadd_ps(vx3, vlog2e, vmagic_bias); - __m256 vn4 = _mm256_fmadd_ps(vx4, vlog2e, vmagic_bias); - __m256 vn5 = _mm256_fmadd_ps(vx5, vlog2e, vmagic_bias); - __m256 vn6 = _mm256_fmadd_ps(vx6, vlog2e, vmagic_bias); - __m256 vn7 = _mm256_fmadd_ps(vx7, vlog2e, vmagic_bias); - __m256 vn8 = _mm256_fmadd_ps(vx8, vlog2e, vmagic_bias); - __m256 vn9 = _mm256_fmadd_ps(vx9, vlog2e, vmagic_bias); - - const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); - const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); - const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); - const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); - const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23)); - const __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); - const __m256 vs6 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn6), 23)); - const __m256 vs7 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn7), 23)); - const __m256 vs8 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn8), 23)); - const __m256 vs9 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn9), 23)); - - vn0 = _mm256_sub_ps(vn0, vmagic_bias); - vn1 = _mm256_sub_ps(vn1, vmagic_bias); - vn2 = _mm256_sub_ps(vn2, vmagic_bias); - vn3 = _mm256_sub_ps(vn3, vmagic_bias); - vn4 = _mm256_sub_ps(vn4, vmagic_bias); - vn5 = _mm256_sub_ps(vn5, vmagic_bias); - vn6 = _mm256_sub_ps(vn6, vmagic_bias); - vn7 = _mm256_sub_ps(vn7, vmagic_bias); - vn8 = _mm256_sub_ps(vn8, vmagic_bias); - vn9 = _mm256_sub_ps(vn9, vmagic_bias); - - __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2, vx0); - __m256 vt1 = _mm256_fmadd_ps(vn1, vminus_ln2, vx1); - __m256 vt2 = _mm256_fmadd_ps(vn2, vminus_ln2, vx2); - __m256 vt3 = _mm256_fmadd_ps(vn3, vminus_ln2, vx3); - __m256 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2, vx4); - __m256 vt5 = _mm256_fmadd_ps(vn5, vminus_ln2, vx5); - __m256 vt6 = _mm256_fmadd_ps(vn6, vminus_ln2, vx6); - __m256 vt7 = _mm256_fmadd_ps(vn7, vminus_ln2, vx7); - __m256 vt8 = _mm256_fmadd_ps(vn8, vminus_ln2, vx8); - __m256 vt9 = _mm256_fmadd_ps(vn9, vminus_ln2, vx9); - - __m256 vp0 = _mm256_fmadd_ps(vc5, vt0, vc4); - __m256 vp1 = _mm256_fmadd_ps(vc5, vt1, vc4); - __m256 vp2 = _mm256_fmadd_ps(vc5, vt2, vc4); - __m256 vp3 = _mm256_fmadd_ps(vc5, vt3, vc4); - __m256 vp4 = _mm256_fmadd_ps(vc5, vt4, vc4); - __m256 vp5 = _mm256_fmadd_ps(vc5, vt5, vc4); - __m256 vp6 = _mm256_fmadd_ps(vc5, vt6, vc4); - __m256 vp7 = _mm256_fmadd_ps(vc5, vt7, vc4); - __m256 vp8 = _mm256_fmadd_ps(vc5, vt8, vc4); - __m256 vp9 = _mm256_fmadd_ps(vc5, vt9, vc4); - - vp0 = _mm256_fmadd_ps(vp0, vt0, vc3); - vp1 = _mm256_fmadd_ps(vp1, vt1, vc3); - vp2 = _mm256_fmadd_ps(vp2, vt2, vc3); - vp3 = _mm256_fmadd_ps(vp3, vt3, vc3); - vp4 = _mm256_fmadd_ps(vp4, vt4, vc3); - vp5 = _mm256_fmadd_ps(vp5, vt5, vc3); - vp6 = _mm256_fmadd_ps(vp6, vt6, vc3); - vp7 = _mm256_fmadd_ps(vp7, vt7, vc3); - vp8 = _mm256_fmadd_ps(vp8, vt8, vc3); - vp9 = _mm256_fmadd_ps(vp9, vt9, vc3); - - vp0 = _mm256_fmadd_ps(vp0, vt0, vc2); - vp1 = _mm256_fmadd_ps(vp1, vt1, vc2); - vp2 = _mm256_fmadd_ps(vp2, vt2, vc2); - vp3 = _mm256_fmadd_ps(vp3, vt3, vc2); - vp4 = _mm256_fmadd_ps(vp4, vt4, vc2); - vp5 = _mm256_fmadd_ps(vp5, vt5, vc2); - vp6 = _mm256_fmadd_ps(vp6, vt6, vc2); - vp7 = _mm256_fmadd_ps(vp7, vt7, vc2); - vp8 = _mm256_fmadd_ps(vp8, vt8, vc2); - vp9 = _mm256_fmadd_ps(vp9, vt9, vc2); - - vp0 = _mm256_fmadd_ps(vp0, vt0, vc1); - vp1 = _mm256_fmadd_ps(vp1, vt1, vc1); - vp2 = _mm256_fmadd_ps(vp2, vt2, vc1); - vp3 = _mm256_fmadd_ps(vp3, vt3, vc1); - vp4 = _mm256_fmadd_ps(vp4, vt4, vc1); - vp5 = _mm256_fmadd_ps(vp5, vt5, vc1); - vp6 = _mm256_fmadd_ps(vp6, vt6, vc1); - vp7 = _mm256_fmadd_ps(vp7, vt7, vc1); - vp8 = _mm256_fmadd_ps(vp8, vt8, vc1); - vp9 = _mm256_fmadd_ps(vp9, vt9, vc1); - - vt0 = _mm256_mul_ps(vt0, vs0); - vt1 = _mm256_mul_ps(vt1, vs1); - vt2 = _mm256_mul_ps(vt2, vs2); - vt3 = _mm256_mul_ps(vt3, vs3); - vt4 = _mm256_mul_ps(vt4, vs4); - vt5 = _mm256_mul_ps(vt5, vs5); - vt6 = _mm256_mul_ps(vt6, vs6); - vt7 = _mm256_mul_ps(vt7, vs7); - vt8 = _mm256_mul_ps(vt8, vs8); - vt9 = _mm256_mul_ps(vt9, vs9); - - __m256 vf0 = _mm256_fmadd_ps(vt0, vp0, vs0); - __m256 vf1 = _mm256_fmadd_ps(vt1, vp1, vs1); - __m256 vf2 = _mm256_fmadd_ps(vt2, vp2, vs2); - __m256 vf3 = _mm256_fmadd_ps(vt3, vp3, vs3); - __m256 vf4 = _mm256_fmadd_ps(vt4, vp4, vs4); - __m256 vf5 = _mm256_fmadd_ps(vt5, vp5, vs5); - __m256 vf6 = _mm256_fmadd_ps(vt6, vp6, vs6); - __m256 vf7 = _mm256_fmadd_ps(vt7, vp7, vs7); - __m256 vf8 = _mm256_fmadd_ps(vt8, vp8, vs8); - __m256 vf9 = _mm256_fmadd_ps(vt9, vp9, vs9); - - vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); - vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); - vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); - vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); - vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vx4, vdenorm_cutoff, _CMP_LT_OS), vf4); - vf5 = _mm256_andnot_ps(_mm256_cmp_ps(vx5, vdenorm_cutoff, _CMP_LT_OS), vf5); - vf6 = _mm256_andnot_ps(_mm256_cmp_ps(vx6, vdenorm_cutoff, _CMP_LT_OS), vf6); - vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); - vf8 = _mm256_andnot_ps(_mm256_cmp_ps(vx8, vdenorm_cutoff, _CMP_LT_OS), vf8); - vf9 = _mm256_andnot_ps(_mm256_cmp_ps(vx9, vdenorm_cutoff, _CMP_LT_OS), vf9); - - _mm256_storeu_ps(output, vf0); - _mm256_storeu_ps(output + 8, vf1); - _mm256_storeu_ps(output + 16, vf2); - _mm256_storeu_ps(output + 24, vf3); - _mm256_storeu_ps(output + 32, vf4); - _mm256_storeu_ps(output + 40, vf5); - _mm256_storeu_ps(output + 48, vf6); - _mm256_storeu_ps(output + 56, vf7); - _mm256_storeu_ps(output + 64, vf8); - _mm256_storeu_ps(output + 72, vf9); - output += 80; - - vacc0 = _mm256_add_ps(vacc0, vf0); - vacc0 = _mm256_add_ps(vacc0, vf1); - vacc0 = _mm256_add_ps(vacc0, vf2); - vacc0 = _mm256_add_ps(vacc0, vf3); - vacc0 = _mm256_add_ps(vacc0, vf4); - vacc0 = _mm256_add_ps(vacc0, vf5); - vacc0 = _mm256_add_ps(vacc0, vf6); - vacc0 = _mm256_add_ps(vacc0, vf7); - vacc0 = _mm256_add_ps(vacc0, vf8); - vacc0 = _mm256_add_ps(vacc0, vf9); - } - - __m256 vacc = vacc0; - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const __m256 vi = _mm256_loadu_ps(input); - input += 8; - - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - __m256 vn = _mm256_fmadd_ps(vx, vlog2e, vmagic_bias); - - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - vn = _mm256_sub_ps(vn, vmagic_bias); - - __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vx); - - __m256 vp = _mm256_fmadd_ps(vc5, vt, vc4); - vp = _mm256_fmadd_ps(vp, vt, vc3); - vp = _mm256_fmadd_ps(vp, vt, vc2); - vp = _mm256_fmadd_ps(vp, vt, vc1); - - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_fmadd_ps(vt, vp, vs); - - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - _mm256_storeu_ps(output, vf); - output += 8; - - vacc = _mm256_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); - - const __m256 vi = _mm256_maskload_ps(input, vmask); - - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - __m256 vn = _mm256_fmadd_ps(vx, vlog2e, vmagic_bias); - - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - vn = _mm256_sub_ps(vn, vmagic_bias); - - __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vx); - - __m256 vp = _mm256_fmadd_ps(vc5, vt, vc4); - vp = _mm256_fmadd_ps(vp, vt, vc3); - vp = _mm256_fmadd_ps(vp, vt, vc2); - vp = _mm256_fmadd_ps(vp, vt, vc1); - - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_fmadd_ps(vt, vp, vs); - - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - __m128 vf_lo = _mm256_castps256_ps128(vf); - if (batch & (4 * sizeof(float))) { - _mm_storeu_ps(output, vf_lo); - vf_lo = _mm256_extractf128_ps(vf, 1); - output += 4; - } - if (batch & (2 * sizeof(float))) { - _mm_storel_pi((__m64*) output, vf_lo); - vf_lo = _mm_movehl_ps(vf_lo, vf_lo); - output += 2; - } - if (batch & (1 * sizeof(float))) { - _mm_store_ss(output, vf_lo); - } - - vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); - } - __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); - vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); - vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); - _mm_store_ss(sum, vacc_lo); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc2.c deleted file mode 100644 index 5bc716258d9..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc2.c +++ /dev/null @@ -1,336 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96_acc2( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; - - const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); - const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); - const __m256 vminus_ln2 = _mm256_set1_ps(-0x1.62E430p-1f); - const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); - const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); - const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); - const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); - const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); - const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m256 vi_max = _mm256_broadcast_ss(max); - - __m256 vacc0 = _mm256_setzero_ps(); - __m256 vacc1 = _mm256_setzero_ps(); - for (; batch >= 96 * sizeof(float); batch -= 96 * sizeof(float)) { - const __m256 vi0 = _mm256_loadu_ps(input); - const __m256 vi1 = _mm256_loadu_ps(input + 8); - const __m256 vi2 = _mm256_loadu_ps(input + 16); - const __m256 vi3 = _mm256_loadu_ps(input + 24); - const __m256 vi4 = _mm256_loadu_ps(input + 32); - const __m256 vi5 = _mm256_loadu_ps(input + 40); - const __m256 vi6 = _mm256_loadu_ps(input + 48); - const __m256 vi7 = _mm256_loadu_ps(input + 56); - const __m256 vi8 = _mm256_loadu_ps(input + 64); - const __m256 vi9 = _mm256_loadu_ps(input + 72); - const __m256 vi10 = _mm256_loadu_ps(input + 80); - const __m256 vi11 = _mm256_loadu_ps(input + 88); - input += 96; - - const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); - const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); - const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); - const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); - const __m256 vx4 = _mm256_sub_ps(vi4, vi_max); - const __m256 vx5 = _mm256_sub_ps(vi5, vi_max); - const __m256 vx6 = _mm256_sub_ps(vi6, vi_max); - const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); - const __m256 vx8 = _mm256_sub_ps(vi8, vi_max); - const __m256 vx9 = _mm256_sub_ps(vi9, vi_max); - const __m256 vx10 = _mm256_sub_ps(vi10, vi_max); - const __m256 vx11 = _mm256_sub_ps(vi11, vi_max); - - __m256 vn0 = _mm256_fmadd_ps(vx0, vlog2e, vmagic_bias); - __m256 vn1 = _mm256_fmadd_ps(vx1, vlog2e, vmagic_bias); - __m256 vn2 = _mm256_fmadd_ps(vx2, vlog2e, vmagic_bias); - __m256 vn3 = _mm256_fmadd_ps(vx3, vlog2e, vmagic_bias); - __m256 vn4 = _mm256_fmadd_ps(vx4, vlog2e, vmagic_bias); - __m256 vn5 = _mm256_fmadd_ps(vx5, vlog2e, vmagic_bias); - __m256 vn6 = _mm256_fmadd_ps(vx6, vlog2e, vmagic_bias); - __m256 vn7 = _mm256_fmadd_ps(vx7, vlog2e, vmagic_bias); - __m256 vn8 = _mm256_fmadd_ps(vx8, vlog2e, vmagic_bias); - __m256 vn9 = _mm256_fmadd_ps(vx9, vlog2e, vmagic_bias); - __m256 vn10 = _mm256_fmadd_ps(vx10, vlog2e, vmagic_bias); - __m256 vn11 = _mm256_fmadd_ps(vx11, vlog2e, vmagic_bias); - - const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); - const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); - const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); - const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); - const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23)); - const __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); - const __m256 vs6 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn6), 23)); - const __m256 vs7 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn7), 23)); - const __m256 vs8 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn8), 23)); - const __m256 vs9 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn9), 23)); - const __m256 vs10 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn10), 23)); - const __m256 vs11 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn11), 23)); - - vn0 = _mm256_sub_ps(vn0, vmagic_bias); - vn1 = _mm256_sub_ps(vn1, vmagic_bias); - vn2 = _mm256_sub_ps(vn2, vmagic_bias); - vn3 = _mm256_sub_ps(vn3, vmagic_bias); - vn4 = _mm256_sub_ps(vn4, vmagic_bias); - vn5 = _mm256_sub_ps(vn5, vmagic_bias); - vn6 = _mm256_sub_ps(vn6, vmagic_bias); - vn7 = _mm256_sub_ps(vn7, vmagic_bias); - vn8 = _mm256_sub_ps(vn8, vmagic_bias); - vn9 = _mm256_sub_ps(vn9, vmagic_bias); - vn10 = _mm256_sub_ps(vn10, vmagic_bias); - vn11 = _mm256_sub_ps(vn11, vmagic_bias); - - __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2, vx0); - __m256 vt1 = _mm256_fmadd_ps(vn1, vminus_ln2, vx1); - __m256 vt2 = _mm256_fmadd_ps(vn2, vminus_ln2, vx2); - __m256 vt3 = _mm256_fmadd_ps(vn3, vminus_ln2, vx3); - __m256 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2, vx4); - __m256 vt5 = _mm256_fmadd_ps(vn5, vminus_ln2, vx5); - __m256 vt6 = _mm256_fmadd_ps(vn6, vminus_ln2, vx6); - __m256 vt7 = _mm256_fmadd_ps(vn7, vminus_ln2, vx7); - __m256 vt8 = _mm256_fmadd_ps(vn8, vminus_ln2, vx8); - __m256 vt9 = _mm256_fmadd_ps(vn9, vminus_ln2, vx9); - __m256 vt10 = _mm256_fmadd_ps(vn10, vminus_ln2, vx10); - __m256 vt11 = _mm256_fmadd_ps(vn11, vminus_ln2, vx11); - - __m256 vp0 = _mm256_fmadd_ps(vc5, vt0, vc4); - __m256 vp1 = _mm256_fmadd_ps(vc5, vt1, vc4); - __m256 vp2 = _mm256_fmadd_ps(vc5, vt2, vc4); - __m256 vp3 = _mm256_fmadd_ps(vc5, vt3, vc4); - __m256 vp4 = _mm256_fmadd_ps(vc5, vt4, vc4); - __m256 vp5 = _mm256_fmadd_ps(vc5, vt5, vc4); - __m256 vp6 = _mm256_fmadd_ps(vc5, vt6, vc4); - __m256 vp7 = _mm256_fmadd_ps(vc5, vt7, vc4); - __m256 vp8 = _mm256_fmadd_ps(vc5, vt8, vc4); - __m256 vp9 = _mm256_fmadd_ps(vc5, vt9, vc4); - __m256 vp10 = _mm256_fmadd_ps(vc5, vt10, vc4); - __m256 vp11 = _mm256_fmadd_ps(vc5, vt11, vc4); - - vp0 = _mm256_fmadd_ps(vp0, vt0, vc3); - vp1 = _mm256_fmadd_ps(vp1, vt1, vc3); - vp2 = _mm256_fmadd_ps(vp2, vt2, vc3); - vp3 = _mm256_fmadd_ps(vp3, vt3, vc3); - vp4 = _mm256_fmadd_ps(vp4, vt4, vc3); - vp5 = _mm256_fmadd_ps(vp5, vt5, vc3); - vp6 = _mm256_fmadd_ps(vp6, vt6, vc3); - vp7 = _mm256_fmadd_ps(vp7, vt7, vc3); - vp8 = _mm256_fmadd_ps(vp8, vt8, vc3); - vp9 = _mm256_fmadd_ps(vp9, vt9, vc3); - vp10 = _mm256_fmadd_ps(vp10, vt10, vc3); - vp11 = _mm256_fmadd_ps(vp11, vt11, vc3); - - vp0 = _mm256_fmadd_ps(vp0, vt0, vc2); - vp1 = _mm256_fmadd_ps(vp1, vt1, vc2); - vp2 = _mm256_fmadd_ps(vp2, vt2, vc2); - vp3 = _mm256_fmadd_ps(vp3, vt3, vc2); - vp4 = _mm256_fmadd_ps(vp4, vt4, vc2); - vp5 = _mm256_fmadd_ps(vp5, vt5, vc2); - vp6 = _mm256_fmadd_ps(vp6, vt6, vc2); - vp7 = _mm256_fmadd_ps(vp7, vt7, vc2); - vp8 = _mm256_fmadd_ps(vp8, vt8, vc2); - vp9 = _mm256_fmadd_ps(vp9, vt9, vc2); - vp10 = _mm256_fmadd_ps(vp10, vt10, vc2); - vp11 = _mm256_fmadd_ps(vp11, vt11, vc2); - - vp0 = _mm256_fmadd_ps(vp0, vt0, vc1); - vp1 = _mm256_fmadd_ps(vp1, vt1, vc1); - vp2 = _mm256_fmadd_ps(vp2, vt2, vc1); - vp3 = _mm256_fmadd_ps(vp3, vt3, vc1); - vp4 = _mm256_fmadd_ps(vp4, vt4, vc1); - vp5 = _mm256_fmadd_ps(vp5, vt5, vc1); - vp6 = _mm256_fmadd_ps(vp6, vt6, vc1); - vp7 = _mm256_fmadd_ps(vp7, vt7, vc1); - vp8 = _mm256_fmadd_ps(vp8, vt8, vc1); - vp9 = _mm256_fmadd_ps(vp9, vt9, vc1); - vp10 = _mm256_fmadd_ps(vp10, vt10, vc1); - vp11 = _mm256_fmadd_ps(vp11, vt11, vc1); - - vt0 = _mm256_mul_ps(vt0, vs0); - vt1 = _mm256_mul_ps(vt1, vs1); - vt2 = _mm256_mul_ps(vt2, vs2); - vt3 = _mm256_mul_ps(vt3, vs3); - vt4 = _mm256_mul_ps(vt4, vs4); - vt5 = _mm256_mul_ps(vt5, vs5); - vt6 = _mm256_mul_ps(vt6, vs6); - vt7 = _mm256_mul_ps(vt7, vs7); - vt8 = _mm256_mul_ps(vt8, vs8); - vt9 = _mm256_mul_ps(vt9, vs9); - vt10 = _mm256_mul_ps(vt10, vs10); - vt11 = _mm256_mul_ps(vt11, vs11); - - __m256 vf0 = _mm256_fmadd_ps(vt0, vp0, vs0); - __m256 vf1 = _mm256_fmadd_ps(vt1, vp1, vs1); - __m256 vf2 = _mm256_fmadd_ps(vt2, vp2, vs2); - __m256 vf3 = _mm256_fmadd_ps(vt3, vp3, vs3); - __m256 vf4 = _mm256_fmadd_ps(vt4, vp4, vs4); - __m256 vf5 = _mm256_fmadd_ps(vt5, vp5, vs5); - __m256 vf6 = _mm256_fmadd_ps(vt6, vp6, vs6); - __m256 vf7 = _mm256_fmadd_ps(vt7, vp7, vs7); - __m256 vf8 = _mm256_fmadd_ps(vt8, vp8, vs8); - __m256 vf9 = _mm256_fmadd_ps(vt9, vp9, vs9); - __m256 vf10 = _mm256_fmadd_ps(vt10, vp10, vs10); - __m256 vf11 = _mm256_fmadd_ps(vt11, vp11, vs11); - - vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); - vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); - vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); - vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); - vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vx4, vdenorm_cutoff, _CMP_LT_OS), vf4); - vf5 = _mm256_andnot_ps(_mm256_cmp_ps(vx5, vdenorm_cutoff, _CMP_LT_OS), vf5); - vf6 = _mm256_andnot_ps(_mm256_cmp_ps(vx6, vdenorm_cutoff, _CMP_LT_OS), vf6); - vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); - vf8 = _mm256_andnot_ps(_mm256_cmp_ps(vx8, vdenorm_cutoff, _CMP_LT_OS), vf8); - vf9 = _mm256_andnot_ps(_mm256_cmp_ps(vx9, vdenorm_cutoff, _CMP_LT_OS), vf9); - vf10 = _mm256_andnot_ps(_mm256_cmp_ps(vx10, vdenorm_cutoff, _CMP_LT_OS), vf10); - vf11 = _mm256_andnot_ps(_mm256_cmp_ps(vx11, vdenorm_cutoff, _CMP_LT_OS), vf11); - - _mm256_storeu_ps(output, vf0); - _mm256_storeu_ps(output + 8, vf1); - _mm256_storeu_ps(output + 16, vf2); - _mm256_storeu_ps(output + 24, vf3); - _mm256_storeu_ps(output + 32, vf4); - _mm256_storeu_ps(output + 40, vf5); - _mm256_storeu_ps(output + 48, vf6); - _mm256_storeu_ps(output + 56, vf7); - _mm256_storeu_ps(output + 64, vf8); - _mm256_storeu_ps(output + 72, vf9); - _mm256_storeu_ps(output + 80, vf10); - _mm256_storeu_ps(output + 88, vf11); - output += 96; - - vacc0 = _mm256_add_ps(vacc0, vf0); - vacc1 = _mm256_add_ps(vacc1, vf1); - vacc0 = _mm256_add_ps(vacc0, vf2); - vacc1 = _mm256_add_ps(vacc1, vf3); - vacc0 = _mm256_add_ps(vacc0, vf4); - vacc1 = _mm256_add_ps(vacc1, vf5); - vacc0 = _mm256_add_ps(vacc0, vf6); - vacc1 = _mm256_add_ps(vacc1, vf7); - vacc0 = _mm256_add_ps(vacc0, vf8); - vacc1 = _mm256_add_ps(vacc1, vf9); - vacc0 = _mm256_add_ps(vacc0, vf10); - vacc1 = _mm256_add_ps(vacc1, vf11); - } - vacc0 = _mm256_add_ps(vacc0, vacc1); - - __m256 vacc = vacc0; - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const __m256 vi = _mm256_loadu_ps(input); - input += 8; - - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - __m256 vn = _mm256_fmadd_ps(vx, vlog2e, vmagic_bias); - - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - vn = _mm256_sub_ps(vn, vmagic_bias); - - __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vx); - - __m256 vp = _mm256_fmadd_ps(vc5, vt, vc4); - vp = _mm256_fmadd_ps(vp, vt, vc3); - vp = _mm256_fmadd_ps(vp, vt, vc2); - vp = _mm256_fmadd_ps(vp, vt, vc1); - - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_fmadd_ps(vt, vp, vs); - - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - _mm256_storeu_ps(output, vf); - output += 8; - - vacc = _mm256_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); - - const __m256 vi = _mm256_maskload_ps(input, vmask); - - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - __m256 vn = _mm256_fmadd_ps(vx, vlog2e, vmagic_bias); - - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - vn = _mm256_sub_ps(vn, vmagic_bias); - - __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vx); - - __m256 vp = _mm256_fmadd_ps(vc5, vt, vc4); - vp = _mm256_fmadd_ps(vp, vt, vc3); - vp = _mm256_fmadd_ps(vp, vt, vc2); - vp = _mm256_fmadd_ps(vp, vt, vc1); - - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_fmadd_ps(vt, vp, vs); - - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - __m128 vf_lo = _mm256_castps256_ps128(vf); - if (batch & (4 * sizeof(float))) { - _mm_storeu_ps(output, vf_lo); - vf_lo = _mm256_extractf128_ps(vf, 1); - output += 4; - } - if (batch & (2 * sizeof(float))) { - _mm_storel_pi((__m64*) output, vf_lo); - vf_lo = _mm_movehl_ps(vf_lo, vf_lo); - output += 2; - } - if (batch & (1 * sizeof(float))) { - _mm_store_ss(output, vf_lo); - } - - vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); - } - __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); - vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); - vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); - _mm_store_ss(sum, vacc_lo); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc3.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc3.c deleted file mode 100644 index f9340744efd..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc3.c +++ /dev/null @@ -1,338 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96_acc3( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; - - const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); - const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); - const __m256 vminus_ln2 = _mm256_set1_ps(-0x1.62E430p-1f); - const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); - const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); - const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); - const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); - const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); - const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m256 vi_max = _mm256_broadcast_ss(max); - - __m256 vacc0 = _mm256_setzero_ps(); - __m256 vacc1 = _mm256_setzero_ps(); - __m256 vacc2 = _mm256_setzero_ps(); - for (; batch >= 96 * sizeof(float); batch -= 96 * sizeof(float)) { - const __m256 vi0 = _mm256_loadu_ps(input); - const __m256 vi1 = _mm256_loadu_ps(input + 8); - const __m256 vi2 = _mm256_loadu_ps(input + 16); - const __m256 vi3 = _mm256_loadu_ps(input + 24); - const __m256 vi4 = _mm256_loadu_ps(input + 32); - const __m256 vi5 = _mm256_loadu_ps(input + 40); - const __m256 vi6 = _mm256_loadu_ps(input + 48); - const __m256 vi7 = _mm256_loadu_ps(input + 56); - const __m256 vi8 = _mm256_loadu_ps(input + 64); - const __m256 vi9 = _mm256_loadu_ps(input + 72); - const __m256 vi10 = _mm256_loadu_ps(input + 80); - const __m256 vi11 = _mm256_loadu_ps(input + 88); - input += 96; - - const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); - const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); - const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); - const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); - const __m256 vx4 = _mm256_sub_ps(vi4, vi_max); - const __m256 vx5 = _mm256_sub_ps(vi5, vi_max); - const __m256 vx6 = _mm256_sub_ps(vi6, vi_max); - const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); - const __m256 vx8 = _mm256_sub_ps(vi8, vi_max); - const __m256 vx9 = _mm256_sub_ps(vi9, vi_max); - const __m256 vx10 = _mm256_sub_ps(vi10, vi_max); - const __m256 vx11 = _mm256_sub_ps(vi11, vi_max); - - __m256 vn0 = _mm256_fmadd_ps(vx0, vlog2e, vmagic_bias); - __m256 vn1 = _mm256_fmadd_ps(vx1, vlog2e, vmagic_bias); - __m256 vn2 = _mm256_fmadd_ps(vx2, vlog2e, vmagic_bias); - __m256 vn3 = _mm256_fmadd_ps(vx3, vlog2e, vmagic_bias); - __m256 vn4 = _mm256_fmadd_ps(vx4, vlog2e, vmagic_bias); - __m256 vn5 = _mm256_fmadd_ps(vx5, vlog2e, vmagic_bias); - __m256 vn6 = _mm256_fmadd_ps(vx6, vlog2e, vmagic_bias); - __m256 vn7 = _mm256_fmadd_ps(vx7, vlog2e, vmagic_bias); - __m256 vn8 = _mm256_fmadd_ps(vx8, vlog2e, vmagic_bias); - __m256 vn9 = _mm256_fmadd_ps(vx9, vlog2e, vmagic_bias); - __m256 vn10 = _mm256_fmadd_ps(vx10, vlog2e, vmagic_bias); - __m256 vn11 = _mm256_fmadd_ps(vx11, vlog2e, vmagic_bias); - - const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); - const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); - const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); - const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); - const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23)); - const __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); - const __m256 vs6 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn6), 23)); - const __m256 vs7 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn7), 23)); - const __m256 vs8 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn8), 23)); - const __m256 vs9 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn9), 23)); - const __m256 vs10 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn10), 23)); - const __m256 vs11 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn11), 23)); - - vn0 = _mm256_sub_ps(vn0, vmagic_bias); - vn1 = _mm256_sub_ps(vn1, vmagic_bias); - vn2 = _mm256_sub_ps(vn2, vmagic_bias); - vn3 = _mm256_sub_ps(vn3, vmagic_bias); - vn4 = _mm256_sub_ps(vn4, vmagic_bias); - vn5 = _mm256_sub_ps(vn5, vmagic_bias); - vn6 = _mm256_sub_ps(vn6, vmagic_bias); - vn7 = _mm256_sub_ps(vn7, vmagic_bias); - vn8 = _mm256_sub_ps(vn8, vmagic_bias); - vn9 = _mm256_sub_ps(vn9, vmagic_bias); - vn10 = _mm256_sub_ps(vn10, vmagic_bias); - vn11 = _mm256_sub_ps(vn11, vmagic_bias); - - __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2, vx0); - __m256 vt1 = _mm256_fmadd_ps(vn1, vminus_ln2, vx1); - __m256 vt2 = _mm256_fmadd_ps(vn2, vminus_ln2, vx2); - __m256 vt3 = _mm256_fmadd_ps(vn3, vminus_ln2, vx3); - __m256 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2, vx4); - __m256 vt5 = _mm256_fmadd_ps(vn5, vminus_ln2, vx5); - __m256 vt6 = _mm256_fmadd_ps(vn6, vminus_ln2, vx6); - __m256 vt7 = _mm256_fmadd_ps(vn7, vminus_ln2, vx7); - __m256 vt8 = _mm256_fmadd_ps(vn8, vminus_ln2, vx8); - __m256 vt9 = _mm256_fmadd_ps(vn9, vminus_ln2, vx9); - __m256 vt10 = _mm256_fmadd_ps(vn10, vminus_ln2, vx10); - __m256 vt11 = _mm256_fmadd_ps(vn11, vminus_ln2, vx11); - - __m256 vp0 = _mm256_fmadd_ps(vc5, vt0, vc4); - __m256 vp1 = _mm256_fmadd_ps(vc5, vt1, vc4); - __m256 vp2 = _mm256_fmadd_ps(vc5, vt2, vc4); - __m256 vp3 = _mm256_fmadd_ps(vc5, vt3, vc4); - __m256 vp4 = _mm256_fmadd_ps(vc5, vt4, vc4); - __m256 vp5 = _mm256_fmadd_ps(vc5, vt5, vc4); - __m256 vp6 = _mm256_fmadd_ps(vc5, vt6, vc4); - __m256 vp7 = _mm256_fmadd_ps(vc5, vt7, vc4); - __m256 vp8 = _mm256_fmadd_ps(vc5, vt8, vc4); - __m256 vp9 = _mm256_fmadd_ps(vc5, vt9, vc4); - __m256 vp10 = _mm256_fmadd_ps(vc5, vt10, vc4); - __m256 vp11 = _mm256_fmadd_ps(vc5, vt11, vc4); - - vp0 = _mm256_fmadd_ps(vp0, vt0, vc3); - vp1 = _mm256_fmadd_ps(vp1, vt1, vc3); - vp2 = _mm256_fmadd_ps(vp2, vt2, vc3); - vp3 = _mm256_fmadd_ps(vp3, vt3, vc3); - vp4 = _mm256_fmadd_ps(vp4, vt4, vc3); - vp5 = _mm256_fmadd_ps(vp5, vt5, vc3); - vp6 = _mm256_fmadd_ps(vp6, vt6, vc3); - vp7 = _mm256_fmadd_ps(vp7, vt7, vc3); - vp8 = _mm256_fmadd_ps(vp8, vt8, vc3); - vp9 = _mm256_fmadd_ps(vp9, vt9, vc3); - vp10 = _mm256_fmadd_ps(vp10, vt10, vc3); - vp11 = _mm256_fmadd_ps(vp11, vt11, vc3); - - vp0 = _mm256_fmadd_ps(vp0, vt0, vc2); - vp1 = _mm256_fmadd_ps(vp1, vt1, vc2); - vp2 = _mm256_fmadd_ps(vp2, vt2, vc2); - vp3 = _mm256_fmadd_ps(vp3, vt3, vc2); - vp4 = _mm256_fmadd_ps(vp4, vt4, vc2); - vp5 = _mm256_fmadd_ps(vp5, vt5, vc2); - vp6 = _mm256_fmadd_ps(vp6, vt6, vc2); - vp7 = _mm256_fmadd_ps(vp7, vt7, vc2); - vp8 = _mm256_fmadd_ps(vp8, vt8, vc2); - vp9 = _mm256_fmadd_ps(vp9, vt9, vc2); - vp10 = _mm256_fmadd_ps(vp10, vt10, vc2); - vp11 = _mm256_fmadd_ps(vp11, vt11, vc2); - - vp0 = _mm256_fmadd_ps(vp0, vt0, vc1); - vp1 = _mm256_fmadd_ps(vp1, vt1, vc1); - vp2 = _mm256_fmadd_ps(vp2, vt2, vc1); - vp3 = _mm256_fmadd_ps(vp3, vt3, vc1); - vp4 = _mm256_fmadd_ps(vp4, vt4, vc1); - vp5 = _mm256_fmadd_ps(vp5, vt5, vc1); - vp6 = _mm256_fmadd_ps(vp6, vt6, vc1); - vp7 = _mm256_fmadd_ps(vp7, vt7, vc1); - vp8 = _mm256_fmadd_ps(vp8, vt8, vc1); - vp9 = _mm256_fmadd_ps(vp9, vt9, vc1); - vp10 = _mm256_fmadd_ps(vp10, vt10, vc1); - vp11 = _mm256_fmadd_ps(vp11, vt11, vc1); - - vt0 = _mm256_mul_ps(vt0, vs0); - vt1 = _mm256_mul_ps(vt1, vs1); - vt2 = _mm256_mul_ps(vt2, vs2); - vt3 = _mm256_mul_ps(vt3, vs3); - vt4 = _mm256_mul_ps(vt4, vs4); - vt5 = _mm256_mul_ps(vt5, vs5); - vt6 = _mm256_mul_ps(vt6, vs6); - vt7 = _mm256_mul_ps(vt7, vs7); - vt8 = _mm256_mul_ps(vt8, vs8); - vt9 = _mm256_mul_ps(vt9, vs9); - vt10 = _mm256_mul_ps(vt10, vs10); - vt11 = _mm256_mul_ps(vt11, vs11); - - __m256 vf0 = _mm256_fmadd_ps(vt0, vp0, vs0); - __m256 vf1 = _mm256_fmadd_ps(vt1, vp1, vs1); - __m256 vf2 = _mm256_fmadd_ps(vt2, vp2, vs2); - __m256 vf3 = _mm256_fmadd_ps(vt3, vp3, vs3); - __m256 vf4 = _mm256_fmadd_ps(vt4, vp4, vs4); - __m256 vf5 = _mm256_fmadd_ps(vt5, vp5, vs5); - __m256 vf6 = _mm256_fmadd_ps(vt6, vp6, vs6); - __m256 vf7 = _mm256_fmadd_ps(vt7, vp7, vs7); - __m256 vf8 = _mm256_fmadd_ps(vt8, vp8, vs8); - __m256 vf9 = _mm256_fmadd_ps(vt9, vp9, vs9); - __m256 vf10 = _mm256_fmadd_ps(vt10, vp10, vs10); - __m256 vf11 = _mm256_fmadd_ps(vt11, vp11, vs11); - - vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); - vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); - vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); - vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); - vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vx4, vdenorm_cutoff, _CMP_LT_OS), vf4); - vf5 = _mm256_andnot_ps(_mm256_cmp_ps(vx5, vdenorm_cutoff, _CMP_LT_OS), vf5); - vf6 = _mm256_andnot_ps(_mm256_cmp_ps(vx6, vdenorm_cutoff, _CMP_LT_OS), vf6); - vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); - vf8 = _mm256_andnot_ps(_mm256_cmp_ps(vx8, vdenorm_cutoff, _CMP_LT_OS), vf8); - vf9 = _mm256_andnot_ps(_mm256_cmp_ps(vx9, vdenorm_cutoff, _CMP_LT_OS), vf9); - vf10 = _mm256_andnot_ps(_mm256_cmp_ps(vx10, vdenorm_cutoff, _CMP_LT_OS), vf10); - vf11 = _mm256_andnot_ps(_mm256_cmp_ps(vx11, vdenorm_cutoff, _CMP_LT_OS), vf11); - - _mm256_storeu_ps(output, vf0); - _mm256_storeu_ps(output + 8, vf1); - _mm256_storeu_ps(output + 16, vf2); - _mm256_storeu_ps(output + 24, vf3); - _mm256_storeu_ps(output + 32, vf4); - _mm256_storeu_ps(output + 40, vf5); - _mm256_storeu_ps(output + 48, vf6); - _mm256_storeu_ps(output + 56, vf7); - _mm256_storeu_ps(output + 64, vf8); - _mm256_storeu_ps(output + 72, vf9); - _mm256_storeu_ps(output + 80, vf10); - _mm256_storeu_ps(output + 88, vf11); - output += 96; - - vacc0 = _mm256_add_ps(vacc0, vf0); - vacc1 = _mm256_add_ps(vacc1, vf1); - vacc2 = _mm256_add_ps(vacc2, vf2); - vacc0 = _mm256_add_ps(vacc0, vf3); - vacc1 = _mm256_add_ps(vacc1, vf4); - vacc2 = _mm256_add_ps(vacc2, vf5); - vacc0 = _mm256_add_ps(vacc0, vf6); - vacc1 = _mm256_add_ps(vacc1, vf7); - vacc2 = _mm256_add_ps(vacc2, vf8); - vacc0 = _mm256_add_ps(vacc0, vf9); - vacc1 = _mm256_add_ps(vacc1, vf10); - vacc2 = _mm256_add_ps(vacc2, vf11); - } - vacc0 = _mm256_add_ps(vacc0, vacc1); - vacc0 = _mm256_add_ps(vacc0, vacc2); - - __m256 vacc = vacc0; - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const __m256 vi = _mm256_loadu_ps(input); - input += 8; - - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - __m256 vn = _mm256_fmadd_ps(vx, vlog2e, vmagic_bias); - - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - vn = _mm256_sub_ps(vn, vmagic_bias); - - __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vx); - - __m256 vp = _mm256_fmadd_ps(vc5, vt, vc4); - vp = _mm256_fmadd_ps(vp, vt, vc3); - vp = _mm256_fmadd_ps(vp, vt, vc2); - vp = _mm256_fmadd_ps(vp, vt, vc1); - - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_fmadd_ps(vt, vp, vs); - - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - _mm256_storeu_ps(output, vf); - output += 8; - - vacc = _mm256_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); - - const __m256 vi = _mm256_maskload_ps(input, vmask); - - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - __m256 vn = _mm256_fmadd_ps(vx, vlog2e, vmagic_bias); - - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - vn = _mm256_sub_ps(vn, vmagic_bias); - - __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vx); - - __m256 vp = _mm256_fmadd_ps(vc5, vt, vc4); - vp = _mm256_fmadd_ps(vp, vt, vc3); - vp = _mm256_fmadd_ps(vp, vt, vc2); - vp = _mm256_fmadd_ps(vp, vt, vc1); - - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_fmadd_ps(vt, vp, vs); - - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - __m128 vf_lo = _mm256_castps256_ps128(vf); - if (batch & (4 * sizeof(float))) { - _mm_storeu_ps(output, vf_lo); - vf_lo = _mm256_extractf128_ps(vf, 1); - output += 4; - } - if (batch & (2 * sizeof(float))) { - _mm_storel_pi((__m64*) output, vf_lo); - vf_lo = _mm_movehl_ps(vf_lo, vf_lo); - output += 2; - } - if (batch & (1 * sizeof(float))) { - _mm_store_ss(output, vf_lo); - } - - vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); - } - __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); - vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); - vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); - _mm_store_ss(sum, vacc_lo); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc6.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc6.c deleted file mode 100644 index 89c609fe60e..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96-acc6.c +++ /dev/null @@ -1,344 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96_acc6( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; - - const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); - const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); - const __m256 vminus_ln2 = _mm256_set1_ps(-0x1.62E430p-1f); - const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); - const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); - const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); - const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); - const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); - const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m256 vi_max = _mm256_broadcast_ss(max); - - __m256 vacc0 = _mm256_setzero_ps(); - __m256 vacc1 = _mm256_setzero_ps(); - __m256 vacc2 = _mm256_setzero_ps(); - __m256 vacc3 = _mm256_setzero_ps(); - __m256 vacc4 = _mm256_setzero_ps(); - __m256 vacc5 = _mm256_setzero_ps(); - for (; batch >= 96 * sizeof(float); batch -= 96 * sizeof(float)) { - const __m256 vi0 = _mm256_loadu_ps(input); - const __m256 vi1 = _mm256_loadu_ps(input + 8); - const __m256 vi2 = _mm256_loadu_ps(input + 16); - const __m256 vi3 = _mm256_loadu_ps(input + 24); - const __m256 vi4 = _mm256_loadu_ps(input + 32); - const __m256 vi5 = _mm256_loadu_ps(input + 40); - const __m256 vi6 = _mm256_loadu_ps(input + 48); - const __m256 vi7 = _mm256_loadu_ps(input + 56); - const __m256 vi8 = _mm256_loadu_ps(input + 64); - const __m256 vi9 = _mm256_loadu_ps(input + 72); - const __m256 vi10 = _mm256_loadu_ps(input + 80); - const __m256 vi11 = _mm256_loadu_ps(input + 88); - input += 96; - - const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); - const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); - const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); - const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); - const __m256 vx4 = _mm256_sub_ps(vi4, vi_max); - const __m256 vx5 = _mm256_sub_ps(vi5, vi_max); - const __m256 vx6 = _mm256_sub_ps(vi6, vi_max); - const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); - const __m256 vx8 = _mm256_sub_ps(vi8, vi_max); - const __m256 vx9 = _mm256_sub_ps(vi9, vi_max); - const __m256 vx10 = _mm256_sub_ps(vi10, vi_max); - const __m256 vx11 = _mm256_sub_ps(vi11, vi_max); - - __m256 vn0 = _mm256_fmadd_ps(vx0, vlog2e, vmagic_bias); - __m256 vn1 = _mm256_fmadd_ps(vx1, vlog2e, vmagic_bias); - __m256 vn2 = _mm256_fmadd_ps(vx2, vlog2e, vmagic_bias); - __m256 vn3 = _mm256_fmadd_ps(vx3, vlog2e, vmagic_bias); - __m256 vn4 = _mm256_fmadd_ps(vx4, vlog2e, vmagic_bias); - __m256 vn5 = _mm256_fmadd_ps(vx5, vlog2e, vmagic_bias); - __m256 vn6 = _mm256_fmadd_ps(vx6, vlog2e, vmagic_bias); - __m256 vn7 = _mm256_fmadd_ps(vx7, vlog2e, vmagic_bias); - __m256 vn8 = _mm256_fmadd_ps(vx8, vlog2e, vmagic_bias); - __m256 vn9 = _mm256_fmadd_ps(vx9, vlog2e, vmagic_bias); - __m256 vn10 = _mm256_fmadd_ps(vx10, vlog2e, vmagic_bias); - __m256 vn11 = _mm256_fmadd_ps(vx11, vlog2e, vmagic_bias); - - const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); - const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); - const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); - const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); - const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23)); - const __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); - const __m256 vs6 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn6), 23)); - const __m256 vs7 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn7), 23)); - const __m256 vs8 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn8), 23)); - const __m256 vs9 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn9), 23)); - const __m256 vs10 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn10), 23)); - const __m256 vs11 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn11), 23)); - - vn0 = _mm256_sub_ps(vn0, vmagic_bias); - vn1 = _mm256_sub_ps(vn1, vmagic_bias); - vn2 = _mm256_sub_ps(vn2, vmagic_bias); - vn3 = _mm256_sub_ps(vn3, vmagic_bias); - vn4 = _mm256_sub_ps(vn4, vmagic_bias); - vn5 = _mm256_sub_ps(vn5, vmagic_bias); - vn6 = _mm256_sub_ps(vn6, vmagic_bias); - vn7 = _mm256_sub_ps(vn7, vmagic_bias); - vn8 = _mm256_sub_ps(vn8, vmagic_bias); - vn9 = _mm256_sub_ps(vn9, vmagic_bias); - vn10 = _mm256_sub_ps(vn10, vmagic_bias); - vn11 = _mm256_sub_ps(vn11, vmagic_bias); - - __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2, vx0); - __m256 vt1 = _mm256_fmadd_ps(vn1, vminus_ln2, vx1); - __m256 vt2 = _mm256_fmadd_ps(vn2, vminus_ln2, vx2); - __m256 vt3 = _mm256_fmadd_ps(vn3, vminus_ln2, vx3); - __m256 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2, vx4); - __m256 vt5 = _mm256_fmadd_ps(vn5, vminus_ln2, vx5); - __m256 vt6 = _mm256_fmadd_ps(vn6, vminus_ln2, vx6); - __m256 vt7 = _mm256_fmadd_ps(vn7, vminus_ln2, vx7); - __m256 vt8 = _mm256_fmadd_ps(vn8, vminus_ln2, vx8); - __m256 vt9 = _mm256_fmadd_ps(vn9, vminus_ln2, vx9); - __m256 vt10 = _mm256_fmadd_ps(vn10, vminus_ln2, vx10); - __m256 vt11 = _mm256_fmadd_ps(vn11, vminus_ln2, vx11); - - __m256 vp0 = _mm256_fmadd_ps(vc5, vt0, vc4); - __m256 vp1 = _mm256_fmadd_ps(vc5, vt1, vc4); - __m256 vp2 = _mm256_fmadd_ps(vc5, vt2, vc4); - __m256 vp3 = _mm256_fmadd_ps(vc5, vt3, vc4); - __m256 vp4 = _mm256_fmadd_ps(vc5, vt4, vc4); - __m256 vp5 = _mm256_fmadd_ps(vc5, vt5, vc4); - __m256 vp6 = _mm256_fmadd_ps(vc5, vt6, vc4); - __m256 vp7 = _mm256_fmadd_ps(vc5, vt7, vc4); - __m256 vp8 = _mm256_fmadd_ps(vc5, vt8, vc4); - __m256 vp9 = _mm256_fmadd_ps(vc5, vt9, vc4); - __m256 vp10 = _mm256_fmadd_ps(vc5, vt10, vc4); - __m256 vp11 = _mm256_fmadd_ps(vc5, vt11, vc4); - - vp0 = _mm256_fmadd_ps(vp0, vt0, vc3); - vp1 = _mm256_fmadd_ps(vp1, vt1, vc3); - vp2 = _mm256_fmadd_ps(vp2, vt2, vc3); - vp3 = _mm256_fmadd_ps(vp3, vt3, vc3); - vp4 = _mm256_fmadd_ps(vp4, vt4, vc3); - vp5 = _mm256_fmadd_ps(vp5, vt5, vc3); - vp6 = _mm256_fmadd_ps(vp6, vt6, vc3); - vp7 = _mm256_fmadd_ps(vp7, vt7, vc3); - vp8 = _mm256_fmadd_ps(vp8, vt8, vc3); - vp9 = _mm256_fmadd_ps(vp9, vt9, vc3); - vp10 = _mm256_fmadd_ps(vp10, vt10, vc3); - vp11 = _mm256_fmadd_ps(vp11, vt11, vc3); - - vp0 = _mm256_fmadd_ps(vp0, vt0, vc2); - vp1 = _mm256_fmadd_ps(vp1, vt1, vc2); - vp2 = _mm256_fmadd_ps(vp2, vt2, vc2); - vp3 = _mm256_fmadd_ps(vp3, vt3, vc2); - vp4 = _mm256_fmadd_ps(vp4, vt4, vc2); - vp5 = _mm256_fmadd_ps(vp5, vt5, vc2); - vp6 = _mm256_fmadd_ps(vp6, vt6, vc2); - vp7 = _mm256_fmadd_ps(vp7, vt7, vc2); - vp8 = _mm256_fmadd_ps(vp8, vt8, vc2); - vp9 = _mm256_fmadd_ps(vp9, vt9, vc2); - vp10 = _mm256_fmadd_ps(vp10, vt10, vc2); - vp11 = _mm256_fmadd_ps(vp11, vt11, vc2); - - vp0 = _mm256_fmadd_ps(vp0, vt0, vc1); - vp1 = _mm256_fmadd_ps(vp1, vt1, vc1); - vp2 = _mm256_fmadd_ps(vp2, vt2, vc1); - vp3 = _mm256_fmadd_ps(vp3, vt3, vc1); - vp4 = _mm256_fmadd_ps(vp4, vt4, vc1); - vp5 = _mm256_fmadd_ps(vp5, vt5, vc1); - vp6 = _mm256_fmadd_ps(vp6, vt6, vc1); - vp7 = _mm256_fmadd_ps(vp7, vt7, vc1); - vp8 = _mm256_fmadd_ps(vp8, vt8, vc1); - vp9 = _mm256_fmadd_ps(vp9, vt9, vc1); - vp10 = _mm256_fmadd_ps(vp10, vt10, vc1); - vp11 = _mm256_fmadd_ps(vp11, vt11, vc1); - - vt0 = _mm256_mul_ps(vt0, vs0); - vt1 = _mm256_mul_ps(vt1, vs1); - vt2 = _mm256_mul_ps(vt2, vs2); - vt3 = _mm256_mul_ps(vt3, vs3); - vt4 = _mm256_mul_ps(vt4, vs4); - vt5 = _mm256_mul_ps(vt5, vs5); - vt6 = _mm256_mul_ps(vt6, vs6); - vt7 = _mm256_mul_ps(vt7, vs7); - vt8 = _mm256_mul_ps(vt8, vs8); - vt9 = _mm256_mul_ps(vt9, vs9); - vt10 = _mm256_mul_ps(vt10, vs10); - vt11 = _mm256_mul_ps(vt11, vs11); - - __m256 vf0 = _mm256_fmadd_ps(vt0, vp0, vs0); - __m256 vf1 = _mm256_fmadd_ps(vt1, vp1, vs1); - __m256 vf2 = _mm256_fmadd_ps(vt2, vp2, vs2); - __m256 vf3 = _mm256_fmadd_ps(vt3, vp3, vs3); - __m256 vf4 = _mm256_fmadd_ps(vt4, vp4, vs4); - __m256 vf5 = _mm256_fmadd_ps(vt5, vp5, vs5); - __m256 vf6 = _mm256_fmadd_ps(vt6, vp6, vs6); - __m256 vf7 = _mm256_fmadd_ps(vt7, vp7, vs7); - __m256 vf8 = _mm256_fmadd_ps(vt8, vp8, vs8); - __m256 vf9 = _mm256_fmadd_ps(vt9, vp9, vs9); - __m256 vf10 = _mm256_fmadd_ps(vt10, vp10, vs10); - __m256 vf11 = _mm256_fmadd_ps(vt11, vp11, vs11); - - vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); - vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); - vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); - vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); - vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vx4, vdenorm_cutoff, _CMP_LT_OS), vf4); - vf5 = _mm256_andnot_ps(_mm256_cmp_ps(vx5, vdenorm_cutoff, _CMP_LT_OS), vf5); - vf6 = _mm256_andnot_ps(_mm256_cmp_ps(vx6, vdenorm_cutoff, _CMP_LT_OS), vf6); - vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); - vf8 = _mm256_andnot_ps(_mm256_cmp_ps(vx8, vdenorm_cutoff, _CMP_LT_OS), vf8); - vf9 = _mm256_andnot_ps(_mm256_cmp_ps(vx9, vdenorm_cutoff, _CMP_LT_OS), vf9); - vf10 = _mm256_andnot_ps(_mm256_cmp_ps(vx10, vdenorm_cutoff, _CMP_LT_OS), vf10); - vf11 = _mm256_andnot_ps(_mm256_cmp_ps(vx11, vdenorm_cutoff, _CMP_LT_OS), vf11); - - _mm256_storeu_ps(output, vf0); - _mm256_storeu_ps(output + 8, vf1); - _mm256_storeu_ps(output + 16, vf2); - _mm256_storeu_ps(output + 24, vf3); - _mm256_storeu_ps(output + 32, vf4); - _mm256_storeu_ps(output + 40, vf5); - _mm256_storeu_ps(output + 48, vf6); - _mm256_storeu_ps(output + 56, vf7); - _mm256_storeu_ps(output + 64, vf8); - _mm256_storeu_ps(output + 72, vf9); - _mm256_storeu_ps(output + 80, vf10); - _mm256_storeu_ps(output + 88, vf11); - output += 96; - - vacc0 = _mm256_add_ps(vacc0, vf0); - vacc1 = _mm256_add_ps(vacc1, vf1); - vacc2 = _mm256_add_ps(vacc2, vf2); - vacc3 = _mm256_add_ps(vacc3, vf3); - vacc4 = _mm256_add_ps(vacc4, vf4); - vacc5 = _mm256_add_ps(vacc5, vf5); - vacc0 = _mm256_add_ps(vacc0, vf6); - vacc1 = _mm256_add_ps(vacc1, vf7); - vacc2 = _mm256_add_ps(vacc2, vf8); - vacc3 = _mm256_add_ps(vacc3, vf9); - vacc4 = _mm256_add_ps(vacc4, vf10); - vacc5 = _mm256_add_ps(vacc5, vf11); - } - vacc0 = _mm256_add_ps(vacc0, vacc1); - vacc2 = _mm256_add_ps(vacc2, vacc3); - vacc4 = _mm256_add_ps(vacc4, vacc5); - vacc0 = _mm256_add_ps(vacc0, vacc2); - vacc0 = _mm256_add_ps(vacc0, vacc4); - - __m256 vacc = vacc0; - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const __m256 vi = _mm256_loadu_ps(input); - input += 8; - - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - __m256 vn = _mm256_fmadd_ps(vx, vlog2e, vmagic_bias); - - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - vn = _mm256_sub_ps(vn, vmagic_bias); - - __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vx); - - __m256 vp = _mm256_fmadd_ps(vc5, vt, vc4); - vp = _mm256_fmadd_ps(vp, vt, vc3); - vp = _mm256_fmadd_ps(vp, vt, vc2); - vp = _mm256_fmadd_ps(vp, vt, vc1); - - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_fmadd_ps(vt, vp, vs); - - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - _mm256_storeu_ps(output, vf); - output += 8; - - vacc = _mm256_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); - - const __m256 vi = _mm256_maskload_ps(input, vmask); - - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - __m256 vn = _mm256_fmadd_ps(vx, vlog2e, vmagic_bias); - - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - vn = _mm256_sub_ps(vn, vmagic_bias); - - __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vx); - - __m256 vp = _mm256_fmadd_ps(vc5, vt, vc4); - vp = _mm256_fmadd_ps(vp, vt, vc3); - vp = _mm256_fmadd_ps(vp, vt, vc2); - vp = _mm256_fmadd_ps(vp, vt, vc1); - - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_fmadd_ps(vt, vp, vs); - - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - __m128 vf_lo = _mm256_castps256_ps128(vf); - if (batch & (4 * sizeof(float))) { - _mm_storeu_ps(output, vf_lo); - vf_lo = _mm256_extractf128_ps(vf, 1); - output += 4; - } - if (batch & (2 * sizeof(float))) { - _mm_storel_pi((__m64*) output, vf_lo); - vf_lo = _mm_movehl_ps(vf_lo, vf_lo); - output += 2; - } - if (batch & (1 * sizeof(float))) { - _mm_store_ss(output, vf_lo); - } - - vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); - } - __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); - vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); - vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); - _mm_store_ss(sum, vacc_lo); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96.c deleted file mode 100644 index 8060155fe95..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-u96.c +++ /dev/null @@ -1,334 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/avx2-rr1-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; - - const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); - const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); - const __m256 vminus_ln2 = _mm256_set1_ps(-0x1.62E430p-1f); - const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); - const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); - const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); - const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); - const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); - const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m256 vi_max = _mm256_broadcast_ss(max); - - __m256 vacc0 = _mm256_setzero_ps(); - for (; batch >= 96 * sizeof(float); batch -= 96 * sizeof(float)) { - const __m256 vi0 = _mm256_loadu_ps(input); - const __m256 vi1 = _mm256_loadu_ps(input + 8); - const __m256 vi2 = _mm256_loadu_ps(input + 16); - const __m256 vi3 = _mm256_loadu_ps(input + 24); - const __m256 vi4 = _mm256_loadu_ps(input + 32); - const __m256 vi5 = _mm256_loadu_ps(input + 40); - const __m256 vi6 = _mm256_loadu_ps(input + 48); - const __m256 vi7 = _mm256_loadu_ps(input + 56); - const __m256 vi8 = _mm256_loadu_ps(input + 64); - const __m256 vi9 = _mm256_loadu_ps(input + 72); - const __m256 vi10 = _mm256_loadu_ps(input + 80); - const __m256 vi11 = _mm256_loadu_ps(input + 88); - input += 96; - - const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); - const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); - const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); - const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); - const __m256 vx4 = _mm256_sub_ps(vi4, vi_max); - const __m256 vx5 = _mm256_sub_ps(vi5, vi_max); - const __m256 vx6 = _mm256_sub_ps(vi6, vi_max); - const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); - const __m256 vx8 = _mm256_sub_ps(vi8, vi_max); - const __m256 vx9 = _mm256_sub_ps(vi9, vi_max); - const __m256 vx10 = _mm256_sub_ps(vi10, vi_max); - const __m256 vx11 = _mm256_sub_ps(vi11, vi_max); - - __m256 vn0 = _mm256_fmadd_ps(vx0, vlog2e, vmagic_bias); - __m256 vn1 = _mm256_fmadd_ps(vx1, vlog2e, vmagic_bias); - __m256 vn2 = _mm256_fmadd_ps(vx2, vlog2e, vmagic_bias); - __m256 vn3 = _mm256_fmadd_ps(vx3, vlog2e, vmagic_bias); - __m256 vn4 = _mm256_fmadd_ps(vx4, vlog2e, vmagic_bias); - __m256 vn5 = _mm256_fmadd_ps(vx5, vlog2e, vmagic_bias); - __m256 vn6 = _mm256_fmadd_ps(vx6, vlog2e, vmagic_bias); - __m256 vn7 = _mm256_fmadd_ps(vx7, vlog2e, vmagic_bias); - __m256 vn8 = _mm256_fmadd_ps(vx8, vlog2e, vmagic_bias); - __m256 vn9 = _mm256_fmadd_ps(vx9, vlog2e, vmagic_bias); - __m256 vn10 = _mm256_fmadd_ps(vx10, vlog2e, vmagic_bias); - __m256 vn11 = _mm256_fmadd_ps(vx11, vlog2e, vmagic_bias); - - const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); - const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); - const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); - const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); - const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23)); - const __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); - const __m256 vs6 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn6), 23)); - const __m256 vs7 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn7), 23)); - const __m256 vs8 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn8), 23)); - const __m256 vs9 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn9), 23)); - const __m256 vs10 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn10), 23)); - const __m256 vs11 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn11), 23)); - - vn0 = _mm256_sub_ps(vn0, vmagic_bias); - vn1 = _mm256_sub_ps(vn1, vmagic_bias); - vn2 = _mm256_sub_ps(vn2, vmagic_bias); - vn3 = _mm256_sub_ps(vn3, vmagic_bias); - vn4 = _mm256_sub_ps(vn4, vmagic_bias); - vn5 = _mm256_sub_ps(vn5, vmagic_bias); - vn6 = _mm256_sub_ps(vn6, vmagic_bias); - vn7 = _mm256_sub_ps(vn7, vmagic_bias); - vn8 = _mm256_sub_ps(vn8, vmagic_bias); - vn9 = _mm256_sub_ps(vn9, vmagic_bias); - vn10 = _mm256_sub_ps(vn10, vmagic_bias); - vn11 = _mm256_sub_ps(vn11, vmagic_bias); - - __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2, vx0); - __m256 vt1 = _mm256_fmadd_ps(vn1, vminus_ln2, vx1); - __m256 vt2 = _mm256_fmadd_ps(vn2, vminus_ln2, vx2); - __m256 vt3 = _mm256_fmadd_ps(vn3, vminus_ln2, vx3); - __m256 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2, vx4); - __m256 vt5 = _mm256_fmadd_ps(vn5, vminus_ln2, vx5); - __m256 vt6 = _mm256_fmadd_ps(vn6, vminus_ln2, vx6); - __m256 vt7 = _mm256_fmadd_ps(vn7, vminus_ln2, vx7); - __m256 vt8 = _mm256_fmadd_ps(vn8, vminus_ln2, vx8); - __m256 vt9 = _mm256_fmadd_ps(vn9, vminus_ln2, vx9); - __m256 vt10 = _mm256_fmadd_ps(vn10, vminus_ln2, vx10); - __m256 vt11 = _mm256_fmadd_ps(vn11, vminus_ln2, vx11); - - __m256 vp0 = _mm256_fmadd_ps(vc5, vt0, vc4); - __m256 vp1 = _mm256_fmadd_ps(vc5, vt1, vc4); - __m256 vp2 = _mm256_fmadd_ps(vc5, vt2, vc4); - __m256 vp3 = _mm256_fmadd_ps(vc5, vt3, vc4); - __m256 vp4 = _mm256_fmadd_ps(vc5, vt4, vc4); - __m256 vp5 = _mm256_fmadd_ps(vc5, vt5, vc4); - __m256 vp6 = _mm256_fmadd_ps(vc5, vt6, vc4); - __m256 vp7 = _mm256_fmadd_ps(vc5, vt7, vc4); - __m256 vp8 = _mm256_fmadd_ps(vc5, vt8, vc4); - __m256 vp9 = _mm256_fmadd_ps(vc5, vt9, vc4); - __m256 vp10 = _mm256_fmadd_ps(vc5, vt10, vc4); - __m256 vp11 = _mm256_fmadd_ps(vc5, vt11, vc4); - - vp0 = _mm256_fmadd_ps(vp0, vt0, vc3); - vp1 = _mm256_fmadd_ps(vp1, vt1, vc3); - vp2 = _mm256_fmadd_ps(vp2, vt2, vc3); - vp3 = _mm256_fmadd_ps(vp3, vt3, vc3); - vp4 = _mm256_fmadd_ps(vp4, vt4, vc3); - vp5 = _mm256_fmadd_ps(vp5, vt5, vc3); - vp6 = _mm256_fmadd_ps(vp6, vt6, vc3); - vp7 = _mm256_fmadd_ps(vp7, vt7, vc3); - vp8 = _mm256_fmadd_ps(vp8, vt8, vc3); - vp9 = _mm256_fmadd_ps(vp9, vt9, vc3); - vp10 = _mm256_fmadd_ps(vp10, vt10, vc3); - vp11 = _mm256_fmadd_ps(vp11, vt11, vc3); - - vp0 = _mm256_fmadd_ps(vp0, vt0, vc2); - vp1 = _mm256_fmadd_ps(vp1, vt1, vc2); - vp2 = _mm256_fmadd_ps(vp2, vt2, vc2); - vp3 = _mm256_fmadd_ps(vp3, vt3, vc2); - vp4 = _mm256_fmadd_ps(vp4, vt4, vc2); - vp5 = _mm256_fmadd_ps(vp5, vt5, vc2); - vp6 = _mm256_fmadd_ps(vp6, vt6, vc2); - vp7 = _mm256_fmadd_ps(vp7, vt7, vc2); - vp8 = _mm256_fmadd_ps(vp8, vt8, vc2); - vp9 = _mm256_fmadd_ps(vp9, vt9, vc2); - vp10 = _mm256_fmadd_ps(vp10, vt10, vc2); - vp11 = _mm256_fmadd_ps(vp11, vt11, vc2); - - vp0 = _mm256_fmadd_ps(vp0, vt0, vc1); - vp1 = _mm256_fmadd_ps(vp1, vt1, vc1); - vp2 = _mm256_fmadd_ps(vp2, vt2, vc1); - vp3 = _mm256_fmadd_ps(vp3, vt3, vc1); - vp4 = _mm256_fmadd_ps(vp4, vt4, vc1); - vp5 = _mm256_fmadd_ps(vp5, vt5, vc1); - vp6 = _mm256_fmadd_ps(vp6, vt6, vc1); - vp7 = _mm256_fmadd_ps(vp7, vt7, vc1); - vp8 = _mm256_fmadd_ps(vp8, vt8, vc1); - vp9 = _mm256_fmadd_ps(vp9, vt9, vc1); - vp10 = _mm256_fmadd_ps(vp10, vt10, vc1); - vp11 = _mm256_fmadd_ps(vp11, vt11, vc1); - - vt0 = _mm256_mul_ps(vt0, vs0); - vt1 = _mm256_mul_ps(vt1, vs1); - vt2 = _mm256_mul_ps(vt2, vs2); - vt3 = _mm256_mul_ps(vt3, vs3); - vt4 = _mm256_mul_ps(vt4, vs4); - vt5 = _mm256_mul_ps(vt5, vs5); - vt6 = _mm256_mul_ps(vt6, vs6); - vt7 = _mm256_mul_ps(vt7, vs7); - vt8 = _mm256_mul_ps(vt8, vs8); - vt9 = _mm256_mul_ps(vt9, vs9); - vt10 = _mm256_mul_ps(vt10, vs10); - vt11 = _mm256_mul_ps(vt11, vs11); - - __m256 vf0 = _mm256_fmadd_ps(vt0, vp0, vs0); - __m256 vf1 = _mm256_fmadd_ps(vt1, vp1, vs1); - __m256 vf2 = _mm256_fmadd_ps(vt2, vp2, vs2); - __m256 vf3 = _mm256_fmadd_ps(vt3, vp3, vs3); - __m256 vf4 = _mm256_fmadd_ps(vt4, vp4, vs4); - __m256 vf5 = _mm256_fmadd_ps(vt5, vp5, vs5); - __m256 vf6 = _mm256_fmadd_ps(vt6, vp6, vs6); - __m256 vf7 = _mm256_fmadd_ps(vt7, vp7, vs7); - __m256 vf8 = _mm256_fmadd_ps(vt8, vp8, vs8); - __m256 vf9 = _mm256_fmadd_ps(vt9, vp9, vs9); - __m256 vf10 = _mm256_fmadd_ps(vt10, vp10, vs10); - __m256 vf11 = _mm256_fmadd_ps(vt11, vp11, vs11); - - vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); - vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); - vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); - vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); - vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vx4, vdenorm_cutoff, _CMP_LT_OS), vf4); - vf5 = _mm256_andnot_ps(_mm256_cmp_ps(vx5, vdenorm_cutoff, _CMP_LT_OS), vf5); - vf6 = _mm256_andnot_ps(_mm256_cmp_ps(vx6, vdenorm_cutoff, _CMP_LT_OS), vf6); - vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); - vf8 = _mm256_andnot_ps(_mm256_cmp_ps(vx8, vdenorm_cutoff, _CMP_LT_OS), vf8); - vf9 = _mm256_andnot_ps(_mm256_cmp_ps(vx9, vdenorm_cutoff, _CMP_LT_OS), vf9); - vf10 = _mm256_andnot_ps(_mm256_cmp_ps(vx10, vdenorm_cutoff, _CMP_LT_OS), vf10); - vf11 = _mm256_andnot_ps(_mm256_cmp_ps(vx11, vdenorm_cutoff, _CMP_LT_OS), vf11); - - _mm256_storeu_ps(output, vf0); - _mm256_storeu_ps(output + 8, vf1); - _mm256_storeu_ps(output + 16, vf2); - _mm256_storeu_ps(output + 24, vf3); - _mm256_storeu_ps(output + 32, vf4); - _mm256_storeu_ps(output + 40, vf5); - _mm256_storeu_ps(output + 48, vf6); - _mm256_storeu_ps(output + 56, vf7); - _mm256_storeu_ps(output + 64, vf8); - _mm256_storeu_ps(output + 72, vf9); - _mm256_storeu_ps(output + 80, vf10); - _mm256_storeu_ps(output + 88, vf11); - output += 96; - - vacc0 = _mm256_add_ps(vacc0, vf0); - vacc0 = _mm256_add_ps(vacc0, vf1); - vacc0 = _mm256_add_ps(vacc0, vf2); - vacc0 = _mm256_add_ps(vacc0, vf3); - vacc0 = _mm256_add_ps(vacc0, vf4); - vacc0 = _mm256_add_ps(vacc0, vf5); - vacc0 = _mm256_add_ps(vacc0, vf6); - vacc0 = _mm256_add_ps(vacc0, vf7); - vacc0 = _mm256_add_ps(vacc0, vf8); - vacc0 = _mm256_add_ps(vacc0, vf9); - vacc0 = _mm256_add_ps(vacc0, vf10); - vacc0 = _mm256_add_ps(vacc0, vf11); - } - - __m256 vacc = vacc0; - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const __m256 vi = _mm256_loadu_ps(input); - input += 8; - - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - __m256 vn = _mm256_fmadd_ps(vx, vlog2e, vmagic_bias); - - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - vn = _mm256_sub_ps(vn, vmagic_bias); - - __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vx); - - __m256 vp = _mm256_fmadd_ps(vc5, vt, vc4); - vp = _mm256_fmadd_ps(vp, vt, vc3); - vp = _mm256_fmadd_ps(vp, vt, vc2); - vp = _mm256_fmadd_ps(vp, vt, vc1); - - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_fmadd_ps(vt, vp, vs); - - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - _mm256_storeu_ps(output, vf); - output += 8; - - vacc = _mm256_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); - - const __m256 vi = _mm256_maskload_ps(input, vmask); - - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - __m256 vn = _mm256_fmadd_ps(vx, vlog2e, vmagic_bias); - - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - vn = _mm256_sub_ps(vn, vmagic_bias); - - __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vx); - - __m256 vp = _mm256_fmadd_ps(vc5, vt, vc4); - vp = _mm256_fmadd_ps(vp, vt, vc3); - vp = _mm256_fmadd_ps(vp, vt, vc2); - vp = _mm256_fmadd_ps(vp, vt, vc1); - - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_fmadd_ps(vt, vp, vs); - - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - __m128 vf_lo = _mm256_castps256_ps128(vf); - if (batch & (4 * sizeof(float))) { - _mm_storeu_ps(output, vf_lo); - vf_lo = _mm256_extractf128_ps(vf, 1); - output += 4; - } - if (batch & (2 * sizeof(float))) { - _mm_storel_pi((__m64*) output, vf_lo); - vf_lo = _mm_movehl_ps(vf_lo, vf_lo); - output += 2; - } - if (batch & (1 * sizeof(float))) { - _mm_store_ss(output, vf_lo); - } - - vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); - } - __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); - vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); - vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); - _mm_store_ss(sum, vacc_lo); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u16-acc2.c similarity index 81% rename from src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32.c rename to src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u16-acc2.c index b671d6b1833..e1ab507b2ed 100644 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32.c +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u16-acc2.c @@ -11,10 +11,11 @@ #include +#include "xnnpack/intrinsics-polyfill.h" #include "xnnpack/raddstoreexpminusmax.h" -void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32( +void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u16_acc2( size_t batch, const float* input, const float* max, @@ -56,71 +57,50 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32( const __m256 vi_max = _mm256_broadcast_ss(max); __m256 vacc0 = _mm256_setzero_ps(); - for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { - // Load 32 (4x4) inputs at a time. + __m256 vacc1 = _mm256_setzero_ps(); + for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { + // Load 16 (2x8) inputs at a time. const __m256 vi0 = _mm256_loadu_ps(input); const __m256 vi1 = _mm256_loadu_ps(input + 8); - const __m256 vi2 = _mm256_loadu_ps(input + 16); - const __m256 vi3 = _mm256_loadu_ps(input + 24); - input += 32; + input += 16; // Subtract maximum input x := i - i_max. This implies x <= 0. const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); - const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); - const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); // Compute reduced argument batch := round(x / log(2)). __m256 vn0 = _mm256_add_ps(_mm256_mul_ps(vx0, vlog2e), vmagic_bias); __m256 vn1 = _mm256_add_ps(_mm256_mul_ps(vx1, vlog2e), vmagic_bias); - __m256 vn2 = _mm256_add_ps(_mm256_mul_ps(vx2, vlog2e), vmagic_bias); - __m256 vn3 = _mm256_add_ps(_mm256_mul_ps(vx3, vlog2e), vmagic_bias); // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); - const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); - const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); // Subtract the large number back to get final batch := round(x / log(2)). vn0 = _mm256_sub_ps(vn0, vmagic_bias); vn1 = _mm256_sub_ps(vn1, vmagic_bias); - vn2 = _mm256_sub_ps(vn2, vmagic_bias); - vn3 = _mm256_sub_ps(vn3, vmagic_bias); // Compute reduced argument t := x - batch * log(2). // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. __m256 vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_hi), vx0); __m256 vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_hi), vx1); - __m256 vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_hi), vx2); - __m256 vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_hi), vx3); vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_lo), vt0); vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_lo), vt1); - vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_lo), vt2); - vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_lo), vt3); // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. __m256 vp0 = _mm256_add_ps(_mm256_mul_ps(vc5, vt0), vc4); __m256 vp1 = _mm256_add_ps(_mm256_mul_ps(vc5, vt1), vc4); - __m256 vp2 = _mm256_add_ps(_mm256_mul_ps(vc5, vt2), vc4); - __m256 vp3 = _mm256_add_ps(_mm256_mul_ps(vc5, vt3), vc4); vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc3); vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc3); - vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc3); - vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc3); vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc2); vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc2); - vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc2); - vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc2); vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc1); vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc1); - vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc1); - vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc1); // Reconstruct the final f value: // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) @@ -128,34 +108,26 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32( // = s + (t * s) * p vt0 = _mm256_mul_ps(vt0, vs0); vt1 = _mm256_mul_ps(vt1, vs1); - vt2 = _mm256_mul_ps(vt2, vs2); - vt3 = _mm256_mul_ps(vt3, vs3); __m256 vf0 = _mm256_add_ps(_mm256_mul_ps(vt0, vp0), vs0); __m256 vf1 = _mm256_add_ps(_mm256_mul_ps(vt1, vp1), vs1); - __m256 vf2 = _mm256_add_ps(_mm256_mul_ps(vt2, vp2), vs2); - __m256 vf3 = _mm256_add_ps(_mm256_mul_ps(vt3, vp3), vs3); // For inputs below zero cutoff, replace output with +0.0f. // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); - vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); - vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); - // Store 32 (4x4) outputs at a time. + // Store 16 (2x8) outputs at a time. _mm256_storeu_ps(output, vf0); _mm256_storeu_ps(output + 8, vf1); - _mm256_storeu_ps(output + 16, vf2); - _mm256_storeu_ps(output + 24, vf3); - output += 32; + output += 16; // Accumulate computed exponents. vacc0 = _mm256_add_ps(vacc0, vf0); - vacc0 = _mm256_add_ps(vacc0, vf1); - vacc0 = _mm256_add_ps(vacc0, vf2); - vacc0 = _mm256_add_ps(vacc0, vf3); + vacc1 = _mm256_add_ps(vacc1, vf1); } + // Add up all accumulators to vacc0 + vacc0 = _mm256_add_ps(vacc0, vacc1); __m256 vacc = vacc0; for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32-acc2.c index 623755988c5..0ff8ed91671 100644 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32-acc2.c +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32-acc2.c @@ -11,6 +11,7 @@ #include +#include "xnnpack/intrinsics-polyfill.h" #include "xnnpack/raddstoreexpminusmax.h" @@ -58,7 +59,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc2( __m256 vacc0 = _mm256_setzero_ps(); __m256 vacc1 = _mm256_setzero_ps(); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { - // Load 32 (4x4) inputs at a time. + // Load 32 (4x8) inputs at a time. const __m256 vi0 = _mm256_loadu_ps(input); const __m256 vi1 = _mm256_loadu_ps(input + 8); const __m256 vi2 = _mm256_loadu_ps(input + 16); @@ -144,7 +145,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc2( vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); - // Store 32 (4x4) outputs at a time. + // Store 32 (4x8) outputs at a time. _mm256_storeu_ps(output, vf0); _mm256_storeu_ps(output + 8, vf1); _mm256_storeu_ps(output + 16, vf2); diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32-acc4.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32-acc4.c index 376339e1a2a..082a7e1a409 100644 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32-acc4.c +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u32-acc4.c @@ -11,6 +11,7 @@ #include +#include "xnnpack/intrinsics-polyfill.h" #include "xnnpack/raddstoreexpminusmax.h" @@ -60,7 +61,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc4( __m256 vacc2 = _mm256_setzero_ps(); __m256 vacc3 = _mm256_setzero_ps(); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { - // Load 32 (4x4) inputs at a time. + // Load 32 (4x8) inputs at a time. const __m256 vi0 = _mm256_loadu_ps(input); const __m256 vi1 = _mm256_loadu_ps(input + 8); const __m256 vi2 = _mm256_loadu_ps(input + 16); @@ -146,7 +147,7 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc4( vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); - // Store 32 (4x4) outputs at a time. + // Store 32 (4x8) outputs at a time. _mm256_storeu_ps(output, vf0); _mm256_storeu_ps(output + 8, vf1); _mm256_storeu_ps(output + 16, vf2); diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64-acc2.c deleted file mode 100644 index bba69d82e40..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64-acc2.c +++ /dev/null @@ -1,338 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64_acc2( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; - - const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); - const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); - const __m256 vminus_ln2_hi = _mm256_set1_ps(-0x1.62E400p-1f); - const __m256 vminus_ln2_lo = _mm256_set1_ps(-0x1.7F7D1Cp-20f); - const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); - const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); - const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); - const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); - const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); - const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m256 vi_max = _mm256_broadcast_ss(max); - - __m256 vacc0 = _mm256_setzero_ps(); - __m256 vacc1 = _mm256_setzero_ps(); - for (; batch >= 64 * sizeof(float); batch -= 64 * sizeof(float)) { - // Load 64 (8x4) inputs at a time. - const __m256 vi0 = _mm256_loadu_ps(input); - const __m256 vi1 = _mm256_loadu_ps(input + 8); - const __m256 vi2 = _mm256_loadu_ps(input + 16); - const __m256 vi3 = _mm256_loadu_ps(input + 24); - const __m256 vi4 = _mm256_loadu_ps(input + 32); - const __m256 vi5 = _mm256_loadu_ps(input + 40); - const __m256 vi6 = _mm256_loadu_ps(input + 48); - const __m256 vi7 = _mm256_loadu_ps(input + 56); - input += 64; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); - const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); - const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); - const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); - const __m256 vx4 = _mm256_sub_ps(vi4, vi_max); - const __m256 vx5 = _mm256_sub_ps(vi5, vi_max); - const __m256 vx6 = _mm256_sub_ps(vi6, vi_max); - const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m256 vn0 = _mm256_add_ps(_mm256_mul_ps(vx0, vlog2e), vmagic_bias); - __m256 vn1 = _mm256_add_ps(_mm256_mul_ps(vx1, vlog2e), vmagic_bias); - __m256 vn2 = _mm256_add_ps(_mm256_mul_ps(vx2, vlog2e), vmagic_bias); - __m256 vn3 = _mm256_add_ps(_mm256_mul_ps(vx3, vlog2e), vmagic_bias); - __m256 vn4 = _mm256_add_ps(_mm256_mul_ps(vx4, vlog2e), vmagic_bias); - __m256 vn5 = _mm256_add_ps(_mm256_mul_ps(vx5, vlog2e), vmagic_bias); - __m256 vn6 = _mm256_add_ps(_mm256_mul_ps(vx6, vlog2e), vmagic_bias); - __m256 vn7 = _mm256_add_ps(_mm256_mul_ps(vx7, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); - const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); - const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); - const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); - const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23)); - const __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); - const __m256 vs6 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn6), 23)); - const __m256 vs7 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn7), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn0 = _mm256_sub_ps(vn0, vmagic_bias); - vn1 = _mm256_sub_ps(vn1, vmagic_bias); - vn2 = _mm256_sub_ps(vn2, vmagic_bias); - vn3 = _mm256_sub_ps(vn3, vmagic_bias); - vn4 = _mm256_sub_ps(vn4, vmagic_bias); - vn5 = _mm256_sub_ps(vn5, vmagic_bias); - vn6 = _mm256_sub_ps(vn6, vmagic_bias); - vn7 = _mm256_sub_ps(vn7, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m256 vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_hi), vx0); - __m256 vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_hi), vx1); - __m256 vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_hi), vx2); - __m256 vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_hi), vx3); - __m256 vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_hi), vx4); - __m256 vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_hi), vx5); - __m256 vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_hi), vx6); - __m256 vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_hi), vx7); - - vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_lo), vt0); - vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_lo), vt1); - vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_lo), vt2); - vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_lo), vt3); - vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_lo), vt4); - vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_lo), vt5); - vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_lo), vt6); - vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_lo), vt7); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m256 vp0 = _mm256_add_ps(_mm256_mul_ps(vc5, vt0), vc4); - __m256 vp1 = _mm256_add_ps(_mm256_mul_ps(vc5, vt1), vc4); - __m256 vp2 = _mm256_add_ps(_mm256_mul_ps(vc5, vt2), vc4); - __m256 vp3 = _mm256_add_ps(_mm256_mul_ps(vc5, vt3), vc4); - __m256 vp4 = _mm256_add_ps(_mm256_mul_ps(vc5, vt4), vc4); - __m256 vp5 = _mm256_add_ps(_mm256_mul_ps(vc5, vt5), vc4); - __m256 vp6 = _mm256_add_ps(_mm256_mul_ps(vc5, vt6), vc4); - __m256 vp7 = _mm256_add_ps(_mm256_mul_ps(vc5, vt7), vc4); - - vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc3); - vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc3); - vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc3); - vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc3); - vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc3); - vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc3); - vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc3); - vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc3); - - vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc2); - vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc2); - vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc2); - vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc2); - vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc2); - vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc2); - vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc2); - vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc2); - - vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc1); - vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc1); - vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc1); - vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc1); - vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc1); - vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc1); - vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc1); - vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt0 = _mm256_mul_ps(vt0, vs0); - vt1 = _mm256_mul_ps(vt1, vs1); - vt2 = _mm256_mul_ps(vt2, vs2); - vt3 = _mm256_mul_ps(vt3, vs3); - vt4 = _mm256_mul_ps(vt4, vs4); - vt5 = _mm256_mul_ps(vt5, vs5); - vt6 = _mm256_mul_ps(vt6, vs6); - vt7 = _mm256_mul_ps(vt7, vs7); - - __m256 vf0 = _mm256_add_ps(_mm256_mul_ps(vt0, vp0), vs0); - __m256 vf1 = _mm256_add_ps(_mm256_mul_ps(vt1, vp1), vs1); - __m256 vf2 = _mm256_add_ps(_mm256_mul_ps(vt2, vp2), vs2); - __m256 vf3 = _mm256_add_ps(_mm256_mul_ps(vt3, vp3), vs3); - __m256 vf4 = _mm256_add_ps(_mm256_mul_ps(vt4, vp4), vs4); - __m256 vf5 = _mm256_add_ps(_mm256_mul_ps(vt5, vp5), vs5); - __m256 vf6 = _mm256_add_ps(_mm256_mul_ps(vt6, vp6), vs6); - __m256 vf7 = _mm256_add_ps(_mm256_mul_ps(vt7, vp7), vs7); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); - vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); - vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); - vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); - vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vx4, vdenorm_cutoff, _CMP_LT_OS), vf4); - vf5 = _mm256_andnot_ps(_mm256_cmp_ps(vx5, vdenorm_cutoff, _CMP_LT_OS), vf5); - vf6 = _mm256_andnot_ps(_mm256_cmp_ps(vx6, vdenorm_cutoff, _CMP_LT_OS), vf6); - vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); - - // Store 64 (8x4) outputs at a time. - _mm256_storeu_ps(output, vf0); - _mm256_storeu_ps(output + 8, vf1); - _mm256_storeu_ps(output + 16, vf2); - _mm256_storeu_ps(output + 24, vf3); - _mm256_storeu_ps(output + 32, vf4); - _mm256_storeu_ps(output + 40, vf5); - _mm256_storeu_ps(output + 48, vf6); - _mm256_storeu_ps(output + 56, vf7); - output += 64; - - // Accumulate computed exponents. - vacc0 = _mm256_add_ps(vacc0, vf0); - vacc1 = _mm256_add_ps(vacc1, vf1); - vacc0 = _mm256_add_ps(vacc0, vf2); - vacc1 = _mm256_add_ps(vacc1, vf3); - vacc0 = _mm256_add_ps(vacc0, vf4); - vacc1 = _mm256_add_ps(vacc1, vf5); - vacc0 = _mm256_add_ps(vacc0, vf6); - vacc1 = _mm256_add_ps(vacc1, vf7); - } - // Add up all accumulators to vacc0 - vacc0 = _mm256_add_ps(vacc0, vacc1); - - __m256 vacc = vacc0; - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - // Load 8 inputs at a time. - const __m256 vi = _mm256_loadu_ps(input); - input += 8; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm256_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - // Store 8 outputs at a time. - _mm256_storeu_ps(output, vf); - output += 8; - - // Accumulate computed exponents. - vacc = _mm256_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); - - const __m256 vi = _mm256_maskload_ps(input, vmask); - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm256_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - __m128 vf_lo = _mm256_castps256_ps128(vf); - if (batch & (4 * sizeof(float))) { - _mm_storeu_ps(output, vf_lo); - vf_lo = _mm256_extractf128_ps(vf, 1); - output += 4; - } - if (batch & (2 * sizeof(float))) { - _mm_storel_pi((__m64*) output, vf_lo); - vf_lo = _mm_movehl_ps(vf_lo, vf_lo); - output += 2; - } - if (batch & (1 * sizeof(float))) { - _mm_store_ss(output, vf_lo); - } - - vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); - } - __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); - vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); - vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); - _mm_store_ss(sum, vacc_lo); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64-acc4.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64-acc4.c deleted file mode 100644 index fd50ec677c8..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64-acc4.c +++ /dev/null @@ -1,342 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64_acc4( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; - - const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); - const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); - const __m256 vminus_ln2_hi = _mm256_set1_ps(-0x1.62E400p-1f); - const __m256 vminus_ln2_lo = _mm256_set1_ps(-0x1.7F7D1Cp-20f); - const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); - const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); - const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); - const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); - const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); - const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m256 vi_max = _mm256_broadcast_ss(max); - - __m256 vacc0 = _mm256_setzero_ps(); - __m256 vacc1 = _mm256_setzero_ps(); - __m256 vacc2 = _mm256_setzero_ps(); - __m256 vacc3 = _mm256_setzero_ps(); - for (; batch >= 64 * sizeof(float); batch -= 64 * sizeof(float)) { - // Load 64 (8x4) inputs at a time. - const __m256 vi0 = _mm256_loadu_ps(input); - const __m256 vi1 = _mm256_loadu_ps(input + 8); - const __m256 vi2 = _mm256_loadu_ps(input + 16); - const __m256 vi3 = _mm256_loadu_ps(input + 24); - const __m256 vi4 = _mm256_loadu_ps(input + 32); - const __m256 vi5 = _mm256_loadu_ps(input + 40); - const __m256 vi6 = _mm256_loadu_ps(input + 48); - const __m256 vi7 = _mm256_loadu_ps(input + 56); - input += 64; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); - const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); - const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); - const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); - const __m256 vx4 = _mm256_sub_ps(vi4, vi_max); - const __m256 vx5 = _mm256_sub_ps(vi5, vi_max); - const __m256 vx6 = _mm256_sub_ps(vi6, vi_max); - const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m256 vn0 = _mm256_add_ps(_mm256_mul_ps(vx0, vlog2e), vmagic_bias); - __m256 vn1 = _mm256_add_ps(_mm256_mul_ps(vx1, vlog2e), vmagic_bias); - __m256 vn2 = _mm256_add_ps(_mm256_mul_ps(vx2, vlog2e), vmagic_bias); - __m256 vn3 = _mm256_add_ps(_mm256_mul_ps(vx3, vlog2e), vmagic_bias); - __m256 vn4 = _mm256_add_ps(_mm256_mul_ps(vx4, vlog2e), vmagic_bias); - __m256 vn5 = _mm256_add_ps(_mm256_mul_ps(vx5, vlog2e), vmagic_bias); - __m256 vn6 = _mm256_add_ps(_mm256_mul_ps(vx6, vlog2e), vmagic_bias); - __m256 vn7 = _mm256_add_ps(_mm256_mul_ps(vx7, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); - const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); - const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); - const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); - const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23)); - const __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); - const __m256 vs6 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn6), 23)); - const __m256 vs7 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn7), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn0 = _mm256_sub_ps(vn0, vmagic_bias); - vn1 = _mm256_sub_ps(vn1, vmagic_bias); - vn2 = _mm256_sub_ps(vn2, vmagic_bias); - vn3 = _mm256_sub_ps(vn3, vmagic_bias); - vn4 = _mm256_sub_ps(vn4, vmagic_bias); - vn5 = _mm256_sub_ps(vn5, vmagic_bias); - vn6 = _mm256_sub_ps(vn6, vmagic_bias); - vn7 = _mm256_sub_ps(vn7, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m256 vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_hi), vx0); - __m256 vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_hi), vx1); - __m256 vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_hi), vx2); - __m256 vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_hi), vx3); - __m256 vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_hi), vx4); - __m256 vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_hi), vx5); - __m256 vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_hi), vx6); - __m256 vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_hi), vx7); - - vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_lo), vt0); - vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_lo), vt1); - vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_lo), vt2); - vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_lo), vt3); - vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_lo), vt4); - vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_lo), vt5); - vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_lo), vt6); - vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_lo), vt7); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m256 vp0 = _mm256_add_ps(_mm256_mul_ps(vc5, vt0), vc4); - __m256 vp1 = _mm256_add_ps(_mm256_mul_ps(vc5, vt1), vc4); - __m256 vp2 = _mm256_add_ps(_mm256_mul_ps(vc5, vt2), vc4); - __m256 vp3 = _mm256_add_ps(_mm256_mul_ps(vc5, vt3), vc4); - __m256 vp4 = _mm256_add_ps(_mm256_mul_ps(vc5, vt4), vc4); - __m256 vp5 = _mm256_add_ps(_mm256_mul_ps(vc5, vt5), vc4); - __m256 vp6 = _mm256_add_ps(_mm256_mul_ps(vc5, vt6), vc4); - __m256 vp7 = _mm256_add_ps(_mm256_mul_ps(vc5, vt7), vc4); - - vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc3); - vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc3); - vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc3); - vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc3); - vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc3); - vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc3); - vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc3); - vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc3); - - vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc2); - vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc2); - vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc2); - vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc2); - vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc2); - vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc2); - vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc2); - vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc2); - - vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc1); - vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc1); - vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc1); - vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc1); - vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc1); - vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc1); - vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc1); - vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt0 = _mm256_mul_ps(vt0, vs0); - vt1 = _mm256_mul_ps(vt1, vs1); - vt2 = _mm256_mul_ps(vt2, vs2); - vt3 = _mm256_mul_ps(vt3, vs3); - vt4 = _mm256_mul_ps(vt4, vs4); - vt5 = _mm256_mul_ps(vt5, vs5); - vt6 = _mm256_mul_ps(vt6, vs6); - vt7 = _mm256_mul_ps(vt7, vs7); - - __m256 vf0 = _mm256_add_ps(_mm256_mul_ps(vt0, vp0), vs0); - __m256 vf1 = _mm256_add_ps(_mm256_mul_ps(vt1, vp1), vs1); - __m256 vf2 = _mm256_add_ps(_mm256_mul_ps(vt2, vp2), vs2); - __m256 vf3 = _mm256_add_ps(_mm256_mul_ps(vt3, vp3), vs3); - __m256 vf4 = _mm256_add_ps(_mm256_mul_ps(vt4, vp4), vs4); - __m256 vf5 = _mm256_add_ps(_mm256_mul_ps(vt5, vp5), vs5); - __m256 vf6 = _mm256_add_ps(_mm256_mul_ps(vt6, vp6), vs6); - __m256 vf7 = _mm256_add_ps(_mm256_mul_ps(vt7, vp7), vs7); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); - vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); - vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); - vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); - vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vx4, vdenorm_cutoff, _CMP_LT_OS), vf4); - vf5 = _mm256_andnot_ps(_mm256_cmp_ps(vx5, vdenorm_cutoff, _CMP_LT_OS), vf5); - vf6 = _mm256_andnot_ps(_mm256_cmp_ps(vx6, vdenorm_cutoff, _CMP_LT_OS), vf6); - vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); - - // Store 64 (8x4) outputs at a time. - _mm256_storeu_ps(output, vf0); - _mm256_storeu_ps(output + 8, vf1); - _mm256_storeu_ps(output + 16, vf2); - _mm256_storeu_ps(output + 24, vf3); - _mm256_storeu_ps(output + 32, vf4); - _mm256_storeu_ps(output + 40, vf5); - _mm256_storeu_ps(output + 48, vf6); - _mm256_storeu_ps(output + 56, vf7); - output += 64; - - // Accumulate computed exponents. - vacc0 = _mm256_add_ps(vacc0, vf0); - vacc1 = _mm256_add_ps(vacc1, vf1); - vacc2 = _mm256_add_ps(vacc2, vf2); - vacc3 = _mm256_add_ps(vacc3, vf3); - vacc0 = _mm256_add_ps(vacc0, vf4); - vacc1 = _mm256_add_ps(vacc1, vf5); - vacc2 = _mm256_add_ps(vacc2, vf6); - vacc3 = _mm256_add_ps(vacc3, vf7); - } - // Add up all accumulators to vacc0 - vacc0 = _mm256_add_ps(vacc0, vacc1); - vacc2 = _mm256_add_ps(vacc2, vacc3); - vacc0 = _mm256_add_ps(vacc0, vacc2); - - __m256 vacc = vacc0; - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - // Load 8 inputs at a time. - const __m256 vi = _mm256_loadu_ps(input); - input += 8; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm256_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - // Store 8 outputs at a time. - _mm256_storeu_ps(output, vf); - output += 8; - - // Accumulate computed exponents. - vacc = _mm256_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); - - const __m256 vi = _mm256_maskload_ps(input, vmask); - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm256_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - __m128 vf_lo = _mm256_castps256_ps128(vf); - if (batch & (4 * sizeof(float))) { - _mm_storeu_ps(output, vf_lo); - vf_lo = _mm256_extractf128_ps(vf, 1); - output += 4; - } - if (batch & (2 * sizeof(float))) { - _mm_storel_pi((__m64*) output, vf_lo); - vf_lo = _mm_movehl_ps(vf_lo, vf_lo); - output += 2; - } - if (batch & (1 * sizeof(float))) { - _mm_store_ss(output, vf_lo); - } - - vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); - } - __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); - vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); - vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); - _mm_store_ss(sum, vacc_lo); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u72-acc3.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u72-acc3.c deleted file mode 100644 index 83632b4397b..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u72-acc3.c +++ /dev/null @@ -1,356 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u72_acc3( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; - - const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); - const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); - const __m256 vminus_ln2_hi = _mm256_set1_ps(-0x1.62E400p-1f); - const __m256 vminus_ln2_lo = _mm256_set1_ps(-0x1.7F7D1Cp-20f); - const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); - const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); - const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); - const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); - const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); - const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m256 vi_max = _mm256_broadcast_ss(max); - - __m256 vacc0 = _mm256_setzero_ps(); - __m256 vacc1 = _mm256_setzero_ps(); - __m256 vacc2 = _mm256_setzero_ps(); - for (; batch >= 72 * sizeof(float); batch -= 72 * sizeof(float)) { - // Load 72 (9x4) inputs at a time. - const __m256 vi0 = _mm256_loadu_ps(input); - const __m256 vi1 = _mm256_loadu_ps(input + 8); - const __m256 vi2 = _mm256_loadu_ps(input + 16); - const __m256 vi3 = _mm256_loadu_ps(input + 24); - const __m256 vi4 = _mm256_loadu_ps(input + 32); - const __m256 vi5 = _mm256_loadu_ps(input + 40); - const __m256 vi6 = _mm256_loadu_ps(input + 48); - const __m256 vi7 = _mm256_loadu_ps(input + 56); - const __m256 vi8 = _mm256_loadu_ps(input + 64); - input += 72; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); - const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); - const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); - const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); - const __m256 vx4 = _mm256_sub_ps(vi4, vi_max); - const __m256 vx5 = _mm256_sub_ps(vi5, vi_max); - const __m256 vx6 = _mm256_sub_ps(vi6, vi_max); - const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); - const __m256 vx8 = _mm256_sub_ps(vi8, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m256 vn0 = _mm256_add_ps(_mm256_mul_ps(vx0, vlog2e), vmagic_bias); - __m256 vn1 = _mm256_add_ps(_mm256_mul_ps(vx1, vlog2e), vmagic_bias); - __m256 vn2 = _mm256_add_ps(_mm256_mul_ps(vx2, vlog2e), vmagic_bias); - __m256 vn3 = _mm256_add_ps(_mm256_mul_ps(vx3, vlog2e), vmagic_bias); - __m256 vn4 = _mm256_add_ps(_mm256_mul_ps(vx4, vlog2e), vmagic_bias); - __m256 vn5 = _mm256_add_ps(_mm256_mul_ps(vx5, vlog2e), vmagic_bias); - __m256 vn6 = _mm256_add_ps(_mm256_mul_ps(vx6, vlog2e), vmagic_bias); - __m256 vn7 = _mm256_add_ps(_mm256_mul_ps(vx7, vlog2e), vmagic_bias); - __m256 vn8 = _mm256_add_ps(_mm256_mul_ps(vx8, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); - const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); - const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); - const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); - const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23)); - const __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); - const __m256 vs6 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn6), 23)); - const __m256 vs7 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn7), 23)); - const __m256 vs8 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn8), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn0 = _mm256_sub_ps(vn0, vmagic_bias); - vn1 = _mm256_sub_ps(vn1, vmagic_bias); - vn2 = _mm256_sub_ps(vn2, vmagic_bias); - vn3 = _mm256_sub_ps(vn3, vmagic_bias); - vn4 = _mm256_sub_ps(vn4, vmagic_bias); - vn5 = _mm256_sub_ps(vn5, vmagic_bias); - vn6 = _mm256_sub_ps(vn6, vmagic_bias); - vn7 = _mm256_sub_ps(vn7, vmagic_bias); - vn8 = _mm256_sub_ps(vn8, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m256 vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_hi), vx0); - __m256 vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_hi), vx1); - __m256 vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_hi), vx2); - __m256 vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_hi), vx3); - __m256 vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_hi), vx4); - __m256 vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_hi), vx5); - __m256 vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_hi), vx6); - __m256 vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_hi), vx7); - __m256 vt8 = _mm256_add_ps(_mm256_mul_ps(vn8, vminus_ln2_hi), vx8); - - vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_lo), vt0); - vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_lo), vt1); - vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_lo), vt2); - vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_lo), vt3); - vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_lo), vt4); - vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_lo), vt5); - vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_lo), vt6); - vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_lo), vt7); - vt8 = _mm256_add_ps(_mm256_mul_ps(vn8, vminus_ln2_lo), vt8); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m256 vp0 = _mm256_add_ps(_mm256_mul_ps(vc5, vt0), vc4); - __m256 vp1 = _mm256_add_ps(_mm256_mul_ps(vc5, vt1), vc4); - __m256 vp2 = _mm256_add_ps(_mm256_mul_ps(vc5, vt2), vc4); - __m256 vp3 = _mm256_add_ps(_mm256_mul_ps(vc5, vt3), vc4); - __m256 vp4 = _mm256_add_ps(_mm256_mul_ps(vc5, vt4), vc4); - __m256 vp5 = _mm256_add_ps(_mm256_mul_ps(vc5, vt5), vc4); - __m256 vp6 = _mm256_add_ps(_mm256_mul_ps(vc5, vt6), vc4); - __m256 vp7 = _mm256_add_ps(_mm256_mul_ps(vc5, vt7), vc4); - __m256 vp8 = _mm256_add_ps(_mm256_mul_ps(vc5, vt8), vc4); - - vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc3); - vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc3); - vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc3); - vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc3); - vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc3); - vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc3); - vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc3); - vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc3); - vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc3); - - vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc2); - vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc2); - vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc2); - vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc2); - vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc2); - vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc2); - vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc2); - vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc2); - vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc2); - - vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc1); - vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc1); - vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc1); - vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc1); - vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc1); - vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc1); - vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc1); - vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc1); - vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt0 = _mm256_mul_ps(vt0, vs0); - vt1 = _mm256_mul_ps(vt1, vs1); - vt2 = _mm256_mul_ps(vt2, vs2); - vt3 = _mm256_mul_ps(vt3, vs3); - vt4 = _mm256_mul_ps(vt4, vs4); - vt5 = _mm256_mul_ps(vt5, vs5); - vt6 = _mm256_mul_ps(vt6, vs6); - vt7 = _mm256_mul_ps(vt7, vs7); - vt8 = _mm256_mul_ps(vt8, vs8); - - __m256 vf0 = _mm256_add_ps(_mm256_mul_ps(vt0, vp0), vs0); - __m256 vf1 = _mm256_add_ps(_mm256_mul_ps(vt1, vp1), vs1); - __m256 vf2 = _mm256_add_ps(_mm256_mul_ps(vt2, vp2), vs2); - __m256 vf3 = _mm256_add_ps(_mm256_mul_ps(vt3, vp3), vs3); - __m256 vf4 = _mm256_add_ps(_mm256_mul_ps(vt4, vp4), vs4); - __m256 vf5 = _mm256_add_ps(_mm256_mul_ps(vt5, vp5), vs5); - __m256 vf6 = _mm256_add_ps(_mm256_mul_ps(vt6, vp6), vs6); - __m256 vf7 = _mm256_add_ps(_mm256_mul_ps(vt7, vp7), vs7); - __m256 vf8 = _mm256_add_ps(_mm256_mul_ps(vt8, vp8), vs8); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); - vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); - vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); - vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); - vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vx4, vdenorm_cutoff, _CMP_LT_OS), vf4); - vf5 = _mm256_andnot_ps(_mm256_cmp_ps(vx5, vdenorm_cutoff, _CMP_LT_OS), vf5); - vf6 = _mm256_andnot_ps(_mm256_cmp_ps(vx6, vdenorm_cutoff, _CMP_LT_OS), vf6); - vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); - vf8 = _mm256_andnot_ps(_mm256_cmp_ps(vx8, vdenorm_cutoff, _CMP_LT_OS), vf8); - - // Store 72 (9x4) outputs at a time. - _mm256_storeu_ps(output, vf0); - _mm256_storeu_ps(output + 8, vf1); - _mm256_storeu_ps(output + 16, vf2); - _mm256_storeu_ps(output + 24, vf3); - _mm256_storeu_ps(output + 32, vf4); - _mm256_storeu_ps(output + 40, vf5); - _mm256_storeu_ps(output + 48, vf6); - _mm256_storeu_ps(output + 56, vf7); - _mm256_storeu_ps(output + 64, vf8); - output += 72; - - // Accumulate computed exponents. - vacc0 = _mm256_add_ps(vacc0, vf0); - vacc1 = _mm256_add_ps(vacc1, vf1); - vacc2 = _mm256_add_ps(vacc2, vf2); - vacc0 = _mm256_add_ps(vacc0, vf3); - vacc1 = _mm256_add_ps(vacc1, vf4); - vacc2 = _mm256_add_ps(vacc2, vf5); - vacc0 = _mm256_add_ps(vacc0, vf6); - vacc1 = _mm256_add_ps(vacc1, vf7); - vacc2 = _mm256_add_ps(vacc2, vf8); - } - // Add up all accumulators to vacc0 - vacc0 = _mm256_add_ps(vacc0, vacc1); - vacc0 = _mm256_add_ps(vacc0, vacc2); - - __m256 vacc = vacc0; - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - // Load 8 inputs at a time. - const __m256 vi = _mm256_loadu_ps(input); - input += 8; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm256_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - // Store 8 outputs at a time. - _mm256_storeu_ps(output, vf); - output += 8; - - // Accumulate computed exponents. - vacc = _mm256_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); - - const __m256 vi = _mm256_maskload_ps(input, vmask); - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm256_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - __m128 vf_lo = _mm256_castps256_ps128(vf); - if (batch & (4 * sizeof(float))) { - _mm_storeu_ps(output, vf_lo); - vf_lo = _mm256_extractf128_ps(vf, 1); - output += 4; - } - if (batch & (2 * sizeof(float))) { - _mm_storel_pi((__m64*) output, vf_lo); - vf_lo = _mm_movehl_ps(vf_lo, vf_lo); - output += 2; - } - if (batch & (1 * sizeof(float))) { - _mm_store_ss(output, vf_lo); - } - - vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); - } - __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); - vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); - vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); - _mm_store_ss(sum, vacc_lo); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u72.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u72.c deleted file mode 100644 index 640e31a234e..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u72.c +++ /dev/null @@ -1,351 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u72( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; - - const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); - const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); - const __m256 vminus_ln2_hi = _mm256_set1_ps(-0x1.62E400p-1f); - const __m256 vminus_ln2_lo = _mm256_set1_ps(-0x1.7F7D1Cp-20f); - const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); - const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); - const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); - const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); - const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); - const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m256 vi_max = _mm256_broadcast_ss(max); - - __m256 vacc0 = _mm256_setzero_ps(); - for (; batch >= 72 * sizeof(float); batch -= 72 * sizeof(float)) { - // Load 72 (9x4) inputs at a time. - const __m256 vi0 = _mm256_loadu_ps(input); - const __m256 vi1 = _mm256_loadu_ps(input + 8); - const __m256 vi2 = _mm256_loadu_ps(input + 16); - const __m256 vi3 = _mm256_loadu_ps(input + 24); - const __m256 vi4 = _mm256_loadu_ps(input + 32); - const __m256 vi5 = _mm256_loadu_ps(input + 40); - const __m256 vi6 = _mm256_loadu_ps(input + 48); - const __m256 vi7 = _mm256_loadu_ps(input + 56); - const __m256 vi8 = _mm256_loadu_ps(input + 64); - input += 72; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); - const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); - const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); - const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); - const __m256 vx4 = _mm256_sub_ps(vi4, vi_max); - const __m256 vx5 = _mm256_sub_ps(vi5, vi_max); - const __m256 vx6 = _mm256_sub_ps(vi6, vi_max); - const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); - const __m256 vx8 = _mm256_sub_ps(vi8, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m256 vn0 = _mm256_add_ps(_mm256_mul_ps(vx0, vlog2e), vmagic_bias); - __m256 vn1 = _mm256_add_ps(_mm256_mul_ps(vx1, vlog2e), vmagic_bias); - __m256 vn2 = _mm256_add_ps(_mm256_mul_ps(vx2, vlog2e), vmagic_bias); - __m256 vn3 = _mm256_add_ps(_mm256_mul_ps(vx3, vlog2e), vmagic_bias); - __m256 vn4 = _mm256_add_ps(_mm256_mul_ps(vx4, vlog2e), vmagic_bias); - __m256 vn5 = _mm256_add_ps(_mm256_mul_ps(vx5, vlog2e), vmagic_bias); - __m256 vn6 = _mm256_add_ps(_mm256_mul_ps(vx6, vlog2e), vmagic_bias); - __m256 vn7 = _mm256_add_ps(_mm256_mul_ps(vx7, vlog2e), vmagic_bias); - __m256 vn8 = _mm256_add_ps(_mm256_mul_ps(vx8, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); - const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); - const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); - const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); - const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23)); - const __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); - const __m256 vs6 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn6), 23)); - const __m256 vs7 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn7), 23)); - const __m256 vs8 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn8), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn0 = _mm256_sub_ps(vn0, vmagic_bias); - vn1 = _mm256_sub_ps(vn1, vmagic_bias); - vn2 = _mm256_sub_ps(vn2, vmagic_bias); - vn3 = _mm256_sub_ps(vn3, vmagic_bias); - vn4 = _mm256_sub_ps(vn4, vmagic_bias); - vn5 = _mm256_sub_ps(vn5, vmagic_bias); - vn6 = _mm256_sub_ps(vn6, vmagic_bias); - vn7 = _mm256_sub_ps(vn7, vmagic_bias); - vn8 = _mm256_sub_ps(vn8, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m256 vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_hi), vx0); - __m256 vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_hi), vx1); - __m256 vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_hi), vx2); - __m256 vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_hi), vx3); - __m256 vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_hi), vx4); - __m256 vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_hi), vx5); - __m256 vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_hi), vx6); - __m256 vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_hi), vx7); - __m256 vt8 = _mm256_add_ps(_mm256_mul_ps(vn8, vminus_ln2_hi), vx8); - - vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_lo), vt0); - vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_lo), vt1); - vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_lo), vt2); - vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_lo), vt3); - vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_lo), vt4); - vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_lo), vt5); - vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_lo), vt6); - vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_lo), vt7); - vt8 = _mm256_add_ps(_mm256_mul_ps(vn8, vminus_ln2_lo), vt8); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m256 vp0 = _mm256_add_ps(_mm256_mul_ps(vc5, vt0), vc4); - __m256 vp1 = _mm256_add_ps(_mm256_mul_ps(vc5, vt1), vc4); - __m256 vp2 = _mm256_add_ps(_mm256_mul_ps(vc5, vt2), vc4); - __m256 vp3 = _mm256_add_ps(_mm256_mul_ps(vc5, vt3), vc4); - __m256 vp4 = _mm256_add_ps(_mm256_mul_ps(vc5, vt4), vc4); - __m256 vp5 = _mm256_add_ps(_mm256_mul_ps(vc5, vt5), vc4); - __m256 vp6 = _mm256_add_ps(_mm256_mul_ps(vc5, vt6), vc4); - __m256 vp7 = _mm256_add_ps(_mm256_mul_ps(vc5, vt7), vc4); - __m256 vp8 = _mm256_add_ps(_mm256_mul_ps(vc5, vt8), vc4); - - vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc3); - vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc3); - vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc3); - vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc3); - vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc3); - vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc3); - vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc3); - vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc3); - vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc3); - - vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc2); - vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc2); - vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc2); - vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc2); - vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc2); - vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc2); - vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc2); - vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc2); - vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc2); - - vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc1); - vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc1); - vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc1); - vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc1); - vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc1); - vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc1); - vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc1); - vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc1); - vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt0 = _mm256_mul_ps(vt0, vs0); - vt1 = _mm256_mul_ps(vt1, vs1); - vt2 = _mm256_mul_ps(vt2, vs2); - vt3 = _mm256_mul_ps(vt3, vs3); - vt4 = _mm256_mul_ps(vt4, vs4); - vt5 = _mm256_mul_ps(vt5, vs5); - vt6 = _mm256_mul_ps(vt6, vs6); - vt7 = _mm256_mul_ps(vt7, vs7); - vt8 = _mm256_mul_ps(vt8, vs8); - - __m256 vf0 = _mm256_add_ps(_mm256_mul_ps(vt0, vp0), vs0); - __m256 vf1 = _mm256_add_ps(_mm256_mul_ps(vt1, vp1), vs1); - __m256 vf2 = _mm256_add_ps(_mm256_mul_ps(vt2, vp2), vs2); - __m256 vf3 = _mm256_add_ps(_mm256_mul_ps(vt3, vp3), vs3); - __m256 vf4 = _mm256_add_ps(_mm256_mul_ps(vt4, vp4), vs4); - __m256 vf5 = _mm256_add_ps(_mm256_mul_ps(vt5, vp5), vs5); - __m256 vf6 = _mm256_add_ps(_mm256_mul_ps(vt6, vp6), vs6); - __m256 vf7 = _mm256_add_ps(_mm256_mul_ps(vt7, vp7), vs7); - __m256 vf8 = _mm256_add_ps(_mm256_mul_ps(vt8, vp8), vs8); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); - vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); - vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); - vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); - vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vx4, vdenorm_cutoff, _CMP_LT_OS), vf4); - vf5 = _mm256_andnot_ps(_mm256_cmp_ps(vx5, vdenorm_cutoff, _CMP_LT_OS), vf5); - vf6 = _mm256_andnot_ps(_mm256_cmp_ps(vx6, vdenorm_cutoff, _CMP_LT_OS), vf6); - vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); - vf8 = _mm256_andnot_ps(_mm256_cmp_ps(vx8, vdenorm_cutoff, _CMP_LT_OS), vf8); - - // Store 72 (9x4) outputs at a time. - _mm256_storeu_ps(output, vf0); - _mm256_storeu_ps(output + 8, vf1); - _mm256_storeu_ps(output + 16, vf2); - _mm256_storeu_ps(output + 24, vf3); - _mm256_storeu_ps(output + 32, vf4); - _mm256_storeu_ps(output + 40, vf5); - _mm256_storeu_ps(output + 48, vf6); - _mm256_storeu_ps(output + 56, vf7); - _mm256_storeu_ps(output + 64, vf8); - output += 72; - - // Accumulate computed exponents. - vacc0 = _mm256_add_ps(vacc0, vf0); - vacc0 = _mm256_add_ps(vacc0, vf1); - vacc0 = _mm256_add_ps(vacc0, vf2); - vacc0 = _mm256_add_ps(vacc0, vf3); - vacc0 = _mm256_add_ps(vacc0, vf4); - vacc0 = _mm256_add_ps(vacc0, vf5); - vacc0 = _mm256_add_ps(vacc0, vf6); - vacc0 = _mm256_add_ps(vacc0, vf7); - vacc0 = _mm256_add_ps(vacc0, vf8); - } - - __m256 vacc = vacc0; - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - // Load 8 inputs at a time. - const __m256 vi = _mm256_loadu_ps(input); - input += 8; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm256_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - // Store 8 outputs at a time. - _mm256_storeu_ps(output, vf); - output += 8; - - // Accumulate computed exponents. - vacc = _mm256_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); - - const __m256 vi = _mm256_maskload_ps(input, vmask); - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm256_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - __m128 vf_lo = _mm256_castps256_ps128(vf); - if (batch & (4 * sizeof(float))) { - _mm_storeu_ps(output, vf_lo); - vf_lo = _mm256_extractf128_ps(vf, 1); - output += 4; - } - if (batch & (2 * sizeof(float))) { - _mm_storel_pi((__m64*) output, vf_lo); - vf_lo = _mm_movehl_ps(vf_lo, vf_lo); - output += 2; - } - if (batch & (1 * sizeof(float))) { - _mm_store_ss(output, vf_lo); - } - - vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); - } - __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); - vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); - vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); - _mm_store_ss(sum, vacc_lo); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u8.c similarity index 56% rename from src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64.c rename to src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u8.c index d79849dc12a..1ac88f7b9f1 100644 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u64.c +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u8.c @@ -11,10 +11,11 @@ #include +#include "xnnpack/intrinsics-polyfill.h" #include "xnnpack/raddstoreexpminusmax.h" -void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64( +void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u8( size_t batch, const float* input, const float* max, @@ -56,169 +57,57 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64( const __m256 vi_max = _mm256_broadcast_ss(max); __m256 vacc0 = _mm256_setzero_ps(); - for (; batch >= 64 * sizeof(float); batch -= 64 * sizeof(float)) { - // Load 64 (8x4) inputs at a time. + for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { + // Load 8 (1x8) inputs at a time. const __m256 vi0 = _mm256_loadu_ps(input); - const __m256 vi1 = _mm256_loadu_ps(input + 8); - const __m256 vi2 = _mm256_loadu_ps(input + 16); - const __m256 vi3 = _mm256_loadu_ps(input + 24); - const __m256 vi4 = _mm256_loadu_ps(input + 32); - const __m256 vi5 = _mm256_loadu_ps(input + 40); - const __m256 vi6 = _mm256_loadu_ps(input + 48); - const __m256 vi7 = _mm256_loadu_ps(input + 56); - input += 64; + input += 8; // Subtract maximum input x := i - i_max. This implies x <= 0. const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); - const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); - const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); - const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); - const __m256 vx4 = _mm256_sub_ps(vi4, vi_max); - const __m256 vx5 = _mm256_sub_ps(vi5, vi_max); - const __m256 vx6 = _mm256_sub_ps(vi6, vi_max); - const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); // Compute reduced argument batch := round(x / log(2)). __m256 vn0 = _mm256_add_ps(_mm256_mul_ps(vx0, vlog2e), vmagic_bias); - __m256 vn1 = _mm256_add_ps(_mm256_mul_ps(vx1, vlog2e), vmagic_bias); - __m256 vn2 = _mm256_add_ps(_mm256_mul_ps(vx2, vlog2e), vmagic_bias); - __m256 vn3 = _mm256_add_ps(_mm256_mul_ps(vx3, vlog2e), vmagic_bias); - __m256 vn4 = _mm256_add_ps(_mm256_mul_ps(vx4, vlog2e), vmagic_bias); - __m256 vn5 = _mm256_add_ps(_mm256_mul_ps(vx5, vlog2e), vmagic_bias); - __m256 vn6 = _mm256_add_ps(_mm256_mul_ps(vx6, vlog2e), vmagic_bias); - __m256 vn7 = _mm256_add_ps(_mm256_mul_ps(vx7, vlog2e), vmagic_bias); // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); - const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); - const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); - const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); - const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23)); - const __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); - const __m256 vs6 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn6), 23)); - const __m256 vs7 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn7), 23)); // Subtract the large number back to get final batch := round(x / log(2)). vn0 = _mm256_sub_ps(vn0, vmagic_bias); - vn1 = _mm256_sub_ps(vn1, vmagic_bias); - vn2 = _mm256_sub_ps(vn2, vmagic_bias); - vn3 = _mm256_sub_ps(vn3, vmagic_bias); - vn4 = _mm256_sub_ps(vn4, vmagic_bias); - vn5 = _mm256_sub_ps(vn5, vmagic_bias); - vn6 = _mm256_sub_ps(vn6, vmagic_bias); - vn7 = _mm256_sub_ps(vn7, vmagic_bias); // Compute reduced argument t := x - batch * log(2). // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. __m256 vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_hi), vx0); - __m256 vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_hi), vx1); - __m256 vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_hi), vx2); - __m256 vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_hi), vx3); - __m256 vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_hi), vx4); - __m256 vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_hi), vx5); - __m256 vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_hi), vx6); - __m256 vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_hi), vx7); vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_lo), vt0); - vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_lo), vt1); - vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_lo), vt2); - vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_lo), vt3); - vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_lo), vt4); - vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_lo), vt5); - vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_lo), vt6); - vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_lo), vt7); // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. __m256 vp0 = _mm256_add_ps(_mm256_mul_ps(vc5, vt0), vc4); - __m256 vp1 = _mm256_add_ps(_mm256_mul_ps(vc5, vt1), vc4); - __m256 vp2 = _mm256_add_ps(_mm256_mul_ps(vc5, vt2), vc4); - __m256 vp3 = _mm256_add_ps(_mm256_mul_ps(vc5, vt3), vc4); - __m256 vp4 = _mm256_add_ps(_mm256_mul_ps(vc5, vt4), vc4); - __m256 vp5 = _mm256_add_ps(_mm256_mul_ps(vc5, vt5), vc4); - __m256 vp6 = _mm256_add_ps(_mm256_mul_ps(vc5, vt6), vc4); - __m256 vp7 = _mm256_add_ps(_mm256_mul_ps(vc5, vt7), vc4); vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc3); - vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc3); - vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc3); - vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc3); - vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc3); - vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc3); - vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc3); - vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc3); vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc2); - vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc2); - vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc2); - vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc2); - vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc2); - vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc2); - vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc2); - vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc2); vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc1); - vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc1); - vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc1); - vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc1); - vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc1); - vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc1); - vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc1); - vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc1); // Reconstruct the final f value: // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) // = s + (t * s) * p vt0 = _mm256_mul_ps(vt0, vs0); - vt1 = _mm256_mul_ps(vt1, vs1); - vt2 = _mm256_mul_ps(vt2, vs2); - vt3 = _mm256_mul_ps(vt3, vs3); - vt4 = _mm256_mul_ps(vt4, vs4); - vt5 = _mm256_mul_ps(vt5, vs5); - vt6 = _mm256_mul_ps(vt6, vs6); - vt7 = _mm256_mul_ps(vt7, vs7); __m256 vf0 = _mm256_add_ps(_mm256_mul_ps(vt0, vp0), vs0); - __m256 vf1 = _mm256_add_ps(_mm256_mul_ps(vt1, vp1), vs1); - __m256 vf2 = _mm256_add_ps(_mm256_mul_ps(vt2, vp2), vs2); - __m256 vf3 = _mm256_add_ps(_mm256_mul_ps(vt3, vp3), vs3); - __m256 vf4 = _mm256_add_ps(_mm256_mul_ps(vt4, vp4), vs4); - __m256 vf5 = _mm256_add_ps(_mm256_mul_ps(vt5, vp5), vs5); - __m256 vf6 = _mm256_add_ps(_mm256_mul_ps(vt6, vp6), vs6); - __m256 vf7 = _mm256_add_ps(_mm256_mul_ps(vt7, vp7), vs7); // For inputs below zero cutoff, replace output with +0.0f. // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); - vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); - vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); - vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); - vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vx4, vdenorm_cutoff, _CMP_LT_OS), vf4); - vf5 = _mm256_andnot_ps(_mm256_cmp_ps(vx5, vdenorm_cutoff, _CMP_LT_OS), vf5); - vf6 = _mm256_andnot_ps(_mm256_cmp_ps(vx6, vdenorm_cutoff, _CMP_LT_OS), vf6); - vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); - - // Store 64 (8x4) outputs at a time. + + // Store 8 (1x8) outputs at a time. _mm256_storeu_ps(output, vf0); - _mm256_storeu_ps(output + 8, vf1); - _mm256_storeu_ps(output + 16, vf2); - _mm256_storeu_ps(output + 24, vf3); - _mm256_storeu_ps(output + 32, vf4); - _mm256_storeu_ps(output + 40, vf5); - _mm256_storeu_ps(output + 48, vf6); - _mm256_storeu_ps(output + 56, vf7); - output += 64; + output += 8; // Accumulate computed exponents. vacc0 = _mm256_add_ps(vacc0, vf0); - vacc0 = _mm256_add_ps(vacc0, vf1); - vacc0 = _mm256_add_ps(vacc0, vf2); - vacc0 = _mm256_add_ps(vacc0, vf3); - vacc0 = _mm256_add_ps(vacc0, vf4); - vacc0 = _mm256_add_ps(vacc0, vf5); - vacc0 = _mm256_add_ps(vacc0, vf6); - vacc0 = _mm256_add_ps(vacc0, vf7); } __m256 vacc = vacc0; diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80-acc2.c deleted file mode 100644 index c674de9489d..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80-acc2.c +++ /dev/null @@ -1,370 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80_acc2( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; - - const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); - const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); - const __m256 vminus_ln2_hi = _mm256_set1_ps(-0x1.62E400p-1f); - const __m256 vminus_ln2_lo = _mm256_set1_ps(-0x1.7F7D1Cp-20f); - const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); - const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); - const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); - const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); - const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); - const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m256 vi_max = _mm256_broadcast_ss(max); - - __m256 vacc0 = _mm256_setzero_ps(); - __m256 vacc1 = _mm256_setzero_ps(); - for (; batch >= 80 * sizeof(float); batch -= 80 * sizeof(float)) { - // Load 80 (10x4) inputs at a time. - const __m256 vi0 = _mm256_loadu_ps(input); - const __m256 vi1 = _mm256_loadu_ps(input + 8); - const __m256 vi2 = _mm256_loadu_ps(input + 16); - const __m256 vi3 = _mm256_loadu_ps(input + 24); - const __m256 vi4 = _mm256_loadu_ps(input + 32); - const __m256 vi5 = _mm256_loadu_ps(input + 40); - const __m256 vi6 = _mm256_loadu_ps(input + 48); - const __m256 vi7 = _mm256_loadu_ps(input + 56); - const __m256 vi8 = _mm256_loadu_ps(input + 64); - const __m256 vi9 = _mm256_loadu_ps(input + 72); - input += 80; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); - const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); - const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); - const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); - const __m256 vx4 = _mm256_sub_ps(vi4, vi_max); - const __m256 vx5 = _mm256_sub_ps(vi5, vi_max); - const __m256 vx6 = _mm256_sub_ps(vi6, vi_max); - const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); - const __m256 vx8 = _mm256_sub_ps(vi8, vi_max); - const __m256 vx9 = _mm256_sub_ps(vi9, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m256 vn0 = _mm256_add_ps(_mm256_mul_ps(vx0, vlog2e), vmagic_bias); - __m256 vn1 = _mm256_add_ps(_mm256_mul_ps(vx1, vlog2e), vmagic_bias); - __m256 vn2 = _mm256_add_ps(_mm256_mul_ps(vx2, vlog2e), vmagic_bias); - __m256 vn3 = _mm256_add_ps(_mm256_mul_ps(vx3, vlog2e), vmagic_bias); - __m256 vn4 = _mm256_add_ps(_mm256_mul_ps(vx4, vlog2e), vmagic_bias); - __m256 vn5 = _mm256_add_ps(_mm256_mul_ps(vx5, vlog2e), vmagic_bias); - __m256 vn6 = _mm256_add_ps(_mm256_mul_ps(vx6, vlog2e), vmagic_bias); - __m256 vn7 = _mm256_add_ps(_mm256_mul_ps(vx7, vlog2e), vmagic_bias); - __m256 vn8 = _mm256_add_ps(_mm256_mul_ps(vx8, vlog2e), vmagic_bias); - __m256 vn9 = _mm256_add_ps(_mm256_mul_ps(vx9, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); - const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); - const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); - const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); - const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23)); - const __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); - const __m256 vs6 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn6), 23)); - const __m256 vs7 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn7), 23)); - const __m256 vs8 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn8), 23)); - const __m256 vs9 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn9), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn0 = _mm256_sub_ps(vn0, vmagic_bias); - vn1 = _mm256_sub_ps(vn1, vmagic_bias); - vn2 = _mm256_sub_ps(vn2, vmagic_bias); - vn3 = _mm256_sub_ps(vn3, vmagic_bias); - vn4 = _mm256_sub_ps(vn4, vmagic_bias); - vn5 = _mm256_sub_ps(vn5, vmagic_bias); - vn6 = _mm256_sub_ps(vn6, vmagic_bias); - vn7 = _mm256_sub_ps(vn7, vmagic_bias); - vn8 = _mm256_sub_ps(vn8, vmagic_bias); - vn9 = _mm256_sub_ps(vn9, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m256 vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_hi), vx0); - __m256 vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_hi), vx1); - __m256 vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_hi), vx2); - __m256 vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_hi), vx3); - __m256 vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_hi), vx4); - __m256 vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_hi), vx5); - __m256 vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_hi), vx6); - __m256 vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_hi), vx7); - __m256 vt8 = _mm256_add_ps(_mm256_mul_ps(vn8, vminus_ln2_hi), vx8); - __m256 vt9 = _mm256_add_ps(_mm256_mul_ps(vn9, vminus_ln2_hi), vx9); - - vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_lo), vt0); - vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_lo), vt1); - vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_lo), vt2); - vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_lo), vt3); - vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_lo), vt4); - vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_lo), vt5); - vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_lo), vt6); - vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_lo), vt7); - vt8 = _mm256_add_ps(_mm256_mul_ps(vn8, vminus_ln2_lo), vt8); - vt9 = _mm256_add_ps(_mm256_mul_ps(vn9, vminus_ln2_lo), vt9); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m256 vp0 = _mm256_add_ps(_mm256_mul_ps(vc5, vt0), vc4); - __m256 vp1 = _mm256_add_ps(_mm256_mul_ps(vc5, vt1), vc4); - __m256 vp2 = _mm256_add_ps(_mm256_mul_ps(vc5, vt2), vc4); - __m256 vp3 = _mm256_add_ps(_mm256_mul_ps(vc5, vt3), vc4); - __m256 vp4 = _mm256_add_ps(_mm256_mul_ps(vc5, vt4), vc4); - __m256 vp5 = _mm256_add_ps(_mm256_mul_ps(vc5, vt5), vc4); - __m256 vp6 = _mm256_add_ps(_mm256_mul_ps(vc5, vt6), vc4); - __m256 vp7 = _mm256_add_ps(_mm256_mul_ps(vc5, vt7), vc4); - __m256 vp8 = _mm256_add_ps(_mm256_mul_ps(vc5, vt8), vc4); - __m256 vp9 = _mm256_add_ps(_mm256_mul_ps(vc5, vt9), vc4); - - vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc3); - vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc3); - vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc3); - vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc3); - vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc3); - vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc3); - vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc3); - vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc3); - vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc3); - vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc3); - - vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc2); - vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc2); - vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc2); - vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc2); - vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc2); - vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc2); - vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc2); - vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc2); - vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc2); - vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc2); - - vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc1); - vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc1); - vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc1); - vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc1); - vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc1); - vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc1); - vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc1); - vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc1); - vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc1); - vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt0 = _mm256_mul_ps(vt0, vs0); - vt1 = _mm256_mul_ps(vt1, vs1); - vt2 = _mm256_mul_ps(vt2, vs2); - vt3 = _mm256_mul_ps(vt3, vs3); - vt4 = _mm256_mul_ps(vt4, vs4); - vt5 = _mm256_mul_ps(vt5, vs5); - vt6 = _mm256_mul_ps(vt6, vs6); - vt7 = _mm256_mul_ps(vt7, vs7); - vt8 = _mm256_mul_ps(vt8, vs8); - vt9 = _mm256_mul_ps(vt9, vs9); - - __m256 vf0 = _mm256_add_ps(_mm256_mul_ps(vt0, vp0), vs0); - __m256 vf1 = _mm256_add_ps(_mm256_mul_ps(vt1, vp1), vs1); - __m256 vf2 = _mm256_add_ps(_mm256_mul_ps(vt2, vp2), vs2); - __m256 vf3 = _mm256_add_ps(_mm256_mul_ps(vt3, vp3), vs3); - __m256 vf4 = _mm256_add_ps(_mm256_mul_ps(vt4, vp4), vs4); - __m256 vf5 = _mm256_add_ps(_mm256_mul_ps(vt5, vp5), vs5); - __m256 vf6 = _mm256_add_ps(_mm256_mul_ps(vt6, vp6), vs6); - __m256 vf7 = _mm256_add_ps(_mm256_mul_ps(vt7, vp7), vs7); - __m256 vf8 = _mm256_add_ps(_mm256_mul_ps(vt8, vp8), vs8); - __m256 vf9 = _mm256_add_ps(_mm256_mul_ps(vt9, vp9), vs9); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); - vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); - vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); - vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); - vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vx4, vdenorm_cutoff, _CMP_LT_OS), vf4); - vf5 = _mm256_andnot_ps(_mm256_cmp_ps(vx5, vdenorm_cutoff, _CMP_LT_OS), vf5); - vf6 = _mm256_andnot_ps(_mm256_cmp_ps(vx6, vdenorm_cutoff, _CMP_LT_OS), vf6); - vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); - vf8 = _mm256_andnot_ps(_mm256_cmp_ps(vx8, vdenorm_cutoff, _CMP_LT_OS), vf8); - vf9 = _mm256_andnot_ps(_mm256_cmp_ps(vx9, vdenorm_cutoff, _CMP_LT_OS), vf9); - - // Store 80 (10x4) outputs at a time. - _mm256_storeu_ps(output, vf0); - _mm256_storeu_ps(output + 8, vf1); - _mm256_storeu_ps(output + 16, vf2); - _mm256_storeu_ps(output + 24, vf3); - _mm256_storeu_ps(output + 32, vf4); - _mm256_storeu_ps(output + 40, vf5); - _mm256_storeu_ps(output + 48, vf6); - _mm256_storeu_ps(output + 56, vf7); - _mm256_storeu_ps(output + 64, vf8); - _mm256_storeu_ps(output + 72, vf9); - output += 80; - - // Accumulate computed exponents. - vacc0 = _mm256_add_ps(vacc0, vf0); - vacc1 = _mm256_add_ps(vacc1, vf1); - vacc0 = _mm256_add_ps(vacc0, vf2); - vacc1 = _mm256_add_ps(vacc1, vf3); - vacc0 = _mm256_add_ps(vacc0, vf4); - vacc1 = _mm256_add_ps(vacc1, vf5); - vacc0 = _mm256_add_ps(vacc0, vf6); - vacc1 = _mm256_add_ps(vacc1, vf7); - vacc0 = _mm256_add_ps(vacc0, vf8); - vacc1 = _mm256_add_ps(vacc1, vf9); - } - // Add up all accumulators to vacc0 - vacc0 = _mm256_add_ps(vacc0, vacc1); - - __m256 vacc = vacc0; - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - // Load 8 inputs at a time. - const __m256 vi = _mm256_loadu_ps(input); - input += 8; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm256_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - // Store 8 outputs at a time. - _mm256_storeu_ps(output, vf); - output += 8; - - // Accumulate computed exponents. - vacc = _mm256_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); - - const __m256 vi = _mm256_maskload_ps(input, vmask); - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm256_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - __m128 vf_lo = _mm256_castps256_ps128(vf); - if (batch & (4 * sizeof(float))) { - _mm_storeu_ps(output, vf_lo); - vf_lo = _mm256_extractf128_ps(vf, 1); - output += 4; - } - if (batch & (2 * sizeof(float))) { - _mm_storel_pi((__m64*) output, vf_lo); - vf_lo = _mm_movehl_ps(vf_lo, vf_lo); - output += 2; - } - if (batch & (1 * sizeof(float))) { - _mm_store_ss(output, vf_lo); - } - - vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); - } - __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); - vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); - vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); - _mm_store_ss(sum, vacc_lo); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80-acc5.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80-acc5.c deleted file mode 100644 index afc39c1c9da..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80-acc5.c +++ /dev/null @@ -1,376 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80_acc5( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; - - const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); - const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); - const __m256 vminus_ln2_hi = _mm256_set1_ps(-0x1.62E400p-1f); - const __m256 vminus_ln2_lo = _mm256_set1_ps(-0x1.7F7D1Cp-20f); - const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); - const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); - const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); - const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); - const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); - const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m256 vi_max = _mm256_broadcast_ss(max); - - __m256 vacc0 = _mm256_setzero_ps(); - __m256 vacc1 = _mm256_setzero_ps(); - __m256 vacc2 = _mm256_setzero_ps(); - __m256 vacc3 = _mm256_setzero_ps(); - __m256 vacc4 = _mm256_setzero_ps(); - for (; batch >= 80 * sizeof(float); batch -= 80 * sizeof(float)) { - // Load 80 (10x4) inputs at a time. - const __m256 vi0 = _mm256_loadu_ps(input); - const __m256 vi1 = _mm256_loadu_ps(input + 8); - const __m256 vi2 = _mm256_loadu_ps(input + 16); - const __m256 vi3 = _mm256_loadu_ps(input + 24); - const __m256 vi4 = _mm256_loadu_ps(input + 32); - const __m256 vi5 = _mm256_loadu_ps(input + 40); - const __m256 vi6 = _mm256_loadu_ps(input + 48); - const __m256 vi7 = _mm256_loadu_ps(input + 56); - const __m256 vi8 = _mm256_loadu_ps(input + 64); - const __m256 vi9 = _mm256_loadu_ps(input + 72); - input += 80; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); - const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); - const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); - const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); - const __m256 vx4 = _mm256_sub_ps(vi4, vi_max); - const __m256 vx5 = _mm256_sub_ps(vi5, vi_max); - const __m256 vx6 = _mm256_sub_ps(vi6, vi_max); - const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); - const __m256 vx8 = _mm256_sub_ps(vi8, vi_max); - const __m256 vx9 = _mm256_sub_ps(vi9, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m256 vn0 = _mm256_add_ps(_mm256_mul_ps(vx0, vlog2e), vmagic_bias); - __m256 vn1 = _mm256_add_ps(_mm256_mul_ps(vx1, vlog2e), vmagic_bias); - __m256 vn2 = _mm256_add_ps(_mm256_mul_ps(vx2, vlog2e), vmagic_bias); - __m256 vn3 = _mm256_add_ps(_mm256_mul_ps(vx3, vlog2e), vmagic_bias); - __m256 vn4 = _mm256_add_ps(_mm256_mul_ps(vx4, vlog2e), vmagic_bias); - __m256 vn5 = _mm256_add_ps(_mm256_mul_ps(vx5, vlog2e), vmagic_bias); - __m256 vn6 = _mm256_add_ps(_mm256_mul_ps(vx6, vlog2e), vmagic_bias); - __m256 vn7 = _mm256_add_ps(_mm256_mul_ps(vx7, vlog2e), vmagic_bias); - __m256 vn8 = _mm256_add_ps(_mm256_mul_ps(vx8, vlog2e), vmagic_bias); - __m256 vn9 = _mm256_add_ps(_mm256_mul_ps(vx9, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); - const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); - const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); - const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); - const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23)); - const __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); - const __m256 vs6 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn6), 23)); - const __m256 vs7 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn7), 23)); - const __m256 vs8 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn8), 23)); - const __m256 vs9 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn9), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn0 = _mm256_sub_ps(vn0, vmagic_bias); - vn1 = _mm256_sub_ps(vn1, vmagic_bias); - vn2 = _mm256_sub_ps(vn2, vmagic_bias); - vn3 = _mm256_sub_ps(vn3, vmagic_bias); - vn4 = _mm256_sub_ps(vn4, vmagic_bias); - vn5 = _mm256_sub_ps(vn5, vmagic_bias); - vn6 = _mm256_sub_ps(vn6, vmagic_bias); - vn7 = _mm256_sub_ps(vn7, vmagic_bias); - vn8 = _mm256_sub_ps(vn8, vmagic_bias); - vn9 = _mm256_sub_ps(vn9, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m256 vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_hi), vx0); - __m256 vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_hi), vx1); - __m256 vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_hi), vx2); - __m256 vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_hi), vx3); - __m256 vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_hi), vx4); - __m256 vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_hi), vx5); - __m256 vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_hi), vx6); - __m256 vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_hi), vx7); - __m256 vt8 = _mm256_add_ps(_mm256_mul_ps(vn8, vminus_ln2_hi), vx8); - __m256 vt9 = _mm256_add_ps(_mm256_mul_ps(vn9, vminus_ln2_hi), vx9); - - vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_lo), vt0); - vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_lo), vt1); - vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_lo), vt2); - vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_lo), vt3); - vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_lo), vt4); - vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_lo), vt5); - vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_lo), vt6); - vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_lo), vt7); - vt8 = _mm256_add_ps(_mm256_mul_ps(vn8, vminus_ln2_lo), vt8); - vt9 = _mm256_add_ps(_mm256_mul_ps(vn9, vminus_ln2_lo), vt9); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m256 vp0 = _mm256_add_ps(_mm256_mul_ps(vc5, vt0), vc4); - __m256 vp1 = _mm256_add_ps(_mm256_mul_ps(vc5, vt1), vc4); - __m256 vp2 = _mm256_add_ps(_mm256_mul_ps(vc5, vt2), vc4); - __m256 vp3 = _mm256_add_ps(_mm256_mul_ps(vc5, vt3), vc4); - __m256 vp4 = _mm256_add_ps(_mm256_mul_ps(vc5, vt4), vc4); - __m256 vp5 = _mm256_add_ps(_mm256_mul_ps(vc5, vt5), vc4); - __m256 vp6 = _mm256_add_ps(_mm256_mul_ps(vc5, vt6), vc4); - __m256 vp7 = _mm256_add_ps(_mm256_mul_ps(vc5, vt7), vc4); - __m256 vp8 = _mm256_add_ps(_mm256_mul_ps(vc5, vt8), vc4); - __m256 vp9 = _mm256_add_ps(_mm256_mul_ps(vc5, vt9), vc4); - - vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc3); - vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc3); - vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc3); - vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc3); - vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc3); - vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc3); - vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc3); - vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc3); - vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc3); - vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc3); - - vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc2); - vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc2); - vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc2); - vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc2); - vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc2); - vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc2); - vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc2); - vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc2); - vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc2); - vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc2); - - vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc1); - vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc1); - vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc1); - vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc1); - vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc1); - vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc1); - vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc1); - vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc1); - vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc1); - vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt0 = _mm256_mul_ps(vt0, vs0); - vt1 = _mm256_mul_ps(vt1, vs1); - vt2 = _mm256_mul_ps(vt2, vs2); - vt3 = _mm256_mul_ps(vt3, vs3); - vt4 = _mm256_mul_ps(vt4, vs4); - vt5 = _mm256_mul_ps(vt5, vs5); - vt6 = _mm256_mul_ps(vt6, vs6); - vt7 = _mm256_mul_ps(vt7, vs7); - vt8 = _mm256_mul_ps(vt8, vs8); - vt9 = _mm256_mul_ps(vt9, vs9); - - __m256 vf0 = _mm256_add_ps(_mm256_mul_ps(vt0, vp0), vs0); - __m256 vf1 = _mm256_add_ps(_mm256_mul_ps(vt1, vp1), vs1); - __m256 vf2 = _mm256_add_ps(_mm256_mul_ps(vt2, vp2), vs2); - __m256 vf3 = _mm256_add_ps(_mm256_mul_ps(vt3, vp3), vs3); - __m256 vf4 = _mm256_add_ps(_mm256_mul_ps(vt4, vp4), vs4); - __m256 vf5 = _mm256_add_ps(_mm256_mul_ps(vt5, vp5), vs5); - __m256 vf6 = _mm256_add_ps(_mm256_mul_ps(vt6, vp6), vs6); - __m256 vf7 = _mm256_add_ps(_mm256_mul_ps(vt7, vp7), vs7); - __m256 vf8 = _mm256_add_ps(_mm256_mul_ps(vt8, vp8), vs8); - __m256 vf9 = _mm256_add_ps(_mm256_mul_ps(vt9, vp9), vs9); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); - vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); - vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); - vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); - vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vx4, vdenorm_cutoff, _CMP_LT_OS), vf4); - vf5 = _mm256_andnot_ps(_mm256_cmp_ps(vx5, vdenorm_cutoff, _CMP_LT_OS), vf5); - vf6 = _mm256_andnot_ps(_mm256_cmp_ps(vx6, vdenorm_cutoff, _CMP_LT_OS), vf6); - vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); - vf8 = _mm256_andnot_ps(_mm256_cmp_ps(vx8, vdenorm_cutoff, _CMP_LT_OS), vf8); - vf9 = _mm256_andnot_ps(_mm256_cmp_ps(vx9, vdenorm_cutoff, _CMP_LT_OS), vf9); - - // Store 80 (10x4) outputs at a time. - _mm256_storeu_ps(output, vf0); - _mm256_storeu_ps(output + 8, vf1); - _mm256_storeu_ps(output + 16, vf2); - _mm256_storeu_ps(output + 24, vf3); - _mm256_storeu_ps(output + 32, vf4); - _mm256_storeu_ps(output + 40, vf5); - _mm256_storeu_ps(output + 48, vf6); - _mm256_storeu_ps(output + 56, vf7); - _mm256_storeu_ps(output + 64, vf8); - _mm256_storeu_ps(output + 72, vf9); - output += 80; - - // Accumulate computed exponents. - vacc0 = _mm256_add_ps(vacc0, vf0); - vacc1 = _mm256_add_ps(vacc1, vf1); - vacc2 = _mm256_add_ps(vacc2, vf2); - vacc3 = _mm256_add_ps(vacc3, vf3); - vacc4 = _mm256_add_ps(vacc4, vf4); - vacc0 = _mm256_add_ps(vacc0, vf5); - vacc1 = _mm256_add_ps(vacc1, vf6); - vacc2 = _mm256_add_ps(vacc2, vf7); - vacc3 = _mm256_add_ps(vacc3, vf8); - vacc4 = _mm256_add_ps(vacc4, vf9); - } - // Add up all accumulators to vacc0 - vacc0 = _mm256_add_ps(vacc0, vacc1); - vacc2 = _mm256_add_ps(vacc2, vacc3); - vacc0 = _mm256_add_ps(vacc0, vacc2); - vacc0 = _mm256_add_ps(vacc0, vacc4); - - __m256 vacc = vacc0; - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - // Load 8 inputs at a time. - const __m256 vi = _mm256_loadu_ps(input); - input += 8; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm256_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - // Store 8 outputs at a time. - _mm256_storeu_ps(output, vf); - output += 8; - - // Accumulate computed exponents. - vacc = _mm256_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); - - const __m256 vi = _mm256_maskload_ps(input, vmask); - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm256_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - __m128 vf_lo = _mm256_castps256_ps128(vf); - if (batch & (4 * sizeof(float))) { - _mm_storeu_ps(output, vf_lo); - vf_lo = _mm256_extractf128_ps(vf, 1); - output += 4; - } - if (batch & (2 * sizeof(float))) { - _mm_storel_pi((__m64*) output, vf_lo); - vf_lo = _mm_movehl_ps(vf_lo, vf_lo); - output += 2; - } - if (batch & (1 * sizeof(float))) { - _mm_store_ss(output, vf_lo); - } - - vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); - } - __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); - vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); - vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); - _mm_store_ss(sum, vacc_lo); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80.c deleted file mode 100644 index 8642c6df381..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u80.c +++ /dev/null @@ -1,367 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; - - const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); - const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); - const __m256 vminus_ln2_hi = _mm256_set1_ps(-0x1.62E400p-1f); - const __m256 vminus_ln2_lo = _mm256_set1_ps(-0x1.7F7D1Cp-20f); - const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); - const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); - const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); - const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); - const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); - const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m256 vi_max = _mm256_broadcast_ss(max); - - __m256 vacc0 = _mm256_setzero_ps(); - for (; batch >= 80 * sizeof(float); batch -= 80 * sizeof(float)) { - // Load 80 (10x4) inputs at a time. - const __m256 vi0 = _mm256_loadu_ps(input); - const __m256 vi1 = _mm256_loadu_ps(input + 8); - const __m256 vi2 = _mm256_loadu_ps(input + 16); - const __m256 vi3 = _mm256_loadu_ps(input + 24); - const __m256 vi4 = _mm256_loadu_ps(input + 32); - const __m256 vi5 = _mm256_loadu_ps(input + 40); - const __m256 vi6 = _mm256_loadu_ps(input + 48); - const __m256 vi7 = _mm256_loadu_ps(input + 56); - const __m256 vi8 = _mm256_loadu_ps(input + 64); - const __m256 vi9 = _mm256_loadu_ps(input + 72); - input += 80; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); - const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); - const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); - const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); - const __m256 vx4 = _mm256_sub_ps(vi4, vi_max); - const __m256 vx5 = _mm256_sub_ps(vi5, vi_max); - const __m256 vx6 = _mm256_sub_ps(vi6, vi_max); - const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); - const __m256 vx8 = _mm256_sub_ps(vi8, vi_max); - const __m256 vx9 = _mm256_sub_ps(vi9, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m256 vn0 = _mm256_add_ps(_mm256_mul_ps(vx0, vlog2e), vmagic_bias); - __m256 vn1 = _mm256_add_ps(_mm256_mul_ps(vx1, vlog2e), vmagic_bias); - __m256 vn2 = _mm256_add_ps(_mm256_mul_ps(vx2, vlog2e), vmagic_bias); - __m256 vn3 = _mm256_add_ps(_mm256_mul_ps(vx3, vlog2e), vmagic_bias); - __m256 vn4 = _mm256_add_ps(_mm256_mul_ps(vx4, vlog2e), vmagic_bias); - __m256 vn5 = _mm256_add_ps(_mm256_mul_ps(vx5, vlog2e), vmagic_bias); - __m256 vn6 = _mm256_add_ps(_mm256_mul_ps(vx6, vlog2e), vmagic_bias); - __m256 vn7 = _mm256_add_ps(_mm256_mul_ps(vx7, vlog2e), vmagic_bias); - __m256 vn8 = _mm256_add_ps(_mm256_mul_ps(vx8, vlog2e), vmagic_bias); - __m256 vn9 = _mm256_add_ps(_mm256_mul_ps(vx9, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); - const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); - const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); - const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); - const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23)); - const __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); - const __m256 vs6 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn6), 23)); - const __m256 vs7 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn7), 23)); - const __m256 vs8 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn8), 23)); - const __m256 vs9 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn9), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn0 = _mm256_sub_ps(vn0, vmagic_bias); - vn1 = _mm256_sub_ps(vn1, vmagic_bias); - vn2 = _mm256_sub_ps(vn2, vmagic_bias); - vn3 = _mm256_sub_ps(vn3, vmagic_bias); - vn4 = _mm256_sub_ps(vn4, vmagic_bias); - vn5 = _mm256_sub_ps(vn5, vmagic_bias); - vn6 = _mm256_sub_ps(vn6, vmagic_bias); - vn7 = _mm256_sub_ps(vn7, vmagic_bias); - vn8 = _mm256_sub_ps(vn8, vmagic_bias); - vn9 = _mm256_sub_ps(vn9, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m256 vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_hi), vx0); - __m256 vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_hi), vx1); - __m256 vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_hi), vx2); - __m256 vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_hi), vx3); - __m256 vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_hi), vx4); - __m256 vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_hi), vx5); - __m256 vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_hi), vx6); - __m256 vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_hi), vx7); - __m256 vt8 = _mm256_add_ps(_mm256_mul_ps(vn8, vminus_ln2_hi), vx8); - __m256 vt9 = _mm256_add_ps(_mm256_mul_ps(vn9, vminus_ln2_hi), vx9); - - vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_lo), vt0); - vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_lo), vt1); - vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_lo), vt2); - vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_lo), vt3); - vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_lo), vt4); - vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_lo), vt5); - vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_lo), vt6); - vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_lo), vt7); - vt8 = _mm256_add_ps(_mm256_mul_ps(vn8, vminus_ln2_lo), vt8); - vt9 = _mm256_add_ps(_mm256_mul_ps(vn9, vminus_ln2_lo), vt9); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m256 vp0 = _mm256_add_ps(_mm256_mul_ps(vc5, vt0), vc4); - __m256 vp1 = _mm256_add_ps(_mm256_mul_ps(vc5, vt1), vc4); - __m256 vp2 = _mm256_add_ps(_mm256_mul_ps(vc5, vt2), vc4); - __m256 vp3 = _mm256_add_ps(_mm256_mul_ps(vc5, vt3), vc4); - __m256 vp4 = _mm256_add_ps(_mm256_mul_ps(vc5, vt4), vc4); - __m256 vp5 = _mm256_add_ps(_mm256_mul_ps(vc5, vt5), vc4); - __m256 vp6 = _mm256_add_ps(_mm256_mul_ps(vc5, vt6), vc4); - __m256 vp7 = _mm256_add_ps(_mm256_mul_ps(vc5, vt7), vc4); - __m256 vp8 = _mm256_add_ps(_mm256_mul_ps(vc5, vt8), vc4); - __m256 vp9 = _mm256_add_ps(_mm256_mul_ps(vc5, vt9), vc4); - - vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc3); - vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc3); - vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc3); - vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc3); - vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc3); - vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc3); - vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc3); - vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc3); - vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc3); - vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc3); - - vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc2); - vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc2); - vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc2); - vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc2); - vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc2); - vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc2); - vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc2); - vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc2); - vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc2); - vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc2); - - vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc1); - vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc1); - vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc1); - vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc1); - vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc1); - vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc1); - vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc1); - vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc1); - vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc1); - vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt0 = _mm256_mul_ps(vt0, vs0); - vt1 = _mm256_mul_ps(vt1, vs1); - vt2 = _mm256_mul_ps(vt2, vs2); - vt3 = _mm256_mul_ps(vt3, vs3); - vt4 = _mm256_mul_ps(vt4, vs4); - vt5 = _mm256_mul_ps(vt5, vs5); - vt6 = _mm256_mul_ps(vt6, vs6); - vt7 = _mm256_mul_ps(vt7, vs7); - vt8 = _mm256_mul_ps(vt8, vs8); - vt9 = _mm256_mul_ps(vt9, vs9); - - __m256 vf0 = _mm256_add_ps(_mm256_mul_ps(vt0, vp0), vs0); - __m256 vf1 = _mm256_add_ps(_mm256_mul_ps(vt1, vp1), vs1); - __m256 vf2 = _mm256_add_ps(_mm256_mul_ps(vt2, vp2), vs2); - __m256 vf3 = _mm256_add_ps(_mm256_mul_ps(vt3, vp3), vs3); - __m256 vf4 = _mm256_add_ps(_mm256_mul_ps(vt4, vp4), vs4); - __m256 vf5 = _mm256_add_ps(_mm256_mul_ps(vt5, vp5), vs5); - __m256 vf6 = _mm256_add_ps(_mm256_mul_ps(vt6, vp6), vs6); - __m256 vf7 = _mm256_add_ps(_mm256_mul_ps(vt7, vp7), vs7); - __m256 vf8 = _mm256_add_ps(_mm256_mul_ps(vt8, vp8), vs8); - __m256 vf9 = _mm256_add_ps(_mm256_mul_ps(vt9, vp9), vs9); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); - vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); - vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); - vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); - vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vx4, vdenorm_cutoff, _CMP_LT_OS), vf4); - vf5 = _mm256_andnot_ps(_mm256_cmp_ps(vx5, vdenorm_cutoff, _CMP_LT_OS), vf5); - vf6 = _mm256_andnot_ps(_mm256_cmp_ps(vx6, vdenorm_cutoff, _CMP_LT_OS), vf6); - vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); - vf8 = _mm256_andnot_ps(_mm256_cmp_ps(vx8, vdenorm_cutoff, _CMP_LT_OS), vf8); - vf9 = _mm256_andnot_ps(_mm256_cmp_ps(vx9, vdenorm_cutoff, _CMP_LT_OS), vf9); - - // Store 80 (10x4) outputs at a time. - _mm256_storeu_ps(output, vf0); - _mm256_storeu_ps(output + 8, vf1); - _mm256_storeu_ps(output + 16, vf2); - _mm256_storeu_ps(output + 24, vf3); - _mm256_storeu_ps(output + 32, vf4); - _mm256_storeu_ps(output + 40, vf5); - _mm256_storeu_ps(output + 48, vf6); - _mm256_storeu_ps(output + 56, vf7); - _mm256_storeu_ps(output + 64, vf8); - _mm256_storeu_ps(output + 72, vf9); - output += 80; - - // Accumulate computed exponents. - vacc0 = _mm256_add_ps(vacc0, vf0); - vacc0 = _mm256_add_ps(vacc0, vf1); - vacc0 = _mm256_add_ps(vacc0, vf2); - vacc0 = _mm256_add_ps(vacc0, vf3); - vacc0 = _mm256_add_ps(vacc0, vf4); - vacc0 = _mm256_add_ps(vacc0, vf5); - vacc0 = _mm256_add_ps(vacc0, vf6); - vacc0 = _mm256_add_ps(vacc0, vf7); - vacc0 = _mm256_add_ps(vacc0, vf8); - vacc0 = _mm256_add_ps(vacc0, vf9); - } - - __m256 vacc = vacc0; - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - // Load 8 inputs at a time. - const __m256 vi = _mm256_loadu_ps(input); - input += 8; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm256_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - // Store 8 outputs at a time. - _mm256_storeu_ps(output, vf); - output += 8; - - // Accumulate computed exponents. - vacc = _mm256_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); - - const __m256 vi = _mm256_maskload_ps(input, vmask); - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm256_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - __m128 vf_lo = _mm256_castps256_ps128(vf); - if (batch & (4 * sizeof(float))) { - _mm_storeu_ps(output, vf_lo); - vf_lo = _mm256_extractf128_ps(vf, 1); - output += 4; - } - if (batch & (2 * sizeof(float))) { - _mm_storel_pi((__m64*) output, vf_lo); - vf_lo = _mm_movehl_ps(vf_lo, vf_lo); - output += 2; - } - if (batch & (1 * sizeof(float))) { - _mm_store_ss(output, vf_lo); - } - - vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); - } - __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); - vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); - vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); - _mm_store_ss(sum, vacc_lo); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc2.c deleted file mode 100644 index 590e5f5ccbf..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc2.c +++ /dev/null @@ -1,402 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc2( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; - - const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); - const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); - const __m256 vminus_ln2_hi = _mm256_set1_ps(-0x1.62E400p-1f); - const __m256 vminus_ln2_lo = _mm256_set1_ps(-0x1.7F7D1Cp-20f); - const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); - const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); - const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); - const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); - const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); - const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m256 vi_max = _mm256_broadcast_ss(max); - - __m256 vacc0 = _mm256_setzero_ps(); - __m256 vacc1 = _mm256_setzero_ps(); - for (; batch >= 96 * sizeof(float); batch -= 96 * sizeof(float)) { - // Load 96 (12x4) inputs at a time. - const __m256 vi0 = _mm256_loadu_ps(input); - const __m256 vi1 = _mm256_loadu_ps(input + 8); - const __m256 vi2 = _mm256_loadu_ps(input + 16); - const __m256 vi3 = _mm256_loadu_ps(input + 24); - const __m256 vi4 = _mm256_loadu_ps(input + 32); - const __m256 vi5 = _mm256_loadu_ps(input + 40); - const __m256 vi6 = _mm256_loadu_ps(input + 48); - const __m256 vi7 = _mm256_loadu_ps(input + 56); - const __m256 vi8 = _mm256_loadu_ps(input + 64); - const __m256 vi9 = _mm256_loadu_ps(input + 72); - const __m256 vi10 = _mm256_loadu_ps(input + 80); - const __m256 vi11 = _mm256_loadu_ps(input + 88); - input += 96; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); - const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); - const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); - const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); - const __m256 vx4 = _mm256_sub_ps(vi4, vi_max); - const __m256 vx5 = _mm256_sub_ps(vi5, vi_max); - const __m256 vx6 = _mm256_sub_ps(vi6, vi_max); - const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); - const __m256 vx8 = _mm256_sub_ps(vi8, vi_max); - const __m256 vx9 = _mm256_sub_ps(vi9, vi_max); - const __m256 vx10 = _mm256_sub_ps(vi10, vi_max); - const __m256 vx11 = _mm256_sub_ps(vi11, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m256 vn0 = _mm256_add_ps(_mm256_mul_ps(vx0, vlog2e), vmagic_bias); - __m256 vn1 = _mm256_add_ps(_mm256_mul_ps(vx1, vlog2e), vmagic_bias); - __m256 vn2 = _mm256_add_ps(_mm256_mul_ps(vx2, vlog2e), vmagic_bias); - __m256 vn3 = _mm256_add_ps(_mm256_mul_ps(vx3, vlog2e), vmagic_bias); - __m256 vn4 = _mm256_add_ps(_mm256_mul_ps(vx4, vlog2e), vmagic_bias); - __m256 vn5 = _mm256_add_ps(_mm256_mul_ps(vx5, vlog2e), vmagic_bias); - __m256 vn6 = _mm256_add_ps(_mm256_mul_ps(vx6, vlog2e), vmagic_bias); - __m256 vn7 = _mm256_add_ps(_mm256_mul_ps(vx7, vlog2e), vmagic_bias); - __m256 vn8 = _mm256_add_ps(_mm256_mul_ps(vx8, vlog2e), vmagic_bias); - __m256 vn9 = _mm256_add_ps(_mm256_mul_ps(vx9, vlog2e), vmagic_bias); - __m256 vn10 = _mm256_add_ps(_mm256_mul_ps(vx10, vlog2e), vmagic_bias); - __m256 vn11 = _mm256_add_ps(_mm256_mul_ps(vx11, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); - const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); - const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); - const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); - const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23)); - const __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); - const __m256 vs6 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn6), 23)); - const __m256 vs7 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn7), 23)); - const __m256 vs8 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn8), 23)); - const __m256 vs9 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn9), 23)); - const __m256 vs10 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn10), 23)); - const __m256 vs11 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn11), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn0 = _mm256_sub_ps(vn0, vmagic_bias); - vn1 = _mm256_sub_ps(vn1, vmagic_bias); - vn2 = _mm256_sub_ps(vn2, vmagic_bias); - vn3 = _mm256_sub_ps(vn3, vmagic_bias); - vn4 = _mm256_sub_ps(vn4, vmagic_bias); - vn5 = _mm256_sub_ps(vn5, vmagic_bias); - vn6 = _mm256_sub_ps(vn6, vmagic_bias); - vn7 = _mm256_sub_ps(vn7, vmagic_bias); - vn8 = _mm256_sub_ps(vn8, vmagic_bias); - vn9 = _mm256_sub_ps(vn9, vmagic_bias); - vn10 = _mm256_sub_ps(vn10, vmagic_bias); - vn11 = _mm256_sub_ps(vn11, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m256 vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_hi), vx0); - __m256 vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_hi), vx1); - __m256 vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_hi), vx2); - __m256 vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_hi), vx3); - __m256 vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_hi), vx4); - __m256 vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_hi), vx5); - __m256 vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_hi), vx6); - __m256 vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_hi), vx7); - __m256 vt8 = _mm256_add_ps(_mm256_mul_ps(vn8, vminus_ln2_hi), vx8); - __m256 vt9 = _mm256_add_ps(_mm256_mul_ps(vn9, vminus_ln2_hi), vx9); - __m256 vt10 = _mm256_add_ps(_mm256_mul_ps(vn10, vminus_ln2_hi), vx10); - __m256 vt11 = _mm256_add_ps(_mm256_mul_ps(vn11, vminus_ln2_hi), vx11); - - vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_lo), vt0); - vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_lo), vt1); - vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_lo), vt2); - vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_lo), vt3); - vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_lo), vt4); - vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_lo), vt5); - vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_lo), vt6); - vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_lo), vt7); - vt8 = _mm256_add_ps(_mm256_mul_ps(vn8, vminus_ln2_lo), vt8); - vt9 = _mm256_add_ps(_mm256_mul_ps(vn9, vminus_ln2_lo), vt9); - vt10 = _mm256_add_ps(_mm256_mul_ps(vn10, vminus_ln2_lo), vt10); - vt11 = _mm256_add_ps(_mm256_mul_ps(vn11, vminus_ln2_lo), vt11); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m256 vp0 = _mm256_add_ps(_mm256_mul_ps(vc5, vt0), vc4); - __m256 vp1 = _mm256_add_ps(_mm256_mul_ps(vc5, vt1), vc4); - __m256 vp2 = _mm256_add_ps(_mm256_mul_ps(vc5, vt2), vc4); - __m256 vp3 = _mm256_add_ps(_mm256_mul_ps(vc5, vt3), vc4); - __m256 vp4 = _mm256_add_ps(_mm256_mul_ps(vc5, vt4), vc4); - __m256 vp5 = _mm256_add_ps(_mm256_mul_ps(vc5, vt5), vc4); - __m256 vp6 = _mm256_add_ps(_mm256_mul_ps(vc5, vt6), vc4); - __m256 vp7 = _mm256_add_ps(_mm256_mul_ps(vc5, vt7), vc4); - __m256 vp8 = _mm256_add_ps(_mm256_mul_ps(vc5, vt8), vc4); - __m256 vp9 = _mm256_add_ps(_mm256_mul_ps(vc5, vt9), vc4); - __m256 vp10 = _mm256_add_ps(_mm256_mul_ps(vc5, vt10), vc4); - __m256 vp11 = _mm256_add_ps(_mm256_mul_ps(vc5, vt11), vc4); - - vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc3); - vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc3); - vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc3); - vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc3); - vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc3); - vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc3); - vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc3); - vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc3); - vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc3); - vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc3); - vp10 = _mm256_add_ps(_mm256_mul_ps(vp10, vt10), vc3); - vp11 = _mm256_add_ps(_mm256_mul_ps(vp11, vt11), vc3); - - vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc2); - vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc2); - vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc2); - vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc2); - vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc2); - vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc2); - vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc2); - vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc2); - vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc2); - vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc2); - vp10 = _mm256_add_ps(_mm256_mul_ps(vp10, vt10), vc2); - vp11 = _mm256_add_ps(_mm256_mul_ps(vp11, vt11), vc2); - - vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc1); - vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc1); - vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc1); - vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc1); - vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc1); - vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc1); - vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc1); - vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc1); - vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc1); - vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc1); - vp10 = _mm256_add_ps(_mm256_mul_ps(vp10, vt10), vc1); - vp11 = _mm256_add_ps(_mm256_mul_ps(vp11, vt11), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt0 = _mm256_mul_ps(vt0, vs0); - vt1 = _mm256_mul_ps(vt1, vs1); - vt2 = _mm256_mul_ps(vt2, vs2); - vt3 = _mm256_mul_ps(vt3, vs3); - vt4 = _mm256_mul_ps(vt4, vs4); - vt5 = _mm256_mul_ps(vt5, vs5); - vt6 = _mm256_mul_ps(vt6, vs6); - vt7 = _mm256_mul_ps(vt7, vs7); - vt8 = _mm256_mul_ps(vt8, vs8); - vt9 = _mm256_mul_ps(vt9, vs9); - vt10 = _mm256_mul_ps(vt10, vs10); - vt11 = _mm256_mul_ps(vt11, vs11); - - __m256 vf0 = _mm256_add_ps(_mm256_mul_ps(vt0, vp0), vs0); - __m256 vf1 = _mm256_add_ps(_mm256_mul_ps(vt1, vp1), vs1); - __m256 vf2 = _mm256_add_ps(_mm256_mul_ps(vt2, vp2), vs2); - __m256 vf3 = _mm256_add_ps(_mm256_mul_ps(vt3, vp3), vs3); - __m256 vf4 = _mm256_add_ps(_mm256_mul_ps(vt4, vp4), vs4); - __m256 vf5 = _mm256_add_ps(_mm256_mul_ps(vt5, vp5), vs5); - __m256 vf6 = _mm256_add_ps(_mm256_mul_ps(vt6, vp6), vs6); - __m256 vf7 = _mm256_add_ps(_mm256_mul_ps(vt7, vp7), vs7); - __m256 vf8 = _mm256_add_ps(_mm256_mul_ps(vt8, vp8), vs8); - __m256 vf9 = _mm256_add_ps(_mm256_mul_ps(vt9, vp9), vs9); - __m256 vf10 = _mm256_add_ps(_mm256_mul_ps(vt10, vp10), vs10); - __m256 vf11 = _mm256_add_ps(_mm256_mul_ps(vt11, vp11), vs11); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); - vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); - vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); - vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); - vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vx4, vdenorm_cutoff, _CMP_LT_OS), vf4); - vf5 = _mm256_andnot_ps(_mm256_cmp_ps(vx5, vdenorm_cutoff, _CMP_LT_OS), vf5); - vf6 = _mm256_andnot_ps(_mm256_cmp_ps(vx6, vdenorm_cutoff, _CMP_LT_OS), vf6); - vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); - vf8 = _mm256_andnot_ps(_mm256_cmp_ps(vx8, vdenorm_cutoff, _CMP_LT_OS), vf8); - vf9 = _mm256_andnot_ps(_mm256_cmp_ps(vx9, vdenorm_cutoff, _CMP_LT_OS), vf9); - vf10 = _mm256_andnot_ps(_mm256_cmp_ps(vx10, vdenorm_cutoff, _CMP_LT_OS), vf10); - vf11 = _mm256_andnot_ps(_mm256_cmp_ps(vx11, vdenorm_cutoff, _CMP_LT_OS), vf11); - - // Store 96 (12x4) outputs at a time. - _mm256_storeu_ps(output, vf0); - _mm256_storeu_ps(output + 8, vf1); - _mm256_storeu_ps(output + 16, vf2); - _mm256_storeu_ps(output + 24, vf3); - _mm256_storeu_ps(output + 32, vf4); - _mm256_storeu_ps(output + 40, vf5); - _mm256_storeu_ps(output + 48, vf6); - _mm256_storeu_ps(output + 56, vf7); - _mm256_storeu_ps(output + 64, vf8); - _mm256_storeu_ps(output + 72, vf9); - _mm256_storeu_ps(output + 80, vf10); - _mm256_storeu_ps(output + 88, vf11); - output += 96; - - // Accumulate computed exponents. - vacc0 = _mm256_add_ps(vacc0, vf0); - vacc1 = _mm256_add_ps(vacc1, vf1); - vacc0 = _mm256_add_ps(vacc0, vf2); - vacc1 = _mm256_add_ps(vacc1, vf3); - vacc0 = _mm256_add_ps(vacc0, vf4); - vacc1 = _mm256_add_ps(vacc1, vf5); - vacc0 = _mm256_add_ps(vacc0, vf6); - vacc1 = _mm256_add_ps(vacc1, vf7); - vacc0 = _mm256_add_ps(vacc0, vf8); - vacc1 = _mm256_add_ps(vacc1, vf9); - vacc0 = _mm256_add_ps(vacc0, vf10); - vacc1 = _mm256_add_ps(vacc1, vf11); - } - // Add up all accumulators to vacc0 - vacc0 = _mm256_add_ps(vacc0, vacc1); - - __m256 vacc = vacc0; - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - // Load 8 inputs at a time. - const __m256 vi = _mm256_loadu_ps(input); - input += 8; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm256_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - // Store 8 outputs at a time. - _mm256_storeu_ps(output, vf); - output += 8; - - // Accumulate computed exponents. - vacc = _mm256_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); - - const __m256 vi = _mm256_maskload_ps(input, vmask); - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm256_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - __m128 vf_lo = _mm256_castps256_ps128(vf); - if (batch & (4 * sizeof(float))) { - _mm_storeu_ps(output, vf_lo); - vf_lo = _mm256_extractf128_ps(vf, 1); - output += 4; - } - if (batch & (2 * sizeof(float))) { - _mm_storel_pi((__m64*) output, vf_lo); - vf_lo = _mm_movehl_ps(vf_lo, vf_lo); - output += 2; - } - if (batch & (1 * sizeof(float))) { - _mm_store_ss(output, vf_lo); - } - - vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); - } - __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); - vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); - vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); - _mm_store_ss(sum, vacc_lo); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc3.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc3.c deleted file mode 100644 index d5650025e2a..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc3.c +++ /dev/null @@ -1,404 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc3( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; - - const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); - const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); - const __m256 vminus_ln2_hi = _mm256_set1_ps(-0x1.62E400p-1f); - const __m256 vminus_ln2_lo = _mm256_set1_ps(-0x1.7F7D1Cp-20f); - const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); - const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); - const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); - const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); - const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); - const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m256 vi_max = _mm256_broadcast_ss(max); - - __m256 vacc0 = _mm256_setzero_ps(); - __m256 vacc1 = _mm256_setzero_ps(); - __m256 vacc2 = _mm256_setzero_ps(); - for (; batch >= 96 * sizeof(float); batch -= 96 * sizeof(float)) { - // Load 96 (12x4) inputs at a time. - const __m256 vi0 = _mm256_loadu_ps(input); - const __m256 vi1 = _mm256_loadu_ps(input + 8); - const __m256 vi2 = _mm256_loadu_ps(input + 16); - const __m256 vi3 = _mm256_loadu_ps(input + 24); - const __m256 vi4 = _mm256_loadu_ps(input + 32); - const __m256 vi5 = _mm256_loadu_ps(input + 40); - const __m256 vi6 = _mm256_loadu_ps(input + 48); - const __m256 vi7 = _mm256_loadu_ps(input + 56); - const __m256 vi8 = _mm256_loadu_ps(input + 64); - const __m256 vi9 = _mm256_loadu_ps(input + 72); - const __m256 vi10 = _mm256_loadu_ps(input + 80); - const __m256 vi11 = _mm256_loadu_ps(input + 88); - input += 96; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); - const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); - const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); - const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); - const __m256 vx4 = _mm256_sub_ps(vi4, vi_max); - const __m256 vx5 = _mm256_sub_ps(vi5, vi_max); - const __m256 vx6 = _mm256_sub_ps(vi6, vi_max); - const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); - const __m256 vx8 = _mm256_sub_ps(vi8, vi_max); - const __m256 vx9 = _mm256_sub_ps(vi9, vi_max); - const __m256 vx10 = _mm256_sub_ps(vi10, vi_max); - const __m256 vx11 = _mm256_sub_ps(vi11, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m256 vn0 = _mm256_add_ps(_mm256_mul_ps(vx0, vlog2e), vmagic_bias); - __m256 vn1 = _mm256_add_ps(_mm256_mul_ps(vx1, vlog2e), vmagic_bias); - __m256 vn2 = _mm256_add_ps(_mm256_mul_ps(vx2, vlog2e), vmagic_bias); - __m256 vn3 = _mm256_add_ps(_mm256_mul_ps(vx3, vlog2e), vmagic_bias); - __m256 vn4 = _mm256_add_ps(_mm256_mul_ps(vx4, vlog2e), vmagic_bias); - __m256 vn5 = _mm256_add_ps(_mm256_mul_ps(vx5, vlog2e), vmagic_bias); - __m256 vn6 = _mm256_add_ps(_mm256_mul_ps(vx6, vlog2e), vmagic_bias); - __m256 vn7 = _mm256_add_ps(_mm256_mul_ps(vx7, vlog2e), vmagic_bias); - __m256 vn8 = _mm256_add_ps(_mm256_mul_ps(vx8, vlog2e), vmagic_bias); - __m256 vn9 = _mm256_add_ps(_mm256_mul_ps(vx9, vlog2e), vmagic_bias); - __m256 vn10 = _mm256_add_ps(_mm256_mul_ps(vx10, vlog2e), vmagic_bias); - __m256 vn11 = _mm256_add_ps(_mm256_mul_ps(vx11, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); - const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); - const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); - const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); - const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23)); - const __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); - const __m256 vs6 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn6), 23)); - const __m256 vs7 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn7), 23)); - const __m256 vs8 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn8), 23)); - const __m256 vs9 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn9), 23)); - const __m256 vs10 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn10), 23)); - const __m256 vs11 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn11), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn0 = _mm256_sub_ps(vn0, vmagic_bias); - vn1 = _mm256_sub_ps(vn1, vmagic_bias); - vn2 = _mm256_sub_ps(vn2, vmagic_bias); - vn3 = _mm256_sub_ps(vn3, vmagic_bias); - vn4 = _mm256_sub_ps(vn4, vmagic_bias); - vn5 = _mm256_sub_ps(vn5, vmagic_bias); - vn6 = _mm256_sub_ps(vn6, vmagic_bias); - vn7 = _mm256_sub_ps(vn7, vmagic_bias); - vn8 = _mm256_sub_ps(vn8, vmagic_bias); - vn9 = _mm256_sub_ps(vn9, vmagic_bias); - vn10 = _mm256_sub_ps(vn10, vmagic_bias); - vn11 = _mm256_sub_ps(vn11, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m256 vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_hi), vx0); - __m256 vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_hi), vx1); - __m256 vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_hi), vx2); - __m256 vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_hi), vx3); - __m256 vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_hi), vx4); - __m256 vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_hi), vx5); - __m256 vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_hi), vx6); - __m256 vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_hi), vx7); - __m256 vt8 = _mm256_add_ps(_mm256_mul_ps(vn8, vminus_ln2_hi), vx8); - __m256 vt9 = _mm256_add_ps(_mm256_mul_ps(vn9, vminus_ln2_hi), vx9); - __m256 vt10 = _mm256_add_ps(_mm256_mul_ps(vn10, vminus_ln2_hi), vx10); - __m256 vt11 = _mm256_add_ps(_mm256_mul_ps(vn11, vminus_ln2_hi), vx11); - - vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_lo), vt0); - vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_lo), vt1); - vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_lo), vt2); - vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_lo), vt3); - vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_lo), vt4); - vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_lo), vt5); - vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_lo), vt6); - vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_lo), vt7); - vt8 = _mm256_add_ps(_mm256_mul_ps(vn8, vminus_ln2_lo), vt8); - vt9 = _mm256_add_ps(_mm256_mul_ps(vn9, vminus_ln2_lo), vt9); - vt10 = _mm256_add_ps(_mm256_mul_ps(vn10, vminus_ln2_lo), vt10); - vt11 = _mm256_add_ps(_mm256_mul_ps(vn11, vminus_ln2_lo), vt11); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m256 vp0 = _mm256_add_ps(_mm256_mul_ps(vc5, vt0), vc4); - __m256 vp1 = _mm256_add_ps(_mm256_mul_ps(vc5, vt1), vc4); - __m256 vp2 = _mm256_add_ps(_mm256_mul_ps(vc5, vt2), vc4); - __m256 vp3 = _mm256_add_ps(_mm256_mul_ps(vc5, vt3), vc4); - __m256 vp4 = _mm256_add_ps(_mm256_mul_ps(vc5, vt4), vc4); - __m256 vp5 = _mm256_add_ps(_mm256_mul_ps(vc5, vt5), vc4); - __m256 vp6 = _mm256_add_ps(_mm256_mul_ps(vc5, vt6), vc4); - __m256 vp7 = _mm256_add_ps(_mm256_mul_ps(vc5, vt7), vc4); - __m256 vp8 = _mm256_add_ps(_mm256_mul_ps(vc5, vt8), vc4); - __m256 vp9 = _mm256_add_ps(_mm256_mul_ps(vc5, vt9), vc4); - __m256 vp10 = _mm256_add_ps(_mm256_mul_ps(vc5, vt10), vc4); - __m256 vp11 = _mm256_add_ps(_mm256_mul_ps(vc5, vt11), vc4); - - vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc3); - vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc3); - vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc3); - vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc3); - vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc3); - vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc3); - vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc3); - vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc3); - vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc3); - vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc3); - vp10 = _mm256_add_ps(_mm256_mul_ps(vp10, vt10), vc3); - vp11 = _mm256_add_ps(_mm256_mul_ps(vp11, vt11), vc3); - - vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc2); - vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc2); - vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc2); - vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc2); - vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc2); - vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc2); - vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc2); - vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc2); - vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc2); - vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc2); - vp10 = _mm256_add_ps(_mm256_mul_ps(vp10, vt10), vc2); - vp11 = _mm256_add_ps(_mm256_mul_ps(vp11, vt11), vc2); - - vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc1); - vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc1); - vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc1); - vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc1); - vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc1); - vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc1); - vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc1); - vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc1); - vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc1); - vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc1); - vp10 = _mm256_add_ps(_mm256_mul_ps(vp10, vt10), vc1); - vp11 = _mm256_add_ps(_mm256_mul_ps(vp11, vt11), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt0 = _mm256_mul_ps(vt0, vs0); - vt1 = _mm256_mul_ps(vt1, vs1); - vt2 = _mm256_mul_ps(vt2, vs2); - vt3 = _mm256_mul_ps(vt3, vs3); - vt4 = _mm256_mul_ps(vt4, vs4); - vt5 = _mm256_mul_ps(vt5, vs5); - vt6 = _mm256_mul_ps(vt6, vs6); - vt7 = _mm256_mul_ps(vt7, vs7); - vt8 = _mm256_mul_ps(vt8, vs8); - vt9 = _mm256_mul_ps(vt9, vs9); - vt10 = _mm256_mul_ps(vt10, vs10); - vt11 = _mm256_mul_ps(vt11, vs11); - - __m256 vf0 = _mm256_add_ps(_mm256_mul_ps(vt0, vp0), vs0); - __m256 vf1 = _mm256_add_ps(_mm256_mul_ps(vt1, vp1), vs1); - __m256 vf2 = _mm256_add_ps(_mm256_mul_ps(vt2, vp2), vs2); - __m256 vf3 = _mm256_add_ps(_mm256_mul_ps(vt3, vp3), vs3); - __m256 vf4 = _mm256_add_ps(_mm256_mul_ps(vt4, vp4), vs4); - __m256 vf5 = _mm256_add_ps(_mm256_mul_ps(vt5, vp5), vs5); - __m256 vf6 = _mm256_add_ps(_mm256_mul_ps(vt6, vp6), vs6); - __m256 vf7 = _mm256_add_ps(_mm256_mul_ps(vt7, vp7), vs7); - __m256 vf8 = _mm256_add_ps(_mm256_mul_ps(vt8, vp8), vs8); - __m256 vf9 = _mm256_add_ps(_mm256_mul_ps(vt9, vp9), vs9); - __m256 vf10 = _mm256_add_ps(_mm256_mul_ps(vt10, vp10), vs10); - __m256 vf11 = _mm256_add_ps(_mm256_mul_ps(vt11, vp11), vs11); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); - vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); - vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); - vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); - vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vx4, vdenorm_cutoff, _CMP_LT_OS), vf4); - vf5 = _mm256_andnot_ps(_mm256_cmp_ps(vx5, vdenorm_cutoff, _CMP_LT_OS), vf5); - vf6 = _mm256_andnot_ps(_mm256_cmp_ps(vx6, vdenorm_cutoff, _CMP_LT_OS), vf6); - vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); - vf8 = _mm256_andnot_ps(_mm256_cmp_ps(vx8, vdenorm_cutoff, _CMP_LT_OS), vf8); - vf9 = _mm256_andnot_ps(_mm256_cmp_ps(vx9, vdenorm_cutoff, _CMP_LT_OS), vf9); - vf10 = _mm256_andnot_ps(_mm256_cmp_ps(vx10, vdenorm_cutoff, _CMP_LT_OS), vf10); - vf11 = _mm256_andnot_ps(_mm256_cmp_ps(vx11, vdenorm_cutoff, _CMP_LT_OS), vf11); - - // Store 96 (12x4) outputs at a time. - _mm256_storeu_ps(output, vf0); - _mm256_storeu_ps(output + 8, vf1); - _mm256_storeu_ps(output + 16, vf2); - _mm256_storeu_ps(output + 24, vf3); - _mm256_storeu_ps(output + 32, vf4); - _mm256_storeu_ps(output + 40, vf5); - _mm256_storeu_ps(output + 48, vf6); - _mm256_storeu_ps(output + 56, vf7); - _mm256_storeu_ps(output + 64, vf8); - _mm256_storeu_ps(output + 72, vf9); - _mm256_storeu_ps(output + 80, vf10); - _mm256_storeu_ps(output + 88, vf11); - output += 96; - - // Accumulate computed exponents. - vacc0 = _mm256_add_ps(vacc0, vf0); - vacc1 = _mm256_add_ps(vacc1, vf1); - vacc2 = _mm256_add_ps(vacc2, vf2); - vacc0 = _mm256_add_ps(vacc0, vf3); - vacc1 = _mm256_add_ps(vacc1, vf4); - vacc2 = _mm256_add_ps(vacc2, vf5); - vacc0 = _mm256_add_ps(vacc0, vf6); - vacc1 = _mm256_add_ps(vacc1, vf7); - vacc2 = _mm256_add_ps(vacc2, vf8); - vacc0 = _mm256_add_ps(vacc0, vf9); - vacc1 = _mm256_add_ps(vacc1, vf10); - vacc2 = _mm256_add_ps(vacc2, vf11); - } - // Add up all accumulators to vacc0 - vacc0 = _mm256_add_ps(vacc0, vacc1); - vacc0 = _mm256_add_ps(vacc0, vacc2); - - __m256 vacc = vacc0; - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - // Load 8 inputs at a time. - const __m256 vi = _mm256_loadu_ps(input); - input += 8; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm256_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - // Store 8 outputs at a time. - _mm256_storeu_ps(output, vf); - output += 8; - - // Accumulate computed exponents. - vacc = _mm256_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); - - const __m256 vi = _mm256_maskload_ps(input, vmask); - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm256_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - __m128 vf_lo = _mm256_castps256_ps128(vf); - if (batch & (4 * sizeof(float))) { - _mm_storeu_ps(output, vf_lo); - vf_lo = _mm256_extractf128_ps(vf, 1); - output += 4; - } - if (batch & (2 * sizeof(float))) { - _mm_storel_pi((__m64*) output, vf_lo); - vf_lo = _mm_movehl_ps(vf_lo, vf_lo); - output += 2; - } - if (batch & (1 * sizeof(float))) { - _mm_store_ss(output, vf_lo); - } - - vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); - } - __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); - vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); - vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); - _mm_store_ss(sum, vacc_lo); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc6.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc6.c deleted file mode 100644 index 3b0891d92a7..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96-acc6.c +++ /dev/null @@ -1,410 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc6( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; - - const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); - const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); - const __m256 vminus_ln2_hi = _mm256_set1_ps(-0x1.62E400p-1f); - const __m256 vminus_ln2_lo = _mm256_set1_ps(-0x1.7F7D1Cp-20f); - const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); - const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); - const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); - const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); - const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); - const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m256 vi_max = _mm256_broadcast_ss(max); - - __m256 vacc0 = _mm256_setzero_ps(); - __m256 vacc1 = _mm256_setzero_ps(); - __m256 vacc2 = _mm256_setzero_ps(); - __m256 vacc3 = _mm256_setzero_ps(); - __m256 vacc4 = _mm256_setzero_ps(); - __m256 vacc5 = _mm256_setzero_ps(); - for (; batch >= 96 * sizeof(float); batch -= 96 * sizeof(float)) { - // Load 96 (12x4) inputs at a time. - const __m256 vi0 = _mm256_loadu_ps(input); - const __m256 vi1 = _mm256_loadu_ps(input + 8); - const __m256 vi2 = _mm256_loadu_ps(input + 16); - const __m256 vi3 = _mm256_loadu_ps(input + 24); - const __m256 vi4 = _mm256_loadu_ps(input + 32); - const __m256 vi5 = _mm256_loadu_ps(input + 40); - const __m256 vi6 = _mm256_loadu_ps(input + 48); - const __m256 vi7 = _mm256_loadu_ps(input + 56); - const __m256 vi8 = _mm256_loadu_ps(input + 64); - const __m256 vi9 = _mm256_loadu_ps(input + 72); - const __m256 vi10 = _mm256_loadu_ps(input + 80); - const __m256 vi11 = _mm256_loadu_ps(input + 88); - input += 96; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); - const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); - const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); - const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); - const __m256 vx4 = _mm256_sub_ps(vi4, vi_max); - const __m256 vx5 = _mm256_sub_ps(vi5, vi_max); - const __m256 vx6 = _mm256_sub_ps(vi6, vi_max); - const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); - const __m256 vx8 = _mm256_sub_ps(vi8, vi_max); - const __m256 vx9 = _mm256_sub_ps(vi9, vi_max); - const __m256 vx10 = _mm256_sub_ps(vi10, vi_max); - const __m256 vx11 = _mm256_sub_ps(vi11, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m256 vn0 = _mm256_add_ps(_mm256_mul_ps(vx0, vlog2e), vmagic_bias); - __m256 vn1 = _mm256_add_ps(_mm256_mul_ps(vx1, vlog2e), vmagic_bias); - __m256 vn2 = _mm256_add_ps(_mm256_mul_ps(vx2, vlog2e), vmagic_bias); - __m256 vn3 = _mm256_add_ps(_mm256_mul_ps(vx3, vlog2e), vmagic_bias); - __m256 vn4 = _mm256_add_ps(_mm256_mul_ps(vx4, vlog2e), vmagic_bias); - __m256 vn5 = _mm256_add_ps(_mm256_mul_ps(vx5, vlog2e), vmagic_bias); - __m256 vn6 = _mm256_add_ps(_mm256_mul_ps(vx6, vlog2e), vmagic_bias); - __m256 vn7 = _mm256_add_ps(_mm256_mul_ps(vx7, vlog2e), vmagic_bias); - __m256 vn8 = _mm256_add_ps(_mm256_mul_ps(vx8, vlog2e), vmagic_bias); - __m256 vn9 = _mm256_add_ps(_mm256_mul_ps(vx9, vlog2e), vmagic_bias); - __m256 vn10 = _mm256_add_ps(_mm256_mul_ps(vx10, vlog2e), vmagic_bias); - __m256 vn11 = _mm256_add_ps(_mm256_mul_ps(vx11, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); - const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); - const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); - const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); - const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23)); - const __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); - const __m256 vs6 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn6), 23)); - const __m256 vs7 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn7), 23)); - const __m256 vs8 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn8), 23)); - const __m256 vs9 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn9), 23)); - const __m256 vs10 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn10), 23)); - const __m256 vs11 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn11), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn0 = _mm256_sub_ps(vn0, vmagic_bias); - vn1 = _mm256_sub_ps(vn1, vmagic_bias); - vn2 = _mm256_sub_ps(vn2, vmagic_bias); - vn3 = _mm256_sub_ps(vn3, vmagic_bias); - vn4 = _mm256_sub_ps(vn4, vmagic_bias); - vn5 = _mm256_sub_ps(vn5, vmagic_bias); - vn6 = _mm256_sub_ps(vn6, vmagic_bias); - vn7 = _mm256_sub_ps(vn7, vmagic_bias); - vn8 = _mm256_sub_ps(vn8, vmagic_bias); - vn9 = _mm256_sub_ps(vn9, vmagic_bias); - vn10 = _mm256_sub_ps(vn10, vmagic_bias); - vn11 = _mm256_sub_ps(vn11, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m256 vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_hi), vx0); - __m256 vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_hi), vx1); - __m256 vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_hi), vx2); - __m256 vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_hi), vx3); - __m256 vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_hi), vx4); - __m256 vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_hi), vx5); - __m256 vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_hi), vx6); - __m256 vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_hi), vx7); - __m256 vt8 = _mm256_add_ps(_mm256_mul_ps(vn8, vminus_ln2_hi), vx8); - __m256 vt9 = _mm256_add_ps(_mm256_mul_ps(vn9, vminus_ln2_hi), vx9); - __m256 vt10 = _mm256_add_ps(_mm256_mul_ps(vn10, vminus_ln2_hi), vx10); - __m256 vt11 = _mm256_add_ps(_mm256_mul_ps(vn11, vminus_ln2_hi), vx11); - - vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_lo), vt0); - vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_lo), vt1); - vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_lo), vt2); - vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_lo), vt3); - vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_lo), vt4); - vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_lo), vt5); - vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_lo), vt6); - vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_lo), vt7); - vt8 = _mm256_add_ps(_mm256_mul_ps(vn8, vminus_ln2_lo), vt8); - vt9 = _mm256_add_ps(_mm256_mul_ps(vn9, vminus_ln2_lo), vt9); - vt10 = _mm256_add_ps(_mm256_mul_ps(vn10, vminus_ln2_lo), vt10); - vt11 = _mm256_add_ps(_mm256_mul_ps(vn11, vminus_ln2_lo), vt11); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m256 vp0 = _mm256_add_ps(_mm256_mul_ps(vc5, vt0), vc4); - __m256 vp1 = _mm256_add_ps(_mm256_mul_ps(vc5, vt1), vc4); - __m256 vp2 = _mm256_add_ps(_mm256_mul_ps(vc5, vt2), vc4); - __m256 vp3 = _mm256_add_ps(_mm256_mul_ps(vc5, vt3), vc4); - __m256 vp4 = _mm256_add_ps(_mm256_mul_ps(vc5, vt4), vc4); - __m256 vp5 = _mm256_add_ps(_mm256_mul_ps(vc5, vt5), vc4); - __m256 vp6 = _mm256_add_ps(_mm256_mul_ps(vc5, vt6), vc4); - __m256 vp7 = _mm256_add_ps(_mm256_mul_ps(vc5, vt7), vc4); - __m256 vp8 = _mm256_add_ps(_mm256_mul_ps(vc5, vt8), vc4); - __m256 vp9 = _mm256_add_ps(_mm256_mul_ps(vc5, vt9), vc4); - __m256 vp10 = _mm256_add_ps(_mm256_mul_ps(vc5, vt10), vc4); - __m256 vp11 = _mm256_add_ps(_mm256_mul_ps(vc5, vt11), vc4); - - vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc3); - vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc3); - vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc3); - vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc3); - vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc3); - vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc3); - vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc3); - vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc3); - vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc3); - vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc3); - vp10 = _mm256_add_ps(_mm256_mul_ps(vp10, vt10), vc3); - vp11 = _mm256_add_ps(_mm256_mul_ps(vp11, vt11), vc3); - - vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc2); - vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc2); - vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc2); - vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc2); - vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc2); - vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc2); - vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc2); - vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc2); - vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc2); - vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc2); - vp10 = _mm256_add_ps(_mm256_mul_ps(vp10, vt10), vc2); - vp11 = _mm256_add_ps(_mm256_mul_ps(vp11, vt11), vc2); - - vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc1); - vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc1); - vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc1); - vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc1); - vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc1); - vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc1); - vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc1); - vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc1); - vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc1); - vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc1); - vp10 = _mm256_add_ps(_mm256_mul_ps(vp10, vt10), vc1); - vp11 = _mm256_add_ps(_mm256_mul_ps(vp11, vt11), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt0 = _mm256_mul_ps(vt0, vs0); - vt1 = _mm256_mul_ps(vt1, vs1); - vt2 = _mm256_mul_ps(vt2, vs2); - vt3 = _mm256_mul_ps(vt3, vs3); - vt4 = _mm256_mul_ps(vt4, vs4); - vt5 = _mm256_mul_ps(vt5, vs5); - vt6 = _mm256_mul_ps(vt6, vs6); - vt7 = _mm256_mul_ps(vt7, vs7); - vt8 = _mm256_mul_ps(vt8, vs8); - vt9 = _mm256_mul_ps(vt9, vs9); - vt10 = _mm256_mul_ps(vt10, vs10); - vt11 = _mm256_mul_ps(vt11, vs11); - - __m256 vf0 = _mm256_add_ps(_mm256_mul_ps(vt0, vp0), vs0); - __m256 vf1 = _mm256_add_ps(_mm256_mul_ps(vt1, vp1), vs1); - __m256 vf2 = _mm256_add_ps(_mm256_mul_ps(vt2, vp2), vs2); - __m256 vf3 = _mm256_add_ps(_mm256_mul_ps(vt3, vp3), vs3); - __m256 vf4 = _mm256_add_ps(_mm256_mul_ps(vt4, vp4), vs4); - __m256 vf5 = _mm256_add_ps(_mm256_mul_ps(vt5, vp5), vs5); - __m256 vf6 = _mm256_add_ps(_mm256_mul_ps(vt6, vp6), vs6); - __m256 vf7 = _mm256_add_ps(_mm256_mul_ps(vt7, vp7), vs7); - __m256 vf8 = _mm256_add_ps(_mm256_mul_ps(vt8, vp8), vs8); - __m256 vf9 = _mm256_add_ps(_mm256_mul_ps(vt9, vp9), vs9); - __m256 vf10 = _mm256_add_ps(_mm256_mul_ps(vt10, vp10), vs10); - __m256 vf11 = _mm256_add_ps(_mm256_mul_ps(vt11, vp11), vs11); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); - vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); - vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); - vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); - vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vx4, vdenorm_cutoff, _CMP_LT_OS), vf4); - vf5 = _mm256_andnot_ps(_mm256_cmp_ps(vx5, vdenorm_cutoff, _CMP_LT_OS), vf5); - vf6 = _mm256_andnot_ps(_mm256_cmp_ps(vx6, vdenorm_cutoff, _CMP_LT_OS), vf6); - vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); - vf8 = _mm256_andnot_ps(_mm256_cmp_ps(vx8, vdenorm_cutoff, _CMP_LT_OS), vf8); - vf9 = _mm256_andnot_ps(_mm256_cmp_ps(vx9, vdenorm_cutoff, _CMP_LT_OS), vf9); - vf10 = _mm256_andnot_ps(_mm256_cmp_ps(vx10, vdenorm_cutoff, _CMP_LT_OS), vf10); - vf11 = _mm256_andnot_ps(_mm256_cmp_ps(vx11, vdenorm_cutoff, _CMP_LT_OS), vf11); - - // Store 96 (12x4) outputs at a time. - _mm256_storeu_ps(output, vf0); - _mm256_storeu_ps(output + 8, vf1); - _mm256_storeu_ps(output + 16, vf2); - _mm256_storeu_ps(output + 24, vf3); - _mm256_storeu_ps(output + 32, vf4); - _mm256_storeu_ps(output + 40, vf5); - _mm256_storeu_ps(output + 48, vf6); - _mm256_storeu_ps(output + 56, vf7); - _mm256_storeu_ps(output + 64, vf8); - _mm256_storeu_ps(output + 72, vf9); - _mm256_storeu_ps(output + 80, vf10); - _mm256_storeu_ps(output + 88, vf11); - output += 96; - - // Accumulate computed exponents. - vacc0 = _mm256_add_ps(vacc0, vf0); - vacc1 = _mm256_add_ps(vacc1, vf1); - vacc2 = _mm256_add_ps(vacc2, vf2); - vacc3 = _mm256_add_ps(vacc3, vf3); - vacc4 = _mm256_add_ps(vacc4, vf4); - vacc5 = _mm256_add_ps(vacc5, vf5); - vacc0 = _mm256_add_ps(vacc0, vf6); - vacc1 = _mm256_add_ps(vacc1, vf7); - vacc2 = _mm256_add_ps(vacc2, vf8); - vacc3 = _mm256_add_ps(vacc3, vf9); - vacc4 = _mm256_add_ps(vacc4, vf10); - vacc5 = _mm256_add_ps(vacc5, vf11); - } - // Add up all accumulators to vacc0 - vacc0 = _mm256_add_ps(vacc0, vacc1); - vacc2 = _mm256_add_ps(vacc2, vacc3); - vacc4 = _mm256_add_ps(vacc4, vacc5); - vacc0 = _mm256_add_ps(vacc0, vacc2); - vacc0 = _mm256_add_ps(vacc0, vacc4); - - __m256 vacc = vacc0; - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - // Load 8 inputs at a time. - const __m256 vi = _mm256_loadu_ps(input); - input += 8; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm256_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - // Store 8 outputs at a time. - _mm256_storeu_ps(output, vf); - output += 8; - - // Accumulate computed exponents. - vacc = _mm256_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); - - const __m256 vi = _mm256_maskload_ps(input, vmask); - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm256_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - __m128 vf_lo = _mm256_castps256_ps128(vf); - if (batch & (4 * sizeof(float))) { - _mm_storeu_ps(output, vf_lo); - vf_lo = _mm256_extractf128_ps(vf, 1); - output += 4; - } - if (batch & (2 * sizeof(float))) { - _mm_storel_pi((__m64*) output, vf_lo); - vf_lo = _mm_movehl_ps(vf_lo, vf_lo); - output += 2; - } - if (batch & (1 * sizeof(float))) { - _mm_store_ss(output, vf_lo); - } - - vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); - } - __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); - vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); - vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); - _mm_store_ss(sum, vacc_lo); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96.c deleted file mode 100644 index af744681554..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr2-p5-u96.c +++ /dev/null @@ -1,399 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/avx2-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; - - const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f); - const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f); - const __m256 vminus_ln2_hi = _mm256_set1_ps(-0x1.62E400p-1f); - const __m256 vminus_ln2_lo = _mm256_set1_ps(-0x1.7F7D1Cp-20f); - const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f); - const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f); - const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f); - const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f); - const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f); - const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m256 vi_max = _mm256_broadcast_ss(max); - - __m256 vacc0 = _mm256_setzero_ps(); - for (; batch >= 96 * sizeof(float); batch -= 96 * sizeof(float)) { - // Load 96 (12x4) inputs at a time. - const __m256 vi0 = _mm256_loadu_ps(input); - const __m256 vi1 = _mm256_loadu_ps(input + 8); - const __m256 vi2 = _mm256_loadu_ps(input + 16); - const __m256 vi3 = _mm256_loadu_ps(input + 24); - const __m256 vi4 = _mm256_loadu_ps(input + 32); - const __m256 vi5 = _mm256_loadu_ps(input + 40); - const __m256 vi6 = _mm256_loadu_ps(input + 48); - const __m256 vi7 = _mm256_loadu_ps(input + 56); - const __m256 vi8 = _mm256_loadu_ps(input + 64); - const __m256 vi9 = _mm256_loadu_ps(input + 72); - const __m256 vi10 = _mm256_loadu_ps(input + 80); - const __m256 vi11 = _mm256_loadu_ps(input + 88); - input += 96; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m256 vx0 = _mm256_sub_ps(vi0, vi_max); - const __m256 vx1 = _mm256_sub_ps(vi1, vi_max); - const __m256 vx2 = _mm256_sub_ps(vi2, vi_max); - const __m256 vx3 = _mm256_sub_ps(vi3, vi_max); - const __m256 vx4 = _mm256_sub_ps(vi4, vi_max); - const __m256 vx5 = _mm256_sub_ps(vi5, vi_max); - const __m256 vx6 = _mm256_sub_ps(vi6, vi_max); - const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); - const __m256 vx8 = _mm256_sub_ps(vi8, vi_max); - const __m256 vx9 = _mm256_sub_ps(vi9, vi_max); - const __m256 vx10 = _mm256_sub_ps(vi10, vi_max); - const __m256 vx11 = _mm256_sub_ps(vi11, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m256 vn0 = _mm256_add_ps(_mm256_mul_ps(vx0, vlog2e), vmagic_bias); - __m256 vn1 = _mm256_add_ps(_mm256_mul_ps(vx1, vlog2e), vmagic_bias); - __m256 vn2 = _mm256_add_ps(_mm256_mul_ps(vx2, vlog2e), vmagic_bias); - __m256 vn3 = _mm256_add_ps(_mm256_mul_ps(vx3, vlog2e), vmagic_bias); - __m256 vn4 = _mm256_add_ps(_mm256_mul_ps(vx4, vlog2e), vmagic_bias); - __m256 vn5 = _mm256_add_ps(_mm256_mul_ps(vx5, vlog2e), vmagic_bias); - __m256 vn6 = _mm256_add_ps(_mm256_mul_ps(vx6, vlog2e), vmagic_bias); - __m256 vn7 = _mm256_add_ps(_mm256_mul_ps(vx7, vlog2e), vmagic_bias); - __m256 vn8 = _mm256_add_ps(_mm256_mul_ps(vx8, vlog2e), vmagic_bias); - __m256 vn9 = _mm256_add_ps(_mm256_mul_ps(vx9, vlog2e), vmagic_bias); - __m256 vn10 = _mm256_add_ps(_mm256_mul_ps(vx10, vlog2e), vmagic_bias); - __m256 vn11 = _mm256_add_ps(_mm256_mul_ps(vx11, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); - const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23)); - const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23)); - const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23)); - const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23)); - const __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); - const __m256 vs6 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn6), 23)); - const __m256 vs7 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn7), 23)); - const __m256 vs8 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn8), 23)); - const __m256 vs9 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn9), 23)); - const __m256 vs10 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn10), 23)); - const __m256 vs11 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn11), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn0 = _mm256_sub_ps(vn0, vmagic_bias); - vn1 = _mm256_sub_ps(vn1, vmagic_bias); - vn2 = _mm256_sub_ps(vn2, vmagic_bias); - vn3 = _mm256_sub_ps(vn3, vmagic_bias); - vn4 = _mm256_sub_ps(vn4, vmagic_bias); - vn5 = _mm256_sub_ps(vn5, vmagic_bias); - vn6 = _mm256_sub_ps(vn6, vmagic_bias); - vn7 = _mm256_sub_ps(vn7, vmagic_bias); - vn8 = _mm256_sub_ps(vn8, vmagic_bias); - vn9 = _mm256_sub_ps(vn9, vmagic_bias); - vn10 = _mm256_sub_ps(vn10, vmagic_bias); - vn11 = _mm256_sub_ps(vn11, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m256 vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_hi), vx0); - __m256 vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_hi), vx1); - __m256 vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_hi), vx2); - __m256 vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_hi), vx3); - __m256 vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_hi), vx4); - __m256 vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_hi), vx5); - __m256 vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_hi), vx6); - __m256 vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_hi), vx7); - __m256 vt8 = _mm256_add_ps(_mm256_mul_ps(vn8, vminus_ln2_hi), vx8); - __m256 vt9 = _mm256_add_ps(_mm256_mul_ps(vn9, vminus_ln2_hi), vx9); - __m256 vt10 = _mm256_add_ps(_mm256_mul_ps(vn10, vminus_ln2_hi), vx10); - __m256 vt11 = _mm256_add_ps(_mm256_mul_ps(vn11, vminus_ln2_hi), vx11); - - vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_lo), vt0); - vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_lo), vt1); - vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_lo), vt2); - vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_lo), vt3); - vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_lo), vt4); - vt5 = _mm256_add_ps(_mm256_mul_ps(vn5, vminus_ln2_lo), vt5); - vt6 = _mm256_add_ps(_mm256_mul_ps(vn6, vminus_ln2_lo), vt6); - vt7 = _mm256_add_ps(_mm256_mul_ps(vn7, vminus_ln2_lo), vt7); - vt8 = _mm256_add_ps(_mm256_mul_ps(vn8, vminus_ln2_lo), vt8); - vt9 = _mm256_add_ps(_mm256_mul_ps(vn9, vminus_ln2_lo), vt9); - vt10 = _mm256_add_ps(_mm256_mul_ps(vn10, vminus_ln2_lo), vt10); - vt11 = _mm256_add_ps(_mm256_mul_ps(vn11, vminus_ln2_lo), vt11); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m256 vp0 = _mm256_add_ps(_mm256_mul_ps(vc5, vt0), vc4); - __m256 vp1 = _mm256_add_ps(_mm256_mul_ps(vc5, vt1), vc4); - __m256 vp2 = _mm256_add_ps(_mm256_mul_ps(vc5, vt2), vc4); - __m256 vp3 = _mm256_add_ps(_mm256_mul_ps(vc5, vt3), vc4); - __m256 vp4 = _mm256_add_ps(_mm256_mul_ps(vc5, vt4), vc4); - __m256 vp5 = _mm256_add_ps(_mm256_mul_ps(vc5, vt5), vc4); - __m256 vp6 = _mm256_add_ps(_mm256_mul_ps(vc5, vt6), vc4); - __m256 vp7 = _mm256_add_ps(_mm256_mul_ps(vc5, vt7), vc4); - __m256 vp8 = _mm256_add_ps(_mm256_mul_ps(vc5, vt8), vc4); - __m256 vp9 = _mm256_add_ps(_mm256_mul_ps(vc5, vt9), vc4); - __m256 vp10 = _mm256_add_ps(_mm256_mul_ps(vc5, vt10), vc4); - __m256 vp11 = _mm256_add_ps(_mm256_mul_ps(vc5, vt11), vc4); - - vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc3); - vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc3); - vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc3); - vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc3); - vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc3); - vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc3); - vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc3); - vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc3); - vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc3); - vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc3); - vp10 = _mm256_add_ps(_mm256_mul_ps(vp10, vt10), vc3); - vp11 = _mm256_add_ps(_mm256_mul_ps(vp11, vt11), vc3); - - vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc2); - vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc2); - vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc2); - vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc2); - vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc2); - vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc2); - vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc2); - vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc2); - vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc2); - vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc2); - vp10 = _mm256_add_ps(_mm256_mul_ps(vp10, vt10), vc2); - vp11 = _mm256_add_ps(_mm256_mul_ps(vp11, vt11), vc2); - - vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc1); - vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc1); - vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc1); - vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc1); - vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc1); - vp5 = _mm256_add_ps(_mm256_mul_ps(vp5, vt5), vc1); - vp6 = _mm256_add_ps(_mm256_mul_ps(vp6, vt6), vc1); - vp7 = _mm256_add_ps(_mm256_mul_ps(vp7, vt7), vc1); - vp8 = _mm256_add_ps(_mm256_mul_ps(vp8, vt8), vc1); - vp9 = _mm256_add_ps(_mm256_mul_ps(vp9, vt9), vc1); - vp10 = _mm256_add_ps(_mm256_mul_ps(vp10, vt10), vc1); - vp11 = _mm256_add_ps(_mm256_mul_ps(vp11, vt11), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt0 = _mm256_mul_ps(vt0, vs0); - vt1 = _mm256_mul_ps(vt1, vs1); - vt2 = _mm256_mul_ps(vt2, vs2); - vt3 = _mm256_mul_ps(vt3, vs3); - vt4 = _mm256_mul_ps(vt4, vs4); - vt5 = _mm256_mul_ps(vt5, vs5); - vt6 = _mm256_mul_ps(vt6, vs6); - vt7 = _mm256_mul_ps(vt7, vs7); - vt8 = _mm256_mul_ps(vt8, vs8); - vt9 = _mm256_mul_ps(vt9, vs9); - vt10 = _mm256_mul_ps(vt10, vs10); - vt11 = _mm256_mul_ps(vt11, vs11); - - __m256 vf0 = _mm256_add_ps(_mm256_mul_ps(vt0, vp0), vs0); - __m256 vf1 = _mm256_add_ps(_mm256_mul_ps(vt1, vp1), vs1); - __m256 vf2 = _mm256_add_ps(_mm256_mul_ps(vt2, vp2), vs2); - __m256 vf3 = _mm256_add_ps(_mm256_mul_ps(vt3, vp3), vs3); - __m256 vf4 = _mm256_add_ps(_mm256_mul_ps(vt4, vp4), vs4); - __m256 vf5 = _mm256_add_ps(_mm256_mul_ps(vt5, vp5), vs5); - __m256 vf6 = _mm256_add_ps(_mm256_mul_ps(vt6, vp6), vs6); - __m256 vf7 = _mm256_add_ps(_mm256_mul_ps(vt7, vp7), vs7); - __m256 vf8 = _mm256_add_ps(_mm256_mul_ps(vt8, vp8), vs8); - __m256 vf9 = _mm256_add_ps(_mm256_mul_ps(vt9, vp9), vs9); - __m256 vf10 = _mm256_add_ps(_mm256_mul_ps(vt10, vp10), vs10); - __m256 vf11 = _mm256_add_ps(_mm256_mul_ps(vt11, vp11), vs11); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0); - vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1); - vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2); - vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3); - vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vx4, vdenorm_cutoff, _CMP_LT_OS), vf4); - vf5 = _mm256_andnot_ps(_mm256_cmp_ps(vx5, vdenorm_cutoff, _CMP_LT_OS), vf5); - vf6 = _mm256_andnot_ps(_mm256_cmp_ps(vx6, vdenorm_cutoff, _CMP_LT_OS), vf6); - vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); - vf8 = _mm256_andnot_ps(_mm256_cmp_ps(vx8, vdenorm_cutoff, _CMP_LT_OS), vf8); - vf9 = _mm256_andnot_ps(_mm256_cmp_ps(vx9, vdenorm_cutoff, _CMP_LT_OS), vf9); - vf10 = _mm256_andnot_ps(_mm256_cmp_ps(vx10, vdenorm_cutoff, _CMP_LT_OS), vf10); - vf11 = _mm256_andnot_ps(_mm256_cmp_ps(vx11, vdenorm_cutoff, _CMP_LT_OS), vf11); - - // Store 96 (12x4) outputs at a time. - _mm256_storeu_ps(output, vf0); - _mm256_storeu_ps(output + 8, vf1); - _mm256_storeu_ps(output + 16, vf2); - _mm256_storeu_ps(output + 24, vf3); - _mm256_storeu_ps(output + 32, vf4); - _mm256_storeu_ps(output + 40, vf5); - _mm256_storeu_ps(output + 48, vf6); - _mm256_storeu_ps(output + 56, vf7); - _mm256_storeu_ps(output + 64, vf8); - _mm256_storeu_ps(output + 72, vf9); - _mm256_storeu_ps(output + 80, vf10); - _mm256_storeu_ps(output + 88, vf11); - output += 96; - - // Accumulate computed exponents. - vacc0 = _mm256_add_ps(vacc0, vf0); - vacc0 = _mm256_add_ps(vacc0, vf1); - vacc0 = _mm256_add_ps(vacc0, vf2); - vacc0 = _mm256_add_ps(vacc0, vf3); - vacc0 = _mm256_add_ps(vacc0, vf4); - vacc0 = _mm256_add_ps(vacc0, vf5); - vacc0 = _mm256_add_ps(vacc0, vf6); - vacc0 = _mm256_add_ps(vacc0, vf7); - vacc0 = _mm256_add_ps(vacc0, vf8); - vacc0 = _mm256_add_ps(vacc0, vf9); - vacc0 = _mm256_add_ps(vacc0, vf10); - vacc0 = _mm256_add_ps(vacc0, vf11); - } - - __m256 vacc = vacc0; - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - // Load 8 inputs at a time. - const __m256 vi = _mm256_loadu_ps(input); - input += 8; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm256_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - // Store 8 outputs at a time. - _mm256_storeu_ps(output, vf); - output += 8; - - // Accumulate computed exponents. - vacc = _mm256_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[8] - batch)); - - const __m256 vi = _mm256_maskload_ps(input, vmask); - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m256 vx = _mm256_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m256 vn = _mm256_add_ps(_mm256_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm256_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2); - vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm256_mul_ps(vt, vs); - __m256 vf = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf); - - __m128 vf_lo = _mm256_castps256_ps128(vf); - if (batch & (4 * sizeof(float))) { - _mm_storeu_ps(output, vf_lo); - vf_lo = _mm256_extractf128_ps(vf, 1); - output += 4; - } - if (batch & (2 * sizeof(float))) { - _mm_storel_pi((__m64*) output, vf_lo); - vf_lo = _mm_movehl_ps(vf_lo, vf_lo); - output += 2; - } - if (batch & (1 * sizeof(float))) { - _mm_store_ss(output, vf_lo); - } - - vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask))); - } - __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1)); - vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo)); - vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo)); - _mm_store_ss(sum, vacc_lo); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u128-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u128-acc2.c deleted file mode 100644 index 0cc8c9aee6a..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u128-acc2.c +++ /dev/null @@ -1,218 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u128_acc2( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const __m512 vlog2e = _mm512_set1_ps(0x1.715476p+0f); - const __m512 vminus_ln2 = _mm512_set1_ps(-0x1.62E430p-1f); - const __m512 vc5 = _mm512_set1_ps(0x1.0F9F9Cp-7f); - const __m512 vc4 = _mm512_set1_ps(0x1.573A1Ap-5f); - const __m512 vc3 = _mm512_set1_ps(0x1.555A80p-3f); - const __m512 vc2 = _mm512_set1_ps(0x1.FFFDC6p-2f); - const __m512 vc1 = _mm512_set1_ps(0x1.FFFFF6p-1f); - const __m512 vc0 = _mm512_set1_ps(1.0f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vminus_ln2); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vc0); - - const __m512 vi_max = _mm512_set1_ps(*max); - - __m512 vacc0 = _mm512_setzero_ps(); - __m512 vacc1 = _mm512_setzero_ps(); - for (; batch >= 128 * sizeof(float); batch -= 128 * sizeof(float)) { - const __m512 vi0 = _mm512_loadu_ps(input); - const __m512 vi1 = _mm512_loadu_ps(input + 16); - const __m512 vi2 = _mm512_loadu_ps(input + 32); - const __m512 vi3 = _mm512_loadu_ps(input + 48); - const __m512 vi4 = _mm512_loadu_ps(input + 64); - const __m512 vi5 = _mm512_loadu_ps(input + 80); - const __m512 vi6 = _mm512_loadu_ps(input + 96); - const __m512 vi7 = _mm512_loadu_ps(input + 112); - input += 128; - - const __m512 vx0 = _mm512_sub_ps(vi0, vi_max); - const __m512 vx1 = _mm512_sub_ps(vi1, vi_max); - const __m512 vx2 = _mm512_sub_ps(vi2, vi_max); - const __m512 vx3 = _mm512_sub_ps(vi3, vi_max); - const __m512 vx4 = _mm512_sub_ps(vi4, vi_max); - const __m512 vx5 = _mm512_sub_ps(vi5, vi_max); - const __m512 vx6 = _mm512_sub_ps(vi6, vi_max); - const __m512 vx7 = _mm512_sub_ps(vi7, vi_max); - - const __m512 vn0 = _mm512_roundscale_ps(_mm512_mul_ps(vx0, vlog2e), 0); - const __m512 vn1 = _mm512_roundscale_ps(_mm512_mul_ps(vx1, vlog2e), 0); - const __m512 vn2 = _mm512_roundscale_ps(_mm512_mul_ps(vx2, vlog2e), 0); - const __m512 vn3 = _mm512_roundscale_ps(_mm512_mul_ps(vx3, vlog2e), 0); - const __m512 vn4 = _mm512_roundscale_ps(_mm512_mul_ps(vx4, vlog2e), 0); - const __m512 vn5 = _mm512_roundscale_ps(_mm512_mul_ps(vx5, vlog2e), 0); - const __m512 vn6 = _mm512_roundscale_ps(_mm512_mul_ps(vx6, vlog2e), 0); - const __m512 vn7 = _mm512_roundscale_ps(_mm512_mul_ps(vx7, vlog2e), 0); - - const __m512 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2, vx0); - const __m512 vt1 = _mm512_fmadd_ps(vn1, vminus_ln2, vx1); - const __m512 vt2 = _mm512_fmadd_ps(vn2, vminus_ln2, vx2); - const __m512 vt3 = _mm512_fmadd_ps(vn3, vminus_ln2, vx3); - const __m512 vt4 = _mm512_fmadd_ps(vn4, vminus_ln2, vx4); - const __m512 vt5 = _mm512_fmadd_ps(vn5, vminus_ln2, vx5); - const __m512 vt6 = _mm512_fmadd_ps(vn6, vminus_ln2, vx6); - const __m512 vt7 = _mm512_fmadd_ps(vn7, vminus_ln2, vx7); - - __m512 vp0 = _mm512_fmadd_ps(vc5, vt0, vc4); - __m512 vp1 = _mm512_fmadd_ps(vc5, vt1, vc4); - __m512 vp2 = _mm512_fmadd_ps(vc5, vt2, vc4); - __m512 vp3 = _mm512_fmadd_ps(vc5, vt3, vc4); - __m512 vp4 = _mm512_fmadd_ps(vc5, vt4, vc4); - __m512 vp5 = _mm512_fmadd_ps(vc5, vt5, vc4); - __m512 vp6 = _mm512_fmadd_ps(vc5, vt6, vc4); - __m512 vp7 = _mm512_fmadd_ps(vc5, vt7, vc4); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc3); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc3); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc3); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc3); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc3); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc3); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc3); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc3); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc2); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc2); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc2); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc2); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc2); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc2); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc2); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc2); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc1); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc1); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc1); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc1); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc1); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc1); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc1); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc1); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc0); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc0); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc0); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc0); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc0); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc0); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc0); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc0); - - const __m512 vf0 = _mm512_scalef_ps(vp0, vn0); - const __m512 vf1 = _mm512_scalef_ps(vp1, vn1); - const __m512 vf2 = _mm512_scalef_ps(vp2, vn2); - const __m512 vf3 = _mm512_scalef_ps(vp3, vn3); - const __m512 vf4 = _mm512_scalef_ps(vp4, vn4); - const __m512 vf5 = _mm512_scalef_ps(vp5, vn5); - const __m512 vf6 = _mm512_scalef_ps(vp6, vn6); - const __m512 vf7 = _mm512_scalef_ps(vp7, vn7); - - _mm512_storeu_ps(output, vf0); - _mm512_storeu_ps(output + 16, vf1); - _mm512_storeu_ps(output + 32, vf2); - _mm512_storeu_ps(output + 48, vf3); - _mm512_storeu_ps(output + 64, vf4); - _mm512_storeu_ps(output + 80, vf5); - _mm512_storeu_ps(output + 96, vf6); - _mm512_storeu_ps(output + 112, vf7); - output += 128; - - vacc0 = _mm512_add_ps(vacc0, vf0); - vacc1 = _mm512_add_ps(vacc1, vf1); - vacc0 = _mm512_add_ps(vacc0, vf2); - vacc1 = _mm512_add_ps(vacc1, vf3); - vacc0 = _mm512_add_ps(vacc0, vf4); - vacc1 = _mm512_add_ps(vacc1, vf5); - vacc0 = _mm512_add_ps(vacc0, vf6); - vacc1 = _mm512_add_ps(vacc1, vf7); - } - vacc0 = _mm512_add_ps(vacc0, vacc1); - - __m512 vacc = vacc0; - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const __m512 vi = _mm512_loadu_ps(input); - input += 16; - - const __m512 vx = _mm512_sub_ps(vi, vi_max); - - const __m512 vn = _mm512_roundscale_ps(_mm512_mul_ps(vx, vlog2e), 0); - - const __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2, vx); - - __m512 vp = _mm512_fmadd_ps(vc5, vt, vc4); - vp = _mm512_fmadd_ps(vp, vt, vc3); - vp = _mm512_fmadd_ps(vp, vt, vc2); - vp = _mm512_fmadd_ps(vp, vt, vc1); - vp = _mm512_fmadd_ps(vp, vt, vc0); - - const __m512 vf = _mm512_scalef_ps(vp, vn); - - _mm512_storeu_ps(output, vf); - output += 16; - - vacc = _mm512_add_ps(vacc, vf); - } - if (batch != 0) { - // Prepare mask for valid 32-bit batch (depends on batch). - batch >>= XNN_LOG2_SIZEOF_FLOAT; - const __mmask16 vmask = _cvtu32_mask16((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); - - const __m512 vi = _mm512_maskz_loadu_ps(vmask, input); - - const __m512 vx = _mm512_sub_ps(vi, vi_max); - - const __m512 vn = _mm512_roundscale_ps(_mm512_mul_ps(vx, vlog2e), 0); - - const __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2, vx); - - __m512 vp = _mm512_fmadd_ps(vc5, vt, vc4); - vp = _mm512_fmadd_ps(vp, vt, vc3); - vp = _mm512_fmadd_ps(vp, vt, vc2); - vp = _mm512_fmadd_ps(vp, vt, vc1); - vp = _mm512_fmadd_ps(vp, vt, vc0); - - const __m512 vf = _mm512_scalef_ps(vp, vn); - - _mm512_mask_storeu_ps(output, vmask, vf); - - vacc = _mm512_mask_add_ps(vacc, vmask, vacc, vf); - } - *sum = _mm512_reduce_add_ps(vacc); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u128-acc4.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u128-acc4.c deleted file mode 100644 index 6027eb7e559..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u128-acc4.c +++ /dev/null @@ -1,222 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u128_acc4( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const __m512 vlog2e = _mm512_set1_ps(0x1.715476p+0f); - const __m512 vminus_ln2 = _mm512_set1_ps(-0x1.62E430p-1f); - const __m512 vc5 = _mm512_set1_ps(0x1.0F9F9Cp-7f); - const __m512 vc4 = _mm512_set1_ps(0x1.573A1Ap-5f); - const __m512 vc3 = _mm512_set1_ps(0x1.555A80p-3f); - const __m512 vc2 = _mm512_set1_ps(0x1.FFFDC6p-2f); - const __m512 vc1 = _mm512_set1_ps(0x1.FFFFF6p-1f); - const __m512 vc0 = _mm512_set1_ps(1.0f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vminus_ln2); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vc0); - - const __m512 vi_max = _mm512_set1_ps(*max); - - __m512 vacc0 = _mm512_setzero_ps(); - __m512 vacc1 = _mm512_setzero_ps(); - __m512 vacc2 = _mm512_setzero_ps(); - __m512 vacc3 = _mm512_setzero_ps(); - for (; batch >= 128 * sizeof(float); batch -= 128 * sizeof(float)) { - const __m512 vi0 = _mm512_loadu_ps(input); - const __m512 vi1 = _mm512_loadu_ps(input + 16); - const __m512 vi2 = _mm512_loadu_ps(input + 32); - const __m512 vi3 = _mm512_loadu_ps(input + 48); - const __m512 vi4 = _mm512_loadu_ps(input + 64); - const __m512 vi5 = _mm512_loadu_ps(input + 80); - const __m512 vi6 = _mm512_loadu_ps(input + 96); - const __m512 vi7 = _mm512_loadu_ps(input + 112); - input += 128; - - const __m512 vx0 = _mm512_sub_ps(vi0, vi_max); - const __m512 vx1 = _mm512_sub_ps(vi1, vi_max); - const __m512 vx2 = _mm512_sub_ps(vi2, vi_max); - const __m512 vx3 = _mm512_sub_ps(vi3, vi_max); - const __m512 vx4 = _mm512_sub_ps(vi4, vi_max); - const __m512 vx5 = _mm512_sub_ps(vi5, vi_max); - const __m512 vx6 = _mm512_sub_ps(vi6, vi_max); - const __m512 vx7 = _mm512_sub_ps(vi7, vi_max); - - const __m512 vn0 = _mm512_roundscale_ps(_mm512_mul_ps(vx0, vlog2e), 0); - const __m512 vn1 = _mm512_roundscale_ps(_mm512_mul_ps(vx1, vlog2e), 0); - const __m512 vn2 = _mm512_roundscale_ps(_mm512_mul_ps(vx2, vlog2e), 0); - const __m512 vn3 = _mm512_roundscale_ps(_mm512_mul_ps(vx3, vlog2e), 0); - const __m512 vn4 = _mm512_roundscale_ps(_mm512_mul_ps(vx4, vlog2e), 0); - const __m512 vn5 = _mm512_roundscale_ps(_mm512_mul_ps(vx5, vlog2e), 0); - const __m512 vn6 = _mm512_roundscale_ps(_mm512_mul_ps(vx6, vlog2e), 0); - const __m512 vn7 = _mm512_roundscale_ps(_mm512_mul_ps(vx7, vlog2e), 0); - - const __m512 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2, vx0); - const __m512 vt1 = _mm512_fmadd_ps(vn1, vminus_ln2, vx1); - const __m512 vt2 = _mm512_fmadd_ps(vn2, vminus_ln2, vx2); - const __m512 vt3 = _mm512_fmadd_ps(vn3, vminus_ln2, vx3); - const __m512 vt4 = _mm512_fmadd_ps(vn4, vminus_ln2, vx4); - const __m512 vt5 = _mm512_fmadd_ps(vn5, vminus_ln2, vx5); - const __m512 vt6 = _mm512_fmadd_ps(vn6, vminus_ln2, vx6); - const __m512 vt7 = _mm512_fmadd_ps(vn7, vminus_ln2, vx7); - - __m512 vp0 = _mm512_fmadd_ps(vc5, vt0, vc4); - __m512 vp1 = _mm512_fmadd_ps(vc5, vt1, vc4); - __m512 vp2 = _mm512_fmadd_ps(vc5, vt2, vc4); - __m512 vp3 = _mm512_fmadd_ps(vc5, vt3, vc4); - __m512 vp4 = _mm512_fmadd_ps(vc5, vt4, vc4); - __m512 vp5 = _mm512_fmadd_ps(vc5, vt5, vc4); - __m512 vp6 = _mm512_fmadd_ps(vc5, vt6, vc4); - __m512 vp7 = _mm512_fmadd_ps(vc5, vt7, vc4); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc3); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc3); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc3); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc3); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc3); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc3); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc3); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc3); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc2); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc2); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc2); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc2); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc2); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc2); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc2); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc2); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc1); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc1); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc1); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc1); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc1); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc1); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc1); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc1); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc0); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc0); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc0); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc0); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc0); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc0); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc0); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc0); - - const __m512 vf0 = _mm512_scalef_ps(vp0, vn0); - const __m512 vf1 = _mm512_scalef_ps(vp1, vn1); - const __m512 vf2 = _mm512_scalef_ps(vp2, vn2); - const __m512 vf3 = _mm512_scalef_ps(vp3, vn3); - const __m512 vf4 = _mm512_scalef_ps(vp4, vn4); - const __m512 vf5 = _mm512_scalef_ps(vp5, vn5); - const __m512 vf6 = _mm512_scalef_ps(vp6, vn6); - const __m512 vf7 = _mm512_scalef_ps(vp7, vn7); - - _mm512_storeu_ps(output, vf0); - _mm512_storeu_ps(output + 16, vf1); - _mm512_storeu_ps(output + 32, vf2); - _mm512_storeu_ps(output + 48, vf3); - _mm512_storeu_ps(output + 64, vf4); - _mm512_storeu_ps(output + 80, vf5); - _mm512_storeu_ps(output + 96, vf6); - _mm512_storeu_ps(output + 112, vf7); - output += 128; - - vacc0 = _mm512_add_ps(vacc0, vf0); - vacc1 = _mm512_add_ps(vacc1, vf1); - vacc2 = _mm512_add_ps(vacc2, vf2); - vacc3 = _mm512_add_ps(vacc3, vf3); - vacc0 = _mm512_add_ps(vacc0, vf4); - vacc1 = _mm512_add_ps(vacc1, vf5); - vacc2 = _mm512_add_ps(vacc2, vf6); - vacc3 = _mm512_add_ps(vacc3, vf7); - } - vacc0 = _mm512_add_ps(vacc0, vacc1); - vacc2 = _mm512_add_ps(vacc2, vacc3); - vacc0 = _mm512_add_ps(vacc0, vacc2); - - __m512 vacc = vacc0; - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const __m512 vi = _mm512_loadu_ps(input); - input += 16; - - const __m512 vx = _mm512_sub_ps(vi, vi_max); - - const __m512 vn = _mm512_roundscale_ps(_mm512_mul_ps(vx, vlog2e), 0); - - const __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2, vx); - - __m512 vp = _mm512_fmadd_ps(vc5, vt, vc4); - vp = _mm512_fmadd_ps(vp, vt, vc3); - vp = _mm512_fmadd_ps(vp, vt, vc2); - vp = _mm512_fmadd_ps(vp, vt, vc1); - vp = _mm512_fmadd_ps(vp, vt, vc0); - - const __m512 vf = _mm512_scalef_ps(vp, vn); - - _mm512_storeu_ps(output, vf); - output += 16; - - vacc = _mm512_add_ps(vacc, vf); - } - if (batch != 0) { - // Prepare mask for valid 32-bit batch (depends on batch). - batch >>= XNN_LOG2_SIZEOF_FLOAT; - const __mmask16 vmask = _cvtu32_mask16((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); - - const __m512 vi = _mm512_maskz_loadu_ps(vmask, input); - - const __m512 vx = _mm512_sub_ps(vi, vi_max); - - const __m512 vn = _mm512_roundscale_ps(_mm512_mul_ps(vx, vlog2e), 0); - - const __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2, vx); - - __m512 vp = _mm512_fmadd_ps(vc5, vt, vc4); - vp = _mm512_fmadd_ps(vp, vt, vc3); - vp = _mm512_fmadd_ps(vp, vt, vc2); - vp = _mm512_fmadd_ps(vp, vt, vc1); - vp = _mm512_fmadd_ps(vp, vt, vc0); - - const __m512 vf = _mm512_scalef_ps(vp, vn); - - _mm512_mask_storeu_ps(output, vmask, vf); - - vacc = _mm512_mask_add_ps(vacc, vmask, vacc, vf); - } - *sum = _mm512_reduce_add_ps(vacc); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u128.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u128.c deleted file mode 100644 index 56bb0e8adec..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u128.c +++ /dev/null @@ -1,216 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u128( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const __m512 vlog2e = _mm512_set1_ps(0x1.715476p+0f); - const __m512 vminus_ln2 = _mm512_set1_ps(-0x1.62E430p-1f); - const __m512 vc5 = _mm512_set1_ps(0x1.0F9F9Cp-7f); - const __m512 vc4 = _mm512_set1_ps(0x1.573A1Ap-5f); - const __m512 vc3 = _mm512_set1_ps(0x1.555A80p-3f); - const __m512 vc2 = _mm512_set1_ps(0x1.FFFDC6p-2f); - const __m512 vc1 = _mm512_set1_ps(0x1.FFFFF6p-1f); - const __m512 vc0 = _mm512_set1_ps(1.0f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vminus_ln2); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vc0); - - const __m512 vi_max = _mm512_set1_ps(*max); - - __m512 vacc0 = _mm512_setzero_ps(); - for (; batch >= 128 * sizeof(float); batch -= 128 * sizeof(float)) { - const __m512 vi0 = _mm512_loadu_ps(input); - const __m512 vi1 = _mm512_loadu_ps(input + 16); - const __m512 vi2 = _mm512_loadu_ps(input + 32); - const __m512 vi3 = _mm512_loadu_ps(input + 48); - const __m512 vi4 = _mm512_loadu_ps(input + 64); - const __m512 vi5 = _mm512_loadu_ps(input + 80); - const __m512 vi6 = _mm512_loadu_ps(input + 96); - const __m512 vi7 = _mm512_loadu_ps(input + 112); - input += 128; - - const __m512 vx0 = _mm512_sub_ps(vi0, vi_max); - const __m512 vx1 = _mm512_sub_ps(vi1, vi_max); - const __m512 vx2 = _mm512_sub_ps(vi2, vi_max); - const __m512 vx3 = _mm512_sub_ps(vi3, vi_max); - const __m512 vx4 = _mm512_sub_ps(vi4, vi_max); - const __m512 vx5 = _mm512_sub_ps(vi5, vi_max); - const __m512 vx6 = _mm512_sub_ps(vi6, vi_max); - const __m512 vx7 = _mm512_sub_ps(vi7, vi_max); - - const __m512 vn0 = _mm512_roundscale_ps(_mm512_mul_ps(vx0, vlog2e), 0); - const __m512 vn1 = _mm512_roundscale_ps(_mm512_mul_ps(vx1, vlog2e), 0); - const __m512 vn2 = _mm512_roundscale_ps(_mm512_mul_ps(vx2, vlog2e), 0); - const __m512 vn3 = _mm512_roundscale_ps(_mm512_mul_ps(vx3, vlog2e), 0); - const __m512 vn4 = _mm512_roundscale_ps(_mm512_mul_ps(vx4, vlog2e), 0); - const __m512 vn5 = _mm512_roundscale_ps(_mm512_mul_ps(vx5, vlog2e), 0); - const __m512 vn6 = _mm512_roundscale_ps(_mm512_mul_ps(vx6, vlog2e), 0); - const __m512 vn7 = _mm512_roundscale_ps(_mm512_mul_ps(vx7, vlog2e), 0); - - const __m512 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2, vx0); - const __m512 vt1 = _mm512_fmadd_ps(vn1, vminus_ln2, vx1); - const __m512 vt2 = _mm512_fmadd_ps(vn2, vminus_ln2, vx2); - const __m512 vt3 = _mm512_fmadd_ps(vn3, vminus_ln2, vx3); - const __m512 vt4 = _mm512_fmadd_ps(vn4, vminus_ln2, vx4); - const __m512 vt5 = _mm512_fmadd_ps(vn5, vminus_ln2, vx5); - const __m512 vt6 = _mm512_fmadd_ps(vn6, vminus_ln2, vx6); - const __m512 vt7 = _mm512_fmadd_ps(vn7, vminus_ln2, vx7); - - __m512 vp0 = _mm512_fmadd_ps(vc5, vt0, vc4); - __m512 vp1 = _mm512_fmadd_ps(vc5, vt1, vc4); - __m512 vp2 = _mm512_fmadd_ps(vc5, vt2, vc4); - __m512 vp3 = _mm512_fmadd_ps(vc5, vt3, vc4); - __m512 vp4 = _mm512_fmadd_ps(vc5, vt4, vc4); - __m512 vp5 = _mm512_fmadd_ps(vc5, vt5, vc4); - __m512 vp6 = _mm512_fmadd_ps(vc5, vt6, vc4); - __m512 vp7 = _mm512_fmadd_ps(vc5, vt7, vc4); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc3); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc3); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc3); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc3); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc3); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc3); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc3); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc3); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc2); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc2); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc2); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc2); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc2); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc2); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc2); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc2); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc1); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc1); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc1); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc1); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc1); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc1); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc1); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc1); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc0); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc0); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc0); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc0); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc0); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc0); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc0); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc0); - - const __m512 vf0 = _mm512_scalef_ps(vp0, vn0); - const __m512 vf1 = _mm512_scalef_ps(vp1, vn1); - const __m512 vf2 = _mm512_scalef_ps(vp2, vn2); - const __m512 vf3 = _mm512_scalef_ps(vp3, vn3); - const __m512 vf4 = _mm512_scalef_ps(vp4, vn4); - const __m512 vf5 = _mm512_scalef_ps(vp5, vn5); - const __m512 vf6 = _mm512_scalef_ps(vp6, vn6); - const __m512 vf7 = _mm512_scalef_ps(vp7, vn7); - - _mm512_storeu_ps(output, vf0); - _mm512_storeu_ps(output + 16, vf1); - _mm512_storeu_ps(output + 32, vf2); - _mm512_storeu_ps(output + 48, vf3); - _mm512_storeu_ps(output + 64, vf4); - _mm512_storeu_ps(output + 80, vf5); - _mm512_storeu_ps(output + 96, vf6); - _mm512_storeu_ps(output + 112, vf7); - output += 128; - - vacc0 = _mm512_add_ps(vacc0, vf0); - vacc0 = _mm512_add_ps(vacc0, vf1); - vacc0 = _mm512_add_ps(vacc0, vf2); - vacc0 = _mm512_add_ps(vacc0, vf3); - vacc0 = _mm512_add_ps(vacc0, vf4); - vacc0 = _mm512_add_ps(vacc0, vf5); - vacc0 = _mm512_add_ps(vacc0, vf6); - vacc0 = _mm512_add_ps(vacc0, vf7); - } - - __m512 vacc = vacc0; - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const __m512 vi = _mm512_loadu_ps(input); - input += 16; - - const __m512 vx = _mm512_sub_ps(vi, vi_max); - - const __m512 vn = _mm512_roundscale_ps(_mm512_mul_ps(vx, vlog2e), 0); - - const __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2, vx); - - __m512 vp = _mm512_fmadd_ps(vc5, vt, vc4); - vp = _mm512_fmadd_ps(vp, vt, vc3); - vp = _mm512_fmadd_ps(vp, vt, vc2); - vp = _mm512_fmadd_ps(vp, vt, vc1); - vp = _mm512_fmadd_ps(vp, vt, vc0); - - const __m512 vf = _mm512_scalef_ps(vp, vn); - - _mm512_storeu_ps(output, vf); - output += 16; - - vacc = _mm512_add_ps(vacc, vf); - } - if (batch != 0) { - // Prepare mask for valid 32-bit batch (depends on batch). - batch >>= XNN_LOG2_SIZEOF_FLOAT; - const __mmask16 vmask = _cvtu32_mask16((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); - - const __m512 vi = _mm512_maskz_loadu_ps(vmask, input); - - const __m512 vx = _mm512_sub_ps(vi, vi_max); - - const __m512 vn = _mm512_roundscale_ps(_mm512_mul_ps(vx, vlog2e), 0); - - const __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2, vx); - - __m512 vp = _mm512_fmadd_ps(vc5, vt, vc4); - vp = _mm512_fmadd_ps(vp, vt, vc3); - vp = _mm512_fmadd_ps(vp, vt, vc2); - vp = _mm512_fmadd_ps(vp, vt, vc1); - vp = _mm512_fmadd_ps(vp, vt, vc0); - - const __m512 vf = _mm512_scalef_ps(vp, vn); - - _mm512_mask_storeu_ps(output, vmask, vf); - - vacc = _mm512_mask_add_ps(vacc, vmask, vacc, vf); - } - *sum = _mm512_reduce_add_ps(vacc); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u144-acc3.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u144-acc3.c deleted file mode 100644 index 520cc4c233b..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u144-acc3.c +++ /dev/null @@ -1,232 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u144_acc3( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const __m512 vlog2e = _mm512_set1_ps(0x1.715476p+0f); - const __m512 vminus_ln2 = _mm512_set1_ps(-0x1.62E430p-1f); - const __m512 vc5 = _mm512_set1_ps(0x1.0F9F9Cp-7f); - const __m512 vc4 = _mm512_set1_ps(0x1.573A1Ap-5f); - const __m512 vc3 = _mm512_set1_ps(0x1.555A80p-3f); - const __m512 vc2 = _mm512_set1_ps(0x1.FFFDC6p-2f); - const __m512 vc1 = _mm512_set1_ps(0x1.FFFFF6p-1f); - const __m512 vc0 = _mm512_set1_ps(1.0f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vminus_ln2); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vc0); - - const __m512 vi_max = _mm512_set1_ps(*max); - - __m512 vacc0 = _mm512_setzero_ps(); - __m512 vacc1 = _mm512_setzero_ps(); - __m512 vacc2 = _mm512_setzero_ps(); - for (; batch >= 144 * sizeof(float); batch -= 144 * sizeof(float)) { - const __m512 vi0 = _mm512_loadu_ps(input); - const __m512 vi1 = _mm512_loadu_ps(input + 16); - const __m512 vi2 = _mm512_loadu_ps(input + 32); - const __m512 vi3 = _mm512_loadu_ps(input + 48); - const __m512 vi4 = _mm512_loadu_ps(input + 64); - const __m512 vi5 = _mm512_loadu_ps(input + 80); - const __m512 vi6 = _mm512_loadu_ps(input + 96); - const __m512 vi7 = _mm512_loadu_ps(input + 112); - const __m512 vi8 = _mm512_loadu_ps(input + 128); - input += 144; - - const __m512 vx0 = _mm512_sub_ps(vi0, vi_max); - const __m512 vx1 = _mm512_sub_ps(vi1, vi_max); - const __m512 vx2 = _mm512_sub_ps(vi2, vi_max); - const __m512 vx3 = _mm512_sub_ps(vi3, vi_max); - const __m512 vx4 = _mm512_sub_ps(vi4, vi_max); - const __m512 vx5 = _mm512_sub_ps(vi5, vi_max); - const __m512 vx6 = _mm512_sub_ps(vi6, vi_max); - const __m512 vx7 = _mm512_sub_ps(vi7, vi_max); - const __m512 vx8 = _mm512_sub_ps(vi8, vi_max); - - const __m512 vn0 = _mm512_roundscale_ps(_mm512_mul_ps(vx0, vlog2e), 0); - const __m512 vn1 = _mm512_roundscale_ps(_mm512_mul_ps(vx1, vlog2e), 0); - const __m512 vn2 = _mm512_roundscale_ps(_mm512_mul_ps(vx2, vlog2e), 0); - const __m512 vn3 = _mm512_roundscale_ps(_mm512_mul_ps(vx3, vlog2e), 0); - const __m512 vn4 = _mm512_roundscale_ps(_mm512_mul_ps(vx4, vlog2e), 0); - const __m512 vn5 = _mm512_roundscale_ps(_mm512_mul_ps(vx5, vlog2e), 0); - const __m512 vn6 = _mm512_roundscale_ps(_mm512_mul_ps(vx6, vlog2e), 0); - const __m512 vn7 = _mm512_roundscale_ps(_mm512_mul_ps(vx7, vlog2e), 0); - const __m512 vn8 = _mm512_roundscale_ps(_mm512_mul_ps(vx8, vlog2e), 0); - - const __m512 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2, vx0); - const __m512 vt1 = _mm512_fmadd_ps(vn1, vminus_ln2, vx1); - const __m512 vt2 = _mm512_fmadd_ps(vn2, vminus_ln2, vx2); - const __m512 vt3 = _mm512_fmadd_ps(vn3, vminus_ln2, vx3); - const __m512 vt4 = _mm512_fmadd_ps(vn4, vminus_ln2, vx4); - const __m512 vt5 = _mm512_fmadd_ps(vn5, vminus_ln2, vx5); - const __m512 vt6 = _mm512_fmadd_ps(vn6, vminus_ln2, vx6); - const __m512 vt7 = _mm512_fmadd_ps(vn7, vminus_ln2, vx7); - const __m512 vt8 = _mm512_fmadd_ps(vn8, vminus_ln2, vx8); - - __m512 vp0 = _mm512_fmadd_ps(vc5, vt0, vc4); - __m512 vp1 = _mm512_fmadd_ps(vc5, vt1, vc4); - __m512 vp2 = _mm512_fmadd_ps(vc5, vt2, vc4); - __m512 vp3 = _mm512_fmadd_ps(vc5, vt3, vc4); - __m512 vp4 = _mm512_fmadd_ps(vc5, vt4, vc4); - __m512 vp5 = _mm512_fmadd_ps(vc5, vt5, vc4); - __m512 vp6 = _mm512_fmadd_ps(vc5, vt6, vc4); - __m512 vp7 = _mm512_fmadd_ps(vc5, vt7, vc4); - __m512 vp8 = _mm512_fmadd_ps(vc5, vt8, vc4); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc3); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc3); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc3); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc3); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc3); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc3); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc3); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc3); - vp8 = _mm512_fmadd_ps(vp8, vt8, vc3); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc2); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc2); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc2); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc2); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc2); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc2); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc2); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc2); - vp8 = _mm512_fmadd_ps(vp8, vt8, vc2); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc1); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc1); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc1); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc1); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc1); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc1); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc1); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc1); - vp8 = _mm512_fmadd_ps(vp8, vt8, vc1); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc0); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc0); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc0); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc0); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc0); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc0); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc0); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc0); - vp8 = _mm512_fmadd_ps(vp8, vt8, vc0); - - const __m512 vf0 = _mm512_scalef_ps(vp0, vn0); - const __m512 vf1 = _mm512_scalef_ps(vp1, vn1); - const __m512 vf2 = _mm512_scalef_ps(vp2, vn2); - const __m512 vf3 = _mm512_scalef_ps(vp3, vn3); - const __m512 vf4 = _mm512_scalef_ps(vp4, vn4); - const __m512 vf5 = _mm512_scalef_ps(vp5, vn5); - const __m512 vf6 = _mm512_scalef_ps(vp6, vn6); - const __m512 vf7 = _mm512_scalef_ps(vp7, vn7); - const __m512 vf8 = _mm512_scalef_ps(vp8, vn8); - - _mm512_storeu_ps(output, vf0); - _mm512_storeu_ps(output + 16, vf1); - _mm512_storeu_ps(output + 32, vf2); - _mm512_storeu_ps(output + 48, vf3); - _mm512_storeu_ps(output + 64, vf4); - _mm512_storeu_ps(output + 80, vf5); - _mm512_storeu_ps(output + 96, vf6); - _mm512_storeu_ps(output + 112, vf7); - _mm512_storeu_ps(output + 128, vf8); - output += 144; - - vacc0 = _mm512_add_ps(vacc0, vf0); - vacc1 = _mm512_add_ps(vacc1, vf1); - vacc2 = _mm512_add_ps(vacc2, vf2); - vacc0 = _mm512_add_ps(vacc0, vf3); - vacc1 = _mm512_add_ps(vacc1, vf4); - vacc2 = _mm512_add_ps(vacc2, vf5); - vacc0 = _mm512_add_ps(vacc0, vf6); - vacc1 = _mm512_add_ps(vacc1, vf7); - vacc2 = _mm512_add_ps(vacc2, vf8); - } - vacc0 = _mm512_add_ps(vacc0, vacc1); - vacc0 = _mm512_add_ps(vacc0, vacc2); - - __m512 vacc = vacc0; - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const __m512 vi = _mm512_loadu_ps(input); - input += 16; - - const __m512 vx = _mm512_sub_ps(vi, vi_max); - - const __m512 vn = _mm512_roundscale_ps(_mm512_mul_ps(vx, vlog2e), 0); - - const __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2, vx); - - __m512 vp = _mm512_fmadd_ps(vc5, vt, vc4); - vp = _mm512_fmadd_ps(vp, vt, vc3); - vp = _mm512_fmadd_ps(vp, vt, vc2); - vp = _mm512_fmadd_ps(vp, vt, vc1); - vp = _mm512_fmadd_ps(vp, vt, vc0); - - const __m512 vf = _mm512_scalef_ps(vp, vn); - - _mm512_storeu_ps(output, vf); - output += 16; - - vacc = _mm512_add_ps(vacc, vf); - } - if (batch != 0) { - // Prepare mask for valid 32-bit batch (depends on batch). - batch >>= XNN_LOG2_SIZEOF_FLOAT; - const __mmask16 vmask = _cvtu32_mask16((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); - - const __m512 vi = _mm512_maskz_loadu_ps(vmask, input); - - const __m512 vx = _mm512_sub_ps(vi, vi_max); - - const __m512 vn = _mm512_roundscale_ps(_mm512_mul_ps(vx, vlog2e), 0); - - const __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2, vx); - - __m512 vp = _mm512_fmadd_ps(vc5, vt, vc4); - vp = _mm512_fmadd_ps(vp, vt, vc3); - vp = _mm512_fmadd_ps(vp, vt, vc2); - vp = _mm512_fmadd_ps(vp, vt, vc1); - vp = _mm512_fmadd_ps(vp, vt, vc0); - - const __m512 vf = _mm512_scalef_ps(vp, vn); - - _mm512_mask_storeu_ps(output, vmask, vf); - - vacc = _mm512_mask_add_ps(vacc, vmask, vacc, vf); - } - *sum = _mm512_reduce_add_ps(vacc); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u144.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u144.c deleted file mode 100644 index 3687dd4dd3a..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u144.c +++ /dev/null @@ -1,228 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u144( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const __m512 vlog2e = _mm512_set1_ps(0x1.715476p+0f); - const __m512 vminus_ln2 = _mm512_set1_ps(-0x1.62E430p-1f); - const __m512 vc5 = _mm512_set1_ps(0x1.0F9F9Cp-7f); - const __m512 vc4 = _mm512_set1_ps(0x1.573A1Ap-5f); - const __m512 vc3 = _mm512_set1_ps(0x1.555A80p-3f); - const __m512 vc2 = _mm512_set1_ps(0x1.FFFDC6p-2f); - const __m512 vc1 = _mm512_set1_ps(0x1.FFFFF6p-1f); - const __m512 vc0 = _mm512_set1_ps(1.0f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vminus_ln2); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vc0); - - const __m512 vi_max = _mm512_set1_ps(*max); - - __m512 vacc0 = _mm512_setzero_ps(); - for (; batch >= 144 * sizeof(float); batch -= 144 * sizeof(float)) { - const __m512 vi0 = _mm512_loadu_ps(input); - const __m512 vi1 = _mm512_loadu_ps(input + 16); - const __m512 vi2 = _mm512_loadu_ps(input + 32); - const __m512 vi3 = _mm512_loadu_ps(input + 48); - const __m512 vi4 = _mm512_loadu_ps(input + 64); - const __m512 vi5 = _mm512_loadu_ps(input + 80); - const __m512 vi6 = _mm512_loadu_ps(input + 96); - const __m512 vi7 = _mm512_loadu_ps(input + 112); - const __m512 vi8 = _mm512_loadu_ps(input + 128); - input += 144; - - const __m512 vx0 = _mm512_sub_ps(vi0, vi_max); - const __m512 vx1 = _mm512_sub_ps(vi1, vi_max); - const __m512 vx2 = _mm512_sub_ps(vi2, vi_max); - const __m512 vx3 = _mm512_sub_ps(vi3, vi_max); - const __m512 vx4 = _mm512_sub_ps(vi4, vi_max); - const __m512 vx5 = _mm512_sub_ps(vi5, vi_max); - const __m512 vx6 = _mm512_sub_ps(vi6, vi_max); - const __m512 vx7 = _mm512_sub_ps(vi7, vi_max); - const __m512 vx8 = _mm512_sub_ps(vi8, vi_max); - - const __m512 vn0 = _mm512_roundscale_ps(_mm512_mul_ps(vx0, vlog2e), 0); - const __m512 vn1 = _mm512_roundscale_ps(_mm512_mul_ps(vx1, vlog2e), 0); - const __m512 vn2 = _mm512_roundscale_ps(_mm512_mul_ps(vx2, vlog2e), 0); - const __m512 vn3 = _mm512_roundscale_ps(_mm512_mul_ps(vx3, vlog2e), 0); - const __m512 vn4 = _mm512_roundscale_ps(_mm512_mul_ps(vx4, vlog2e), 0); - const __m512 vn5 = _mm512_roundscale_ps(_mm512_mul_ps(vx5, vlog2e), 0); - const __m512 vn6 = _mm512_roundscale_ps(_mm512_mul_ps(vx6, vlog2e), 0); - const __m512 vn7 = _mm512_roundscale_ps(_mm512_mul_ps(vx7, vlog2e), 0); - const __m512 vn8 = _mm512_roundscale_ps(_mm512_mul_ps(vx8, vlog2e), 0); - - const __m512 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2, vx0); - const __m512 vt1 = _mm512_fmadd_ps(vn1, vminus_ln2, vx1); - const __m512 vt2 = _mm512_fmadd_ps(vn2, vminus_ln2, vx2); - const __m512 vt3 = _mm512_fmadd_ps(vn3, vminus_ln2, vx3); - const __m512 vt4 = _mm512_fmadd_ps(vn4, vminus_ln2, vx4); - const __m512 vt5 = _mm512_fmadd_ps(vn5, vminus_ln2, vx5); - const __m512 vt6 = _mm512_fmadd_ps(vn6, vminus_ln2, vx6); - const __m512 vt7 = _mm512_fmadd_ps(vn7, vminus_ln2, vx7); - const __m512 vt8 = _mm512_fmadd_ps(vn8, vminus_ln2, vx8); - - __m512 vp0 = _mm512_fmadd_ps(vc5, vt0, vc4); - __m512 vp1 = _mm512_fmadd_ps(vc5, vt1, vc4); - __m512 vp2 = _mm512_fmadd_ps(vc5, vt2, vc4); - __m512 vp3 = _mm512_fmadd_ps(vc5, vt3, vc4); - __m512 vp4 = _mm512_fmadd_ps(vc5, vt4, vc4); - __m512 vp5 = _mm512_fmadd_ps(vc5, vt5, vc4); - __m512 vp6 = _mm512_fmadd_ps(vc5, vt6, vc4); - __m512 vp7 = _mm512_fmadd_ps(vc5, vt7, vc4); - __m512 vp8 = _mm512_fmadd_ps(vc5, vt8, vc4); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc3); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc3); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc3); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc3); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc3); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc3); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc3); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc3); - vp8 = _mm512_fmadd_ps(vp8, vt8, vc3); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc2); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc2); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc2); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc2); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc2); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc2); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc2); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc2); - vp8 = _mm512_fmadd_ps(vp8, vt8, vc2); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc1); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc1); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc1); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc1); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc1); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc1); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc1); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc1); - vp8 = _mm512_fmadd_ps(vp8, vt8, vc1); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc0); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc0); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc0); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc0); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc0); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc0); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc0); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc0); - vp8 = _mm512_fmadd_ps(vp8, vt8, vc0); - - const __m512 vf0 = _mm512_scalef_ps(vp0, vn0); - const __m512 vf1 = _mm512_scalef_ps(vp1, vn1); - const __m512 vf2 = _mm512_scalef_ps(vp2, vn2); - const __m512 vf3 = _mm512_scalef_ps(vp3, vn3); - const __m512 vf4 = _mm512_scalef_ps(vp4, vn4); - const __m512 vf5 = _mm512_scalef_ps(vp5, vn5); - const __m512 vf6 = _mm512_scalef_ps(vp6, vn6); - const __m512 vf7 = _mm512_scalef_ps(vp7, vn7); - const __m512 vf8 = _mm512_scalef_ps(vp8, vn8); - - _mm512_storeu_ps(output, vf0); - _mm512_storeu_ps(output + 16, vf1); - _mm512_storeu_ps(output + 32, vf2); - _mm512_storeu_ps(output + 48, vf3); - _mm512_storeu_ps(output + 64, vf4); - _mm512_storeu_ps(output + 80, vf5); - _mm512_storeu_ps(output + 96, vf6); - _mm512_storeu_ps(output + 112, vf7); - _mm512_storeu_ps(output + 128, vf8); - output += 144; - - vacc0 = _mm512_add_ps(vacc0, vf0); - vacc0 = _mm512_add_ps(vacc0, vf1); - vacc0 = _mm512_add_ps(vacc0, vf2); - vacc0 = _mm512_add_ps(vacc0, vf3); - vacc0 = _mm512_add_ps(vacc0, vf4); - vacc0 = _mm512_add_ps(vacc0, vf5); - vacc0 = _mm512_add_ps(vacc0, vf6); - vacc0 = _mm512_add_ps(vacc0, vf7); - vacc0 = _mm512_add_ps(vacc0, vf8); - } - - __m512 vacc = vacc0; - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const __m512 vi = _mm512_loadu_ps(input); - input += 16; - - const __m512 vx = _mm512_sub_ps(vi, vi_max); - - const __m512 vn = _mm512_roundscale_ps(_mm512_mul_ps(vx, vlog2e), 0); - - const __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2, vx); - - __m512 vp = _mm512_fmadd_ps(vc5, vt, vc4); - vp = _mm512_fmadd_ps(vp, vt, vc3); - vp = _mm512_fmadd_ps(vp, vt, vc2); - vp = _mm512_fmadd_ps(vp, vt, vc1); - vp = _mm512_fmadd_ps(vp, vt, vc0); - - const __m512 vf = _mm512_scalef_ps(vp, vn); - - _mm512_storeu_ps(output, vf); - output += 16; - - vacc = _mm512_add_ps(vacc, vf); - } - if (batch != 0) { - // Prepare mask for valid 32-bit batch (depends on batch). - batch >>= XNN_LOG2_SIZEOF_FLOAT; - const __mmask16 vmask = _cvtu32_mask16((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); - - const __m512 vi = _mm512_maskz_loadu_ps(vmask, input); - - const __m512 vx = _mm512_sub_ps(vi, vi_max); - - const __m512 vn = _mm512_roundscale_ps(_mm512_mul_ps(vx, vlog2e), 0); - - const __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2, vx); - - __m512 vp = _mm512_fmadd_ps(vc5, vt, vc4); - vp = _mm512_fmadd_ps(vp, vt, vc3); - vp = _mm512_fmadd_ps(vp, vt, vc2); - vp = _mm512_fmadd_ps(vp, vt, vc1); - vp = _mm512_fmadd_ps(vp, vt, vc0); - - const __m512 vf = _mm512_scalef_ps(vp, vn); - - _mm512_mask_storeu_ps(output, vmask, vf); - - vacc = _mm512_mask_add_ps(vacc, vmask, vacc, vf); - } - *sum = _mm512_reduce_add_ps(vacc); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u16.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u16.c new file mode 100644 index 00000000000..647fc736ba6 --- /dev/null +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u16.c @@ -0,0 +1,132 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in +// Generator: tools/xngen +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/intrinsics-polyfill.h" +#include "xnnpack/raddstoreexpminusmax.h" + + +void xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u16( + size_t batch, + const float* input, + const float* max, + float* output, + float* sum, + const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(max != NULL); + assert(output != NULL); + assert(sum != NULL); + + const __m512 vlog2e = _mm512_set1_ps(0x1.715476p+0f); + const __m512 vminus_ln2 = _mm512_set1_ps(-0x1.62E430p-1f); + const __m512 vc5 = _mm512_set1_ps(0x1.0F9F9Cp-7f); + const __m512 vc4 = _mm512_set1_ps(0x1.573A1Ap-5f); + const __m512 vc3 = _mm512_set1_ps(0x1.555A80p-3f); + const __m512 vc2 = _mm512_set1_ps(0x1.FFFDC6p-2f); + const __m512 vc1 = _mm512_set1_ps(0x1.FFFFF6p-1f); + const __m512 vc0 = _mm512_set1_ps(1.0f); + + XNN_FORCE_REALIZATION(vlog2e); + XNN_FORCE_REALIZATION(vminus_ln2); + XNN_FORCE_REALIZATION(vc5); + XNN_FORCE_REALIZATION(vc4); + XNN_FORCE_REALIZATION(vc3); + XNN_FORCE_REALIZATION(vc2); + XNN_FORCE_REALIZATION(vc1); + XNN_FORCE_REALIZATION(vc0); + + const __m512 vi_max = _mm512_set1_ps(*max); + + __m512 vacc0 = _mm512_setzero_ps(); + for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { + const __m512 vi0 = _mm512_loadu_ps(input); + input += 16; + + const __m512 vx0 = _mm512_sub_ps(vi0, vi_max); + + const __m512 vn0 = _mm512_roundscale_ps(_mm512_mul_ps(vx0, vlog2e), 0); + + const __m512 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2, vx0); + + __m512 vp0 = _mm512_fmadd_ps(vc5, vt0, vc4); + + vp0 = _mm512_fmadd_ps(vp0, vt0, vc3); + + vp0 = _mm512_fmadd_ps(vp0, vt0, vc2); + + vp0 = _mm512_fmadd_ps(vp0, vt0, vc1); + + vp0 = _mm512_fmadd_ps(vp0, vt0, vc0); + + const __m512 vf0 = _mm512_scalef_ps(vp0, vn0); + + _mm512_storeu_ps(output, vf0); + output += 16; + + vacc0 = _mm512_add_ps(vacc0, vf0); + } + + __m512 vacc = vacc0; + for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { + const __m512 vi = _mm512_loadu_ps(input); + input += 16; + + const __m512 vx = _mm512_sub_ps(vi, vi_max); + + const __m512 vn = _mm512_roundscale_ps(_mm512_mul_ps(vx, vlog2e), 0); + + const __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2, vx); + + __m512 vp = _mm512_fmadd_ps(vc5, vt, vc4); + vp = _mm512_fmadd_ps(vp, vt, vc3); + vp = _mm512_fmadd_ps(vp, vt, vc2); + vp = _mm512_fmadd_ps(vp, vt, vc1); + vp = _mm512_fmadd_ps(vp, vt, vc0); + + const __m512 vf = _mm512_scalef_ps(vp, vn); + + _mm512_storeu_ps(output, vf); + output += 16; + + vacc = _mm512_add_ps(vacc, vf); + } + if (batch != 0) { + // Prepare mask for valid 32-bit batch (depends on batch). + batch >>= XNN_LOG2_SIZEOF_FLOAT; + const __mmask16 vmask = _cvtu32_mask16((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); + + const __m512 vi = _mm512_maskz_loadu_ps(vmask, input); + + const __m512 vx = _mm512_sub_ps(vi, vi_max); + + const __m512 vn = _mm512_roundscale_ps(_mm512_mul_ps(vx, vlog2e), 0); + + const __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2, vx); + + __m512 vp = _mm512_fmadd_ps(vc5, vt, vc4); + vp = _mm512_fmadd_ps(vp, vt, vc3); + vp = _mm512_fmadd_ps(vp, vt, vc2); + vp = _mm512_fmadd_ps(vp, vt, vc1); + vp = _mm512_fmadd_ps(vp, vt, vc0); + + const __m512 vf = _mm512_scalef_ps(vp, vn); + + _mm512_mask_storeu_ps(output, vmask, vf); + + vacc = _mm512_mask_add_ps(vacc, vmask, vacc, vf); + } + *sum = _mm512_reduce_add_ps(vacc); +} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u160-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u160-acc2.c deleted file mode 100644 index d242a390935..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u160-acc2.c +++ /dev/null @@ -1,242 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u160_acc2( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const __m512 vlog2e = _mm512_set1_ps(0x1.715476p+0f); - const __m512 vminus_ln2 = _mm512_set1_ps(-0x1.62E430p-1f); - const __m512 vc5 = _mm512_set1_ps(0x1.0F9F9Cp-7f); - const __m512 vc4 = _mm512_set1_ps(0x1.573A1Ap-5f); - const __m512 vc3 = _mm512_set1_ps(0x1.555A80p-3f); - const __m512 vc2 = _mm512_set1_ps(0x1.FFFDC6p-2f); - const __m512 vc1 = _mm512_set1_ps(0x1.FFFFF6p-1f); - const __m512 vc0 = _mm512_set1_ps(1.0f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vminus_ln2); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vc0); - - const __m512 vi_max = _mm512_set1_ps(*max); - - __m512 vacc0 = _mm512_setzero_ps(); - __m512 vacc1 = _mm512_setzero_ps(); - for (; batch >= 160 * sizeof(float); batch -= 160 * sizeof(float)) { - const __m512 vi0 = _mm512_loadu_ps(input); - const __m512 vi1 = _mm512_loadu_ps(input + 16); - const __m512 vi2 = _mm512_loadu_ps(input + 32); - const __m512 vi3 = _mm512_loadu_ps(input + 48); - const __m512 vi4 = _mm512_loadu_ps(input + 64); - const __m512 vi5 = _mm512_loadu_ps(input + 80); - const __m512 vi6 = _mm512_loadu_ps(input + 96); - const __m512 vi7 = _mm512_loadu_ps(input + 112); - const __m512 vi8 = _mm512_loadu_ps(input + 128); - const __m512 vi9 = _mm512_loadu_ps(input + 144); - input += 160; - - const __m512 vx0 = _mm512_sub_ps(vi0, vi_max); - const __m512 vx1 = _mm512_sub_ps(vi1, vi_max); - const __m512 vx2 = _mm512_sub_ps(vi2, vi_max); - const __m512 vx3 = _mm512_sub_ps(vi3, vi_max); - const __m512 vx4 = _mm512_sub_ps(vi4, vi_max); - const __m512 vx5 = _mm512_sub_ps(vi5, vi_max); - const __m512 vx6 = _mm512_sub_ps(vi6, vi_max); - const __m512 vx7 = _mm512_sub_ps(vi7, vi_max); - const __m512 vx8 = _mm512_sub_ps(vi8, vi_max); - const __m512 vx9 = _mm512_sub_ps(vi9, vi_max); - - const __m512 vn0 = _mm512_roundscale_ps(_mm512_mul_ps(vx0, vlog2e), 0); - const __m512 vn1 = _mm512_roundscale_ps(_mm512_mul_ps(vx1, vlog2e), 0); - const __m512 vn2 = _mm512_roundscale_ps(_mm512_mul_ps(vx2, vlog2e), 0); - const __m512 vn3 = _mm512_roundscale_ps(_mm512_mul_ps(vx3, vlog2e), 0); - const __m512 vn4 = _mm512_roundscale_ps(_mm512_mul_ps(vx4, vlog2e), 0); - const __m512 vn5 = _mm512_roundscale_ps(_mm512_mul_ps(vx5, vlog2e), 0); - const __m512 vn6 = _mm512_roundscale_ps(_mm512_mul_ps(vx6, vlog2e), 0); - const __m512 vn7 = _mm512_roundscale_ps(_mm512_mul_ps(vx7, vlog2e), 0); - const __m512 vn8 = _mm512_roundscale_ps(_mm512_mul_ps(vx8, vlog2e), 0); - const __m512 vn9 = _mm512_roundscale_ps(_mm512_mul_ps(vx9, vlog2e), 0); - - const __m512 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2, vx0); - const __m512 vt1 = _mm512_fmadd_ps(vn1, vminus_ln2, vx1); - const __m512 vt2 = _mm512_fmadd_ps(vn2, vminus_ln2, vx2); - const __m512 vt3 = _mm512_fmadd_ps(vn3, vminus_ln2, vx3); - const __m512 vt4 = _mm512_fmadd_ps(vn4, vminus_ln2, vx4); - const __m512 vt5 = _mm512_fmadd_ps(vn5, vminus_ln2, vx5); - const __m512 vt6 = _mm512_fmadd_ps(vn6, vminus_ln2, vx6); - const __m512 vt7 = _mm512_fmadd_ps(vn7, vminus_ln2, vx7); - const __m512 vt8 = _mm512_fmadd_ps(vn8, vminus_ln2, vx8); - const __m512 vt9 = _mm512_fmadd_ps(vn9, vminus_ln2, vx9); - - __m512 vp0 = _mm512_fmadd_ps(vc5, vt0, vc4); - __m512 vp1 = _mm512_fmadd_ps(vc5, vt1, vc4); - __m512 vp2 = _mm512_fmadd_ps(vc5, vt2, vc4); - __m512 vp3 = _mm512_fmadd_ps(vc5, vt3, vc4); - __m512 vp4 = _mm512_fmadd_ps(vc5, vt4, vc4); - __m512 vp5 = _mm512_fmadd_ps(vc5, vt5, vc4); - __m512 vp6 = _mm512_fmadd_ps(vc5, vt6, vc4); - __m512 vp7 = _mm512_fmadd_ps(vc5, vt7, vc4); - __m512 vp8 = _mm512_fmadd_ps(vc5, vt8, vc4); - __m512 vp9 = _mm512_fmadd_ps(vc5, vt9, vc4); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc3); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc3); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc3); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc3); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc3); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc3); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc3); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc3); - vp8 = _mm512_fmadd_ps(vp8, vt8, vc3); - vp9 = _mm512_fmadd_ps(vp9, vt9, vc3); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc2); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc2); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc2); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc2); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc2); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc2); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc2); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc2); - vp8 = _mm512_fmadd_ps(vp8, vt8, vc2); - vp9 = _mm512_fmadd_ps(vp9, vt9, vc2); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc1); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc1); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc1); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc1); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc1); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc1); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc1); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc1); - vp8 = _mm512_fmadd_ps(vp8, vt8, vc1); - vp9 = _mm512_fmadd_ps(vp9, vt9, vc1); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc0); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc0); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc0); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc0); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc0); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc0); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc0); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc0); - vp8 = _mm512_fmadd_ps(vp8, vt8, vc0); - vp9 = _mm512_fmadd_ps(vp9, vt9, vc0); - - const __m512 vf0 = _mm512_scalef_ps(vp0, vn0); - const __m512 vf1 = _mm512_scalef_ps(vp1, vn1); - const __m512 vf2 = _mm512_scalef_ps(vp2, vn2); - const __m512 vf3 = _mm512_scalef_ps(vp3, vn3); - const __m512 vf4 = _mm512_scalef_ps(vp4, vn4); - const __m512 vf5 = _mm512_scalef_ps(vp5, vn5); - const __m512 vf6 = _mm512_scalef_ps(vp6, vn6); - const __m512 vf7 = _mm512_scalef_ps(vp7, vn7); - const __m512 vf8 = _mm512_scalef_ps(vp8, vn8); - const __m512 vf9 = _mm512_scalef_ps(vp9, vn9); - - _mm512_storeu_ps(output, vf0); - _mm512_storeu_ps(output + 16, vf1); - _mm512_storeu_ps(output + 32, vf2); - _mm512_storeu_ps(output + 48, vf3); - _mm512_storeu_ps(output + 64, vf4); - _mm512_storeu_ps(output + 80, vf5); - _mm512_storeu_ps(output + 96, vf6); - _mm512_storeu_ps(output + 112, vf7); - _mm512_storeu_ps(output + 128, vf8); - _mm512_storeu_ps(output + 144, vf9); - output += 160; - - vacc0 = _mm512_add_ps(vacc0, vf0); - vacc1 = _mm512_add_ps(vacc1, vf1); - vacc0 = _mm512_add_ps(vacc0, vf2); - vacc1 = _mm512_add_ps(vacc1, vf3); - vacc0 = _mm512_add_ps(vacc0, vf4); - vacc1 = _mm512_add_ps(vacc1, vf5); - vacc0 = _mm512_add_ps(vacc0, vf6); - vacc1 = _mm512_add_ps(vacc1, vf7); - vacc0 = _mm512_add_ps(vacc0, vf8); - vacc1 = _mm512_add_ps(vacc1, vf9); - } - vacc0 = _mm512_add_ps(vacc0, vacc1); - - __m512 vacc = vacc0; - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const __m512 vi = _mm512_loadu_ps(input); - input += 16; - - const __m512 vx = _mm512_sub_ps(vi, vi_max); - - const __m512 vn = _mm512_roundscale_ps(_mm512_mul_ps(vx, vlog2e), 0); - - const __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2, vx); - - __m512 vp = _mm512_fmadd_ps(vc5, vt, vc4); - vp = _mm512_fmadd_ps(vp, vt, vc3); - vp = _mm512_fmadd_ps(vp, vt, vc2); - vp = _mm512_fmadd_ps(vp, vt, vc1); - vp = _mm512_fmadd_ps(vp, vt, vc0); - - const __m512 vf = _mm512_scalef_ps(vp, vn); - - _mm512_storeu_ps(output, vf); - output += 16; - - vacc = _mm512_add_ps(vacc, vf); - } - if (batch != 0) { - // Prepare mask for valid 32-bit batch (depends on batch). - batch >>= XNN_LOG2_SIZEOF_FLOAT; - const __mmask16 vmask = _cvtu32_mask16((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); - - const __m512 vi = _mm512_maskz_loadu_ps(vmask, input); - - const __m512 vx = _mm512_sub_ps(vi, vi_max); - - const __m512 vn = _mm512_roundscale_ps(_mm512_mul_ps(vx, vlog2e), 0); - - const __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2, vx); - - __m512 vp = _mm512_fmadd_ps(vc5, vt, vc4); - vp = _mm512_fmadd_ps(vp, vt, vc3); - vp = _mm512_fmadd_ps(vp, vt, vc2); - vp = _mm512_fmadd_ps(vp, vt, vc1); - vp = _mm512_fmadd_ps(vp, vt, vc0); - - const __m512 vf = _mm512_scalef_ps(vp, vn); - - _mm512_mask_storeu_ps(output, vmask, vf); - - vacc = _mm512_mask_add_ps(vacc, vmask, vacc, vf); - } - *sum = _mm512_reduce_add_ps(vacc); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u160-acc5.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u160-acc5.c deleted file mode 100644 index 9f0afe6b0b3..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u160-acc5.c +++ /dev/null @@ -1,248 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u160_acc5( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const __m512 vlog2e = _mm512_set1_ps(0x1.715476p+0f); - const __m512 vminus_ln2 = _mm512_set1_ps(-0x1.62E430p-1f); - const __m512 vc5 = _mm512_set1_ps(0x1.0F9F9Cp-7f); - const __m512 vc4 = _mm512_set1_ps(0x1.573A1Ap-5f); - const __m512 vc3 = _mm512_set1_ps(0x1.555A80p-3f); - const __m512 vc2 = _mm512_set1_ps(0x1.FFFDC6p-2f); - const __m512 vc1 = _mm512_set1_ps(0x1.FFFFF6p-1f); - const __m512 vc0 = _mm512_set1_ps(1.0f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vminus_ln2); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vc0); - - const __m512 vi_max = _mm512_set1_ps(*max); - - __m512 vacc0 = _mm512_setzero_ps(); - __m512 vacc1 = _mm512_setzero_ps(); - __m512 vacc2 = _mm512_setzero_ps(); - __m512 vacc3 = _mm512_setzero_ps(); - __m512 vacc4 = _mm512_setzero_ps(); - for (; batch >= 160 * sizeof(float); batch -= 160 * sizeof(float)) { - const __m512 vi0 = _mm512_loadu_ps(input); - const __m512 vi1 = _mm512_loadu_ps(input + 16); - const __m512 vi2 = _mm512_loadu_ps(input + 32); - const __m512 vi3 = _mm512_loadu_ps(input + 48); - const __m512 vi4 = _mm512_loadu_ps(input + 64); - const __m512 vi5 = _mm512_loadu_ps(input + 80); - const __m512 vi6 = _mm512_loadu_ps(input + 96); - const __m512 vi7 = _mm512_loadu_ps(input + 112); - const __m512 vi8 = _mm512_loadu_ps(input + 128); - const __m512 vi9 = _mm512_loadu_ps(input + 144); - input += 160; - - const __m512 vx0 = _mm512_sub_ps(vi0, vi_max); - const __m512 vx1 = _mm512_sub_ps(vi1, vi_max); - const __m512 vx2 = _mm512_sub_ps(vi2, vi_max); - const __m512 vx3 = _mm512_sub_ps(vi3, vi_max); - const __m512 vx4 = _mm512_sub_ps(vi4, vi_max); - const __m512 vx5 = _mm512_sub_ps(vi5, vi_max); - const __m512 vx6 = _mm512_sub_ps(vi6, vi_max); - const __m512 vx7 = _mm512_sub_ps(vi7, vi_max); - const __m512 vx8 = _mm512_sub_ps(vi8, vi_max); - const __m512 vx9 = _mm512_sub_ps(vi9, vi_max); - - const __m512 vn0 = _mm512_roundscale_ps(_mm512_mul_ps(vx0, vlog2e), 0); - const __m512 vn1 = _mm512_roundscale_ps(_mm512_mul_ps(vx1, vlog2e), 0); - const __m512 vn2 = _mm512_roundscale_ps(_mm512_mul_ps(vx2, vlog2e), 0); - const __m512 vn3 = _mm512_roundscale_ps(_mm512_mul_ps(vx3, vlog2e), 0); - const __m512 vn4 = _mm512_roundscale_ps(_mm512_mul_ps(vx4, vlog2e), 0); - const __m512 vn5 = _mm512_roundscale_ps(_mm512_mul_ps(vx5, vlog2e), 0); - const __m512 vn6 = _mm512_roundscale_ps(_mm512_mul_ps(vx6, vlog2e), 0); - const __m512 vn7 = _mm512_roundscale_ps(_mm512_mul_ps(vx7, vlog2e), 0); - const __m512 vn8 = _mm512_roundscale_ps(_mm512_mul_ps(vx8, vlog2e), 0); - const __m512 vn9 = _mm512_roundscale_ps(_mm512_mul_ps(vx9, vlog2e), 0); - - const __m512 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2, vx0); - const __m512 vt1 = _mm512_fmadd_ps(vn1, vminus_ln2, vx1); - const __m512 vt2 = _mm512_fmadd_ps(vn2, vminus_ln2, vx2); - const __m512 vt3 = _mm512_fmadd_ps(vn3, vminus_ln2, vx3); - const __m512 vt4 = _mm512_fmadd_ps(vn4, vminus_ln2, vx4); - const __m512 vt5 = _mm512_fmadd_ps(vn5, vminus_ln2, vx5); - const __m512 vt6 = _mm512_fmadd_ps(vn6, vminus_ln2, vx6); - const __m512 vt7 = _mm512_fmadd_ps(vn7, vminus_ln2, vx7); - const __m512 vt8 = _mm512_fmadd_ps(vn8, vminus_ln2, vx8); - const __m512 vt9 = _mm512_fmadd_ps(vn9, vminus_ln2, vx9); - - __m512 vp0 = _mm512_fmadd_ps(vc5, vt0, vc4); - __m512 vp1 = _mm512_fmadd_ps(vc5, vt1, vc4); - __m512 vp2 = _mm512_fmadd_ps(vc5, vt2, vc4); - __m512 vp3 = _mm512_fmadd_ps(vc5, vt3, vc4); - __m512 vp4 = _mm512_fmadd_ps(vc5, vt4, vc4); - __m512 vp5 = _mm512_fmadd_ps(vc5, vt5, vc4); - __m512 vp6 = _mm512_fmadd_ps(vc5, vt6, vc4); - __m512 vp7 = _mm512_fmadd_ps(vc5, vt7, vc4); - __m512 vp8 = _mm512_fmadd_ps(vc5, vt8, vc4); - __m512 vp9 = _mm512_fmadd_ps(vc5, vt9, vc4); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc3); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc3); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc3); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc3); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc3); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc3); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc3); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc3); - vp8 = _mm512_fmadd_ps(vp8, vt8, vc3); - vp9 = _mm512_fmadd_ps(vp9, vt9, vc3); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc2); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc2); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc2); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc2); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc2); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc2); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc2); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc2); - vp8 = _mm512_fmadd_ps(vp8, vt8, vc2); - vp9 = _mm512_fmadd_ps(vp9, vt9, vc2); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc1); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc1); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc1); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc1); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc1); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc1); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc1); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc1); - vp8 = _mm512_fmadd_ps(vp8, vt8, vc1); - vp9 = _mm512_fmadd_ps(vp9, vt9, vc1); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc0); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc0); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc0); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc0); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc0); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc0); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc0); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc0); - vp8 = _mm512_fmadd_ps(vp8, vt8, vc0); - vp9 = _mm512_fmadd_ps(vp9, vt9, vc0); - - const __m512 vf0 = _mm512_scalef_ps(vp0, vn0); - const __m512 vf1 = _mm512_scalef_ps(vp1, vn1); - const __m512 vf2 = _mm512_scalef_ps(vp2, vn2); - const __m512 vf3 = _mm512_scalef_ps(vp3, vn3); - const __m512 vf4 = _mm512_scalef_ps(vp4, vn4); - const __m512 vf5 = _mm512_scalef_ps(vp5, vn5); - const __m512 vf6 = _mm512_scalef_ps(vp6, vn6); - const __m512 vf7 = _mm512_scalef_ps(vp7, vn7); - const __m512 vf8 = _mm512_scalef_ps(vp8, vn8); - const __m512 vf9 = _mm512_scalef_ps(vp9, vn9); - - _mm512_storeu_ps(output, vf0); - _mm512_storeu_ps(output + 16, vf1); - _mm512_storeu_ps(output + 32, vf2); - _mm512_storeu_ps(output + 48, vf3); - _mm512_storeu_ps(output + 64, vf4); - _mm512_storeu_ps(output + 80, vf5); - _mm512_storeu_ps(output + 96, vf6); - _mm512_storeu_ps(output + 112, vf7); - _mm512_storeu_ps(output + 128, vf8); - _mm512_storeu_ps(output + 144, vf9); - output += 160; - - vacc0 = _mm512_add_ps(vacc0, vf0); - vacc1 = _mm512_add_ps(vacc1, vf1); - vacc2 = _mm512_add_ps(vacc2, vf2); - vacc3 = _mm512_add_ps(vacc3, vf3); - vacc4 = _mm512_add_ps(vacc4, vf4); - vacc0 = _mm512_add_ps(vacc0, vf5); - vacc1 = _mm512_add_ps(vacc1, vf6); - vacc2 = _mm512_add_ps(vacc2, vf7); - vacc3 = _mm512_add_ps(vacc3, vf8); - vacc4 = _mm512_add_ps(vacc4, vf9); - } - vacc0 = _mm512_add_ps(vacc0, vacc1); - vacc2 = _mm512_add_ps(vacc2, vacc3); - vacc0 = _mm512_add_ps(vacc0, vacc2); - vacc0 = _mm512_add_ps(vacc0, vacc4); - - __m512 vacc = vacc0; - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const __m512 vi = _mm512_loadu_ps(input); - input += 16; - - const __m512 vx = _mm512_sub_ps(vi, vi_max); - - const __m512 vn = _mm512_roundscale_ps(_mm512_mul_ps(vx, vlog2e), 0); - - const __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2, vx); - - __m512 vp = _mm512_fmadd_ps(vc5, vt, vc4); - vp = _mm512_fmadd_ps(vp, vt, vc3); - vp = _mm512_fmadd_ps(vp, vt, vc2); - vp = _mm512_fmadd_ps(vp, vt, vc1); - vp = _mm512_fmadd_ps(vp, vt, vc0); - - const __m512 vf = _mm512_scalef_ps(vp, vn); - - _mm512_storeu_ps(output, vf); - output += 16; - - vacc = _mm512_add_ps(vacc, vf); - } - if (batch != 0) { - // Prepare mask for valid 32-bit batch (depends on batch). - batch >>= XNN_LOG2_SIZEOF_FLOAT; - const __mmask16 vmask = _cvtu32_mask16((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); - - const __m512 vi = _mm512_maskz_loadu_ps(vmask, input); - - const __m512 vx = _mm512_sub_ps(vi, vi_max); - - const __m512 vn = _mm512_roundscale_ps(_mm512_mul_ps(vx, vlog2e), 0); - - const __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2, vx); - - __m512 vp = _mm512_fmadd_ps(vc5, vt, vc4); - vp = _mm512_fmadd_ps(vp, vt, vc3); - vp = _mm512_fmadd_ps(vp, vt, vc2); - vp = _mm512_fmadd_ps(vp, vt, vc1); - vp = _mm512_fmadd_ps(vp, vt, vc0); - - const __m512 vf = _mm512_scalef_ps(vp, vn); - - _mm512_mask_storeu_ps(output, vmask, vf); - - vacc = _mm512_mask_add_ps(vacc, vmask, vacc, vf); - } - *sum = _mm512_reduce_add_ps(vacc); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u160.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u160.c deleted file mode 100644 index 38a35fc01f1..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u160.c +++ /dev/null @@ -1,240 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u160( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const __m512 vlog2e = _mm512_set1_ps(0x1.715476p+0f); - const __m512 vminus_ln2 = _mm512_set1_ps(-0x1.62E430p-1f); - const __m512 vc5 = _mm512_set1_ps(0x1.0F9F9Cp-7f); - const __m512 vc4 = _mm512_set1_ps(0x1.573A1Ap-5f); - const __m512 vc3 = _mm512_set1_ps(0x1.555A80p-3f); - const __m512 vc2 = _mm512_set1_ps(0x1.FFFDC6p-2f); - const __m512 vc1 = _mm512_set1_ps(0x1.FFFFF6p-1f); - const __m512 vc0 = _mm512_set1_ps(1.0f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vminus_ln2); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vc0); - - const __m512 vi_max = _mm512_set1_ps(*max); - - __m512 vacc0 = _mm512_setzero_ps(); - for (; batch >= 160 * sizeof(float); batch -= 160 * sizeof(float)) { - const __m512 vi0 = _mm512_loadu_ps(input); - const __m512 vi1 = _mm512_loadu_ps(input + 16); - const __m512 vi2 = _mm512_loadu_ps(input + 32); - const __m512 vi3 = _mm512_loadu_ps(input + 48); - const __m512 vi4 = _mm512_loadu_ps(input + 64); - const __m512 vi5 = _mm512_loadu_ps(input + 80); - const __m512 vi6 = _mm512_loadu_ps(input + 96); - const __m512 vi7 = _mm512_loadu_ps(input + 112); - const __m512 vi8 = _mm512_loadu_ps(input + 128); - const __m512 vi9 = _mm512_loadu_ps(input + 144); - input += 160; - - const __m512 vx0 = _mm512_sub_ps(vi0, vi_max); - const __m512 vx1 = _mm512_sub_ps(vi1, vi_max); - const __m512 vx2 = _mm512_sub_ps(vi2, vi_max); - const __m512 vx3 = _mm512_sub_ps(vi3, vi_max); - const __m512 vx4 = _mm512_sub_ps(vi4, vi_max); - const __m512 vx5 = _mm512_sub_ps(vi5, vi_max); - const __m512 vx6 = _mm512_sub_ps(vi6, vi_max); - const __m512 vx7 = _mm512_sub_ps(vi7, vi_max); - const __m512 vx8 = _mm512_sub_ps(vi8, vi_max); - const __m512 vx9 = _mm512_sub_ps(vi9, vi_max); - - const __m512 vn0 = _mm512_roundscale_ps(_mm512_mul_ps(vx0, vlog2e), 0); - const __m512 vn1 = _mm512_roundscale_ps(_mm512_mul_ps(vx1, vlog2e), 0); - const __m512 vn2 = _mm512_roundscale_ps(_mm512_mul_ps(vx2, vlog2e), 0); - const __m512 vn3 = _mm512_roundscale_ps(_mm512_mul_ps(vx3, vlog2e), 0); - const __m512 vn4 = _mm512_roundscale_ps(_mm512_mul_ps(vx4, vlog2e), 0); - const __m512 vn5 = _mm512_roundscale_ps(_mm512_mul_ps(vx5, vlog2e), 0); - const __m512 vn6 = _mm512_roundscale_ps(_mm512_mul_ps(vx6, vlog2e), 0); - const __m512 vn7 = _mm512_roundscale_ps(_mm512_mul_ps(vx7, vlog2e), 0); - const __m512 vn8 = _mm512_roundscale_ps(_mm512_mul_ps(vx8, vlog2e), 0); - const __m512 vn9 = _mm512_roundscale_ps(_mm512_mul_ps(vx9, vlog2e), 0); - - const __m512 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2, vx0); - const __m512 vt1 = _mm512_fmadd_ps(vn1, vminus_ln2, vx1); - const __m512 vt2 = _mm512_fmadd_ps(vn2, vminus_ln2, vx2); - const __m512 vt3 = _mm512_fmadd_ps(vn3, vminus_ln2, vx3); - const __m512 vt4 = _mm512_fmadd_ps(vn4, vminus_ln2, vx4); - const __m512 vt5 = _mm512_fmadd_ps(vn5, vminus_ln2, vx5); - const __m512 vt6 = _mm512_fmadd_ps(vn6, vminus_ln2, vx6); - const __m512 vt7 = _mm512_fmadd_ps(vn7, vminus_ln2, vx7); - const __m512 vt8 = _mm512_fmadd_ps(vn8, vminus_ln2, vx8); - const __m512 vt9 = _mm512_fmadd_ps(vn9, vminus_ln2, vx9); - - __m512 vp0 = _mm512_fmadd_ps(vc5, vt0, vc4); - __m512 vp1 = _mm512_fmadd_ps(vc5, vt1, vc4); - __m512 vp2 = _mm512_fmadd_ps(vc5, vt2, vc4); - __m512 vp3 = _mm512_fmadd_ps(vc5, vt3, vc4); - __m512 vp4 = _mm512_fmadd_ps(vc5, vt4, vc4); - __m512 vp5 = _mm512_fmadd_ps(vc5, vt5, vc4); - __m512 vp6 = _mm512_fmadd_ps(vc5, vt6, vc4); - __m512 vp7 = _mm512_fmadd_ps(vc5, vt7, vc4); - __m512 vp8 = _mm512_fmadd_ps(vc5, vt8, vc4); - __m512 vp9 = _mm512_fmadd_ps(vc5, vt9, vc4); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc3); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc3); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc3); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc3); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc3); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc3); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc3); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc3); - vp8 = _mm512_fmadd_ps(vp8, vt8, vc3); - vp9 = _mm512_fmadd_ps(vp9, vt9, vc3); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc2); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc2); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc2); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc2); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc2); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc2); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc2); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc2); - vp8 = _mm512_fmadd_ps(vp8, vt8, vc2); - vp9 = _mm512_fmadd_ps(vp9, vt9, vc2); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc1); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc1); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc1); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc1); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc1); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc1); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc1); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc1); - vp8 = _mm512_fmadd_ps(vp8, vt8, vc1); - vp9 = _mm512_fmadd_ps(vp9, vt9, vc1); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc0); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc0); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc0); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc0); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc0); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc0); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc0); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc0); - vp8 = _mm512_fmadd_ps(vp8, vt8, vc0); - vp9 = _mm512_fmadd_ps(vp9, vt9, vc0); - - const __m512 vf0 = _mm512_scalef_ps(vp0, vn0); - const __m512 vf1 = _mm512_scalef_ps(vp1, vn1); - const __m512 vf2 = _mm512_scalef_ps(vp2, vn2); - const __m512 vf3 = _mm512_scalef_ps(vp3, vn3); - const __m512 vf4 = _mm512_scalef_ps(vp4, vn4); - const __m512 vf5 = _mm512_scalef_ps(vp5, vn5); - const __m512 vf6 = _mm512_scalef_ps(vp6, vn6); - const __m512 vf7 = _mm512_scalef_ps(vp7, vn7); - const __m512 vf8 = _mm512_scalef_ps(vp8, vn8); - const __m512 vf9 = _mm512_scalef_ps(vp9, vn9); - - _mm512_storeu_ps(output, vf0); - _mm512_storeu_ps(output + 16, vf1); - _mm512_storeu_ps(output + 32, vf2); - _mm512_storeu_ps(output + 48, vf3); - _mm512_storeu_ps(output + 64, vf4); - _mm512_storeu_ps(output + 80, vf5); - _mm512_storeu_ps(output + 96, vf6); - _mm512_storeu_ps(output + 112, vf7); - _mm512_storeu_ps(output + 128, vf8); - _mm512_storeu_ps(output + 144, vf9); - output += 160; - - vacc0 = _mm512_add_ps(vacc0, vf0); - vacc0 = _mm512_add_ps(vacc0, vf1); - vacc0 = _mm512_add_ps(vacc0, vf2); - vacc0 = _mm512_add_ps(vacc0, vf3); - vacc0 = _mm512_add_ps(vacc0, vf4); - vacc0 = _mm512_add_ps(vacc0, vf5); - vacc0 = _mm512_add_ps(vacc0, vf6); - vacc0 = _mm512_add_ps(vacc0, vf7); - vacc0 = _mm512_add_ps(vacc0, vf8); - vacc0 = _mm512_add_ps(vacc0, vf9); - } - - __m512 vacc = vacc0; - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const __m512 vi = _mm512_loadu_ps(input); - input += 16; - - const __m512 vx = _mm512_sub_ps(vi, vi_max); - - const __m512 vn = _mm512_roundscale_ps(_mm512_mul_ps(vx, vlog2e), 0); - - const __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2, vx); - - __m512 vp = _mm512_fmadd_ps(vc5, vt, vc4); - vp = _mm512_fmadd_ps(vp, vt, vc3); - vp = _mm512_fmadd_ps(vp, vt, vc2); - vp = _mm512_fmadd_ps(vp, vt, vc1); - vp = _mm512_fmadd_ps(vp, vt, vc0); - - const __m512 vf = _mm512_scalef_ps(vp, vn); - - _mm512_storeu_ps(output, vf); - output += 16; - - vacc = _mm512_add_ps(vacc, vf); - } - if (batch != 0) { - // Prepare mask for valid 32-bit batch (depends on batch). - batch >>= XNN_LOG2_SIZEOF_FLOAT; - const __mmask16 vmask = _cvtu32_mask16((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); - - const __m512 vi = _mm512_maskz_loadu_ps(vmask, input); - - const __m512 vx = _mm512_sub_ps(vi, vi_max); - - const __m512 vn = _mm512_roundscale_ps(_mm512_mul_ps(vx, vlog2e), 0); - - const __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2, vx); - - __m512 vp = _mm512_fmadd_ps(vc5, vt, vc4); - vp = _mm512_fmadd_ps(vp, vt, vc3); - vp = _mm512_fmadd_ps(vp, vt, vc2); - vp = _mm512_fmadd_ps(vp, vt, vc1); - vp = _mm512_fmadd_ps(vp, vt, vc0); - - const __m512 vf = _mm512_scalef_ps(vp, vn); - - _mm512_mask_storeu_ps(output, vmask, vf); - - vacc = _mm512_mask_add_ps(vacc, vmask, vacc, vf); - } - *sum = _mm512_reduce_add_ps(vacc); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u192-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u192-acc2.c deleted file mode 100644 index 328228324bc..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u192-acc2.c +++ /dev/null @@ -1,266 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u192_acc2( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const __m512 vlog2e = _mm512_set1_ps(0x1.715476p+0f); - const __m512 vminus_ln2 = _mm512_set1_ps(-0x1.62E430p-1f); - const __m512 vc5 = _mm512_set1_ps(0x1.0F9F9Cp-7f); - const __m512 vc4 = _mm512_set1_ps(0x1.573A1Ap-5f); - const __m512 vc3 = _mm512_set1_ps(0x1.555A80p-3f); - const __m512 vc2 = _mm512_set1_ps(0x1.FFFDC6p-2f); - const __m512 vc1 = _mm512_set1_ps(0x1.FFFFF6p-1f); - const __m512 vc0 = _mm512_set1_ps(1.0f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vminus_ln2); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vc0); - - const __m512 vi_max = _mm512_set1_ps(*max); - - __m512 vacc0 = _mm512_setzero_ps(); - __m512 vacc1 = _mm512_setzero_ps(); - for (; batch >= 192 * sizeof(float); batch -= 192 * sizeof(float)) { - const __m512 vi0 = _mm512_loadu_ps(input); - const __m512 vi1 = _mm512_loadu_ps(input + 16); - const __m512 vi2 = _mm512_loadu_ps(input + 32); - const __m512 vi3 = _mm512_loadu_ps(input + 48); - const __m512 vi4 = _mm512_loadu_ps(input + 64); - const __m512 vi5 = _mm512_loadu_ps(input + 80); - const __m512 vi6 = _mm512_loadu_ps(input + 96); - const __m512 vi7 = _mm512_loadu_ps(input + 112); - const __m512 vi8 = _mm512_loadu_ps(input + 128); - const __m512 vi9 = _mm512_loadu_ps(input + 144); - const __m512 vi10 = _mm512_loadu_ps(input + 160); - const __m512 vi11 = _mm512_loadu_ps(input + 176); - input += 192; - - const __m512 vx0 = _mm512_sub_ps(vi0, vi_max); - const __m512 vx1 = _mm512_sub_ps(vi1, vi_max); - const __m512 vx2 = _mm512_sub_ps(vi2, vi_max); - const __m512 vx3 = _mm512_sub_ps(vi3, vi_max); - const __m512 vx4 = _mm512_sub_ps(vi4, vi_max); - const __m512 vx5 = _mm512_sub_ps(vi5, vi_max); - const __m512 vx6 = _mm512_sub_ps(vi6, vi_max); - const __m512 vx7 = _mm512_sub_ps(vi7, vi_max); - const __m512 vx8 = _mm512_sub_ps(vi8, vi_max); - const __m512 vx9 = _mm512_sub_ps(vi9, vi_max); - const __m512 vx10 = _mm512_sub_ps(vi10, vi_max); - const __m512 vx11 = _mm512_sub_ps(vi11, vi_max); - - const __m512 vn0 = _mm512_roundscale_ps(_mm512_mul_ps(vx0, vlog2e), 0); - const __m512 vn1 = _mm512_roundscale_ps(_mm512_mul_ps(vx1, vlog2e), 0); - const __m512 vn2 = _mm512_roundscale_ps(_mm512_mul_ps(vx2, vlog2e), 0); - const __m512 vn3 = _mm512_roundscale_ps(_mm512_mul_ps(vx3, vlog2e), 0); - const __m512 vn4 = _mm512_roundscale_ps(_mm512_mul_ps(vx4, vlog2e), 0); - const __m512 vn5 = _mm512_roundscale_ps(_mm512_mul_ps(vx5, vlog2e), 0); - const __m512 vn6 = _mm512_roundscale_ps(_mm512_mul_ps(vx6, vlog2e), 0); - const __m512 vn7 = _mm512_roundscale_ps(_mm512_mul_ps(vx7, vlog2e), 0); - const __m512 vn8 = _mm512_roundscale_ps(_mm512_mul_ps(vx8, vlog2e), 0); - const __m512 vn9 = _mm512_roundscale_ps(_mm512_mul_ps(vx9, vlog2e), 0); - const __m512 vn10 = _mm512_roundscale_ps(_mm512_mul_ps(vx10, vlog2e), 0); - const __m512 vn11 = _mm512_roundscale_ps(_mm512_mul_ps(vx11, vlog2e), 0); - - const __m512 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2, vx0); - const __m512 vt1 = _mm512_fmadd_ps(vn1, vminus_ln2, vx1); - const __m512 vt2 = _mm512_fmadd_ps(vn2, vminus_ln2, vx2); - const __m512 vt3 = _mm512_fmadd_ps(vn3, vminus_ln2, vx3); - const __m512 vt4 = _mm512_fmadd_ps(vn4, vminus_ln2, vx4); - const __m512 vt5 = _mm512_fmadd_ps(vn5, vminus_ln2, vx5); - const __m512 vt6 = _mm512_fmadd_ps(vn6, vminus_ln2, vx6); - const __m512 vt7 = _mm512_fmadd_ps(vn7, vminus_ln2, vx7); - const __m512 vt8 = _mm512_fmadd_ps(vn8, vminus_ln2, vx8); - const __m512 vt9 = _mm512_fmadd_ps(vn9, vminus_ln2, vx9); - const __m512 vt10 = _mm512_fmadd_ps(vn10, vminus_ln2, vx10); - const __m512 vt11 = _mm512_fmadd_ps(vn11, vminus_ln2, vx11); - - __m512 vp0 = _mm512_fmadd_ps(vc5, vt0, vc4); - __m512 vp1 = _mm512_fmadd_ps(vc5, vt1, vc4); - __m512 vp2 = _mm512_fmadd_ps(vc5, vt2, vc4); - __m512 vp3 = _mm512_fmadd_ps(vc5, vt3, vc4); - __m512 vp4 = _mm512_fmadd_ps(vc5, vt4, vc4); - __m512 vp5 = _mm512_fmadd_ps(vc5, vt5, vc4); - __m512 vp6 = _mm512_fmadd_ps(vc5, vt6, vc4); - __m512 vp7 = _mm512_fmadd_ps(vc5, vt7, vc4); - __m512 vp8 = _mm512_fmadd_ps(vc5, vt8, vc4); - __m512 vp9 = _mm512_fmadd_ps(vc5, vt9, vc4); - __m512 vp10 = _mm512_fmadd_ps(vc5, vt10, vc4); - __m512 vp11 = _mm512_fmadd_ps(vc5, vt11, vc4); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc3); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc3); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc3); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc3); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc3); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc3); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc3); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc3); - vp8 = _mm512_fmadd_ps(vp8, vt8, vc3); - vp9 = _mm512_fmadd_ps(vp9, vt9, vc3); - vp10 = _mm512_fmadd_ps(vp10, vt10, vc3); - vp11 = _mm512_fmadd_ps(vp11, vt11, vc3); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc2); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc2); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc2); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc2); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc2); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc2); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc2); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc2); - vp8 = _mm512_fmadd_ps(vp8, vt8, vc2); - vp9 = _mm512_fmadd_ps(vp9, vt9, vc2); - vp10 = _mm512_fmadd_ps(vp10, vt10, vc2); - vp11 = _mm512_fmadd_ps(vp11, vt11, vc2); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc1); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc1); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc1); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc1); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc1); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc1); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc1); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc1); - vp8 = _mm512_fmadd_ps(vp8, vt8, vc1); - vp9 = _mm512_fmadd_ps(vp9, vt9, vc1); - vp10 = _mm512_fmadd_ps(vp10, vt10, vc1); - vp11 = _mm512_fmadd_ps(vp11, vt11, vc1); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc0); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc0); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc0); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc0); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc0); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc0); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc0); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc0); - vp8 = _mm512_fmadd_ps(vp8, vt8, vc0); - vp9 = _mm512_fmadd_ps(vp9, vt9, vc0); - vp10 = _mm512_fmadd_ps(vp10, vt10, vc0); - vp11 = _mm512_fmadd_ps(vp11, vt11, vc0); - - const __m512 vf0 = _mm512_scalef_ps(vp0, vn0); - const __m512 vf1 = _mm512_scalef_ps(vp1, vn1); - const __m512 vf2 = _mm512_scalef_ps(vp2, vn2); - const __m512 vf3 = _mm512_scalef_ps(vp3, vn3); - const __m512 vf4 = _mm512_scalef_ps(vp4, vn4); - const __m512 vf5 = _mm512_scalef_ps(vp5, vn5); - const __m512 vf6 = _mm512_scalef_ps(vp6, vn6); - const __m512 vf7 = _mm512_scalef_ps(vp7, vn7); - const __m512 vf8 = _mm512_scalef_ps(vp8, vn8); - const __m512 vf9 = _mm512_scalef_ps(vp9, vn9); - const __m512 vf10 = _mm512_scalef_ps(vp10, vn10); - const __m512 vf11 = _mm512_scalef_ps(vp11, vn11); - - _mm512_storeu_ps(output, vf0); - _mm512_storeu_ps(output + 16, vf1); - _mm512_storeu_ps(output + 32, vf2); - _mm512_storeu_ps(output + 48, vf3); - _mm512_storeu_ps(output + 64, vf4); - _mm512_storeu_ps(output + 80, vf5); - _mm512_storeu_ps(output + 96, vf6); - _mm512_storeu_ps(output + 112, vf7); - _mm512_storeu_ps(output + 128, vf8); - _mm512_storeu_ps(output + 144, vf9); - _mm512_storeu_ps(output + 160, vf10); - _mm512_storeu_ps(output + 176, vf11); - output += 192; - - vacc0 = _mm512_add_ps(vacc0, vf0); - vacc1 = _mm512_add_ps(vacc1, vf1); - vacc0 = _mm512_add_ps(vacc0, vf2); - vacc1 = _mm512_add_ps(vacc1, vf3); - vacc0 = _mm512_add_ps(vacc0, vf4); - vacc1 = _mm512_add_ps(vacc1, vf5); - vacc0 = _mm512_add_ps(vacc0, vf6); - vacc1 = _mm512_add_ps(vacc1, vf7); - vacc0 = _mm512_add_ps(vacc0, vf8); - vacc1 = _mm512_add_ps(vacc1, vf9); - vacc0 = _mm512_add_ps(vacc0, vf10); - vacc1 = _mm512_add_ps(vacc1, vf11); - } - vacc0 = _mm512_add_ps(vacc0, vacc1); - - __m512 vacc = vacc0; - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const __m512 vi = _mm512_loadu_ps(input); - input += 16; - - const __m512 vx = _mm512_sub_ps(vi, vi_max); - - const __m512 vn = _mm512_roundscale_ps(_mm512_mul_ps(vx, vlog2e), 0); - - const __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2, vx); - - __m512 vp = _mm512_fmadd_ps(vc5, vt, vc4); - vp = _mm512_fmadd_ps(vp, vt, vc3); - vp = _mm512_fmadd_ps(vp, vt, vc2); - vp = _mm512_fmadd_ps(vp, vt, vc1); - vp = _mm512_fmadd_ps(vp, vt, vc0); - - const __m512 vf = _mm512_scalef_ps(vp, vn); - - _mm512_storeu_ps(output, vf); - output += 16; - - vacc = _mm512_add_ps(vacc, vf); - } - if (batch != 0) { - // Prepare mask for valid 32-bit batch (depends on batch). - batch >>= XNN_LOG2_SIZEOF_FLOAT; - const __mmask16 vmask = _cvtu32_mask16((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); - - const __m512 vi = _mm512_maskz_loadu_ps(vmask, input); - - const __m512 vx = _mm512_sub_ps(vi, vi_max); - - const __m512 vn = _mm512_roundscale_ps(_mm512_mul_ps(vx, vlog2e), 0); - - const __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2, vx); - - __m512 vp = _mm512_fmadd_ps(vc5, vt, vc4); - vp = _mm512_fmadd_ps(vp, vt, vc3); - vp = _mm512_fmadd_ps(vp, vt, vc2); - vp = _mm512_fmadd_ps(vp, vt, vc1); - vp = _mm512_fmadd_ps(vp, vt, vc0); - - const __m512 vf = _mm512_scalef_ps(vp, vn); - - _mm512_mask_storeu_ps(output, vmask, vf); - - vacc = _mm512_mask_add_ps(vacc, vmask, vacc, vf); - } - *sum = _mm512_reduce_add_ps(vacc); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u192-acc3.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u192-acc3.c deleted file mode 100644 index 4032fa698eb..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u192-acc3.c +++ /dev/null @@ -1,268 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u192_acc3( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const __m512 vlog2e = _mm512_set1_ps(0x1.715476p+0f); - const __m512 vminus_ln2 = _mm512_set1_ps(-0x1.62E430p-1f); - const __m512 vc5 = _mm512_set1_ps(0x1.0F9F9Cp-7f); - const __m512 vc4 = _mm512_set1_ps(0x1.573A1Ap-5f); - const __m512 vc3 = _mm512_set1_ps(0x1.555A80p-3f); - const __m512 vc2 = _mm512_set1_ps(0x1.FFFDC6p-2f); - const __m512 vc1 = _mm512_set1_ps(0x1.FFFFF6p-1f); - const __m512 vc0 = _mm512_set1_ps(1.0f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vminus_ln2); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vc0); - - const __m512 vi_max = _mm512_set1_ps(*max); - - __m512 vacc0 = _mm512_setzero_ps(); - __m512 vacc1 = _mm512_setzero_ps(); - __m512 vacc2 = _mm512_setzero_ps(); - for (; batch >= 192 * sizeof(float); batch -= 192 * sizeof(float)) { - const __m512 vi0 = _mm512_loadu_ps(input); - const __m512 vi1 = _mm512_loadu_ps(input + 16); - const __m512 vi2 = _mm512_loadu_ps(input + 32); - const __m512 vi3 = _mm512_loadu_ps(input + 48); - const __m512 vi4 = _mm512_loadu_ps(input + 64); - const __m512 vi5 = _mm512_loadu_ps(input + 80); - const __m512 vi6 = _mm512_loadu_ps(input + 96); - const __m512 vi7 = _mm512_loadu_ps(input + 112); - const __m512 vi8 = _mm512_loadu_ps(input + 128); - const __m512 vi9 = _mm512_loadu_ps(input + 144); - const __m512 vi10 = _mm512_loadu_ps(input + 160); - const __m512 vi11 = _mm512_loadu_ps(input + 176); - input += 192; - - const __m512 vx0 = _mm512_sub_ps(vi0, vi_max); - const __m512 vx1 = _mm512_sub_ps(vi1, vi_max); - const __m512 vx2 = _mm512_sub_ps(vi2, vi_max); - const __m512 vx3 = _mm512_sub_ps(vi3, vi_max); - const __m512 vx4 = _mm512_sub_ps(vi4, vi_max); - const __m512 vx5 = _mm512_sub_ps(vi5, vi_max); - const __m512 vx6 = _mm512_sub_ps(vi6, vi_max); - const __m512 vx7 = _mm512_sub_ps(vi7, vi_max); - const __m512 vx8 = _mm512_sub_ps(vi8, vi_max); - const __m512 vx9 = _mm512_sub_ps(vi9, vi_max); - const __m512 vx10 = _mm512_sub_ps(vi10, vi_max); - const __m512 vx11 = _mm512_sub_ps(vi11, vi_max); - - const __m512 vn0 = _mm512_roundscale_ps(_mm512_mul_ps(vx0, vlog2e), 0); - const __m512 vn1 = _mm512_roundscale_ps(_mm512_mul_ps(vx1, vlog2e), 0); - const __m512 vn2 = _mm512_roundscale_ps(_mm512_mul_ps(vx2, vlog2e), 0); - const __m512 vn3 = _mm512_roundscale_ps(_mm512_mul_ps(vx3, vlog2e), 0); - const __m512 vn4 = _mm512_roundscale_ps(_mm512_mul_ps(vx4, vlog2e), 0); - const __m512 vn5 = _mm512_roundscale_ps(_mm512_mul_ps(vx5, vlog2e), 0); - const __m512 vn6 = _mm512_roundscale_ps(_mm512_mul_ps(vx6, vlog2e), 0); - const __m512 vn7 = _mm512_roundscale_ps(_mm512_mul_ps(vx7, vlog2e), 0); - const __m512 vn8 = _mm512_roundscale_ps(_mm512_mul_ps(vx8, vlog2e), 0); - const __m512 vn9 = _mm512_roundscale_ps(_mm512_mul_ps(vx9, vlog2e), 0); - const __m512 vn10 = _mm512_roundscale_ps(_mm512_mul_ps(vx10, vlog2e), 0); - const __m512 vn11 = _mm512_roundscale_ps(_mm512_mul_ps(vx11, vlog2e), 0); - - const __m512 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2, vx0); - const __m512 vt1 = _mm512_fmadd_ps(vn1, vminus_ln2, vx1); - const __m512 vt2 = _mm512_fmadd_ps(vn2, vminus_ln2, vx2); - const __m512 vt3 = _mm512_fmadd_ps(vn3, vminus_ln2, vx3); - const __m512 vt4 = _mm512_fmadd_ps(vn4, vminus_ln2, vx4); - const __m512 vt5 = _mm512_fmadd_ps(vn5, vminus_ln2, vx5); - const __m512 vt6 = _mm512_fmadd_ps(vn6, vminus_ln2, vx6); - const __m512 vt7 = _mm512_fmadd_ps(vn7, vminus_ln2, vx7); - const __m512 vt8 = _mm512_fmadd_ps(vn8, vminus_ln2, vx8); - const __m512 vt9 = _mm512_fmadd_ps(vn9, vminus_ln2, vx9); - const __m512 vt10 = _mm512_fmadd_ps(vn10, vminus_ln2, vx10); - const __m512 vt11 = _mm512_fmadd_ps(vn11, vminus_ln2, vx11); - - __m512 vp0 = _mm512_fmadd_ps(vc5, vt0, vc4); - __m512 vp1 = _mm512_fmadd_ps(vc5, vt1, vc4); - __m512 vp2 = _mm512_fmadd_ps(vc5, vt2, vc4); - __m512 vp3 = _mm512_fmadd_ps(vc5, vt3, vc4); - __m512 vp4 = _mm512_fmadd_ps(vc5, vt4, vc4); - __m512 vp5 = _mm512_fmadd_ps(vc5, vt5, vc4); - __m512 vp6 = _mm512_fmadd_ps(vc5, vt6, vc4); - __m512 vp7 = _mm512_fmadd_ps(vc5, vt7, vc4); - __m512 vp8 = _mm512_fmadd_ps(vc5, vt8, vc4); - __m512 vp9 = _mm512_fmadd_ps(vc5, vt9, vc4); - __m512 vp10 = _mm512_fmadd_ps(vc5, vt10, vc4); - __m512 vp11 = _mm512_fmadd_ps(vc5, vt11, vc4); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc3); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc3); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc3); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc3); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc3); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc3); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc3); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc3); - vp8 = _mm512_fmadd_ps(vp8, vt8, vc3); - vp9 = _mm512_fmadd_ps(vp9, vt9, vc3); - vp10 = _mm512_fmadd_ps(vp10, vt10, vc3); - vp11 = _mm512_fmadd_ps(vp11, vt11, vc3); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc2); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc2); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc2); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc2); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc2); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc2); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc2); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc2); - vp8 = _mm512_fmadd_ps(vp8, vt8, vc2); - vp9 = _mm512_fmadd_ps(vp9, vt9, vc2); - vp10 = _mm512_fmadd_ps(vp10, vt10, vc2); - vp11 = _mm512_fmadd_ps(vp11, vt11, vc2); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc1); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc1); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc1); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc1); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc1); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc1); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc1); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc1); - vp8 = _mm512_fmadd_ps(vp8, vt8, vc1); - vp9 = _mm512_fmadd_ps(vp9, vt9, vc1); - vp10 = _mm512_fmadd_ps(vp10, vt10, vc1); - vp11 = _mm512_fmadd_ps(vp11, vt11, vc1); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc0); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc0); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc0); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc0); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc0); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc0); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc0); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc0); - vp8 = _mm512_fmadd_ps(vp8, vt8, vc0); - vp9 = _mm512_fmadd_ps(vp9, vt9, vc0); - vp10 = _mm512_fmadd_ps(vp10, vt10, vc0); - vp11 = _mm512_fmadd_ps(vp11, vt11, vc0); - - const __m512 vf0 = _mm512_scalef_ps(vp0, vn0); - const __m512 vf1 = _mm512_scalef_ps(vp1, vn1); - const __m512 vf2 = _mm512_scalef_ps(vp2, vn2); - const __m512 vf3 = _mm512_scalef_ps(vp3, vn3); - const __m512 vf4 = _mm512_scalef_ps(vp4, vn4); - const __m512 vf5 = _mm512_scalef_ps(vp5, vn5); - const __m512 vf6 = _mm512_scalef_ps(vp6, vn6); - const __m512 vf7 = _mm512_scalef_ps(vp7, vn7); - const __m512 vf8 = _mm512_scalef_ps(vp8, vn8); - const __m512 vf9 = _mm512_scalef_ps(vp9, vn9); - const __m512 vf10 = _mm512_scalef_ps(vp10, vn10); - const __m512 vf11 = _mm512_scalef_ps(vp11, vn11); - - _mm512_storeu_ps(output, vf0); - _mm512_storeu_ps(output + 16, vf1); - _mm512_storeu_ps(output + 32, vf2); - _mm512_storeu_ps(output + 48, vf3); - _mm512_storeu_ps(output + 64, vf4); - _mm512_storeu_ps(output + 80, vf5); - _mm512_storeu_ps(output + 96, vf6); - _mm512_storeu_ps(output + 112, vf7); - _mm512_storeu_ps(output + 128, vf8); - _mm512_storeu_ps(output + 144, vf9); - _mm512_storeu_ps(output + 160, vf10); - _mm512_storeu_ps(output + 176, vf11); - output += 192; - - vacc0 = _mm512_add_ps(vacc0, vf0); - vacc1 = _mm512_add_ps(vacc1, vf1); - vacc2 = _mm512_add_ps(vacc2, vf2); - vacc0 = _mm512_add_ps(vacc0, vf3); - vacc1 = _mm512_add_ps(vacc1, vf4); - vacc2 = _mm512_add_ps(vacc2, vf5); - vacc0 = _mm512_add_ps(vacc0, vf6); - vacc1 = _mm512_add_ps(vacc1, vf7); - vacc2 = _mm512_add_ps(vacc2, vf8); - vacc0 = _mm512_add_ps(vacc0, vf9); - vacc1 = _mm512_add_ps(vacc1, vf10); - vacc2 = _mm512_add_ps(vacc2, vf11); - } - vacc0 = _mm512_add_ps(vacc0, vacc1); - vacc0 = _mm512_add_ps(vacc0, vacc2); - - __m512 vacc = vacc0; - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const __m512 vi = _mm512_loadu_ps(input); - input += 16; - - const __m512 vx = _mm512_sub_ps(vi, vi_max); - - const __m512 vn = _mm512_roundscale_ps(_mm512_mul_ps(vx, vlog2e), 0); - - const __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2, vx); - - __m512 vp = _mm512_fmadd_ps(vc5, vt, vc4); - vp = _mm512_fmadd_ps(vp, vt, vc3); - vp = _mm512_fmadd_ps(vp, vt, vc2); - vp = _mm512_fmadd_ps(vp, vt, vc1); - vp = _mm512_fmadd_ps(vp, vt, vc0); - - const __m512 vf = _mm512_scalef_ps(vp, vn); - - _mm512_storeu_ps(output, vf); - output += 16; - - vacc = _mm512_add_ps(vacc, vf); - } - if (batch != 0) { - // Prepare mask for valid 32-bit batch (depends on batch). - batch >>= XNN_LOG2_SIZEOF_FLOAT; - const __mmask16 vmask = _cvtu32_mask16((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); - - const __m512 vi = _mm512_maskz_loadu_ps(vmask, input); - - const __m512 vx = _mm512_sub_ps(vi, vi_max); - - const __m512 vn = _mm512_roundscale_ps(_mm512_mul_ps(vx, vlog2e), 0); - - const __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2, vx); - - __m512 vp = _mm512_fmadd_ps(vc5, vt, vc4); - vp = _mm512_fmadd_ps(vp, vt, vc3); - vp = _mm512_fmadd_ps(vp, vt, vc2); - vp = _mm512_fmadd_ps(vp, vt, vc1); - vp = _mm512_fmadd_ps(vp, vt, vc0); - - const __m512 vf = _mm512_scalef_ps(vp, vn); - - _mm512_mask_storeu_ps(output, vmask, vf); - - vacc = _mm512_mask_add_ps(vacc, vmask, vacc, vf); - } - *sum = _mm512_reduce_add_ps(vacc); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u192-acc6.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u192-acc6.c deleted file mode 100644 index e68a4dee96e..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u192-acc6.c +++ /dev/null @@ -1,274 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u192_acc6( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const __m512 vlog2e = _mm512_set1_ps(0x1.715476p+0f); - const __m512 vminus_ln2 = _mm512_set1_ps(-0x1.62E430p-1f); - const __m512 vc5 = _mm512_set1_ps(0x1.0F9F9Cp-7f); - const __m512 vc4 = _mm512_set1_ps(0x1.573A1Ap-5f); - const __m512 vc3 = _mm512_set1_ps(0x1.555A80p-3f); - const __m512 vc2 = _mm512_set1_ps(0x1.FFFDC6p-2f); - const __m512 vc1 = _mm512_set1_ps(0x1.FFFFF6p-1f); - const __m512 vc0 = _mm512_set1_ps(1.0f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vminus_ln2); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vc0); - - const __m512 vi_max = _mm512_set1_ps(*max); - - __m512 vacc0 = _mm512_setzero_ps(); - __m512 vacc1 = _mm512_setzero_ps(); - __m512 vacc2 = _mm512_setzero_ps(); - __m512 vacc3 = _mm512_setzero_ps(); - __m512 vacc4 = _mm512_setzero_ps(); - __m512 vacc5 = _mm512_setzero_ps(); - for (; batch >= 192 * sizeof(float); batch -= 192 * sizeof(float)) { - const __m512 vi0 = _mm512_loadu_ps(input); - const __m512 vi1 = _mm512_loadu_ps(input + 16); - const __m512 vi2 = _mm512_loadu_ps(input + 32); - const __m512 vi3 = _mm512_loadu_ps(input + 48); - const __m512 vi4 = _mm512_loadu_ps(input + 64); - const __m512 vi5 = _mm512_loadu_ps(input + 80); - const __m512 vi6 = _mm512_loadu_ps(input + 96); - const __m512 vi7 = _mm512_loadu_ps(input + 112); - const __m512 vi8 = _mm512_loadu_ps(input + 128); - const __m512 vi9 = _mm512_loadu_ps(input + 144); - const __m512 vi10 = _mm512_loadu_ps(input + 160); - const __m512 vi11 = _mm512_loadu_ps(input + 176); - input += 192; - - const __m512 vx0 = _mm512_sub_ps(vi0, vi_max); - const __m512 vx1 = _mm512_sub_ps(vi1, vi_max); - const __m512 vx2 = _mm512_sub_ps(vi2, vi_max); - const __m512 vx3 = _mm512_sub_ps(vi3, vi_max); - const __m512 vx4 = _mm512_sub_ps(vi4, vi_max); - const __m512 vx5 = _mm512_sub_ps(vi5, vi_max); - const __m512 vx6 = _mm512_sub_ps(vi6, vi_max); - const __m512 vx7 = _mm512_sub_ps(vi7, vi_max); - const __m512 vx8 = _mm512_sub_ps(vi8, vi_max); - const __m512 vx9 = _mm512_sub_ps(vi9, vi_max); - const __m512 vx10 = _mm512_sub_ps(vi10, vi_max); - const __m512 vx11 = _mm512_sub_ps(vi11, vi_max); - - const __m512 vn0 = _mm512_roundscale_ps(_mm512_mul_ps(vx0, vlog2e), 0); - const __m512 vn1 = _mm512_roundscale_ps(_mm512_mul_ps(vx1, vlog2e), 0); - const __m512 vn2 = _mm512_roundscale_ps(_mm512_mul_ps(vx2, vlog2e), 0); - const __m512 vn3 = _mm512_roundscale_ps(_mm512_mul_ps(vx3, vlog2e), 0); - const __m512 vn4 = _mm512_roundscale_ps(_mm512_mul_ps(vx4, vlog2e), 0); - const __m512 vn5 = _mm512_roundscale_ps(_mm512_mul_ps(vx5, vlog2e), 0); - const __m512 vn6 = _mm512_roundscale_ps(_mm512_mul_ps(vx6, vlog2e), 0); - const __m512 vn7 = _mm512_roundscale_ps(_mm512_mul_ps(vx7, vlog2e), 0); - const __m512 vn8 = _mm512_roundscale_ps(_mm512_mul_ps(vx8, vlog2e), 0); - const __m512 vn9 = _mm512_roundscale_ps(_mm512_mul_ps(vx9, vlog2e), 0); - const __m512 vn10 = _mm512_roundscale_ps(_mm512_mul_ps(vx10, vlog2e), 0); - const __m512 vn11 = _mm512_roundscale_ps(_mm512_mul_ps(vx11, vlog2e), 0); - - const __m512 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2, vx0); - const __m512 vt1 = _mm512_fmadd_ps(vn1, vminus_ln2, vx1); - const __m512 vt2 = _mm512_fmadd_ps(vn2, vminus_ln2, vx2); - const __m512 vt3 = _mm512_fmadd_ps(vn3, vminus_ln2, vx3); - const __m512 vt4 = _mm512_fmadd_ps(vn4, vminus_ln2, vx4); - const __m512 vt5 = _mm512_fmadd_ps(vn5, vminus_ln2, vx5); - const __m512 vt6 = _mm512_fmadd_ps(vn6, vminus_ln2, vx6); - const __m512 vt7 = _mm512_fmadd_ps(vn7, vminus_ln2, vx7); - const __m512 vt8 = _mm512_fmadd_ps(vn8, vminus_ln2, vx8); - const __m512 vt9 = _mm512_fmadd_ps(vn9, vminus_ln2, vx9); - const __m512 vt10 = _mm512_fmadd_ps(vn10, vminus_ln2, vx10); - const __m512 vt11 = _mm512_fmadd_ps(vn11, vminus_ln2, vx11); - - __m512 vp0 = _mm512_fmadd_ps(vc5, vt0, vc4); - __m512 vp1 = _mm512_fmadd_ps(vc5, vt1, vc4); - __m512 vp2 = _mm512_fmadd_ps(vc5, vt2, vc4); - __m512 vp3 = _mm512_fmadd_ps(vc5, vt3, vc4); - __m512 vp4 = _mm512_fmadd_ps(vc5, vt4, vc4); - __m512 vp5 = _mm512_fmadd_ps(vc5, vt5, vc4); - __m512 vp6 = _mm512_fmadd_ps(vc5, vt6, vc4); - __m512 vp7 = _mm512_fmadd_ps(vc5, vt7, vc4); - __m512 vp8 = _mm512_fmadd_ps(vc5, vt8, vc4); - __m512 vp9 = _mm512_fmadd_ps(vc5, vt9, vc4); - __m512 vp10 = _mm512_fmadd_ps(vc5, vt10, vc4); - __m512 vp11 = _mm512_fmadd_ps(vc5, vt11, vc4); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc3); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc3); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc3); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc3); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc3); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc3); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc3); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc3); - vp8 = _mm512_fmadd_ps(vp8, vt8, vc3); - vp9 = _mm512_fmadd_ps(vp9, vt9, vc3); - vp10 = _mm512_fmadd_ps(vp10, vt10, vc3); - vp11 = _mm512_fmadd_ps(vp11, vt11, vc3); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc2); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc2); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc2); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc2); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc2); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc2); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc2); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc2); - vp8 = _mm512_fmadd_ps(vp8, vt8, vc2); - vp9 = _mm512_fmadd_ps(vp9, vt9, vc2); - vp10 = _mm512_fmadd_ps(vp10, vt10, vc2); - vp11 = _mm512_fmadd_ps(vp11, vt11, vc2); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc1); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc1); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc1); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc1); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc1); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc1); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc1); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc1); - vp8 = _mm512_fmadd_ps(vp8, vt8, vc1); - vp9 = _mm512_fmadd_ps(vp9, vt9, vc1); - vp10 = _mm512_fmadd_ps(vp10, vt10, vc1); - vp11 = _mm512_fmadd_ps(vp11, vt11, vc1); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc0); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc0); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc0); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc0); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc0); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc0); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc0); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc0); - vp8 = _mm512_fmadd_ps(vp8, vt8, vc0); - vp9 = _mm512_fmadd_ps(vp9, vt9, vc0); - vp10 = _mm512_fmadd_ps(vp10, vt10, vc0); - vp11 = _mm512_fmadd_ps(vp11, vt11, vc0); - - const __m512 vf0 = _mm512_scalef_ps(vp0, vn0); - const __m512 vf1 = _mm512_scalef_ps(vp1, vn1); - const __m512 vf2 = _mm512_scalef_ps(vp2, vn2); - const __m512 vf3 = _mm512_scalef_ps(vp3, vn3); - const __m512 vf4 = _mm512_scalef_ps(vp4, vn4); - const __m512 vf5 = _mm512_scalef_ps(vp5, vn5); - const __m512 vf6 = _mm512_scalef_ps(vp6, vn6); - const __m512 vf7 = _mm512_scalef_ps(vp7, vn7); - const __m512 vf8 = _mm512_scalef_ps(vp8, vn8); - const __m512 vf9 = _mm512_scalef_ps(vp9, vn9); - const __m512 vf10 = _mm512_scalef_ps(vp10, vn10); - const __m512 vf11 = _mm512_scalef_ps(vp11, vn11); - - _mm512_storeu_ps(output, vf0); - _mm512_storeu_ps(output + 16, vf1); - _mm512_storeu_ps(output + 32, vf2); - _mm512_storeu_ps(output + 48, vf3); - _mm512_storeu_ps(output + 64, vf4); - _mm512_storeu_ps(output + 80, vf5); - _mm512_storeu_ps(output + 96, vf6); - _mm512_storeu_ps(output + 112, vf7); - _mm512_storeu_ps(output + 128, vf8); - _mm512_storeu_ps(output + 144, vf9); - _mm512_storeu_ps(output + 160, vf10); - _mm512_storeu_ps(output + 176, vf11); - output += 192; - - vacc0 = _mm512_add_ps(vacc0, vf0); - vacc1 = _mm512_add_ps(vacc1, vf1); - vacc2 = _mm512_add_ps(vacc2, vf2); - vacc3 = _mm512_add_ps(vacc3, vf3); - vacc4 = _mm512_add_ps(vacc4, vf4); - vacc5 = _mm512_add_ps(vacc5, vf5); - vacc0 = _mm512_add_ps(vacc0, vf6); - vacc1 = _mm512_add_ps(vacc1, vf7); - vacc2 = _mm512_add_ps(vacc2, vf8); - vacc3 = _mm512_add_ps(vacc3, vf9); - vacc4 = _mm512_add_ps(vacc4, vf10); - vacc5 = _mm512_add_ps(vacc5, vf11); - } - vacc0 = _mm512_add_ps(vacc0, vacc1); - vacc2 = _mm512_add_ps(vacc2, vacc3); - vacc4 = _mm512_add_ps(vacc4, vacc5); - vacc0 = _mm512_add_ps(vacc0, vacc2); - vacc0 = _mm512_add_ps(vacc0, vacc4); - - __m512 vacc = vacc0; - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const __m512 vi = _mm512_loadu_ps(input); - input += 16; - - const __m512 vx = _mm512_sub_ps(vi, vi_max); - - const __m512 vn = _mm512_roundscale_ps(_mm512_mul_ps(vx, vlog2e), 0); - - const __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2, vx); - - __m512 vp = _mm512_fmadd_ps(vc5, vt, vc4); - vp = _mm512_fmadd_ps(vp, vt, vc3); - vp = _mm512_fmadd_ps(vp, vt, vc2); - vp = _mm512_fmadd_ps(vp, vt, vc1); - vp = _mm512_fmadd_ps(vp, vt, vc0); - - const __m512 vf = _mm512_scalef_ps(vp, vn); - - _mm512_storeu_ps(output, vf); - output += 16; - - vacc = _mm512_add_ps(vacc, vf); - } - if (batch != 0) { - // Prepare mask for valid 32-bit batch (depends on batch). - batch >>= XNN_LOG2_SIZEOF_FLOAT; - const __mmask16 vmask = _cvtu32_mask16((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); - - const __m512 vi = _mm512_maskz_loadu_ps(vmask, input); - - const __m512 vx = _mm512_sub_ps(vi, vi_max); - - const __m512 vn = _mm512_roundscale_ps(_mm512_mul_ps(vx, vlog2e), 0); - - const __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2, vx); - - __m512 vp = _mm512_fmadd_ps(vc5, vt, vc4); - vp = _mm512_fmadd_ps(vp, vt, vc3); - vp = _mm512_fmadd_ps(vp, vt, vc2); - vp = _mm512_fmadd_ps(vp, vt, vc1); - vp = _mm512_fmadd_ps(vp, vt, vc0); - - const __m512 vf = _mm512_scalef_ps(vp, vn); - - _mm512_mask_storeu_ps(output, vmask, vf); - - vacc = _mm512_mask_add_ps(vacc, vmask, vacc, vf); - } - *sum = _mm512_reduce_add_ps(vacc); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u192.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u192.c deleted file mode 100644 index 62f3a1be691..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u192.c +++ /dev/null @@ -1,264 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u192( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const __m512 vlog2e = _mm512_set1_ps(0x1.715476p+0f); - const __m512 vminus_ln2 = _mm512_set1_ps(-0x1.62E430p-1f); - const __m512 vc5 = _mm512_set1_ps(0x1.0F9F9Cp-7f); - const __m512 vc4 = _mm512_set1_ps(0x1.573A1Ap-5f); - const __m512 vc3 = _mm512_set1_ps(0x1.555A80p-3f); - const __m512 vc2 = _mm512_set1_ps(0x1.FFFDC6p-2f); - const __m512 vc1 = _mm512_set1_ps(0x1.FFFFF6p-1f); - const __m512 vc0 = _mm512_set1_ps(1.0f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vminus_ln2); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vc0); - - const __m512 vi_max = _mm512_set1_ps(*max); - - __m512 vacc0 = _mm512_setzero_ps(); - for (; batch >= 192 * sizeof(float); batch -= 192 * sizeof(float)) { - const __m512 vi0 = _mm512_loadu_ps(input); - const __m512 vi1 = _mm512_loadu_ps(input + 16); - const __m512 vi2 = _mm512_loadu_ps(input + 32); - const __m512 vi3 = _mm512_loadu_ps(input + 48); - const __m512 vi4 = _mm512_loadu_ps(input + 64); - const __m512 vi5 = _mm512_loadu_ps(input + 80); - const __m512 vi6 = _mm512_loadu_ps(input + 96); - const __m512 vi7 = _mm512_loadu_ps(input + 112); - const __m512 vi8 = _mm512_loadu_ps(input + 128); - const __m512 vi9 = _mm512_loadu_ps(input + 144); - const __m512 vi10 = _mm512_loadu_ps(input + 160); - const __m512 vi11 = _mm512_loadu_ps(input + 176); - input += 192; - - const __m512 vx0 = _mm512_sub_ps(vi0, vi_max); - const __m512 vx1 = _mm512_sub_ps(vi1, vi_max); - const __m512 vx2 = _mm512_sub_ps(vi2, vi_max); - const __m512 vx3 = _mm512_sub_ps(vi3, vi_max); - const __m512 vx4 = _mm512_sub_ps(vi4, vi_max); - const __m512 vx5 = _mm512_sub_ps(vi5, vi_max); - const __m512 vx6 = _mm512_sub_ps(vi6, vi_max); - const __m512 vx7 = _mm512_sub_ps(vi7, vi_max); - const __m512 vx8 = _mm512_sub_ps(vi8, vi_max); - const __m512 vx9 = _mm512_sub_ps(vi9, vi_max); - const __m512 vx10 = _mm512_sub_ps(vi10, vi_max); - const __m512 vx11 = _mm512_sub_ps(vi11, vi_max); - - const __m512 vn0 = _mm512_roundscale_ps(_mm512_mul_ps(vx0, vlog2e), 0); - const __m512 vn1 = _mm512_roundscale_ps(_mm512_mul_ps(vx1, vlog2e), 0); - const __m512 vn2 = _mm512_roundscale_ps(_mm512_mul_ps(vx2, vlog2e), 0); - const __m512 vn3 = _mm512_roundscale_ps(_mm512_mul_ps(vx3, vlog2e), 0); - const __m512 vn4 = _mm512_roundscale_ps(_mm512_mul_ps(vx4, vlog2e), 0); - const __m512 vn5 = _mm512_roundscale_ps(_mm512_mul_ps(vx5, vlog2e), 0); - const __m512 vn6 = _mm512_roundscale_ps(_mm512_mul_ps(vx6, vlog2e), 0); - const __m512 vn7 = _mm512_roundscale_ps(_mm512_mul_ps(vx7, vlog2e), 0); - const __m512 vn8 = _mm512_roundscale_ps(_mm512_mul_ps(vx8, vlog2e), 0); - const __m512 vn9 = _mm512_roundscale_ps(_mm512_mul_ps(vx9, vlog2e), 0); - const __m512 vn10 = _mm512_roundscale_ps(_mm512_mul_ps(vx10, vlog2e), 0); - const __m512 vn11 = _mm512_roundscale_ps(_mm512_mul_ps(vx11, vlog2e), 0); - - const __m512 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2, vx0); - const __m512 vt1 = _mm512_fmadd_ps(vn1, vminus_ln2, vx1); - const __m512 vt2 = _mm512_fmadd_ps(vn2, vminus_ln2, vx2); - const __m512 vt3 = _mm512_fmadd_ps(vn3, vminus_ln2, vx3); - const __m512 vt4 = _mm512_fmadd_ps(vn4, vminus_ln2, vx4); - const __m512 vt5 = _mm512_fmadd_ps(vn5, vminus_ln2, vx5); - const __m512 vt6 = _mm512_fmadd_ps(vn6, vminus_ln2, vx6); - const __m512 vt7 = _mm512_fmadd_ps(vn7, vminus_ln2, vx7); - const __m512 vt8 = _mm512_fmadd_ps(vn8, vminus_ln2, vx8); - const __m512 vt9 = _mm512_fmadd_ps(vn9, vminus_ln2, vx9); - const __m512 vt10 = _mm512_fmadd_ps(vn10, vminus_ln2, vx10); - const __m512 vt11 = _mm512_fmadd_ps(vn11, vminus_ln2, vx11); - - __m512 vp0 = _mm512_fmadd_ps(vc5, vt0, vc4); - __m512 vp1 = _mm512_fmadd_ps(vc5, vt1, vc4); - __m512 vp2 = _mm512_fmadd_ps(vc5, vt2, vc4); - __m512 vp3 = _mm512_fmadd_ps(vc5, vt3, vc4); - __m512 vp4 = _mm512_fmadd_ps(vc5, vt4, vc4); - __m512 vp5 = _mm512_fmadd_ps(vc5, vt5, vc4); - __m512 vp6 = _mm512_fmadd_ps(vc5, vt6, vc4); - __m512 vp7 = _mm512_fmadd_ps(vc5, vt7, vc4); - __m512 vp8 = _mm512_fmadd_ps(vc5, vt8, vc4); - __m512 vp9 = _mm512_fmadd_ps(vc5, vt9, vc4); - __m512 vp10 = _mm512_fmadd_ps(vc5, vt10, vc4); - __m512 vp11 = _mm512_fmadd_ps(vc5, vt11, vc4); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc3); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc3); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc3); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc3); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc3); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc3); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc3); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc3); - vp8 = _mm512_fmadd_ps(vp8, vt8, vc3); - vp9 = _mm512_fmadd_ps(vp9, vt9, vc3); - vp10 = _mm512_fmadd_ps(vp10, vt10, vc3); - vp11 = _mm512_fmadd_ps(vp11, vt11, vc3); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc2); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc2); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc2); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc2); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc2); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc2); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc2); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc2); - vp8 = _mm512_fmadd_ps(vp8, vt8, vc2); - vp9 = _mm512_fmadd_ps(vp9, vt9, vc2); - vp10 = _mm512_fmadd_ps(vp10, vt10, vc2); - vp11 = _mm512_fmadd_ps(vp11, vt11, vc2); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc1); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc1); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc1); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc1); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc1); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc1); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc1); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc1); - vp8 = _mm512_fmadd_ps(vp8, vt8, vc1); - vp9 = _mm512_fmadd_ps(vp9, vt9, vc1); - vp10 = _mm512_fmadd_ps(vp10, vt10, vc1); - vp11 = _mm512_fmadd_ps(vp11, vt11, vc1); - - vp0 = _mm512_fmadd_ps(vp0, vt0, vc0); - vp1 = _mm512_fmadd_ps(vp1, vt1, vc0); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc0); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc0); - vp4 = _mm512_fmadd_ps(vp4, vt4, vc0); - vp5 = _mm512_fmadd_ps(vp5, vt5, vc0); - vp6 = _mm512_fmadd_ps(vp6, vt6, vc0); - vp7 = _mm512_fmadd_ps(vp7, vt7, vc0); - vp8 = _mm512_fmadd_ps(vp8, vt8, vc0); - vp9 = _mm512_fmadd_ps(vp9, vt9, vc0); - vp10 = _mm512_fmadd_ps(vp10, vt10, vc0); - vp11 = _mm512_fmadd_ps(vp11, vt11, vc0); - - const __m512 vf0 = _mm512_scalef_ps(vp0, vn0); - const __m512 vf1 = _mm512_scalef_ps(vp1, vn1); - const __m512 vf2 = _mm512_scalef_ps(vp2, vn2); - const __m512 vf3 = _mm512_scalef_ps(vp3, vn3); - const __m512 vf4 = _mm512_scalef_ps(vp4, vn4); - const __m512 vf5 = _mm512_scalef_ps(vp5, vn5); - const __m512 vf6 = _mm512_scalef_ps(vp6, vn6); - const __m512 vf7 = _mm512_scalef_ps(vp7, vn7); - const __m512 vf8 = _mm512_scalef_ps(vp8, vn8); - const __m512 vf9 = _mm512_scalef_ps(vp9, vn9); - const __m512 vf10 = _mm512_scalef_ps(vp10, vn10); - const __m512 vf11 = _mm512_scalef_ps(vp11, vn11); - - _mm512_storeu_ps(output, vf0); - _mm512_storeu_ps(output + 16, vf1); - _mm512_storeu_ps(output + 32, vf2); - _mm512_storeu_ps(output + 48, vf3); - _mm512_storeu_ps(output + 64, vf4); - _mm512_storeu_ps(output + 80, vf5); - _mm512_storeu_ps(output + 96, vf6); - _mm512_storeu_ps(output + 112, vf7); - _mm512_storeu_ps(output + 128, vf8); - _mm512_storeu_ps(output + 144, vf9); - _mm512_storeu_ps(output + 160, vf10); - _mm512_storeu_ps(output + 176, vf11); - output += 192; - - vacc0 = _mm512_add_ps(vacc0, vf0); - vacc0 = _mm512_add_ps(vacc0, vf1); - vacc0 = _mm512_add_ps(vacc0, vf2); - vacc0 = _mm512_add_ps(vacc0, vf3); - vacc0 = _mm512_add_ps(vacc0, vf4); - vacc0 = _mm512_add_ps(vacc0, vf5); - vacc0 = _mm512_add_ps(vacc0, vf6); - vacc0 = _mm512_add_ps(vacc0, vf7); - vacc0 = _mm512_add_ps(vacc0, vf8); - vacc0 = _mm512_add_ps(vacc0, vf9); - vacc0 = _mm512_add_ps(vacc0, vf10); - vacc0 = _mm512_add_ps(vacc0, vf11); - } - - __m512 vacc = vacc0; - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const __m512 vi = _mm512_loadu_ps(input); - input += 16; - - const __m512 vx = _mm512_sub_ps(vi, vi_max); - - const __m512 vn = _mm512_roundscale_ps(_mm512_mul_ps(vx, vlog2e), 0); - - const __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2, vx); - - __m512 vp = _mm512_fmadd_ps(vc5, vt, vc4); - vp = _mm512_fmadd_ps(vp, vt, vc3); - vp = _mm512_fmadd_ps(vp, vt, vc2); - vp = _mm512_fmadd_ps(vp, vt, vc1); - vp = _mm512_fmadd_ps(vp, vt, vc0); - - const __m512 vf = _mm512_scalef_ps(vp, vn); - - _mm512_storeu_ps(output, vf); - output += 16; - - vacc = _mm512_add_ps(vacc, vf); - } - if (batch != 0) { - // Prepare mask for valid 32-bit batch (depends on batch). - batch >>= XNN_LOG2_SIZEOF_FLOAT; - const __mmask16 vmask = _cvtu32_mask16((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); - - const __m512 vi = _mm512_maskz_loadu_ps(vmask, input); - - const __m512 vx = _mm512_sub_ps(vi, vi_max); - - const __m512 vn = _mm512_roundscale_ps(_mm512_mul_ps(vx, vlog2e), 0); - - const __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2, vx); - - __m512 vp = _mm512_fmadd_ps(vc5, vt, vc4); - vp = _mm512_fmadd_ps(vp, vt, vc3); - vp = _mm512_fmadd_ps(vp, vt, vc2); - vp = _mm512_fmadd_ps(vp, vt, vc1); - vp = _mm512_fmadd_ps(vp, vt, vc0); - - const __m512 vf = _mm512_scalef_ps(vp, vn); - - _mm512_mask_storeu_ps(output, vmask, vf); - - vacc = _mm512_mask_add_ps(vacc, vmask, vacc, vf); - } - *sum = _mm512_reduce_add_ps(vacc); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u64.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u32-acc2.c similarity index 76% rename from src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u64.c rename to src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u32-acc2.c index e6ade63d2b1..d32f2237bb5 100644 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u64.c +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u32-acc2.c @@ -15,7 +15,7 @@ #include "xnnpack/raddstoreexpminusmax.h" -void xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64( +void xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u32_acc2( size_t batch, const float* input, const float* max, @@ -51,69 +51,47 @@ void xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64( const __m512 vi_max = _mm512_set1_ps(*max); __m512 vacc0 = _mm512_setzero_ps(); - for (; batch >= 64 * sizeof(float); batch -= 64 * sizeof(float)) { + __m512 vacc1 = _mm512_setzero_ps(); + for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { const __m512 vi0 = _mm512_loadu_ps(input); const __m512 vi1 = _mm512_loadu_ps(input + 16); - const __m512 vi2 = _mm512_loadu_ps(input + 32); - const __m512 vi3 = _mm512_loadu_ps(input + 48); - input += 64; + input += 32; const __m512 vx0 = _mm512_sub_ps(vi0, vi_max); const __m512 vx1 = _mm512_sub_ps(vi1, vi_max); - const __m512 vx2 = _mm512_sub_ps(vi2, vi_max); - const __m512 vx3 = _mm512_sub_ps(vi3, vi_max); const __m512 vn0 = _mm512_roundscale_ps(_mm512_mul_ps(vx0, vlog2e), 0); const __m512 vn1 = _mm512_roundscale_ps(_mm512_mul_ps(vx1, vlog2e), 0); - const __m512 vn2 = _mm512_roundscale_ps(_mm512_mul_ps(vx2, vlog2e), 0); - const __m512 vn3 = _mm512_roundscale_ps(_mm512_mul_ps(vx3, vlog2e), 0); const __m512 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2, vx0); const __m512 vt1 = _mm512_fmadd_ps(vn1, vminus_ln2, vx1); - const __m512 vt2 = _mm512_fmadd_ps(vn2, vminus_ln2, vx2); - const __m512 vt3 = _mm512_fmadd_ps(vn3, vminus_ln2, vx3); __m512 vp0 = _mm512_fmadd_ps(vc5, vt0, vc4); __m512 vp1 = _mm512_fmadd_ps(vc5, vt1, vc4); - __m512 vp2 = _mm512_fmadd_ps(vc5, vt2, vc4); - __m512 vp3 = _mm512_fmadd_ps(vc5, vt3, vc4); vp0 = _mm512_fmadd_ps(vp0, vt0, vc3); vp1 = _mm512_fmadd_ps(vp1, vt1, vc3); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc3); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc3); vp0 = _mm512_fmadd_ps(vp0, vt0, vc2); vp1 = _mm512_fmadd_ps(vp1, vt1, vc2); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc2); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc2); vp0 = _mm512_fmadd_ps(vp0, vt0, vc1); vp1 = _mm512_fmadd_ps(vp1, vt1, vc1); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc1); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc1); vp0 = _mm512_fmadd_ps(vp0, vt0, vc0); vp1 = _mm512_fmadd_ps(vp1, vt1, vc0); - vp2 = _mm512_fmadd_ps(vp2, vt2, vc0); - vp3 = _mm512_fmadd_ps(vp3, vt3, vc0); const __m512 vf0 = _mm512_scalef_ps(vp0, vn0); const __m512 vf1 = _mm512_scalef_ps(vp1, vn1); - const __m512 vf2 = _mm512_scalef_ps(vp2, vn2); - const __m512 vf3 = _mm512_scalef_ps(vp3, vn3); _mm512_storeu_ps(output, vf0); _mm512_storeu_ps(output + 16, vf1); - _mm512_storeu_ps(output + 32, vf2); - _mm512_storeu_ps(output + 48, vf3); - output += 64; + output += 32; vacc0 = _mm512_add_ps(vacc0, vf0); - vacc0 = _mm512_add_ps(vacc0, vf1); - vacc0 = _mm512_add_ps(vacc0, vf2); - vacc0 = _mm512_add_ps(vacc0, vf3); + vacc1 = _mm512_add_ps(vacc1, vf1); } + vacc0 = _mm512_add_ps(vacc0, vacc1); __m512 vacc = vacc0; for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u128-acc3.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u128-acc3.c deleted file mode 100644 index 652387f65d7..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u128-acc3.c +++ /dev/null @@ -1,226 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/hvx-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2024 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - - -#include - -#include "xnnpack/simd/f32-hvx.h" -#include "xnnpack/raddstoreexpminusmax.h" - -void xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u128_acc3( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const HVX_Vector vi_max = xnn_set1_f32(*max); - const HVX_Vector vlog2e = xnn_set1_f32(0x1.715476p+0f); - const HVX_Vector vmagic_bias = xnn_set1_f32(0x1.8000FEp23f); - const HVX_Vector vminus_ln2_hi = xnn_set1_f32(-0x1.62E400p-1f); - const HVX_Vector vminus_ln2_lo = xnn_set1_f32(-0x1.7F7D1Cp-20f); - const HVX_Vector vc5 = xnn_set1_f32(0x1.0F9F9Cp-7f); - const HVX_Vector vc4 = xnn_set1_f32(0x1.573A1Ap-5f); - const HVX_Vector vc3 = xnn_set1_f32(0x1.555A80p-3f); - const HVX_Vector vc2 = xnn_set1_f32(0x1.FFFDC6p-2f); - const HVX_Vector vc1 = xnn_set1_f32(0x1.FFFFF6p-1f); - const HVX_Vector vdenorm_cutoff = xnn_set1_f32(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - HVX_Vector vacc0 = Q6_V_vzero(); - HVX_Vector vacc1 = vacc0; - HVX_Vector vacc2 = vacc0; - for (; batch >= 128 * sizeof(float); batch -= 128 * sizeof(float)) { - const HVX_Vector vi0 = xnn_loadu_f32(input); - const HVX_Vector vi1 = xnn_loadu_f32(input + 32); - const HVX_Vector vi2 = xnn_loadu_f32(input + 64); - const HVX_Vector vi3 = xnn_loadu_f32(input + 96); - input += 128; - - // Subtract maximum input x := i - i_max - const HVX_Vector vx0 = xnn_sub_f32(vi0, vi_max); - const HVX_Vector vx1 = xnn_sub_f32(vi1, vi_max); - const HVX_Vector vx2 = xnn_sub_f32(vi2, vi_max); - const HVX_Vector vx3 = xnn_sub_f32(vi3, vi_max); - - // n := round(x / log(2)) - HVX_Vector vn0 = xnn_fmadd_f32(vx0, vlog2e, vmagic_bias); - HVX_Vector vn1 = xnn_fmadd_f32(vx1, vlog2e, vmagic_bias); - HVX_Vector vn2 = xnn_fmadd_f32(vx2, vlog2e, vmagic_bias); - HVX_Vector vn3 = xnn_fmadd_f32(vx3, vlog2e, vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow. - const HVX_Vector vs0 = Q6_Vw_vasl_VwR(vn0, 23); - const HVX_Vector vs1 = Q6_Vw_vasl_VwR(vn1, 23); - const HVX_Vector vs2 = Q6_Vw_vasl_VwR(vn2, 23); - const HVX_Vector vs3 = Q6_Vw_vasl_VwR(vn3, 23); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn0 = xnn_sub_f32(vn0, vmagic_bias); - vn1 = xnn_sub_f32(vn1, vmagic_bias); - vn2 = xnn_sub_f32(vn2, vmagic_bias); - vn3 = xnn_sub_f32(vn3, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - HVX_Vector vt0 = xnn_fmadd_f32(vn0, vminus_ln2_hi, vx0); - HVX_Vector vt1 = xnn_fmadd_f32(vn1, vminus_ln2_hi, vx1); - HVX_Vector vt2 = xnn_fmadd_f32(vn2, vminus_ln2_hi, vx2); - HVX_Vector vt3 = xnn_fmadd_f32(vn3, vminus_ln2_hi, vx3); - - vt0 = xnn_fmadd_f32(vn0, vminus_ln2_lo, vt0); - vt1 = xnn_fmadd_f32(vn1, vminus_ln2_lo, vt1); - vt2 = xnn_fmadd_f32(vn2, vminus_ln2_lo, vt2); - vt3 = xnn_fmadd_f32(vn3, vminus_ln2_lo, vt3); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - // p := c5 * t + c4; - // p = p * t + c3; - // p = p * t + c2; - // p = p * t + c1; - HVX_Vector vp0 = xnn_fmadd_f32(vc5, vt0, vc4); - HVX_Vector vp1 = xnn_fmadd_f32(vc5, vt1, vc4); - HVX_Vector vp2 = xnn_fmadd_f32(vc5, vt2, vc4); - HVX_Vector vp3 = xnn_fmadd_f32(vc5, vt3, vc4); - - vp0 = xnn_fmadd_f32(vp0, vt0, vc3); - vp1 = xnn_fmadd_f32(vp1, vt1, vc3); - vp2 = xnn_fmadd_f32(vp2, vt2, vc3); - vp3 = xnn_fmadd_f32(vp3, vt3, vc3); - - vp0 = xnn_fmadd_f32(vp0, vt0, vc2); - vp1 = xnn_fmadd_f32(vp1, vt1, vc2); - vp2 = xnn_fmadd_f32(vp2, vt2, vc2); - vp3 = xnn_fmadd_f32(vp3, vt3, vc2); - - vp0 = xnn_fmadd_f32(vp0, vt0, vc1); - vp1 = xnn_fmadd_f32(vp1, vt1, vc1); - vp2 = xnn_fmadd_f32(vp2, vt2, vc1); - vp3 = xnn_fmadd_f32(vp3, vt3, vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt0 = xnn_mul_f32(vt0, vs0); - vt1 = xnn_mul_f32(vt1, vs1); - vt2 = xnn_mul_f32(vt2, vs2); - vt3 = xnn_mul_f32(vt3, vs3); - - HVX_Vector vf0 = xnn_fmadd_f32(vt0, vp0, vs0); - HVX_Vector vf1 = xnn_fmadd_f32(vt1, vp1, vs1); - HVX_Vector vf2 = xnn_fmadd_f32(vt2, vp2, vs2); - HVX_Vector vf3 = xnn_fmadd_f32(vt3, vp3, vs3); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0 = Q6_V_vand_QnV(Q6_Q_vcmp_gt_VsfVsf(vdenorm_cutoff, vx0), vf0); - vf1 = Q6_V_vand_QnV(Q6_Q_vcmp_gt_VsfVsf(vdenorm_cutoff, vx1), vf1); - vf2 = Q6_V_vand_QnV(Q6_Q_vcmp_gt_VsfVsf(vdenorm_cutoff, vx2), vf2); - vf3 = Q6_V_vand_QnV(Q6_Q_vcmp_gt_VsfVsf(vdenorm_cutoff, vx3), vf3); - - xnn_storeu_f32(output, vf0); - xnn_storeu_f32(output + 32, vf1); - xnn_storeu_f32(output + 64, vf2); - xnn_storeu_f32(output + 96, vf3); - output += 128; - - vacc0 = xnn_add_f32(vacc0, vf0); - vacc2 = xnn_add_f32(vacc2, vf1); - vacc1 = xnn_add_f32(vacc1, vf2); - vacc0 = xnn_add_f32(vacc0, vf3); - } - vacc0 = xnn_add_f32(vacc0, vacc1); - vacc0 = xnn_add_f32(vacc0, vacc2); - - HVX_Vector vacc = vacc0; - for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { - const HVX_Vector vi = xnn_loadu_f32(input); - input += 32; - - const HVX_Vector vx = xnn_sub_f32(vi, vi_max); - - HVX_Vector vn = xnn_fmadd_f32(vx, vlog2e, vmagic_bias); - - const HVX_Vector vs = Q6_Vw_vasl_VwR(vn, 23); - - vn = xnn_sub_f32(vn, vmagic_bias); - - HVX_Vector vt = xnn_fmadd_f32(vn, vminus_ln2_hi, vx); - vt = xnn_fmadd_f32(vn, vminus_ln2_lo, vt); - - HVX_Vector vp = xnn_fmadd_f32(vc5, vt, vc4); - vp = xnn_fmadd_f32(vp, vt, vc3); - vp = xnn_fmadd_f32(vp, vt, vc2); - vp = xnn_fmadd_f32(vp, vt, vc1); - - vt = xnn_mul_f32(vt, vs); - HVX_Vector vf = xnn_fmadd_f32(vt, vp, vs); - - vf = Q6_V_vand_QnV(Q6_Q_vcmp_gt_VsfVsf(vdenorm_cutoff, vx), vf); - - xnn_storeu_f32(output, vf); - output += 32; - - vacc = xnn_add_f32(vacc, vf); - } - - float vacc_lo = Q6_f32_vrsum_Vsf(vacc); - if XNN_UNLIKELY(batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch < 32 * sizeof(float)); - - const HVX_Vector vi = xnn_loadu_f32(input); - - const HVX_Vector vx = xnn_sub_f32(vi, vi_max); - - HVX_Vector vn = xnn_fmadd_f32(vx, vlog2e, vmagic_bias); - - const HVX_Vector vs = Q6_Vw_vasl_VwR(vn, 23); - - vn = xnn_sub_f32(vn, vmagic_bias); - - HVX_Vector vt = xnn_fmadd_f32(vn, vminus_ln2_hi, vx); - vt = xnn_fmadd_f32(vn, vminus_ln2_lo, vt); - - HVX_Vector vp = xnn_fmadd_f32(vc5, vt, vc4); - vp = xnn_fmadd_f32(vp, vt, vc3); - vp = xnn_fmadd_f32(vp, vt, vc2); - vp = xnn_fmadd_f32(vp, vt, vc1); - - vt = xnn_mul_f32(vt, vs); - HVX_Vector vf = xnn_fmadd_f32(vt, vp, vs); - - vf = Q6_V_vand_QnV(Q6_Q_vcmp_gt_VsfVsf(vdenorm_cutoff, vx), vf); - - Q6_V_vstu_variable(output, batch, vf); - - vf = Q6_V_vand_QV(Q6_Q_vsetq_R(batch), vf); - vacc_lo += Q6_f32_vrsum_Vsf(vf); - } - *sum = vacc_lo; -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u128.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u128.c deleted file mode 100644 index 54e98f115d1..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u128.c +++ /dev/null @@ -1,222 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/hvx-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2024 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - - -#include - -#include "xnnpack/simd/f32-hvx.h" -#include "xnnpack/raddstoreexpminusmax.h" - -void xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u128( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const HVX_Vector vi_max = xnn_set1_f32(*max); - const HVX_Vector vlog2e = xnn_set1_f32(0x1.715476p+0f); - const HVX_Vector vmagic_bias = xnn_set1_f32(0x1.8000FEp23f); - const HVX_Vector vminus_ln2_hi = xnn_set1_f32(-0x1.62E400p-1f); - const HVX_Vector vminus_ln2_lo = xnn_set1_f32(-0x1.7F7D1Cp-20f); - const HVX_Vector vc5 = xnn_set1_f32(0x1.0F9F9Cp-7f); - const HVX_Vector vc4 = xnn_set1_f32(0x1.573A1Ap-5f); - const HVX_Vector vc3 = xnn_set1_f32(0x1.555A80p-3f); - const HVX_Vector vc2 = xnn_set1_f32(0x1.FFFDC6p-2f); - const HVX_Vector vc1 = xnn_set1_f32(0x1.FFFFF6p-1f); - const HVX_Vector vdenorm_cutoff = xnn_set1_f32(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - HVX_Vector vacc0 = Q6_V_vzero(); - for (; batch >= 128 * sizeof(float); batch -= 128 * sizeof(float)) { - const HVX_Vector vi0 = xnn_loadu_f32(input); - const HVX_Vector vi1 = xnn_loadu_f32(input + 32); - const HVX_Vector vi2 = xnn_loadu_f32(input + 64); - const HVX_Vector vi3 = xnn_loadu_f32(input + 96); - input += 128; - - // Subtract maximum input x := i - i_max - const HVX_Vector vx0 = xnn_sub_f32(vi0, vi_max); - const HVX_Vector vx1 = xnn_sub_f32(vi1, vi_max); - const HVX_Vector vx2 = xnn_sub_f32(vi2, vi_max); - const HVX_Vector vx3 = xnn_sub_f32(vi3, vi_max); - - // n := round(x / log(2)) - HVX_Vector vn0 = xnn_fmadd_f32(vx0, vlog2e, vmagic_bias); - HVX_Vector vn1 = xnn_fmadd_f32(vx1, vlog2e, vmagic_bias); - HVX_Vector vn2 = xnn_fmadd_f32(vx2, vlog2e, vmagic_bias); - HVX_Vector vn3 = xnn_fmadd_f32(vx3, vlog2e, vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow. - const HVX_Vector vs0 = Q6_Vw_vasl_VwR(vn0, 23); - const HVX_Vector vs1 = Q6_Vw_vasl_VwR(vn1, 23); - const HVX_Vector vs2 = Q6_Vw_vasl_VwR(vn2, 23); - const HVX_Vector vs3 = Q6_Vw_vasl_VwR(vn3, 23); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn0 = xnn_sub_f32(vn0, vmagic_bias); - vn1 = xnn_sub_f32(vn1, vmagic_bias); - vn2 = xnn_sub_f32(vn2, vmagic_bias); - vn3 = xnn_sub_f32(vn3, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - HVX_Vector vt0 = xnn_fmadd_f32(vn0, vminus_ln2_hi, vx0); - HVX_Vector vt1 = xnn_fmadd_f32(vn1, vminus_ln2_hi, vx1); - HVX_Vector vt2 = xnn_fmadd_f32(vn2, vminus_ln2_hi, vx2); - HVX_Vector vt3 = xnn_fmadd_f32(vn3, vminus_ln2_hi, vx3); - - vt0 = xnn_fmadd_f32(vn0, vminus_ln2_lo, vt0); - vt1 = xnn_fmadd_f32(vn1, vminus_ln2_lo, vt1); - vt2 = xnn_fmadd_f32(vn2, vminus_ln2_lo, vt2); - vt3 = xnn_fmadd_f32(vn3, vminus_ln2_lo, vt3); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - // p := c5 * t + c4; - // p = p * t + c3; - // p = p * t + c2; - // p = p * t + c1; - HVX_Vector vp0 = xnn_fmadd_f32(vc5, vt0, vc4); - HVX_Vector vp1 = xnn_fmadd_f32(vc5, vt1, vc4); - HVX_Vector vp2 = xnn_fmadd_f32(vc5, vt2, vc4); - HVX_Vector vp3 = xnn_fmadd_f32(vc5, vt3, vc4); - - vp0 = xnn_fmadd_f32(vp0, vt0, vc3); - vp1 = xnn_fmadd_f32(vp1, vt1, vc3); - vp2 = xnn_fmadd_f32(vp2, vt2, vc3); - vp3 = xnn_fmadd_f32(vp3, vt3, vc3); - - vp0 = xnn_fmadd_f32(vp0, vt0, vc2); - vp1 = xnn_fmadd_f32(vp1, vt1, vc2); - vp2 = xnn_fmadd_f32(vp2, vt2, vc2); - vp3 = xnn_fmadd_f32(vp3, vt3, vc2); - - vp0 = xnn_fmadd_f32(vp0, vt0, vc1); - vp1 = xnn_fmadd_f32(vp1, vt1, vc1); - vp2 = xnn_fmadd_f32(vp2, vt2, vc1); - vp3 = xnn_fmadd_f32(vp3, vt3, vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt0 = xnn_mul_f32(vt0, vs0); - vt1 = xnn_mul_f32(vt1, vs1); - vt2 = xnn_mul_f32(vt2, vs2); - vt3 = xnn_mul_f32(vt3, vs3); - - HVX_Vector vf0 = xnn_fmadd_f32(vt0, vp0, vs0); - HVX_Vector vf1 = xnn_fmadd_f32(vt1, vp1, vs1); - HVX_Vector vf2 = xnn_fmadd_f32(vt2, vp2, vs2); - HVX_Vector vf3 = xnn_fmadd_f32(vt3, vp3, vs3); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0 = Q6_V_vand_QnV(Q6_Q_vcmp_gt_VsfVsf(vdenorm_cutoff, vx0), vf0); - vf1 = Q6_V_vand_QnV(Q6_Q_vcmp_gt_VsfVsf(vdenorm_cutoff, vx1), vf1); - vf2 = Q6_V_vand_QnV(Q6_Q_vcmp_gt_VsfVsf(vdenorm_cutoff, vx2), vf2); - vf3 = Q6_V_vand_QnV(Q6_Q_vcmp_gt_VsfVsf(vdenorm_cutoff, vx3), vf3); - - xnn_storeu_f32(output, vf0); - xnn_storeu_f32(output + 32, vf1); - xnn_storeu_f32(output + 64, vf2); - xnn_storeu_f32(output + 96, vf3); - output += 128; - - vacc0 = xnn_add_f32(vacc0, vf0); - vacc0 = xnn_add_f32(vacc0, vf1); - vacc0 = xnn_add_f32(vacc0, vf2); - vacc0 = xnn_add_f32(vacc0, vf3); - } - - HVX_Vector vacc = vacc0; - for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { - const HVX_Vector vi = xnn_loadu_f32(input); - input += 32; - - const HVX_Vector vx = xnn_sub_f32(vi, vi_max); - - HVX_Vector vn = xnn_fmadd_f32(vx, vlog2e, vmagic_bias); - - const HVX_Vector vs = Q6_Vw_vasl_VwR(vn, 23); - - vn = xnn_sub_f32(vn, vmagic_bias); - - HVX_Vector vt = xnn_fmadd_f32(vn, vminus_ln2_hi, vx); - vt = xnn_fmadd_f32(vn, vminus_ln2_lo, vt); - - HVX_Vector vp = xnn_fmadd_f32(vc5, vt, vc4); - vp = xnn_fmadd_f32(vp, vt, vc3); - vp = xnn_fmadd_f32(vp, vt, vc2); - vp = xnn_fmadd_f32(vp, vt, vc1); - - vt = xnn_mul_f32(vt, vs); - HVX_Vector vf = xnn_fmadd_f32(vt, vp, vs); - - vf = Q6_V_vand_QnV(Q6_Q_vcmp_gt_VsfVsf(vdenorm_cutoff, vx), vf); - - xnn_storeu_f32(output, vf); - output += 32; - - vacc = xnn_add_f32(vacc, vf); - } - - float vacc_lo = Q6_f32_vrsum_Vsf(vacc); - if XNN_UNLIKELY(batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch < 32 * sizeof(float)); - - const HVX_Vector vi = xnn_loadu_f32(input); - - const HVX_Vector vx = xnn_sub_f32(vi, vi_max); - - HVX_Vector vn = xnn_fmadd_f32(vx, vlog2e, vmagic_bias); - - const HVX_Vector vs = Q6_Vw_vasl_VwR(vn, 23); - - vn = xnn_sub_f32(vn, vmagic_bias); - - HVX_Vector vt = xnn_fmadd_f32(vn, vminus_ln2_hi, vx); - vt = xnn_fmadd_f32(vn, vminus_ln2_lo, vt); - - HVX_Vector vp = xnn_fmadd_f32(vc5, vt, vc4); - vp = xnn_fmadd_f32(vp, vt, vc3); - vp = xnn_fmadd_f32(vp, vt, vc2); - vp = xnn_fmadd_f32(vp, vt, vc1); - - vt = xnn_mul_f32(vt, vs); - HVX_Vector vf = xnn_fmadd_f32(vt, vp, vs); - - vf = Q6_V_vand_QnV(Q6_Q_vcmp_gt_VsfVsf(vdenorm_cutoff, vx), vf); - - Q6_V_vstu_variable(output, batch, vf); - - vf = Q6_V_vand_QV(Q6_Q_vsetq_R(batch), vf); - vacc_lo += Q6_f32_vrsum_Vsf(vf); - } - *sum = vacc_lo; -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u64.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u64.c deleted file mode 100644 index 848fcc4195f..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u64.c +++ /dev/null @@ -1,190 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/hvx-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2024 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - - -#include - -#include "xnnpack/simd/f32-hvx.h" -#include "xnnpack/raddstoreexpminusmax.h" - -void xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u64( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const HVX_Vector vi_max = xnn_set1_f32(*max); - const HVX_Vector vlog2e = xnn_set1_f32(0x1.715476p+0f); - const HVX_Vector vmagic_bias = xnn_set1_f32(0x1.8000FEp23f); - const HVX_Vector vminus_ln2_hi = xnn_set1_f32(-0x1.62E400p-1f); - const HVX_Vector vminus_ln2_lo = xnn_set1_f32(-0x1.7F7D1Cp-20f); - const HVX_Vector vc5 = xnn_set1_f32(0x1.0F9F9Cp-7f); - const HVX_Vector vc4 = xnn_set1_f32(0x1.573A1Ap-5f); - const HVX_Vector vc3 = xnn_set1_f32(0x1.555A80p-3f); - const HVX_Vector vc2 = xnn_set1_f32(0x1.FFFDC6p-2f); - const HVX_Vector vc1 = xnn_set1_f32(0x1.FFFFF6p-1f); - const HVX_Vector vdenorm_cutoff = xnn_set1_f32(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - HVX_Vector vacc0 = Q6_V_vzero(); - for (; batch >= 64 * sizeof(float); batch -= 64 * sizeof(float)) { - const HVX_Vector vi0 = xnn_loadu_f32(input); - const HVX_Vector vi1 = xnn_loadu_f32(input + 32); - input += 64; - - // Subtract maximum input x := i - i_max - const HVX_Vector vx0 = xnn_sub_f32(vi0, vi_max); - const HVX_Vector vx1 = xnn_sub_f32(vi1, vi_max); - - // n := round(x / log(2)) - HVX_Vector vn0 = xnn_fmadd_f32(vx0, vlog2e, vmagic_bias); - HVX_Vector vn1 = xnn_fmadd_f32(vx1, vlog2e, vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow. - const HVX_Vector vs0 = Q6_Vw_vasl_VwR(vn0, 23); - const HVX_Vector vs1 = Q6_Vw_vasl_VwR(vn1, 23); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn0 = xnn_sub_f32(vn0, vmagic_bias); - vn1 = xnn_sub_f32(vn1, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - HVX_Vector vt0 = xnn_fmadd_f32(vn0, vminus_ln2_hi, vx0); - HVX_Vector vt1 = xnn_fmadd_f32(vn1, vminus_ln2_hi, vx1); - - vt0 = xnn_fmadd_f32(vn0, vminus_ln2_lo, vt0); - vt1 = xnn_fmadd_f32(vn1, vminus_ln2_lo, vt1); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - // p := c5 * t + c4; - // p = p * t + c3; - // p = p * t + c2; - // p = p * t + c1; - HVX_Vector vp0 = xnn_fmadd_f32(vc5, vt0, vc4); - HVX_Vector vp1 = xnn_fmadd_f32(vc5, vt1, vc4); - - vp0 = xnn_fmadd_f32(vp0, vt0, vc3); - vp1 = xnn_fmadd_f32(vp1, vt1, vc3); - - vp0 = xnn_fmadd_f32(vp0, vt0, vc2); - vp1 = xnn_fmadd_f32(vp1, vt1, vc2); - - vp0 = xnn_fmadd_f32(vp0, vt0, vc1); - vp1 = xnn_fmadd_f32(vp1, vt1, vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt0 = xnn_mul_f32(vt0, vs0); - vt1 = xnn_mul_f32(vt1, vs1); - - HVX_Vector vf0 = xnn_fmadd_f32(vt0, vp0, vs0); - HVX_Vector vf1 = xnn_fmadd_f32(vt1, vp1, vs1); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0 = Q6_V_vand_QnV(Q6_Q_vcmp_gt_VsfVsf(vdenorm_cutoff, vx0), vf0); - vf1 = Q6_V_vand_QnV(Q6_Q_vcmp_gt_VsfVsf(vdenorm_cutoff, vx1), vf1); - - xnn_storeu_f32(output, vf0); - xnn_storeu_f32(output + 32, vf1); - output += 64; - - vacc0 = xnn_add_f32(vacc0, vf0); - vacc0 = xnn_add_f32(vacc0, vf1); - } - - HVX_Vector vacc = vacc0; - for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { - const HVX_Vector vi = xnn_loadu_f32(input); - input += 32; - - const HVX_Vector vx = xnn_sub_f32(vi, vi_max); - - HVX_Vector vn = xnn_fmadd_f32(vx, vlog2e, vmagic_bias); - - const HVX_Vector vs = Q6_Vw_vasl_VwR(vn, 23); - - vn = xnn_sub_f32(vn, vmagic_bias); - - HVX_Vector vt = xnn_fmadd_f32(vn, vminus_ln2_hi, vx); - vt = xnn_fmadd_f32(vn, vminus_ln2_lo, vt); - - HVX_Vector vp = xnn_fmadd_f32(vc5, vt, vc4); - vp = xnn_fmadd_f32(vp, vt, vc3); - vp = xnn_fmadd_f32(vp, vt, vc2); - vp = xnn_fmadd_f32(vp, vt, vc1); - - vt = xnn_mul_f32(vt, vs); - HVX_Vector vf = xnn_fmadd_f32(vt, vp, vs); - - vf = Q6_V_vand_QnV(Q6_Q_vcmp_gt_VsfVsf(vdenorm_cutoff, vx), vf); - - xnn_storeu_f32(output, vf); - output += 32; - - vacc = xnn_add_f32(vacc, vf); - } - - float vacc_lo = Q6_f32_vrsum_Vsf(vacc); - if XNN_UNLIKELY(batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch < 32 * sizeof(float)); - - const HVX_Vector vi = xnn_loadu_f32(input); - - const HVX_Vector vx = xnn_sub_f32(vi, vi_max); - - HVX_Vector vn = xnn_fmadd_f32(vx, vlog2e, vmagic_bias); - - const HVX_Vector vs = Q6_Vw_vasl_VwR(vn, 23); - - vn = xnn_sub_f32(vn, vmagic_bias); - - HVX_Vector vt = xnn_fmadd_f32(vn, vminus_ln2_hi, vx); - vt = xnn_fmadd_f32(vn, vminus_ln2_lo, vt); - - HVX_Vector vp = xnn_fmadd_f32(vc5, vt, vc4); - vp = xnn_fmadd_f32(vp, vt, vc3); - vp = xnn_fmadd_f32(vp, vt, vc2); - vp = xnn_fmadd_f32(vp, vt, vc1); - - vt = xnn_mul_f32(vt, vs); - HVX_Vector vf = xnn_fmadd_f32(vt, vp, vs); - - vf = Q6_V_vand_QnV(Q6_Q_vcmp_gt_VsfVsf(vdenorm_cutoff, vx), vf); - - Q6_V_vstu_variable(output, batch, vf); - - vf = Q6_V_vand_QV(Q6_Q_vsetq_R(batch), vf); - vacc_lo += Q6_f32_vrsum_Vsf(vf); - } - *sum = vacc_lo; -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u96-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u96-acc2.c deleted file mode 100644 index 5d2a5e8ba56..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u96-acc2.c +++ /dev/null @@ -1,208 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/hvx-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2024 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - - -#include - -#include "xnnpack/simd/f32-hvx.h" -#include "xnnpack/raddstoreexpminusmax.h" - -void xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u96_acc2( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const HVX_Vector vi_max = xnn_set1_f32(*max); - const HVX_Vector vlog2e = xnn_set1_f32(0x1.715476p+0f); - const HVX_Vector vmagic_bias = xnn_set1_f32(0x1.8000FEp23f); - const HVX_Vector vminus_ln2_hi = xnn_set1_f32(-0x1.62E400p-1f); - const HVX_Vector vminus_ln2_lo = xnn_set1_f32(-0x1.7F7D1Cp-20f); - const HVX_Vector vc5 = xnn_set1_f32(0x1.0F9F9Cp-7f); - const HVX_Vector vc4 = xnn_set1_f32(0x1.573A1Ap-5f); - const HVX_Vector vc3 = xnn_set1_f32(0x1.555A80p-3f); - const HVX_Vector vc2 = xnn_set1_f32(0x1.FFFDC6p-2f); - const HVX_Vector vc1 = xnn_set1_f32(0x1.FFFFF6p-1f); - const HVX_Vector vdenorm_cutoff = xnn_set1_f32(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - HVX_Vector vacc0 = Q6_V_vzero(); - HVX_Vector vacc1 = vacc0; - for (; batch >= 96 * sizeof(float); batch -= 96 * sizeof(float)) { - const HVX_Vector vi0 = xnn_loadu_f32(input); - const HVX_Vector vi1 = xnn_loadu_f32(input + 32); - const HVX_Vector vi2 = xnn_loadu_f32(input + 64); - input += 96; - - // Subtract maximum input x := i - i_max - const HVX_Vector vx0 = xnn_sub_f32(vi0, vi_max); - const HVX_Vector vx1 = xnn_sub_f32(vi1, vi_max); - const HVX_Vector vx2 = xnn_sub_f32(vi2, vi_max); - - // n := round(x / log(2)) - HVX_Vector vn0 = xnn_fmadd_f32(vx0, vlog2e, vmagic_bias); - HVX_Vector vn1 = xnn_fmadd_f32(vx1, vlog2e, vmagic_bias); - HVX_Vector vn2 = xnn_fmadd_f32(vx2, vlog2e, vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow. - const HVX_Vector vs0 = Q6_Vw_vasl_VwR(vn0, 23); - const HVX_Vector vs1 = Q6_Vw_vasl_VwR(vn1, 23); - const HVX_Vector vs2 = Q6_Vw_vasl_VwR(vn2, 23); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn0 = xnn_sub_f32(vn0, vmagic_bias); - vn1 = xnn_sub_f32(vn1, vmagic_bias); - vn2 = xnn_sub_f32(vn2, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - HVX_Vector vt0 = xnn_fmadd_f32(vn0, vminus_ln2_hi, vx0); - HVX_Vector vt1 = xnn_fmadd_f32(vn1, vminus_ln2_hi, vx1); - HVX_Vector vt2 = xnn_fmadd_f32(vn2, vminus_ln2_hi, vx2); - - vt0 = xnn_fmadd_f32(vn0, vminus_ln2_lo, vt0); - vt1 = xnn_fmadd_f32(vn1, vminus_ln2_lo, vt1); - vt2 = xnn_fmadd_f32(vn2, vminus_ln2_lo, vt2); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - // p := c5 * t + c4; - // p = p * t + c3; - // p = p * t + c2; - // p = p * t + c1; - HVX_Vector vp0 = xnn_fmadd_f32(vc5, vt0, vc4); - HVX_Vector vp1 = xnn_fmadd_f32(vc5, vt1, vc4); - HVX_Vector vp2 = xnn_fmadd_f32(vc5, vt2, vc4); - - vp0 = xnn_fmadd_f32(vp0, vt0, vc3); - vp1 = xnn_fmadd_f32(vp1, vt1, vc3); - vp2 = xnn_fmadd_f32(vp2, vt2, vc3); - - vp0 = xnn_fmadd_f32(vp0, vt0, vc2); - vp1 = xnn_fmadd_f32(vp1, vt1, vc2); - vp2 = xnn_fmadd_f32(vp2, vt2, vc2); - - vp0 = xnn_fmadd_f32(vp0, vt0, vc1); - vp1 = xnn_fmadd_f32(vp1, vt1, vc1); - vp2 = xnn_fmadd_f32(vp2, vt2, vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt0 = xnn_mul_f32(vt0, vs0); - vt1 = xnn_mul_f32(vt1, vs1); - vt2 = xnn_mul_f32(vt2, vs2); - - HVX_Vector vf0 = xnn_fmadd_f32(vt0, vp0, vs0); - HVX_Vector vf1 = xnn_fmadd_f32(vt1, vp1, vs1); - HVX_Vector vf2 = xnn_fmadd_f32(vt2, vp2, vs2); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0 = Q6_V_vand_QnV(Q6_Q_vcmp_gt_VsfVsf(vdenorm_cutoff, vx0), vf0); - vf1 = Q6_V_vand_QnV(Q6_Q_vcmp_gt_VsfVsf(vdenorm_cutoff, vx1), vf1); - vf2 = Q6_V_vand_QnV(Q6_Q_vcmp_gt_VsfVsf(vdenorm_cutoff, vx2), vf2); - - xnn_storeu_f32(output, vf0); - xnn_storeu_f32(output + 32, vf1); - xnn_storeu_f32(output + 64, vf2); - output += 96; - - vacc0 = xnn_add_f32(vacc0, vf0); - vacc0 = xnn_add_f32(vacc0, vf1); - vacc0 = xnn_add_f32(vacc0, vf2); - } - vacc0 = xnn_add_f32(vacc0, vacc1); - - HVX_Vector vacc = vacc0; - for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { - const HVX_Vector vi = xnn_loadu_f32(input); - input += 32; - - const HVX_Vector vx = xnn_sub_f32(vi, vi_max); - - HVX_Vector vn = xnn_fmadd_f32(vx, vlog2e, vmagic_bias); - - const HVX_Vector vs = Q6_Vw_vasl_VwR(vn, 23); - - vn = xnn_sub_f32(vn, vmagic_bias); - - HVX_Vector vt = xnn_fmadd_f32(vn, vminus_ln2_hi, vx); - vt = xnn_fmadd_f32(vn, vminus_ln2_lo, vt); - - HVX_Vector vp = xnn_fmadd_f32(vc5, vt, vc4); - vp = xnn_fmadd_f32(vp, vt, vc3); - vp = xnn_fmadd_f32(vp, vt, vc2); - vp = xnn_fmadd_f32(vp, vt, vc1); - - vt = xnn_mul_f32(vt, vs); - HVX_Vector vf = xnn_fmadd_f32(vt, vp, vs); - - vf = Q6_V_vand_QnV(Q6_Q_vcmp_gt_VsfVsf(vdenorm_cutoff, vx), vf); - - xnn_storeu_f32(output, vf); - output += 32; - - vacc = xnn_add_f32(vacc, vf); - } - - float vacc_lo = Q6_f32_vrsum_Vsf(vacc); - if XNN_UNLIKELY(batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch < 32 * sizeof(float)); - - const HVX_Vector vi = xnn_loadu_f32(input); - - const HVX_Vector vx = xnn_sub_f32(vi, vi_max); - - HVX_Vector vn = xnn_fmadd_f32(vx, vlog2e, vmagic_bias); - - const HVX_Vector vs = Q6_Vw_vasl_VwR(vn, 23); - - vn = xnn_sub_f32(vn, vmagic_bias); - - HVX_Vector vt = xnn_fmadd_f32(vn, vminus_ln2_hi, vx); - vt = xnn_fmadd_f32(vn, vminus_ln2_lo, vt); - - HVX_Vector vp = xnn_fmadd_f32(vc5, vt, vc4); - vp = xnn_fmadd_f32(vp, vt, vc3); - vp = xnn_fmadd_f32(vp, vt, vc2); - vp = xnn_fmadd_f32(vp, vt, vc1); - - vt = xnn_mul_f32(vt, vs); - HVX_Vector vf = xnn_fmadd_f32(vt, vp, vs); - - vf = Q6_V_vand_QnV(Q6_Q_vcmp_gt_VsfVsf(vdenorm_cutoff, vx), vf); - - Q6_V_vstu_variable(output, batch, vf); - - vf = Q6_V_vand_QV(Q6_Q_vsetq_R(batch), vf); - vacc_lo += Q6_f32_vrsum_Vsf(vf); - } - *sum = vacc_lo; -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u96-acc3.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u96-acc3.c deleted file mode 100644 index 0199c0fe366..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u96-acc3.c +++ /dev/null @@ -1,210 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/hvx-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2024 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - - -#include - -#include "xnnpack/simd/f32-hvx.h" -#include "xnnpack/raddstoreexpminusmax.h" - -void xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u96_acc3( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const HVX_Vector vi_max = xnn_set1_f32(*max); - const HVX_Vector vlog2e = xnn_set1_f32(0x1.715476p+0f); - const HVX_Vector vmagic_bias = xnn_set1_f32(0x1.8000FEp23f); - const HVX_Vector vminus_ln2_hi = xnn_set1_f32(-0x1.62E400p-1f); - const HVX_Vector vminus_ln2_lo = xnn_set1_f32(-0x1.7F7D1Cp-20f); - const HVX_Vector vc5 = xnn_set1_f32(0x1.0F9F9Cp-7f); - const HVX_Vector vc4 = xnn_set1_f32(0x1.573A1Ap-5f); - const HVX_Vector vc3 = xnn_set1_f32(0x1.555A80p-3f); - const HVX_Vector vc2 = xnn_set1_f32(0x1.FFFDC6p-2f); - const HVX_Vector vc1 = xnn_set1_f32(0x1.FFFFF6p-1f); - const HVX_Vector vdenorm_cutoff = xnn_set1_f32(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - HVX_Vector vacc0 = Q6_V_vzero(); - HVX_Vector vacc1 = vacc0; - HVX_Vector vacc2 = vacc0; - for (; batch >= 96 * sizeof(float); batch -= 96 * sizeof(float)) { - const HVX_Vector vi0 = xnn_loadu_f32(input); - const HVX_Vector vi1 = xnn_loadu_f32(input + 32); - const HVX_Vector vi2 = xnn_loadu_f32(input + 64); - input += 96; - - // Subtract maximum input x := i - i_max - const HVX_Vector vx0 = xnn_sub_f32(vi0, vi_max); - const HVX_Vector vx1 = xnn_sub_f32(vi1, vi_max); - const HVX_Vector vx2 = xnn_sub_f32(vi2, vi_max); - - // n := round(x / log(2)) - HVX_Vector vn0 = xnn_fmadd_f32(vx0, vlog2e, vmagic_bias); - HVX_Vector vn1 = xnn_fmadd_f32(vx1, vlog2e, vmagic_bias); - HVX_Vector vn2 = xnn_fmadd_f32(vx2, vlog2e, vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow. - const HVX_Vector vs0 = Q6_Vw_vasl_VwR(vn0, 23); - const HVX_Vector vs1 = Q6_Vw_vasl_VwR(vn1, 23); - const HVX_Vector vs2 = Q6_Vw_vasl_VwR(vn2, 23); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn0 = xnn_sub_f32(vn0, vmagic_bias); - vn1 = xnn_sub_f32(vn1, vmagic_bias); - vn2 = xnn_sub_f32(vn2, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - HVX_Vector vt0 = xnn_fmadd_f32(vn0, vminus_ln2_hi, vx0); - HVX_Vector vt1 = xnn_fmadd_f32(vn1, vminus_ln2_hi, vx1); - HVX_Vector vt2 = xnn_fmadd_f32(vn2, vminus_ln2_hi, vx2); - - vt0 = xnn_fmadd_f32(vn0, vminus_ln2_lo, vt0); - vt1 = xnn_fmadd_f32(vn1, vminus_ln2_lo, vt1); - vt2 = xnn_fmadd_f32(vn2, vminus_ln2_lo, vt2); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - // p := c5 * t + c4; - // p = p * t + c3; - // p = p * t + c2; - // p = p * t + c1; - HVX_Vector vp0 = xnn_fmadd_f32(vc5, vt0, vc4); - HVX_Vector vp1 = xnn_fmadd_f32(vc5, vt1, vc4); - HVX_Vector vp2 = xnn_fmadd_f32(vc5, vt2, vc4); - - vp0 = xnn_fmadd_f32(vp0, vt0, vc3); - vp1 = xnn_fmadd_f32(vp1, vt1, vc3); - vp2 = xnn_fmadd_f32(vp2, vt2, vc3); - - vp0 = xnn_fmadd_f32(vp0, vt0, vc2); - vp1 = xnn_fmadd_f32(vp1, vt1, vc2); - vp2 = xnn_fmadd_f32(vp2, vt2, vc2); - - vp0 = xnn_fmadd_f32(vp0, vt0, vc1); - vp1 = xnn_fmadd_f32(vp1, vt1, vc1); - vp2 = xnn_fmadd_f32(vp2, vt2, vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt0 = xnn_mul_f32(vt0, vs0); - vt1 = xnn_mul_f32(vt1, vs1); - vt2 = xnn_mul_f32(vt2, vs2); - - HVX_Vector vf0 = xnn_fmadd_f32(vt0, vp0, vs0); - HVX_Vector vf1 = xnn_fmadd_f32(vt1, vp1, vs1); - HVX_Vector vf2 = xnn_fmadd_f32(vt2, vp2, vs2); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0 = Q6_V_vand_QnV(Q6_Q_vcmp_gt_VsfVsf(vdenorm_cutoff, vx0), vf0); - vf1 = Q6_V_vand_QnV(Q6_Q_vcmp_gt_VsfVsf(vdenorm_cutoff, vx1), vf1); - vf2 = Q6_V_vand_QnV(Q6_Q_vcmp_gt_VsfVsf(vdenorm_cutoff, vx2), vf2); - - xnn_storeu_f32(output, vf0); - xnn_storeu_f32(output + 32, vf1); - xnn_storeu_f32(output + 64, vf2); - output += 96; - - vacc0 = xnn_add_f32(vacc0, vf0); - vacc2 = xnn_add_f32(vacc2, vf1); - vacc1 = xnn_add_f32(vacc1, vf2); - } - vacc0 = xnn_add_f32(vacc0, vacc1); - vacc0 = xnn_add_f32(vacc0, vacc2); - - HVX_Vector vacc = vacc0; - for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { - const HVX_Vector vi = xnn_loadu_f32(input); - input += 32; - - const HVX_Vector vx = xnn_sub_f32(vi, vi_max); - - HVX_Vector vn = xnn_fmadd_f32(vx, vlog2e, vmagic_bias); - - const HVX_Vector vs = Q6_Vw_vasl_VwR(vn, 23); - - vn = xnn_sub_f32(vn, vmagic_bias); - - HVX_Vector vt = xnn_fmadd_f32(vn, vminus_ln2_hi, vx); - vt = xnn_fmadd_f32(vn, vminus_ln2_lo, vt); - - HVX_Vector vp = xnn_fmadd_f32(vc5, vt, vc4); - vp = xnn_fmadd_f32(vp, vt, vc3); - vp = xnn_fmadd_f32(vp, vt, vc2); - vp = xnn_fmadd_f32(vp, vt, vc1); - - vt = xnn_mul_f32(vt, vs); - HVX_Vector vf = xnn_fmadd_f32(vt, vp, vs); - - vf = Q6_V_vand_QnV(Q6_Q_vcmp_gt_VsfVsf(vdenorm_cutoff, vx), vf); - - xnn_storeu_f32(output, vf); - output += 32; - - vacc = xnn_add_f32(vacc, vf); - } - - float vacc_lo = Q6_f32_vrsum_Vsf(vacc); - if XNN_UNLIKELY(batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch < 32 * sizeof(float)); - - const HVX_Vector vi = xnn_loadu_f32(input); - - const HVX_Vector vx = xnn_sub_f32(vi, vi_max); - - HVX_Vector vn = xnn_fmadd_f32(vx, vlog2e, vmagic_bias); - - const HVX_Vector vs = Q6_Vw_vasl_VwR(vn, 23); - - vn = xnn_sub_f32(vn, vmagic_bias); - - HVX_Vector vt = xnn_fmadd_f32(vn, vminus_ln2_hi, vx); - vt = xnn_fmadd_f32(vn, vminus_ln2_lo, vt); - - HVX_Vector vp = xnn_fmadd_f32(vc5, vt, vc4); - vp = xnn_fmadd_f32(vp, vt, vc3); - vp = xnn_fmadd_f32(vp, vt, vc2); - vp = xnn_fmadd_f32(vp, vt, vc1); - - vt = xnn_mul_f32(vt, vs); - HVX_Vector vf = xnn_fmadd_f32(vt, vp, vs); - - vf = Q6_V_vand_QnV(Q6_Q_vcmp_gt_VsfVsf(vdenorm_cutoff, vx), vf); - - Q6_V_vstu_variable(output, batch, vf); - - vf = Q6_V_vand_QV(Q6_Q_vsetq_R(batch), vf); - vacc_lo += Q6_f32_vrsum_Vsf(vf); - } - *sum = vacc_lo; -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u96.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u96.c deleted file mode 100644 index 2367c2c2fc6..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u96.c +++ /dev/null @@ -1,206 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/hvx-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2024 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - - -#include - -#include "xnnpack/simd/f32-hvx.h" -#include "xnnpack/raddstoreexpminusmax.h" - -void xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u96( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const HVX_Vector vi_max = xnn_set1_f32(*max); - const HVX_Vector vlog2e = xnn_set1_f32(0x1.715476p+0f); - const HVX_Vector vmagic_bias = xnn_set1_f32(0x1.8000FEp23f); - const HVX_Vector vminus_ln2_hi = xnn_set1_f32(-0x1.62E400p-1f); - const HVX_Vector vminus_ln2_lo = xnn_set1_f32(-0x1.7F7D1Cp-20f); - const HVX_Vector vc5 = xnn_set1_f32(0x1.0F9F9Cp-7f); - const HVX_Vector vc4 = xnn_set1_f32(0x1.573A1Ap-5f); - const HVX_Vector vc3 = xnn_set1_f32(0x1.555A80p-3f); - const HVX_Vector vc2 = xnn_set1_f32(0x1.FFFDC6p-2f); - const HVX_Vector vc1 = xnn_set1_f32(0x1.FFFFF6p-1f); - const HVX_Vector vdenorm_cutoff = xnn_set1_f32(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - HVX_Vector vacc0 = Q6_V_vzero(); - for (; batch >= 96 * sizeof(float); batch -= 96 * sizeof(float)) { - const HVX_Vector vi0 = xnn_loadu_f32(input); - const HVX_Vector vi1 = xnn_loadu_f32(input + 32); - const HVX_Vector vi2 = xnn_loadu_f32(input + 64); - input += 96; - - // Subtract maximum input x := i - i_max - const HVX_Vector vx0 = xnn_sub_f32(vi0, vi_max); - const HVX_Vector vx1 = xnn_sub_f32(vi1, vi_max); - const HVX_Vector vx2 = xnn_sub_f32(vi2, vi_max); - - // n := round(x / log(2)) - HVX_Vector vn0 = xnn_fmadd_f32(vx0, vlog2e, vmagic_bias); - HVX_Vector vn1 = xnn_fmadd_f32(vx1, vlog2e, vmagic_bias); - HVX_Vector vn2 = xnn_fmadd_f32(vx2, vlog2e, vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow. - const HVX_Vector vs0 = Q6_Vw_vasl_VwR(vn0, 23); - const HVX_Vector vs1 = Q6_Vw_vasl_VwR(vn1, 23); - const HVX_Vector vs2 = Q6_Vw_vasl_VwR(vn2, 23); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn0 = xnn_sub_f32(vn0, vmagic_bias); - vn1 = xnn_sub_f32(vn1, vmagic_bias); - vn2 = xnn_sub_f32(vn2, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - HVX_Vector vt0 = xnn_fmadd_f32(vn0, vminus_ln2_hi, vx0); - HVX_Vector vt1 = xnn_fmadd_f32(vn1, vminus_ln2_hi, vx1); - HVX_Vector vt2 = xnn_fmadd_f32(vn2, vminus_ln2_hi, vx2); - - vt0 = xnn_fmadd_f32(vn0, vminus_ln2_lo, vt0); - vt1 = xnn_fmadd_f32(vn1, vminus_ln2_lo, vt1); - vt2 = xnn_fmadd_f32(vn2, vminus_ln2_lo, vt2); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - // p := c5 * t + c4; - // p = p * t + c3; - // p = p * t + c2; - // p = p * t + c1; - HVX_Vector vp0 = xnn_fmadd_f32(vc5, vt0, vc4); - HVX_Vector vp1 = xnn_fmadd_f32(vc5, vt1, vc4); - HVX_Vector vp2 = xnn_fmadd_f32(vc5, vt2, vc4); - - vp0 = xnn_fmadd_f32(vp0, vt0, vc3); - vp1 = xnn_fmadd_f32(vp1, vt1, vc3); - vp2 = xnn_fmadd_f32(vp2, vt2, vc3); - - vp0 = xnn_fmadd_f32(vp0, vt0, vc2); - vp1 = xnn_fmadd_f32(vp1, vt1, vc2); - vp2 = xnn_fmadd_f32(vp2, vt2, vc2); - - vp0 = xnn_fmadd_f32(vp0, vt0, vc1); - vp1 = xnn_fmadd_f32(vp1, vt1, vc1); - vp2 = xnn_fmadd_f32(vp2, vt2, vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt0 = xnn_mul_f32(vt0, vs0); - vt1 = xnn_mul_f32(vt1, vs1); - vt2 = xnn_mul_f32(vt2, vs2); - - HVX_Vector vf0 = xnn_fmadd_f32(vt0, vp0, vs0); - HVX_Vector vf1 = xnn_fmadd_f32(vt1, vp1, vs1); - HVX_Vector vf2 = xnn_fmadd_f32(vt2, vp2, vs2); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0 = Q6_V_vand_QnV(Q6_Q_vcmp_gt_VsfVsf(vdenorm_cutoff, vx0), vf0); - vf1 = Q6_V_vand_QnV(Q6_Q_vcmp_gt_VsfVsf(vdenorm_cutoff, vx1), vf1); - vf2 = Q6_V_vand_QnV(Q6_Q_vcmp_gt_VsfVsf(vdenorm_cutoff, vx2), vf2); - - xnn_storeu_f32(output, vf0); - xnn_storeu_f32(output + 32, vf1); - xnn_storeu_f32(output + 64, vf2); - output += 96; - - vacc0 = xnn_add_f32(vacc0, vf0); - vacc0 = xnn_add_f32(vacc0, vf1); - vacc0 = xnn_add_f32(vacc0, vf2); - } - - HVX_Vector vacc = vacc0; - for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { - const HVX_Vector vi = xnn_loadu_f32(input); - input += 32; - - const HVX_Vector vx = xnn_sub_f32(vi, vi_max); - - HVX_Vector vn = xnn_fmadd_f32(vx, vlog2e, vmagic_bias); - - const HVX_Vector vs = Q6_Vw_vasl_VwR(vn, 23); - - vn = xnn_sub_f32(vn, vmagic_bias); - - HVX_Vector vt = xnn_fmadd_f32(vn, vminus_ln2_hi, vx); - vt = xnn_fmadd_f32(vn, vminus_ln2_lo, vt); - - HVX_Vector vp = xnn_fmadd_f32(vc5, vt, vc4); - vp = xnn_fmadd_f32(vp, vt, vc3); - vp = xnn_fmadd_f32(vp, vt, vc2); - vp = xnn_fmadd_f32(vp, vt, vc1); - - vt = xnn_mul_f32(vt, vs); - HVX_Vector vf = xnn_fmadd_f32(vt, vp, vs); - - vf = Q6_V_vand_QnV(Q6_Q_vcmp_gt_VsfVsf(vdenorm_cutoff, vx), vf); - - xnn_storeu_f32(output, vf); - output += 32; - - vacc = xnn_add_f32(vacc, vf); - } - - float vacc_lo = Q6_f32_vrsum_Vsf(vacc); - if XNN_UNLIKELY(batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch < 32 * sizeof(float)); - - const HVX_Vector vi = xnn_loadu_f32(input); - - const HVX_Vector vx = xnn_sub_f32(vi, vi_max); - - HVX_Vector vn = xnn_fmadd_f32(vx, vlog2e, vmagic_bias); - - const HVX_Vector vs = Q6_Vw_vasl_VwR(vn, 23); - - vn = xnn_sub_f32(vn, vmagic_bias); - - HVX_Vector vt = xnn_fmadd_f32(vn, vminus_ln2_hi, vx); - vt = xnn_fmadd_f32(vn, vminus_ln2_lo, vt); - - HVX_Vector vp = xnn_fmadd_f32(vc5, vt, vc4); - vp = xnn_fmadd_f32(vp, vt, vc3); - vp = xnn_fmadd_f32(vp, vt, vc2); - vp = xnn_fmadd_f32(vp, vt, vc1); - - vt = xnn_mul_f32(vt, vs); - HVX_Vector vf = xnn_fmadd_f32(vt, vp, vs); - - vf = Q6_V_vand_QnV(Q6_Q_vcmp_gt_VsfVsf(vdenorm_cutoff, vx), vf); - - Q6_V_vstu_variable(output, batch, vf); - - vf = Q6_V_vand_QV(Q6_Q_vsetq_R(batch), vf); - vacc_lo += Q6_f32_vrsum_Vsf(vf); - } - *sum = vacc_lo; -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u12-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u12-acc2.c deleted file mode 100644 index de07a792488..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u12-acc2.c +++ /dev/null @@ -1,243 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -extern XNN_INTERNAL const float xnn_table_exp2_k_over_64[64]; - -void xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u12_acc2( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const float32x4_t vlog2e = vmovq_n_f32(0x1.715476p+0f); - const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p17f); - const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x3F)); - const float32x4_t vc2 = vmovq_n_f32(0x1.FFFF0Ap-2f); - const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vindex_mask); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const float32x4_t vminus_ln2_hi = vmovq_n_f32(-0x1.62E400p-1f); - const float32x4_t vminus_ln2_lo = vmovq_n_f32(-0x1.7F7D1Cp-20f); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - - const float32x4_t vi_max = vld1q_dup_f32(max); - - float32x4_t vacc0 = vmovq_n_f32(0.0f); - float32x4_t vacc1 = vmovq_n_f32(0.0f); - for (; batch >= 12 * sizeof(float); batch -= 12 * sizeof(float)) { - const float32x4_t vi0123 = vld1q_f32(input); input += 4; - const float32x4_t vi4567 = vld1q_f32(input); input += 4; - const float32x4_t vi89AB = vld1q_f32(input); input += 4; - - const float32x4_t vx0123 = vsubq_f32(vi0123, vi_max); - const float32x4_t vx4567 = vsubq_f32(vi4567, vi_max); - const float32x4_t vx89AB = vsubq_f32(vi89AB, vi_max); - - float32x4_t vn0123 = vmlaq_f32(vmagic_bias, vx0123, vlog2e); - float32x4_t vn4567 = vmlaq_f32(vmagic_bias, vx4567, vlog2e); - float32x4_t vn89AB = vmlaq_f32(vmagic_bias, vx89AB, vlog2e); - - const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t ve89AB = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn89AB), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask)); - const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0); - const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1); - const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask)); - const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0); - const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1); - const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask)); - const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0); - const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1); - - float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx01]); - float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx23]); - float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx45]); - float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx67]); - float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx89]); - float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxAB]); - - vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx01 >> 32)], vl01, 1); - vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx23 >> 32)], vl23, 1); - const float32x4_t vl0123 = vcombine_f32(vl01, vl23); - vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx45 >> 32)], vl45, 1); - vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx67 >> 32)], vl67, 1); - const float32x4_t vl4567 = vcombine_f32(vl45, vl67); - vl89 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx89 >> 32)], vl89, 1); - vlAB = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxAB >> 32)], vlAB, 1); - const float32x4_t vl89AB = vcombine_f32(vl89, vlAB); - - const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123)); - const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567)); - const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB)); - - vn0123 = vsubq_f32(vn0123, vmagic_bias); - vn4567 = vsubq_f32(vn4567, vmagic_bias); - vn89AB = vsubq_f32(vn89AB, vmagic_bias); - - float32x4_t vt0123 = vmlaq_f32(vx0123, vn0123, vminus_ln2_hi); - float32x4_t vt4567 = vmlaq_f32(vx4567, vn4567, vminus_ln2_hi); - float32x4_t vt89AB = vmlaq_f32(vx89AB, vn89AB, vminus_ln2_hi); - - vt0123 = vmlaq_f32(vt0123, vn0123, vminus_ln2_lo); - vt4567 = vmlaq_f32(vt4567, vn4567, vminus_ln2_lo); - vt89AB = vmlaq_f32(vt89AB, vn89AB, vminus_ln2_lo); - - float32x4_t vp0123 = vmulq_f32(vt0123, vc2); - float32x4_t vp4567 = vmulq_f32(vt4567, vc2); - float32x4_t vp89AB = vmulq_f32(vt89AB, vc2); - - vp0123 = vmlaq_f32(vt0123, vt0123, vp0123); - vp4567 = vmlaq_f32(vt4567, vt4567, vp4567); - vp89AB = vmlaq_f32(vt89AB, vt89AB, vp89AB); - - float32x4_t vf0123 = vmlaq_f32(vs0123, vs0123, vp0123); - float32x4_t vf4567 = vmlaq_f32(vs4567, vs4567, vp4567); - float32x4_t vf89AB = vmlaq_f32(vs89AB, vs89AB, vp89AB); - - vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff))); - vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcltq_f32(vx4567, vdenorm_cutoff))); - vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcltq_f32(vx89AB, vdenorm_cutoff))); - - vst1q_f32(output, vf0123); output += 4; - vst1q_f32(output, vf4567); output += 4; - vst1q_f32(output, vf89AB); output += 4; - - vacc0 = vaddq_f32(vacc0, vf0123); - vacc0 = vaddq_f32(vacc0, vf4567); - vacc0 = vaddq_f32(vacc0, vf89AB); - } - vacc0 = vaddq_f32(vacc0, vacc1); - - float32x4_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vmlaq_f32(vmagic_bias, vx, vlog2e); - - const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); - const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); - const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); - float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); - float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]); - vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); - vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); - const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); - const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vmlaq_f32(vx, vn, vminus_ln2_hi); - vt = vmlaq_f32(vt, vn, vminus_ln2_lo); - - float32x4_t vp = vmulq_f32(vt, vc2); - vp = vmlaq_f32(vt, vt, vp); - - float32x4_t vf = vmlaq_f32(vs, vs, vp); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - vst1q_f32(output, vf); output += 4; - - vacc = vaddq_f32(vacc, vf); - } -#if XNN_ARCH_ARM64 - float vacc_lo = vaddvq_f32(vacc); -#else - float32x2_t vacc_lo = vadd_f32(vget_high_f32(vacc), vget_low_f32(vacc)); -#endif - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vmlaq_f32(vmagic_bias, vx, vlog2e); - - const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); - const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); - const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); - float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); - float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]); - vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); - vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); - const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); - const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vmlaq_f32(vx, vn, vminus_ln2_hi); - vt = vmlaq_f32(vt, vn, vminus_ln2_lo); - - float32x4_t vp = vmulq_f32(vt, vc2); - vp = vmlaq_f32(vt, vt, vp); - - float32x4_t vf = vmlaq_f32(vs, vs, vp); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - float32x2_t vf_lo = vget_low_f32(vf); - if (batch & (2 * sizeof(float))) { - vst1_f32(output, vf_lo); output += 2; - - #if XNN_ARCH_ARM64 - vacc_lo += vaddv_f32(vf_lo); - #else - vacc_lo = vadd_f32(vacc_lo, vf_lo); - #endif - - vf_lo = vget_high_f32(vf); - } - if (batch & (1 * sizeof(float))) { - vst1_lane_f32(output, vf_lo, 0); - - #if XNN_ARCH_ARM64 - vacc_lo += vget_lane_f32(vf_lo, 0); - #else - vacc_lo = vadd_f32(vacc_lo, vreinterpret_f32_u64(vshl_n_u64(vreinterpret_u64_f32(vf_lo), 32))); - #endif - } - } -#if XNN_ARCH_ARM64 - *sum = vacc_lo; -#else - vst1_lane_f32(sum, vpadd_f32(vacc_lo, vacc_lo), 0); -#endif -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u12-acc3.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u12-acc3.c deleted file mode 100644 index 7dff5519a8c..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u12-acc3.c +++ /dev/null @@ -1,245 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -extern XNN_INTERNAL const float xnn_table_exp2_k_over_64[64]; - -void xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u12_acc3( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const float32x4_t vlog2e = vmovq_n_f32(0x1.715476p+0f); - const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p17f); - const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x3F)); - const float32x4_t vc2 = vmovq_n_f32(0x1.FFFF0Ap-2f); - const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vindex_mask); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const float32x4_t vminus_ln2_hi = vmovq_n_f32(-0x1.62E400p-1f); - const float32x4_t vminus_ln2_lo = vmovq_n_f32(-0x1.7F7D1Cp-20f); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - - const float32x4_t vi_max = vld1q_dup_f32(max); - - float32x4_t vacc0 = vmovq_n_f32(0.0f); - float32x4_t vacc1 = vmovq_n_f32(0.0f); - float32x4_t vacc2 = vmovq_n_f32(0.0f); - for (; batch >= 12 * sizeof(float); batch -= 12 * sizeof(float)) { - const float32x4_t vi0123 = vld1q_f32(input); input += 4; - const float32x4_t vi4567 = vld1q_f32(input); input += 4; - const float32x4_t vi89AB = vld1q_f32(input); input += 4; - - const float32x4_t vx0123 = vsubq_f32(vi0123, vi_max); - const float32x4_t vx4567 = vsubq_f32(vi4567, vi_max); - const float32x4_t vx89AB = vsubq_f32(vi89AB, vi_max); - - float32x4_t vn0123 = vmlaq_f32(vmagic_bias, vx0123, vlog2e); - float32x4_t vn4567 = vmlaq_f32(vmagic_bias, vx4567, vlog2e); - float32x4_t vn89AB = vmlaq_f32(vmagic_bias, vx89AB, vlog2e); - - const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t ve89AB = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn89AB), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask)); - const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0); - const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1); - const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask)); - const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0); - const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1); - const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask)); - const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0); - const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1); - - float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx01]); - float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx23]); - float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx45]); - float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx67]); - float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx89]); - float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxAB]); - - vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx01 >> 32)], vl01, 1); - vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx23 >> 32)], vl23, 1); - const float32x4_t vl0123 = vcombine_f32(vl01, vl23); - vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx45 >> 32)], vl45, 1); - vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx67 >> 32)], vl67, 1); - const float32x4_t vl4567 = vcombine_f32(vl45, vl67); - vl89 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx89 >> 32)], vl89, 1); - vlAB = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxAB >> 32)], vlAB, 1); - const float32x4_t vl89AB = vcombine_f32(vl89, vlAB); - - const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123)); - const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567)); - const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB)); - - vn0123 = vsubq_f32(vn0123, vmagic_bias); - vn4567 = vsubq_f32(vn4567, vmagic_bias); - vn89AB = vsubq_f32(vn89AB, vmagic_bias); - - float32x4_t vt0123 = vmlaq_f32(vx0123, vn0123, vminus_ln2_hi); - float32x4_t vt4567 = vmlaq_f32(vx4567, vn4567, vminus_ln2_hi); - float32x4_t vt89AB = vmlaq_f32(vx89AB, vn89AB, vminus_ln2_hi); - - vt0123 = vmlaq_f32(vt0123, vn0123, vminus_ln2_lo); - vt4567 = vmlaq_f32(vt4567, vn4567, vminus_ln2_lo); - vt89AB = vmlaq_f32(vt89AB, vn89AB, vminus_ln2_lo); - - float32x4_t vp0123 = vmulq_f32(vt0123, vc2); - float32x4_t vp4567 = vmulq_f32(vt4567, vc2); - float32x4_t vp89AB = vmulq_f32(vt89AB, vc2); - - vp0123 = vmlaq_f32(vt0123, vt0123, vp0123); - vp4567 = vmlaq_f32(vt4567, vt4567, vp4567); - vp89AB = vmlaq_f32(vt89AB, vt89AB, vp89AB); - - float32x4_t vf0123 = vmlaq_f32(vs0123, vs0123, vp0123); - float32x4_t vf4567 = vmlaq_f32(vs4567, vs4567, vp4567); - float32x4_t vf89AB = vmlaq_f32(vs89AB, vs89AB, vp89AB); - - vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff))); - vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcltq_f32(vx4567, vdenorm_cutoff))); - vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcltq_f32(vx89AB, vdenorm_cutoff))); - - vst1q_f32(output, vf0123); output += 4; - vst1q_f32(output, vf4567); output += 4; - vst1q_f32(output, vf89AB); output += 4; - - vacc0 = vaddq_f32(vacc0, vf0123); - vacc1 = vaddq_f32(vacc1, vf4567); - vacc2 = vaddq_f32(vacc2, vf89AB); - } - vacc0 = vaddq_f32(vacc0, vacc1); - vacc0 = vaddq_f32(vacc0, vacc2); - - float32x4_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vmlaq_f32(vmagic_bias, vx, vlog2e); - - const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); - const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); - const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); - float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); - float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]); - vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); - vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); - const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); - const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vmlaq_f32(vx, vn, vminus_ln2_hi); - vt = vmlaq_f32(vt, vn, vminus_ln2_lo); - - float32x4_t vp = vmulq_f32(vt, vc2); - vp = vmlaq_f32(vt, vt, vp); - - float32x4_t vf = vmlaq_f32(vs, vs, vp); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - vst1q_f32(output, vf); output += 4; - - vacc = vaddq_f32(vacc, vf); - } -#if XNN_ARCH_ARM64 - float vacc_lo = vaddvq_f32(vacc); -#else - float32x2_t vacc_lo = vadd_f32(vget_high_f32(vacc), vget_low_f32(vacc)); -#endif - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vmlaq_f32(vmagic_bias, vx, vlog2e); - - const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); - const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); - const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); - float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); - float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]); - vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); - vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); - const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); - const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vmlaq_f32(vx, vn, vminus_ln2_hi); - vt = vmlaq_f32(vt, vn, vminus_ln2_lo); - - float32x4_t vp = vmulq_f32(vt, vc2); - vp = vmlaq_f32(vt, vt, vp); - - float32x4_t vf = vmlaq_f32(vs, vs, vp); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - float32x2_t vf_lo = vget_low_f32(vf); - if (batch & (2 * sizeof(float))) { - vst1_f32(output, vf_lo); output += 2; - - #if XNN_ARCH_ARM64 - vacc_lo += vaddv_f32(vf_lo); - #else - vacc_lo = vadd_f32(vacc_lo, vf_lo); - #endif - - vf_lo = vget_high_f32(vf); - } - if (batch & (1 * sizeof(float))) { - vst1_lane_f32(output, vf_lo, 0); - - #if XNN_ARCH_ARM64 - vacc_lo += vget_lane_f32(vf_lo, 0); - #else - vacc_lo = vadd_f32(vacc_lo, vreinterpret_f32_u64(vshl_n_u64(vreinterpret_u64_f32(vf_lo), 32))); - #endif - } - } -#if XNN_ARCH_ARM64 - *sum = vacc_lo; -#else - vst1_lane_f32(sum, vpadd_f32(vacc_lo, vacc_lo), 0); -#endif -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u12.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u12.c deleted file mode 100644 index 4da0edd8726..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u12.c +++ /dev/null @@ -1,241 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -extern XNN_INTERNAL const float xnn_table_exp2_k_over_64[64]; - -void xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u12( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const float32x4_t vlog2e = vmovq_n_f32(0x1.715476p+0f); - const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p17f); - const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x3F)); - const float32x4_t vc2 = vmovq_n_f32(0x1.FFFF0Ap-2f); - const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vindex_mask); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const float32x4_t vminus_ln2_hi = vmovq_n_f32(-0x1.62E400p-1f); - const float32x4_t vminus_ln2_lo = vmovq_n_f32(-0x1.7F7D1Cp-20f); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - - const float32x4_t vi_max = vld1q_dup_f32(max); - - float32x4_t vacc0 = vmovq_n_f32(0.0f); - for (; batch >= 12 * sizeof(float); batch -= 12 * sizeof(float)) { - const float32x4_t vi0123 = vld1q_f32(input); input += 4; - const float32x4_t vi4567 = vld1q_f32(input); input += 4; - const float32x4_t vi89AB = vld1q_f32(input); input += 4; - - const float32x4_t vx0123 = vsubq_f32(vi0123, vi_max); - const float32x4_t vx4567 = vsubq_f32(vi4567, vi_max); - const float32x4_t vx89AB = vsubq_f32(vi89AB, vi_max); - - float32x4_t vn0123 = vmlaq_f32(vmagic_bias, vx0123, vlog2e); - float32x4_t vn4567 = vmlaq_f32(vmagic_bias, vx4567, vlog2e); - float32x4_t vn89AB = vmlaq_f32(vmagic_bias, vx89AB, vlog2e); - - const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t ve89AB = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn89AB), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask)); - const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0); - const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1); - const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask)); - const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0); - const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1); - const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask)); - const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0); - const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1); - - float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx01]); - float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx23]); - float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx45]); - float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx67]); - float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx89]); - float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxAB]); - - vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx01 >> 32)], vl01, 1); - vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx23 >> 32)], vl23, 1); - const float32x4_t vl0123 = vcombine_f32(vl01, vl23); - vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx45 >> 32)], vl45, 1); - vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx67 >> 32)], vl67, 1); - const float32x4_t vl4567 = vcombine_f32(vl45, vl67); - vl89 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx89 >> 32)], vl89, 1); - vlAB = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxAB >> 32)], vlAB, 1); - const float32x4_t vl89AB = vcombine_f32(vl89, vlAB); - - const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123)); - const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567)); - const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB)); - - vn0123 = vsubq_f32(vn0123, vmagic_bias); - vn4567 = vsubq_f32(vn4567, vmagic_bias); - vn89AB = vsubq_f32(vn89AB, vmagic_bias); - - float32x4_t vt0123 = vmlaq_f32(vx0123, vn0123, vminus_ln2_hi); - float32x4_t vt4567 = vmlaq_f32(vx4567, vn4567, vminus_ln2_hi); - float32x4_t vt89AB = vmlaq_f32(vx89AB, vn89AB, vminus_ln2_hi); - - vt0123 = vmlaq_f32(vt0123, vn0123, vminus_ln2_lo); - vt4567 = vmlaq_f32(vt4567, vn4567, vminus_ln2_lo); - vt89AB = vmlaq_f32(vt89AB, vn89AB, vminus_ln2_lo); - - float32x4_t vp0123 = vmulq_f32(vt0123, vc2); - float32x4_t vp4567 = vmulq_f32(vt4567, vc2); - float32x4_t vp89AB = vmulq_f32(vt89AB, vc2); - - vp0123 = vmlaq_f32(vt0123, vt0123, vp0123); - vp4567 = vmlaq_f32(vt4567, vt4567, vp4567); - vp89AB = vmlaq_f32(vt89AB, vt89AB, vp89AB); - - float32x4_t vf0123 = vmlaq_f32(vs0123, vs0123, vp0123); - float32x4_t vf4567 = vmlaq_f32(vs4567, vs4567, vp4567); - float32x4_t vf89AB = vmlaq_f32(vs89AB, vs89AB, vp89AB); - - vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff))); - vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcltq_f32(vx4567, vdenorm_cutoff))); - vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcltq_f32(vx89AB, vdenorm_cutoff))); - - vst1q_f32(output, vf0123); output += 4; - vst1q_f32(output, vf4567); output += 4; - vst1q_f32(output, vf89AB); output += 4; - - vacc0 = vaddq_f32(vacc0, vf0123); - vacc0 = vaddq_f32(vacc0, vf4567); - vacc0 = vaddq_f32(vacc0, vf89AB); - } - - float32x4_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vmlaq_f32(vmagic_bias, vx, vlog2e); - - const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); - const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); - const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); - float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); - float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]); - vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); - vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); - const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); - const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vmlaq_f32(vx, vn, vminus_ln2_hi); - vt = vmlaq_f32(vt, vn, vminus_ln2_lo); - - float32x4_t vp = vmulq_f32(vt, vc2); - vp = vmlaq_f32(vt, vt, vp); - - float32x4_t vf = vmlaq_f32(vs, vs, vp); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - vst1q_f32(output, vf); output += 4; - - vacc = vaddq_f32(vacc, vf); - } -#if XNN_ARCH_ARM64 - float vacc_lo = vaddvq_f32(vacc); -#else - float32x2_t vacc_lo = vadd_f32(vget_high_f32(vacc), vget_low_f32(vacc)); -#endif - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vmlaq_f32(vmagic_bias, vx, vlog2e); - - const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); - const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); - const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); - float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); - float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]); - vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); - vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); - const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); - const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vmlaq_f32(vx, vn, vminus_ln2_hi); - vt = vmlaq_f32(vt, vn, vminus_ln2_lo); - - float32x4_t vp = vmulq_f32(vt, vc2); - vp = vmlaq_f32(vt, vt, vp); - - float32x4_t vf = vmlaq_f32(vs, vs, vp); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - float32x2_t vf_lo = vget_low_f32(vf); - if (batch & (2 * sizeof(float))) { - vst1_f32(output, vf_lo); output += 2; - - #if XNN_ARCH_ARM64 - vacc_lo += vaddv_f32(vf_lo); - #else - vacc_lo = vadd_f32(vacc_lo, vf_lo); - #endif - - vf_lo = vget_high_f32(vf); - } - if (batch & (1 * sizeof(float))) { - vst1_lane_f32(output, vf_lo, 0); - - #if XNN_ARCH_ARM64 - vacc_lo += vget_lane_f32(vf_lo, 0); - #else - vacc_lo = vadd_f32(vacc_lo, vreinterpret_f32_u64(vshl_n_u64(vreinterpret_u64_f32(vf_lo), 32))); - #endif - } - } -#if XNN_ARCH_ARM64 - *sum = vacc_lo; -#else - vst1_lane_f32(sum, vpadd_f32(vacc_lo, vacc_lo), 0); -#endif -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u16.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u16.c deleted file mode 100644 index 471d4c777e4..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u16.c +++ /dev/null @@ -1,263 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -extern XNN_INTERNAL const float xnn_table_exp2_k_over_64[64]; - -void xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u16( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const float32x4_t vlog2e = vmovq_n_f32(0x1.715476p+0f); - const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p17f); - const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x3F)); - const float32x4_t vc2 = vmovq_n_f32(0x1.FFFF0Ap-2f); - const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vindex_mask); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const float32x4_t vminus_ln2_hi = vmovq_n_f32(-0x1.62E400p-1f); - const float32x4_t vminus_ln2_lo = vmovq_n_f32(-0x1.7F7D1Cp-20f); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - - const float32x4_t vi_max = vld1q_dup_f32(max); - - float32x4_t vacc0 = vmovq_n_f32(0.0f); - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const float32x4_t vi0123 = vld1q_f32(input); input += 4; - const float32x4_t vi4567 = vld1q_f32(input); input += 4; - const float32x4_t vi89AB = vld1q_f32(input); input += 4; - const float32x4_t viCDEF = vld1q_f32(input); input += 4; - - const float32x4_t vx0123 = vsubq_f32(vi0123, vi_max); - const float32x4_t vx4567 = vsubq_f32(vi4567, vi_max); - const float32x4_t vx89AB = vsubq_f32(vi89AB, vi_max); - const float32x4_t vxCDEF = vsubq_f32(viCDEF, vi_max); - - float32x4_t vn0123 = vmlaq_f32(vmagic_bias, vx0123, vlog2e); - float32x4_t vn4567 = vmlaq_f32(vmagic_bias, vx4567, vlog2e); - float32x4_t vn89AB = vmlaq_f32(vmagic_bias, vx89AB, vlog2e); - float32x4_t vnCDEF = vmlaq_f32(vmagic_bias, vxCDEF, vlog2e); - - const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t ve89AB = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn89AB), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t veCDEF = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnCDEF), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask)); - const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0); - const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1); - const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask)); - const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0); - const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1); - const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask)); - const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0); - const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1); - const uint64x2_t vidxCDEF = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnCDEF), vindex_mask)); - const uint64_t vidxCD = vgetq_lane_u64(vidxCDEF, 0); - const uint64_t vidxEF = vgetq_lane_u64(vidxCDEF, 1); - - float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx01]); - float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx23]); - float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx45]); - float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx67]); - float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx89]); - float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxAB]); - float32x2_t vlCD = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxCD]); - float32x2_t vlEF = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxEF]); - - vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx01 >> 32)], vl01, 1); - vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx23 >> 32)], vl23, 1); - const float32x4_t vl0123 = vcombine_f32(vl01, vl23); - vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx45 >> 32)], vl45, 1); - vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx67 >> 32)], vl67, 1); - const float32x4_t vl4567 = vcombine_f32(vl45, vl67); - vl89 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx89 >> 32)], vl89, 1); - vlAB = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxAB >> 32)], vlAB, 1); - const float32x4_t vl89AB = vcombine_f32(vl89, vlAB); - vlCD = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxCD >> 32)], vlCD, 1); - vlEF = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxEF >> 32)], vlEF, 1); - const float32x4_t vlCDEF = vcombine_f32(vlCD, vlEF); - - const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123)); - const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567)); - const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB)); - const float32x4_t vsCDEF = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlCDEF), veCDEF)); - - vn0123 = vsubq_f32(vn0123, vmagic_bias); - vn4567 = vsubq_f32(vn4567, vmagic_bias); - vn89AB = vsubq_f32(vn89AB, vmagic_bias); - vnCDEF = vsubq_f32(vnCDEF, vmagic_bias); - - float32x4_t vt0123 = vmlaq_f32(vx0123, vn0123, vminus_ln2_hi); - float32x4_t vt4567 = vmlaq_f32(vx4567, vn4567, vminus_ln2_hi); - float32x4_t vt89AB = vmlaq_f32(vx89AB, vn89AB, vminus_ln2_hi); - float32x4_t vtCDEF = vmlaq_f32(vxCDEF, vnCDEF, vminus_ln2_hi); - - vt0123 = vmlaq_f32(vt0123, vn0123, vminus_ln2_lo); - vt4567 = vmlaq_f32(vt4567, vn4567, vminus_ln2_lo); - vt89AB = vmlaq_f32(vt89AB, vn89AB, vminus_ln2_lo); - vtCDEF = vmlaq_f32(vtCDEF, vnCDEF, vminus_ln2_lo); - - float32x4_t vp0123 = vmulq_f32(vt0123, vc2); - float32x4_t vp4567 = vmulq_f32(vt4567, vc2); - float32x4_t vp89AB = vmulq_f32(vt89AB, vc2); - float32x4_t vpCDEF = vmulq_f32(vtCDEF, vc2); - - vp0123 = vmlaq_f32(vt0123, vt0123, vp0123); - vp4567 = vmlaq_f32(vt4567, vt4567, vp4567); - vp89AB = vmlaq_f32(vt89AB, vt89AB, vp89AB); - vpCDEF = vmlaq_f32(vtCDEF, vtCDEF, vpCDEF); - - float32x4_t vf0123 = vmlaq_f32(vs0123, vs0123, vp0123); - float32x4_t vf4567 = vmlaq_f32(vs4567, vs4567, vp4567); - float32x4_t vf89AB = vmlaq_f32(vs89AB, vs89AB, vp89AB); - float32x4_t vfCDEF = vmlaq_f32(vsCDEF, vsCDEF, vpCDEF); - - vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff))); - vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcltq_f32(vx4567, vdenorm_cutoff))); - vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcltq_f32(vx89AB, vdenorm_cutoff))); - vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcltq_f32(vxCDEF, vdenorm_cutoff))); - - vst1q_f32(output, vf0123); output += 4; - vst1q_f32(output, vf4567); output += 4; - vst1q_f32(output, vf89AB); output += 4; - vst1q_f32(output, vfCDEF); output += 4; - - vacc0 = vaddq_f32(vacc0, vf0123); - vacc0 = vaddq_f32(vacc0, vf4567); - vacc0 = vaddq_f32(vacc0, vf89AB); - vacc0 = vaddq_f32(vacc0, vfCDEF); - } - - float32x4_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vmlaq_f32(vmagic_bias, vx, vlog2e); - - const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); - const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); - const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); - float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); - float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]); - vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); - vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); - const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); - const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vmlaq_f32(vx, vn, vminus_ln2_hi); - vt = vmlaq_f32(vt, vn, vminus_ln2_lo); - - float32x4_t vp = vmulq_f32(vt, vc2); - vp = vmlaq_f32(vt, vt, vp); - - float32x4_t vf = vmlaq_f32(vs, vs, vp); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - vst1q_f32(output, vf); output += 4; - - vacc = vaddq_f32(vacc, vf); - } -#if XNN_ARCH_ARM64 - float vacc_lo = vaddvq_f32(vacc); -#else - float32x2_t vacc_lo = vadd_f32(vget_high_f32(vacc), vget_low_f32(vacc)); -#endif - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vmlaq_f32(vmagic_bias, vx, vlog2e); - - const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); - const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); - const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); - float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); - float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]); - vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); - vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); - const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); - const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vmlaq_f32(vx, vn, vminus_ln2_hi); - vt = vmlaq_f32(vt, vn, vminus_ln2_lo); - - float32x4_t vp = vmulq_f32(vt, vc2); - vp = vmlaq_f32(vt, vt, vp); - - float32x4_t vf = vmlaq_f32(vs, vs, vp); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - float32x2_t vf_lo = vget_low_f32(vf); - if (batch & (2 * sizeof(float))) { - vst1_f32(output, vf_lo); output += 2; - - #if XNN_ARCH_ARM64 - vacc_lo += vaddv_f32(vf_lo); - #else - vacc_lo = vadd_f32(vacc_lo, vf_lo); - #endif - - vf_lo = vget_high_f32(vf); - } - if (batch & (1 * sizeof(float))) { - vst1_lane_f32(output, vf_lo, 0); - - #if XNN_ARCH_ARM64 - vacc_lo += vget_lane_f32(vf_lo, 0); - #else - vacc_lo = vadd_f32(vacc_lo, vreinterpret_f32_u64(vshl_n_u64(vreinterpret_u64_f32(vf_lo), 32))); - #endif - } - } -#if XNN_ARCH_ARM64 - *sum = vacc_lo; -#else - vst1_lane_f32(sum, vpadd_f32(vacc_lo, vacc_lo), 0); -#endif -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u20-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u20-acc2.c deleted file mode 100644 index 5c140a7a513..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u20-acc2.c +++ /dev/null @@ -1,287 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -extern XNN_INTERNAL const float xnn_table_exp2_k_over_64[64]; - -void xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u20_acc2( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const float32x4_t vlog2e = vmovq_n_f32(0x1.715476p+0f); - const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p17f); - const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x3F)); - const float32x4_t vc2 = vmovq_n_f32(0x1.FFFF0Ap-2f); - const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vindex_mask); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const float32x4_t vminus_ln2_hi = vmovq_n_f32(-0x1.62E400p-1f); - const float32x4_t vminus_ln2_lo = vmovq_n_f32(-0x1.7F7D1Cp-20f); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - - const float32x4_t vi_max = vld1q_dup_f32(max); - - float32x4_t vacc0 = vmovq_n_f32(0.0f); - float32x4_t vacc1 = vmovq_n_f32(0.0f); - for (; batch >= 20 * sizeof(float); batch -= 20 * sizeof(float)) { - const float32x4_t vi0123 = vld1q_f32(input); input += 4; - const float32x4_t vi4567 = vld1q_f32(input); input += 4; - const float32x4_t vi89AB = vld1q_f32(input); input += 4; - const float32x4_t viCDEF = vld1q_f32(input); input += 4; - const float32x4_t viGHIJ = vld1q_f32(input); input += 4; - - const float32x4_t vx0123 = vsubq_f32(vi0123, vi_max); - const float32x4_t vx4567 = vsubq_f32(vi4567, vi_max); - const float32x4_t vx89AB = vsubq_f32(vi89AB, vi_max); - const float32x4_t vxCDEF = vsubq_f32(viCDEF, vi_max); - const float32x4_t vxGHIJ = vsubq_f32(viGHIJ, vi_max); - - float32x4_t vn0123 = vmlaq_f32(vmagic_bias, vx0123, vlog2e); - float32x4_t vn4567 = vmlaq_f32(vmagic_bias, vx4567, vlog2e); - float32x4_t vn89AB = vmlaq_f32(vmagic_bias, vx89AB, vlog2e); - float32x4_t vnCDEF = vmlaq_f32(vmagic_bias, vxCDEF, vlog2e); - float32x4_t vnGHIJ = vmlaq_f32(vmagic_bias, vxGHIJ, vlog2e); - - const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t ve89AB = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn89AB), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t veCDEF = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnCDEF), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t veGHIJ = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnGHIJ), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask)); - const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0); - const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1); - const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask)); - const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0); - const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1); - const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask)); - const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0); - const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1); - const uint64x2_t vidxCDEF = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnCDEF), vindex_mask)); - const uint64_t vidxCD = vgetq_lane_u64(vidxCDEF, 0); - const uint64_t vidxEF = vgetq_lane_u64(vidxCDEF, 1); - const uint64x2_t vidxGHIJ = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnGHIJ), vindex_mask)); - const uint64_t vidxGH = vgetq_lane_u64(vidxGHIJ, 0); - const uint64_t vidxIJ = vgetq_lane_u64(vidxGHIJ, 1); - - float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx01]); - float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx23]); - float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx45]); - float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx67]); - float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx89]); - float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxAB]); - float32x2_t vlCD = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxCD]); - float32x2_t vlEF = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxEF]); - float32x2_t vlGH = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxGH]); - float32x2_t vlIJ = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxIJ]); - - vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx01 >> 32)], vl01, 1); - vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx23 >> 32)], vl23, 1); - const float32x4_t vl0123 = vcombine_f32(vl01, vl23); - vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx45 >> 32)], vl45, 1); - vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx67 >> 32)], vl67, 1); - const float32x4_t vl4567 = vcombine_f32(vl45, vl67); - vl89 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx89 >> 32)], vl89, 1); - vlAB = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxAB >> 32)], vlAB, 1); - const float32x4_t vl89AB = vcombine_f32(vl89, vlAB); - vlCD = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxCD >> 32)], vlCD, 1); - vlEF = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxEF >> 32)], vlEF, 1); - const float32x4_t vlCDEF = vcombine_f32(vlCD, vlEF); - vlGH = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxGH >> 32)], vlGH, 1); - vlIJ = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxIJ >> 32)], vlIJ, 1); - const float32x4_t vlGHIJ = vcombine_f32(vlGH, vlIJ); - - const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123)); - const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567)); - const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB)); - const float32x4_t vsCDEF = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlCDEF), veCDEF)); - const float32x4_t vsGHIJ = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlGHIJ), veGHIJ)); - - vn0123 = vsubq_f32(vn0123, vmagic_bias); - vn4567 = vsubq_f32(vn4567, vmagic_bias); - vn89AB = vsubq_f32(vn89AB, vmagic_bias); - vnCDEF = vsubq_f32(vnCDEF, vmagic_bias); - vnGHIJ = vsubq_f32(vnGHIJ, vmagic_bias); - - float32x4_t vt0123 = vmlaq_f32(vx0123, vn0123, vminus_ln2_hi); - float32x4_t vt4567 = vmlaq_f32(vx4567, vn4567, vminus_ln2_hi); - float32x4_t vt89AB = vmlaq_f32(vx89AB, vn89AB, vminus_ln2_hi); - float32x4_t vtCDEF = vmlaq_f32(vxCDEF, vnCDEF, vminus_ln2_hi); - float32x4_t vtGHIJ = vmlaq_f32(vxGHIJ, vnGHIJ, vminus_ln2_hi); - - vt0123 = vmlaq_f32(vt0123, vn0123, vminus_ln2_lo); - vt4567 = vmlaq_f32(vt4567, vn4567, vminus_ln2_lo); - vt89AB = vmlaq_f32(vt89AB, vn89AB, vminus_ln2_lo); - vtCDEF = vmlaq_f32(vtCDEF, vnCDEF, vminus_ln2_lo); - vtGHIJ = vmlaq_f32(vtGHIJ, vnGHIJ, vminus_ln2_lo); - - float32x4_t vp0123 = vmulq_f32(vt0123, vc2); - float32x4_t vp4567 = vmulq_f32(vt4567, vc2); - float32x4_t vp89AB = vmulq_f32(vt89AB, vc2); - float32x4_t vpCDEF = vmulq_f32(vtCDEF, vc2); - float32x4_t vpGHIJ = vmulq_f32(vtGHIJ, vc2); - - vp0123 = vmlaq_f32(vt0123, vt0123, vp0123); - vp4567 = vmlaq_f32(vt4567, vt4567, vp4567); - vp89AB = vmlaq_f32(vt89AB, vt89AB, vp89AB); - vpCDEF = vmlaq_f32(vtCDEF, vtCDEF, vpCDEF); - vpGHIJ = vmlaq_f32(vtGHIJ, vtGHIJ, vpGHIJ); - - float32x4_t vf0123 = vmlaq_f32(vs0123, vs0123, vp0123); - float32x4_t vf4567 = vmlaq_f32(vs4567, vs4567, vp4567); - float32x4_t vf89AB = vmlaq_f32(vs89AB, vs89AB, vp89AB); - float32x4_t vfCDEF = vmlaq_f32(vsCDEF, vsCDEF, vpCDEF); - float32x4_t vfGHIJ = vmlaq_f32(vsGHIJ, vsGHIJ, vpGHIJ); - - vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff))); - vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcltq_f32(vx4567, vdenorm_cutoff))); - vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcltq_f32(vx89AB, vdenorm_cutoff))); - vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcltq_f32(vxCDEF, vdenorm_cutoff))); - vfGHIJ = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfGHIJ), vcltq_f32(vxGHIJ, vdenorm_cutoff))); - - vst1q_f32(output, vf0123); output += 4; - vst1q_f32(output, vf4567); output += 4; - vst1q_f32(output, vf89AB); output += 4; - vst1q_f32(output, vfCDEF); output += 4; - vst1q_f32(output, vfGHIJ); output += 4; - - vacc0 = vaddq_f32(vacc0, vf0123); - vacc0 = vaddq_f32(vacc0, vf4567); - vacc0 = vaddq_f32(vacc0, vf89AB); - vacc0 = vaddq_f32(vacc0, vfCDEF); - vacc0 = vaddq_f32(vacc0, vfGHIJ); - } - vacc0 = vaddq_f32(vacc0, vacc1); - - float32x4_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vmlaq_f32(vmagic_bias, vx, vlog2e); - - const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); - const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); - const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); - float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); - float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]); - vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); - vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); - const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); - const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vmlaq_f32(vx, vn, vminus_ln2_hi); - vt = vmlaq_f32(vt, vn, vminus_ln2_lo); - - float32x4_t vp = vmulq_f32(vt, vc2); - vp = vmlaq_f32(vt, vt, vp); - - float32x4_t vf = vmlaq_f32(vs, vs, vp); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - vst1q_f32(output, vf); output += 4; - - vacc = vaddq_f32(vacc, vf); - } -#if XNN_ARCH_ARM64 - float vacc_lo = vaddvq_f32(vacc); -#else - float32x2_t vacc_lo = vadd_f32(vget_high_f32(vacc), vget_low_f32(vacc)); -#endif - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vmlaq_f32(vmagic_bias, vx, vlog2e); - - const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); - const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); - const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); - float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); - float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]); - vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); - vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); - const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); - const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vmlaq_f32(vx, vn, vminus_ln2_hi); - vt = vmlaq_f32(vt, vn, vminus_ln2_lo); - - float32x4_t vp = vmulq_f32(vt, vc2); - vp = vmlaq_f32(vt, vt, vp); - - float32x4_t vf = vmlaq_f32(vs, vs, vp); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - float32x2_t vf_lo = vget_low_f32(vf); - if (batch & (2 * sizeof(float))) { - vst1_f32(output, vf_lo); output += 2; - - #if XNN_ARCH_ARM64 - vacc_lo += vaddv_f32(vf_lo); - #else - vacc_lo = vadd_f32(vacc_lo, vf_lo); - #endif - - vf_lo = vget_high_f32(vf); - } - if (batch & (1 * sizeof(float))) { - vst1_lane_f32(output, vf_lo, 0); - - #if XNN_ARCH_ARM64 - vacc_lo += vget_lane_f32(vf_lo, 0); - #else - vacc_lo = vadd_f32(vacc_lo, vreinterpret_f32_u64(vshl_n_u64(vreinterpret_u64_f32(vf_lo), 32))); - #endif - } - } -#if XNN_ARCH_ARM64 - *sum = vacc_lo; -#else - vst1_lane_f32(sum, vpadd_f32(vacc_lo, vacc_lo), 0); -#endif -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u20-acc5.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u20-acc5.c deleted file mode 100644 index 79dca487fc6..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u20-acc5.c +++ /dev/null @@ -1,293 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -extern XNN_INTERNAL const float xnn_table_exp2_k_over_64[64]; - -void xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u20_acc5( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const float32x4_t vlog2e = vmovq_n_f32(0x1.715476p+0f); - const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p17f); - const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x3F)); - const float32x4_t vc2 = vmovq_n_f32(0x1.FFFF0Ap-2f); - const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vindex_mask); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const float32x4_t vminus_ln2_hi = vmovq_n_f32(-0x1.62E400p-1f); - const float32x4_t vminus_ln2_lo = vmovq_n_f32(-0x1.7F7D1Cp-20f); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - - const float32x4_t vi_max = vld1q_dup_f32(max); - - float32x4_t vacc0 = vmovq_n_f32(0.0f); - float32x4_t vacc1 = vmovq_n_f32(0.0f); - float32x4_t vacc2 = vmovq_n_f32(0.0f); - float32x4_t vacc3 = vmovq_n_f32(0.0f); - float32x4_t vacc4 = vmovq_n_f32(0.0f); - for (; batch >= 20 * sizeof(float); batch -= 20 * sizeof(float)) { - const float32x4_t vi0123 = vld1q_f32(input); input += 4; - const float32x4_t vi4567 = vld1q_f32(input); input += 4; - const float32x4_t vi89AB = vld1q_f32(input); input += 4; - const float32x4_t viCDEF = vld1q_f32(input); input += 4; - const float32x4_t viGHIJ = vld1q_f32(input); input += 4; - - const float32x4_t vx0123 = vsubq_f32(vi0123, vi_max); - const float32x4_t vx4567 = vsubq_f32(vi4567, vi_max); - const float32x4_t vx89AB = vsubq_f32(vi89AB, vi_max); - const float32x4_t vxCDEF = vsubq_f32(viCDEF, vi_max); - const float32x4_t vxGHIJ = vsubq_f32(viGHIJ, vi_max); - - float32x4_t vn0123 = vmlaq_f32(vmagic_bias, vx0123, vlog2e); - float32x4_t vn4567 = vmlaq_f32(vmagic_bias, vx4567, vlog2e); - float32x4_t vn89AB = vmlaq_f32(vmagic_bias, vx89AB, vlog2e); - float32x4_t vnCDEF = vmlaq_f32(vmagic_bias, vxCDEF, vlog2e); - float32x4_t vnGHIJ = vmlaq_f32(vmagic_bias, vxGHIJ, vlog2e); - - const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t ve89AB = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn89AB), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t veCDEF = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnCDEF), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t veGHIJ = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnGHIJ), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask)); - const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0); - const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1); - const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask)); - const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0); - const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1); - const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask)); - const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0); - const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1); - const uint64x2_t vidxCDEF = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnCDEF), vindex_mask)); - const uint64_t vidxCD = vgetq_lane_u64(vidxCDEF, 0); - const uint64_t vidxEF = vgetq_lane_u64(vidxCDEF, 1); - const uint64x2_t vidxGHIJ = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnGHIJ), vindex_mask)); - const uint64_t vidxGH = vgetq_lane_u64(vidxGHIJ, 0); - const uint64_t vidxIJ = vgetq_lane_u64(vidxGHIJ, 1); - - float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx01]); - float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx23]); - float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx45]); - float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx67]); - float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx89]); - float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxAB]); - float32x2_t vlCD = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxCD]); - float32x2_t vlEF = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxEF]); - float32x2_t vlGH = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxGH]); - float32x2_t vlIJ = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxIJ]); - - vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx01 >> 32)], vl01, 1); - vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx23 >> 32)], vl23, 1); - const float32x4_t vl0123 = vcombine_f32(vl01, vl23); - vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx45 >> 32)], vl45, 1); - vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx67 >> 32)], vl67, 1); - const float32x4_t vl4567 = vcombine_f32(vl45, vl67); - vl89 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx89 >> 32)], vl89, 1); - vlAB = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxAB >> 32)], vlAB, 1); - const float32x4_t vl89AB = vcombine_f32(vl89, vlAB); - vlCD = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxCD >> 32)], vlCD, 1); - vlEF = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxEF >> 32)], vlEF, 1); - const float32x4_t vlCDEF = vcombine_f32(vlCD, vlEF); - vlGH = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxGH >> 32)], vlGH, 1); - vlIJ = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxIJ >> 32)], vlIJ, 1); - const float32x4_t vlGHIJ = vcombine_f32(vlGH, vlIJ); - - const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123)); - const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567)); - const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB)); - const float32x4_t vsCDEF = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlCDEF), veCDEF)); - const float32x4_t vsGHIJ = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlGHIJ), veGHIJ)); - - vn0123 = vsubq_f32(vn0123, vmagic_bias); - vn4567 = vsubq_f32(vn4567, vmagic_bias); - vn89AB = vsubq_f32(vn89AB, vmagic_bias); - vnCDEF = vsubq_f32(vnCDEF, vmagic_bias); - vnGHIJ = vsubq_f32(vnGHIJ, vmagic_bias); - - float32x4_t vt0123 = vmlaq_f32(vx0123, vn0123, vminus_ln2_hi); - float32x4_t vt4567 = vmlaq_f32(vx4567, vn4567, vminus_ln2_hi); - float32x4_t vt89AB = vmlaq_f32(vx89AB, vn89AB, vminus_ln2_hi); - float32x4_t vtCDEF = vmlaq_f32(vxCDEF, vnCDEF, vminus_ln2_hi); - float32x4_t vtGHIJ = vmlaq_f32(vxGHIJ, vnGHIJ, vminus_ln2_hi); - - vt0123 = vmlaq_f32(vt0123, vn0123, vminus_ln2_lo); - vt4567 = vmlaq_f32(vt4567, vn4567, vminus_ln2_lo); - vt89AB = vmlaq_f32(vt89AB, vn89AB, vminus_ln2_lo); - vtCDEF = vmlaq_f32(vtCDEF, vnCDEF, vminus_ln2_lo); - vtGHIJ = vmlaq_f32(vtGHIJ, vnGHIJ, vminus_ln2_lo); - - float32x4_t vp0123 = vmulq_f32(vt0123, vc2); - float32x4_t vp4567 = vmulq_f32(vt4567, vc2); - float32x4_t vp89AB = vmulq_f32(vt89AB, vc2); - float32x4_t vpCDEF = vmulq_f32(vtCDEF, vc2); - float32x4_t vpGHIJ = vmulq_f32(vtGHIJ, vc2); - - vp0123 = vmlaq_f32(vt0123, vt0123, vp0123); - vp4567 = vmlaq_f32(vt4567, vt4567, vp4567); - vp89AB = vmlaq_f32(vt89AB, vt89AB, vp89AB); - vpCDEF = vmlaq_f32(vtCDEF, vtCDEF, vpCDEF); - vpGHIJ = vmlaq_f32(vtGHIJ, vtGHIJ, vpGHIJ); - - float32x4_t vf0123 = vmlaq_f32(vs0123, vs0123, vp0123); - float32x4_t vf4567 = vmlaq_f32(vs4567, vs4567, vp4567); - float32x4_t vf89AB = vmlaq_f32(vs89AB, vs89AB, vp89AB); - float32x4_t vfCDEF = vmlaq_f32(vsCDEF, vsCDEF, vpCDEF); - float32x4_t vfGHIJ = vmlaq_f32(vsGHIJ, vsGHIJ, vpGHIJ); - - vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff))); - vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcltq_f32(vx4567, vdenorm_cutoff))); - vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcltq_f32(vx89AB, vdenorm_cutoff))); - vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcltq_f32(vxCDEF, vdenorm_cutoff))); - vfGHIJ = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfGHIJ), vcltq_f32(vxGHIJ, vdenorm_cutoff))); - - vst1q_f32(output, vf0123); output += 4; - vst1q_f32(output, vf4567); output += 4; - vst1q_f32(output, vf89AB); output += 4; - vst1q_f32(output, vfCDEF); output += 4; - vst1q_f32(output, vfGHIJ); output += 4; - - vacc0 = vaddq_f32(vacc0, vf0123); - vacc4 = vaddq_f32(vacc4, vf4567); - vacc3 = vaddq_f32(vacc3, vf89AB); - vacc2 = vaddq_f32(vacc2, vfCDEF); - vacc1 = vaddq_f32(vacc1, vfGHIJ); - } - vacc0 = vaddq_f32(vacc0, vacc1); - vacc2 = vaddq_f32(vacc2, vacc3); - vacc0 = vaddq_f32(vacc0, vacc2); - vacc0 = vaddq_f32(vacc0, vacc4); - - float32x4_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vmlaq_f32(vmagic_bias, vx, vlog2e); - - const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); - const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); - const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); - float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); - float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]); - vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); - vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); - const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); - const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vmlaq_f32(vx, vn, vminus_ln2_hi); - vt = vmlaq_f32(vt, vn, vminus_ln2_lo); - - float32x4_t vp = vmulq_f32(vt, vc2); - vp = vmlaq_f32(vt, vt, vp); - - float32x4_t vf = vmlaq_f32(vs, vs, vp); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - vst1q_f32(output, vf); output += 4; - - vacc = vaddq_f32(vacc, vf); - } -#if XNN_ARCH_ARM64 - float vacc_lo = vaddvq_f32(vacc); -#else - float32x2_t vacc_lo = vadd_f32(vget_high_f32(vacc), vget_low_f32(vacc)); -#endif - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vmlaq_f32(vmagic_bias, vx, vlog2e); - - const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); - const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); - const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); - float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); - float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]); - vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); - vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); - const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); - const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vmlaq_f32(vx, vn, vminus_ln2_hi); - vt = vmlaq_f32(vt, vn, vminus_ln2_lo); - - float32x4_t vp = vmulq_f32(vt, vc2); - vp = vmlaq_f32(vt, vt, vp); - - float32x4_t vf = vmlaq_f32(vs, vs, vp); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - float32x2_t vf_lo = vget_low_f32(vf); - if (batch & (2 * sizeof(float))) { - vst1_f32(output, vf_lo); output += 2; - - #if XNN_ARCH_ARM64 - vacc_lo += vaddv_f32(vf_lo); - #else - vacc_lo = vadd_f32(vacc_lo, vf_lo); - #endif - - vf_lo = vget_high_f32(vf); - } - if (batch & (1 * sizeof(float))) { - vst1_lane_f32(output, vf_lo, 0); - - #if XNN_ARCH_ARM64 - vacc_lo += vget_lane_f32(vf_lo, 0); - #else - vacc_lo = vadd_f32(vacc_lo, vreinterpret_f32_u64(vshl_n_u64(vreinterpret_u64_f32(vf_lo), 32))); - #endif - } - } -#if XNN_ARCH_ARM64 - *sum = vacc_lo; -#else - vst1_lane_f32(sum, vpadd_f32(vacc_lo, vacc_lo), 0); -#endif -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u20.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u20.c deleted file mode 100644 index 8eac0905fba..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u20.c +++ /dev/null @@ -1,285 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -extern XNN_INTERNAL const float xnn_table_exp2_k_over_64[64]; - -void xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u20( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const float32x4_t vlog2e = vmovq_n_f32(0x1.715476p+0f); - const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p17f); - const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x3F)); - const float32x4_t vc2 = vmovq_n_f32(0x1.FFFF0Ap-2f); - const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vindex_mask); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const float32x4_t vminus_ln2_hi = vmovq_n_f32(-0x1.62E400p-1f); - const float32x4_t vminus_ln2_lo = vmovq_n_f32(-0x1.7F7D1Cp-20f); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - - const float32x4_t vi_max = vld1q_dup_f32(max); - - float32x4_t vacc0 = vmovq_n_f32(0.0f); - for (; batch >= 20 * sizeof(float); batch -= 20 * sizeof(float)) { - const float32x4_t vi0123 = vld1q_f32(input); input += 4; - const float32x4_t vi4567 = vld1q_f32(input); input += 4; - const float32x4_t vi89AB = vld1q_f32(input); input += 4; - const float32x4_t viCDEF = vld1q_f32(input); input += 4; - const float32x4_t viGHIJ = vld1q_f32(input); input += 4; - - const float32x4_t vx0123 = vsubq_f32(vi0123, vi_max); - const float32x4_t vx4567 = vsubq_f32(vi4567, vi_max); - const float32x4_t vx89AB = vsubq_f32(vi89AB, vi_max); - const float32x4_t vxCDEF = vsubq_f32(viCDEF, vi_max); - const float32x4_t vxGHIJ = vsubq_f32(viGHIJ, vi_max); - - float32x4_t vn0123 = vmlaq_f32(vmagic_bias, vx0123, vlog2e); - float32x4_t vn4567 = vmlaq_f32(vmagic_bias, vx4567, vlog2e); - float32x4_t vn89AB = vmlaq_f32(vmagic_bias, vx89AB, vlog2e); - float32x4_t vnCDEF = vmlaq_f32(vmagic_bias, vxCDEF, vlog2e); - float32x4_t vnGHIJ = vmlaq_f32(vmagic_bias, vxGHIJ, vlog2e); - - const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t ve89AB = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn89AB), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t veCDEF = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnCDEF), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t veGHIJ = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnGHIJ), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask)); - const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0); - const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1); - const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask)); - const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0); - const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1); - const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask)); - const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0); - const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1); - const uint64x2_t vidxCDEF = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnCDEF), vindex_mask)); - const uint64_t vidxCD = vgetq_lane_u64(vidxCDEF, 0); - const uint64_t vidxEF = vgetq_lane_u64(vidxCDEF, 1); - const uint64x2_t vidxGHIJ = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnGHIJ), vindex_mask)); - const uint64_t vidxGH = vgetq_lane_u64(vidxGHIJ, 0); - const uint64_t vidxIJ = vgetq_lane_u64(vidxGHIJ, 1); - - float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx01]); - float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx23]); - float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx45]); - float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx67]); - float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx89]); - float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxAB]); - float32x2_t vlCD = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxCD]); - float32x2_t vlEF = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxEF]); - float32x2_t vlGH = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxGH]); - float32x2_t vlIJ = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxIJ]); - - vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx01 >> 32)], vl01, 1); - vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx23 >> 32)], vl23, 1); - const float32x4_t vl0123 = vcombine_f32(vl01, vl23); - vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx45 >> 32)], vl45, 1); - vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx67 >> 32)], vl67, 1); - const float32x4_t vl4567 = vcombine_f32(vl45, vl67); - vl89 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx89 >> 32)], vl89, 1); - vlAB = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxAB >> 32)], vlAB, 1); - const float32x4_t vl89AB = vcombine_f32(vl89, vlAB); - vlCD = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxCD >> 32)], vlCD, 1); - vlEF = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxEF >> 32)], vlEF, 1); - const float32x4_t vlCDEF = vcombine_f32(vlCD, vlEF); - vlGH = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxGH >> 32)], vlGH, 1); - vlIJ = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxIJ >> 32)], vlIJ, 1); - const float32x4_t vlGHIJ = vcombine_f32(vlGH, vlIJ); - - const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123)); - const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567)); - const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB)); - const float32x4_t vsCDEF = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlCDEF), veCDEF)); - const float32x4_t vsGHIJ = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlGHIJ), veGHIJ)); - - vn0123 = vsubq_f32(vn0123, vmagic_bias); - vn4567 = vsubq_f32(vn4567, vmagic_bias); - vn89AB = vsubq_f32(vn89AB, vmagic_bias); - vnCDEF = vsubq_f32(vnCDEF, vmagic_bias); - vnGHIJ = vsubq_f32(vnGHIJ, vmagic_bias); - - float32x4_t vt0123 = vmlaq_f32(vx0123, vn0123, vminus_ln2_hi); - float32x4_t vt4567 = vmlaq_f32(vx4567, vn4567, vminus_ln2_hi); - float32x4_t vt89AB = vmlaq_f32(vx89AB, vn89AB, vminus_ln2_hi); - float32x4_t vtCDEF = vmlaq_f32(vxCDEF, vnCDEF, vminus_ln2_hi); - float32x4_t vtGHIJ = vmlaq_f32(vxGHIJ, vnGHIJ, vminus_ln2_hi); - - vt0123 = vmlaq_f32(vt0123, vn0123, vminus_ln2_lo); - vt4567 = vmlaq_f32(vt4567, vn4567, vminus_ln2_lo); - vt89AB = vmlaq_f32(vt89AB, vn89AB, vminus_ln2_lo); - vtCDEF = vmlaq_f32(vtCDEF, vnCDEF, vminus_ln2_lo); - vtGHIJ = vmlaq_f32(vtGHIJ, vnGHIJ, vminus_ln2_lo); - - float32x4_t vp0123 = vmulq_f32(vt0123, vc2); - float32x4_t vp4567 = vmulq_f32(vt4567, vc2); - float32x4_t vp89AB = vmulq_f32(vt89AB, vc2); - float32x4_t vpCDEF = vmulq_f32(vtCDEF, vc2); - float32x4_t vpGHIJ = vmulq_f32(vtGHIJ, vc2); - - vp0123 = vmlaq_f32(vt0123, vt0123, vp0123); - vp4567 = vmlaq_f32(vt4567, vt4567, vp4567); - vp89AB = vmlaq_f32(vt89AB, vt89AB, vp89AB); - vpCDEF = vmlaq_f32(vtCDEF, vtCDEF, vpCDEF); - vpGHIJ = vmlaq_f32(vtGHIJ, vtGHIJ, vpGHIJ); - - float32x4_t vf0123 = vmlaq_f32(vs0123, vs0123, vp0123); - float32x4_t vf4567 = vmlaq_f32(vs4567, vs4567, vp4567); - float32x4_t vf89AB = vmlaq_f32(vs89AB, vs89AB, vp89AB); - float32x4_t vfCDEF = vmlaq_f32(vsCDEF, vsCDEF, vpCDEF); - float32x4_t vfGHIJ = vmlaq_f32(vsGHIJ, vsGHIJ, vpGHIJ); - - vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff))); - vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcltq_f32(vx4567, vdenorm_cutoff))); - vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcltq_f32(vx89AB, vdenorm_cutoff))); - vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcltq_f32(vxCDEF, vdenorm_cutoff))); - vfGHIJ = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfGHIJ), vcltq_f32(vxGHIJ, vdenorm_cutoff))); - - vst1q_f32(output, vf0123); output += 4; - vst1q_f32(output, vf4567); output += 4; - vst1q_f32(output, vf89AB); output += 4; - vst1q_f32(output, vfCDEF); output += 4; - vst1q_f32(output, vfGHIJ); output += 4; - - vacc0 = vaddq_f32(vacc0, vf0123); - vacc0 = vaddq_f32(vacc0, vf4567); - vacc0 = vaddq_f32(vacc0, vf89AB); - vacc0 = vaddq_f32(vacc0, vfCDEF); - vacc0 = vaddq_f32(vacc0, vfGHIJ); - } - - float32x4_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vmlaq_f32(vmagic_bias, vx, vlog2e); - - const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); - const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); - const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); - float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); - float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]); - vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); - vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); - const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); - const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vmlaq_f32(vx, vn, vminus_ln2_hi); - vt = vmlaq_f32(vt, vn, vminus_ln2_lo); - - float32x4_t vp = vmulq_f32(vt, vc2); - vp = vmlaq_f32(vt, vt, vp); - - float32x4_t vf = vmlaq_f32(vs, vs, vp); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - vst1q_f32(output, vf); output += 4; - - vacc = vaddq_f32(vacc, vf); - } -#if XNN_ARCH_ARM64 - float vacc_lo = vaddvq_f32(vacc); -#else - float32x2_t vacc_lo = vadd_f32(vget_high_f32(vacc), vget_low_f32(vacc)); -#endif - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vmlaq_f32(vmagic_bias, vx, vlog2e); - - const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); - const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); - const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); - float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); - float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]); - vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); - vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); - const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); - const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vmlaq_f32(vx, vn, vminus_ln2_hi); - vt = vmlaq_f32(vt, vn, vminus_ln2_lo); - - float32x4_t vp = vmulq_f32(vt, vc2); - vp = vmlaq_f32(vt, vt, vp); - - float32x4_t vf = vmlaq_f32(vs, vs, vp); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - float32x2_t vf_lo = vget_low_f32(vf); - if (batch & (2 * sizeof(float))) { - vst1_f32(output, vf_lo); output += 2; - - #if XNN_ARCH_ARM64 - vacc_lo += vaddv_f32(vf_lo); - #else - vacc_lo = vadd_f32(vacc_lo, vf_lo); - #endif - - vf_lo = vget_high_f32(vf); - } - if (batch & (1 * sizeof(float))) { - vst1_lane_f32(output, vf_lo, 0); - - #if XNN_ARCH_ARM64 - vacc_lo += vget_lane_f32(vf_lo, 0); - #else - vacc_lo = vadd_f32(vacc_lo, vreinterpret_f32_u64(vshl_n_u64(vreinterpret_u64_f32(vf_lo), 32))); - #endif - } - } -#if XNN_ARCH_ARM64 - *sum = vacc_lo; -#else - vst1_lane_f32(sum, vpadd_f32(vacc_lo, vacc_lo), 0); -#endif -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u8.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u8.c deleted file mode 100644 index aa8c06a1f9e..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-u8.c +++ /dev/null @@ -1,219 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -extern XNN_INTERNAL const float xnn_table_exp2_k_over_64[64]; - -void xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u8( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const float32x4_t vlog2e = vmovq_n_f32(0x1.715476p+0f); - const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p17f); - const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x3F)); - const float32x4_t vc2 = vmovq_n_f32(0x1.FFFF0Ap-2f); - const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vindex_mask); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const float32x4_t vminus_ln2_hi = vmovq_n_f32(-0x1.62E400p-1f); - const float32x4_t vminus_ln2_lo = vmovq_n_f32(-0x1.7F7D1Cp-20f); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - - const float32x4_t vi_max = vld1q_dup_f32(max); - - float32x4_t vacc0 = vmovq_n_f32(0.0f); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const float32x4_t vi0123 = vld1q_f32(input); input += 4; - const float32x4_t vi4567 = vld1q_f32(input); input += 4; - - const float32x4_t vx0123 = vsubq_f32(vi0123, vi_max); - const float32x4_t vx4567 = vsubq_f32(vi4567, vi_max); - - float32x4_t vn0123 = vmlaq_f32(vmagic_bias, vx0123, vlog2e); - float32x4_t vn4567 = vmlaq_f32(vmagic_bias, vx4567, vlog2e); - - const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask)); - const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0); - const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1); - const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask)); - const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0); - const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1); - - float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx01]); - float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx23]); - float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx45]); - float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx67]); - - vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx01 >> 32)], vl01, 1); - vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx23 >> 32)], vl23, 1); - const float32x4_t vl0123 = vcombine_f32(vl01, vl23); - vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx45 >> 32)], vl45, 1); - vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx67 >> 32)], vl67, 1); - const float32x4_t vl4567 = vcombine_f32(vl45, vl67); - - const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123)); - const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567)); - - vn0123 = vsubq_f32(vn0123, vmagic_bias); - vn4567 = vsubq_f32(vn4567, vmagic_bias); - - float32x4_t vt0123 = vmlaq_f32(vx0123, vn0123, vminus_ln2_hi); - float32x4_t vt4567 = vmlaq_f32(vx4567, vn4567, vminus_ln2_hi); - - vt0123 = vmlaq_f32(vt0123, vn0123, vminus_ln2_lo); - vt4567 = vmlaq_f32(vt4567, vn4567, vminus_ln2_lo); - - float32x4_t vp0123 = vmulq_f32(vt0123, vc2); - float32x4_t vp4567 = vmulq_f32(vt4567, vc2); - - vp0123 = vmlaq_f32(vt0123, vt0123, vp0123); - vp4567 = vmlaq_f32(vt4567, vt4567, vp4567); - - float32x4_t vf0123 = vmlaq_f32(vs0123, vs0123, vp0123); - float32x4_t vf4567 = vmlaq_f32(vs4567, vs4567, vp4567); - - vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff))); - vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcltq_f32(vx4567, vdenorm_cutoff))); - - vst1q_f32(output, vf0123); output += 4; - vst1q_f32(output, vf4567); output += 4; - - vacc0 = vaddq_f32(vacc0, vf0123); - vacc0 = vaddq_f32(vacc0, vf4567); - } - - float32x4_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vmlaq_f32(vmagic_bias, vx, vlog2e); - - const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); - const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); - const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); - float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); - float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]); - vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); - vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); - const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); - const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vmlaq_f32(vx, vn, vminus_ln2_hi); - vt = vmlaq_f32(vt, vn, vminus_ln2_lo); - - float32x4_t vp = vmulq_f32(vt, vc2); - vp = vmlaq_f32(vt, vt, vp); - - float32x4_t vf = vmlaq_f32(vs, vs, vp); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - vst1q_f32(output, vf); output += 4; - - vacc = vaddq_f32(vacc, vf); - } -#if XNN_ARCH_ARM64 - float vacc_lo = vaddvq_f32(vacc); -#else - float32x2_t vacc_lo = vadd_f32(vget_high_f32(vacc), vget_low_f32(vacc)); -#endif - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vmlaq_f32(vmagic_bias, vx, vlog2e); - - const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); - const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); - const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); - float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); - float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]); - vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); - vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); - const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); - const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vmlaq_f32(vx, vn, vminus_ln2_hi); - vt = vmlaq_f32(vt, vn, vminus_ln2_lo); - - float32x4_t vp = vmulq_f32(vt, vc2); - vp = vmlaq_f32(vt, vt, vp); - - float32x4_t vf = vmlaq_f32(vs, vs, vp); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - float32x2_t vf_lo = vget_low_f32(vf); - if (batch & (2 * sizeof(float))) { - vst1_f32(output, vf_lo); output += 2; - - #if XNN_ARCH_ARM64 - vacc_lo += vaddv_f32(vf_lo); - #else - vacc_lo = vadd_f32(vacc_lo, vf_lo); - #endif - - vf_lo = vget_high_f32(vf); - } - if (batch & (1 * sizeof(float))) { - vst1_lane_f32(output, vf_lo, 0); - - #if XNN_ARCH_ARM64 - vacc_lo += vget_lane_f32(vf_lo, 0); - #else - vacc_lo = vadd_f32(vacc_lo, vreinterpret_f32_u64(vshl_n_u64(vreinterpret_u64_f32(vf_lo), 32))); - #endif - } - } -#if XNN_ARCH_ARM64 - *sum = vacc_lo; -#else - vst1_lane_f32(sum, vpadd_f32(vacc_lo, vacc_lo), 0); -#endif -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u12-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u12-acc2.c deleted file mode 100644 index c0d702f7cd4..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u12-acc2.c +++ /dev/null @@ -1,214 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/neon-p5.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u12_acc2( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const float32x4_t vlog2e = vmovq_n_f32(0x1.715476p+0f); - const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f); - const float32x4_t vc5 = vmovq_n_f32(0x1.0F9F9Cp-7f); - const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f); - const float32x4_t vc3 = vmovq_n_f32(0x1.555A80p-3f); - const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f); - const float32x4_t vc1 = vmovq_n_f32(0x1.FFFFF6p-1f); - const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const float32x4_t vminus_ln2_hi = vmovq_n_f32(-0x1.62E400p-1f); - const float32x4_t vminus_ln2_lo = vmovq_n_f32(-0x1.7F7D1Cp-20f); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - - const float32x4_t vi_max = vld1q_dup_f32(max); - - float32x4_t vacc0 = vmovq_n_f32(0.0f); - float32x4_t vacc1 = vmovq_n_f32(0.0f); - for (; batch >= 12 * sizeof(float); batch -= 12 * sizeof(float)) { - const float32x4_t vi0123 = vld1q_f32(input); input += 4; - const float32x4_t vi4567 = vld1q_f32(input); input += 4; - const float32x4_t vi89AB = vld1q_f32(input); input += 4; - - const float32x4_t vx0123 = vsubq_f32(vi0123, vi_max); - const float32x4_t vx4567 = vsubq_f32(vi4567, vi_max); - const float32x4_t vx89AB = vsubq_f32(vi89AB, vi_max); - - float32x4_t vn0123 = vmlaq_f32(vmagic_bias, vx0123, vlog2e); - float32x4_t vn4567 = vmlaq_f32(vmagic_bias, vx4567, vlog2e); - float32x4_t vn89AB = vmlaq_f32(vmagic_bias, vx89AB, vlog2e); - - const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23)); - const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23)); - const float32x4_t vs89AB = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn89AB), 23)); - - vn0123 = vsubq_f32(vn0123, vmagic_bias); - vn4567 = vsubq_f32(vn4567, vmagic_bias); - vn89AB = vsubq_f32(vn89AB, vmagic_bias); - - float32x4_t vt0123 = vmlaq_f32(vx0123, vn0123, vminus_ln2_hi); - float32x4_t vt4567 = vmlaq_f32(vx4567, vn4567, vminus_ln2_hi); - float32x4_t vt89AB = vmlaq_f32(vx89AB, vn89AB, vminus_ln2_hi); - - vt0123 = vmlaq_f32(vt0123, vn0123, vminus_ln2_lo); - vt4567 = vmlaq_f32(vt4567, vn4567, vminus_ln2_lo); - vt89AB = vmlaq_f32(vt89AB, vn89AB, vminus_ln2_lo); - - float32x4_t vp0123 = vmlaq_f32(vc4, vc5, vt0123); - float32x4_t vp4567 = vmlaq_f32(vc4, vc5, vt4567); - float32x4_t vp89AB = vmlaq_f32(vc4, vc5, vt89AB); - - vp0123 = vmlaq_f32(vc3, vp0123, vt0123); - vp4567 = vmlaq_f32(vc3, vp4567, vt4567); - vp89AB = vmlaq_f32(vc3, vp89AB, vt89AB); - - vp0123 = vmlaq_f32(vc2, vp0123, vt0123); - vp4567 = vmlaq_f32(vc2, vp4567, vt4567); - vp89AB = vmlaq_f32(vc2, vp89AB, vt89AB); - - vp0123 = vmlaq_f32(vc1, vp0123, vt0123); - vp4567 = vmlaq_f32(vc1, vp4567, vt4567); - vp89AB = vmlaq_f32(vc1, vp89AB, vt89AB); - - vt0123 = vmulq_f32(vt0123, vs0123); - vt4567 = vmulq_f32(vt4567, vs4567); - vt89AB = vmulq_f32(vt89AB, vs89AB); - - float32x4_t vf0123 = vmlaq_f32(vs0123, vp0123, vt0123); - float32x4_t vf4567 = vmlaq_f32(vs4567, vp4567, vt4567); - float32x4_t vf89AB = vmlaq_f32(vs89AB, vp89AB, vt89AB); - - vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff))); - vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcltq_f32(vx4567, vdenorm_cutoff))); - vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcltq_f32(vx89AB, vdenorm_cutoff))); - - vst1q_f32(output, vf0123); output += 4; - vst1q_f32(output, vf4567); output += 4; - vst1q_f32(output, vf89AB); output += 4; - - vacc0 = vaddq_f32(vacc0, vf0123); - vacc0 = vaddq_f32(vacc0, vf4567); - vacc0 = vaddq_f32(vacc0, vf89AB); - } - vacc0 = vaddq_f32(vacc0, vacc1); - - float32x4_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vmlaq_f32(vmagic_bias, vx, vlog2e); - - const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vmlaq_f32(vx, vn, vminus_ln2_hi); - vt = vmlaq_f32(vt, vn, vminus_ln2_lo); - - float32x4_t vp = vmlaq_f32(vc4, vc5, vt); - vp = vmlaq_f32(vc3, vp, vt); - vp = vmlaq_f32(vc2, vp, vt); - vp = vmlaq_f32(vc1, vp, vt); - - vt = vmulq_f32(vt, vs); - float32x4_t vf = vmlaq_f32(vs, vp, vt); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - vst1q_f32(output, vf); output += 4; - - vacc = vaddq_f32(vacc, vf); - } -#if XNN_ARCH_ARM64 - float vacc_lo = vaddvq_f32(vacc); -#else - float32x2_t vacc_lo = vadd_f32(vget_high_f32(vacc), vget_low_f32(vacc)); -#endif - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vmlaq_f32(vmagic_bias, vx, vlog2e); - - const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vmlaq_f32(vx, vn, vminus_ln2_hi); - vt = vmlaq_f32(vt, vn, vminus_ln2_lo); - - float32x4_t vp = vmlaq_f32(vc4, vc5, vt); - vp = vmlaq_f32(vc3, vp, vt); - vp = vmlaq_f32(vc2, vp, vt); - vp = vmlaq_f32(vc1, vp, vt); - - vt = vmulq_f32(vt, vs); - float32x4_t vf = vmlaq_f32(vs, vp, vt); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - float32x2_t vf_lo = vget_low_f32(vf); - if (batch & (2 * sizeof(float))) { - vst1_f32(output, vf_lo); output += 2; - - #if XNN_ARCH_ARM64 - vacc_lo += vaddv_f32(vf_lo); - #else - vacc_lo = vadd_f32(vacc_lo, vf_lo); - #endif - - vf_lo = vget_high_f32(vf); - } - if (batch & (1 * sizeof(float))) { - vst1_lane_f32(output, vf_lo, 0); - - #if XNN_ARCH_ARM64 - vacc_lo += vget_lane_f32(vf_lo, 0); - #else - vacc_lo = vadd_f32(vacc_lo, vreinterpret_f32_u64(vshl_n_u64(vreinterpret_u64_f32(vf_lo), 32))); - #endif - } - } -#if XNN_ARCH_ARM64 - *sum = vacc_lo; -#else - vst1_lane_f32(sum, vpadd_f32(vacc_lo, vacc_lo), 0); -#endif -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u12-acc3.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u12-acc3.c deleted file mode 100644 index 9182da327c9..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u12-acc3.c +++ /dev/null @@ -1,216 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/neon-p5.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u12_acc3( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const float32x4_t vlog2e = vmovq_n_f32(0x1.715476p+0f); - const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f); - const float32x4_t vc5 = vmovq_n_f32(0x1.0F9F9Cp-7f); - const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f); - const float32x4_t vc3 = vmovq_n_f32(0x1.555A80p-3f); - const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f); - const float32x4_t vc1 = vmovq_n_f32(0x1.FFFFF6p-1f); - const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const float32x4_t vminus_ln2_hi = vmovq_n_f32(-0x1.62E400p-1f); - const float32x4_t vminus_ln2_lo = vmovq_n_f32(-0x1.7F7D1Cp-20f); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - - const float32x4_t vi_max = vld1q_dup_f32(max); - - float32x4_t vacc0 = vmovq_n_f32(0.0f); - float32x4_t vacc1 = vmovq_n_f32(0.0f); - float32x4_t vacc2 = vmovq_n_f32(0.0f); - for (; batch >= 12 * sizeof(float); batch -= 12 * sizeof(float)) { - const float32x4_t vi0123 = vld1q_f32(input); input += 4; - const float32x4_t vi4567 = vld1q_f32(input); input += 4; - const float32x4_t vi89AB = vld1q_f32(input); input += 4; - - const float32x4_t vx0123 = vsubq_f32(vi0123, vi_max); - const float32x4_t vx4567 = vsubq_f32(vi4567, vi_max); - const float32x4_t vx89AB = vsubq_f32(vi89AB, vi_max); - - float32x4_t vn0123 = vmlaq_f32(vmagic_bias, vx0123, vlog2e); - float32x4_t vn4567 = vmlaq_f32(vmagic_bias, vx4567, vlog2e); - float32x4_t vn89AB = vmlaq_f32(vmagic_bias, vx89AB, vlog2e); - - const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23)); - const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23)); - const float32x4_t vs89AB = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn89AB), 23)); - - vn0123 = vsubq_f32(vn0123, vmagic_bias); - vn4567 = vsubq_f32(vn4567, vmagic_bias); - vn89AB = vsubq_f32(vn89AB, vmagic_bias); - - float32x4_t vt0123 = vmlaq_f32(vx0123, vn0123, vminus_ln2_hi); - float32x4_t vt4567 = vmlaq_f32(vx4567, vn4567, vminus_ln2_hi); - float32x4_t vt89AB = vmlaq_f32(vx89AB, vn89AB, vminus_ln2_hi); - - vt0123 = vmlaq_f32(vt0123, vn0123, vminus_ln2_lo); - vt4567 = vmlaq_f32(vt4567, vn4567, vminus_ln2_lo); - vt89AB = vmlaq_f32(vt89AB, vn89AB, vminus_ln2_lo); - - float32x4_t vp0123 = vmlaq_f32(vc4, vc5, vt0123); - float32x4_t vp4567 = vmlaq_f32(vc4, vc5, vt4567); - float32x4_t vp89AB = vmlaq_f32(vc4, vc5, vt89AB); - - vp0123 = vmlaq_f32(vc3, vp0123, vt0123); - vp4567 = vmlaq_f32(vc3, vp4567, vt4567); - vp89AB = vmlaq_f32(vc3, vp89AB, vt89AB); - - vp0123 = vmlaq_f32(vc2, vp0123, vt0123); - vp4567 = vmlaq_f32(vc2, vp4567, vt4567); - vp89AB = vmlaq_f32(vc2, vp89AB, vt89AB); - - vp0123 = vmlaq_f32(vc1, vp0123, vt0123); - vp4567 = vmlaq_f32(vc1, vp4567, vt4567); - vp89AB = vmlaq_f32(vc1, vp89AB, vt89AB); - - vt0123 = vmulq_f32(vt0123, vs0123); - vt4567 = vmulq_f32(vt4567, vs4567); - vt89AB = vmulq_f32(vt89AB, vs89AB); - - float32x4_t vf0123 = vmlaq_f32(vs0123, vp0123, vt0123); - float32x4_t vf4567 = vmlaq_f32(vs4567, vp4567, vt4567); - float32x4_t vf89AB = vmlaq_f32(vs89AB, vp89AB, vt89AB); - - vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff))); - vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcltq_f32(vx4567, vdenorm_cutoff))); - vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcltq_f32(vx89AB, vdenorm_cutoff))); - - vst1q_f32(output, vf0123); output += 4; - vst1q_f32(output, vf4567); output += 4; - vst1q_f32(output, vf89AB); output += 4; - - vacc0 = vaddq_f32(vacc0, vf0123); - vacc1 = vaddq_f32(vacc1, vf4567); - vacc2 = vaddq_f32(vacc2, vf89AB); - } - vacc0 = vaddq_f32(vacc0, vacc1); - vacc0 = vaddq_f32(vacc0, vacc2); - - float32x4_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vmlaq_f32(vmagic_bias, vx, vlog2e); - - const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vmlaq_f32(vx, vn, vminus_ln2_hi); - vt = vmlaq_f32(vt, vn, vminus_ln2_lo); - - float32x4_t vp = vmlaq_f32(vc4, vc5, vt); - vp = vmlaq_f32(vc3, vp, vt); - vp = vmlaq_f32(vc2, vp, vt); - vp = vmlaq_f32(vc1, vp, vt); - - vt = vmulq_f32(vt, vs); - float32x4_t vf = vmlaq_f32(vs, vp, vt); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - vst1q_f32(output, vf); output += 4; - - vacc = vaddq_f32(vacc, vf); - } -#if XNN_ARCH_ARM64 - float vacc_lo = vaddvq_f32(vacc); -#else - float32x2_t vacc_lo = vadd_f32(vget_high_f32(vacc), vget_low_f32(vacc)); -#endif - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vmlaq_f32(vmagic_bias, vx, vlog2e); - - const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vmlaq_f32(vx, vn, vminus_ln2_hi); - vt = vmlaq_f32(vt, vn, vminus_ln2_lo); - - float32x4_t vp = vmlaq_f32(vc4, vc5, vt); - vp = vmlaq_f32(vc3, vp, vt); - vp = vmlaq_f32(vc2, vp, vt); - vp = vmlaq_f32(vc1, vp, vt); - - vt = vmulq_f32(vt, vs); - float32x4_t vf = vmlaq_f32(vs, vp, vt); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - float32x2_t vf_lo = vget_low_f32(vf); - if (batch & (2 * sizeof(float))) { - vst1_f32(output, vf_lo); output += 2; - - #if XNN_ARCH_ARM64 - vacc_lo += vaddv_f32(vf_lo); - #else - vacc_lo = vadd_f32(vacc_lo, vf_lo); - #endif - - vf_lo = vget_high_f32(vf); - } - if (batch & (1 * sizeof(float))) { - vst1_lane_f32(output, vf_lo, 0); - - #if XNN_ARCH_ARM64 - vacc_lo += vget_lane_f32(vf_lo, 0); - #else - vacc_lo = vadd_f32(vacc_lo, vreinterpret_f32_u64(vshl_n_u64(vreinterpret_u64_f32(vf_lo), 32))); - #endif - } - } -#if XNN_ARCH_ARM64 - *sum = vacc_lo; -#else - vst1_lane_f32(sum, vpadd_f32(vacc_lo, vacc_lo), 0); -#endif -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u12.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u12.c deleted file mode 100644 index 9391b701a31..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u12.c +++ /dev/null @@ -1,212 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/neon-p5.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u12( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const float32x4_t vlog2e = vmovq_n_f32(0x1.715476p+0f); - const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f); - const float32x4_t vc5 = vmovq_n_f32(0x1.0F9F9Cp-7f); - const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f); - const float32x4_t vc3 = vmovq_n_f32(0x1.555A80p-3f); - const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f); - const float32x4_t vc1 = vmovq_n_f32(0x1.FFFFF6p-1f); - const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const float32x4_t vminus_ln2_hi = vmovq_n_f32(-0x1.62E400p-1f); - const float32x4_t vminus_ln2_lo = vmovq_n_f32(-0x1.7F7D1Cp-20f); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - - const float32x4_t vi_max = vld1q_dup_f32(max); - - float32x4_t vacc0 = vmovq_n_f32(0.0f); - for (; batch >= 12 * sizeof(float); batch -= 12 * sizeof(float)) { - const float32x4_t vi0123 = vld1q_f32(input); input += 4; - const float32x4_t vi4567 = vld1q_f32(input); input += 4; - const float32x4_t vi89AB = vld1q_f32(input); input += 4; - - const float32x4_t vx0123 = vsubq_f32(vi0123, vi_max); - const float32x4_t vx4567 = vsubq_f32(vi4567, vi_max); - const float32x4_t vx89AB = vsubq_f32(vi89AB, vi_max); - - float32x4_t vn0123 = vmlaq_f32(vmagic_bias, vx0123, vlog2e); - float32x4_t vn4567 = vmlaq_f32(vmagic_bias, vx4567, vlog2e); - float32x4_t vn89AB = vmlaq_f32(vmagic_bias, vx89AB, vlog2e); - - const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23)); - const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23)); - const float32x4_t vs89AB = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn89AB), 23)); - - vn0123 = vsubq_f32(vn0123, vmagic_bias); - vn4567 = vsubq_f32(vn4567, vmagic_bias); - vn89AB = vsubq_f32(vn89AB, vmagic_bias); - - float32x4_t vt0123 = vmlaq_f32(vx0123, vn0123, vminus_ln2_hi); - float32x4_t vt4567 = vmlaq_f32(vx4567, vn4567, vminus_ln2_hi); - float32x4_t vt89AB = vmlaq_f32(vx89AB, vn89AB, vminus_ln2_hi); - - vt0123 = vmlaq_f32(vt0123, vn0123, vminus_ln2_lo); - vt4567 = vmlaq_f32(vt4567, vn4567, vminus_ln2_lo); - vt89AB = vmlaq_f32(vt89AB, vn89AB, vminus_ln2_lo); - - float32x4_t vp0123 = vmlaq_f32(vc4, vc5, vt0123); - float32x4_t vp4567 = vmlaq_f32(vc4, vc5, vt4567); - float32x4_t vp89AB = vmlaq_f32(vc4, vc5, vt89AB); - - vp0123 = vmlaq_f32(vc3, vp0123, vt0123); - vp4567 = vmlaq_f32(vc3, vp4567, vt4567); - vp89AB = vmlaq_f32(vc3, vp89AB, vt89AB); - - vp0123 = vmlaq_f32(vc2, vp0123, vt0123); - vp4567 = vmlaq_f32(vc2, vp4567, vt4567); - vp89AB = vmlaq_f32(vc2, vp89AB, vt89AB); - - vp0123 = vmlaq_f32(vc1, vp0123, vt0123); - vp4567 = vmlaq_f32(vc1, vp4567, vt4567); - vp89AB = vmlaq_f32(vc1, vp89AB, vt89AB); - - vt0123 = vmulq_f32(vt0123, vs0123); - vt4567 = vmulq_f32(vt4567, vs4567); - vt89AB = vmulq_f32(vt89AB, vs89AB); - - float32x4_t vf0123 = vmlaq_f32(vs0123, vp0123, vt0123); - float32x4_t vf4567 = vmlaq_f32(vs4567, vp4567, vt4567); - float32x4_t vf89AB = vmlaq_f32(vs89AB, vp89AB, vt89AB); - - vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff))); - vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcltq_f32(vx4567, vdenorm_cutoff))); - vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcltq_f32(vx89AB, vdenorm_cutoff))); - - vst1q_f32(output, vf0123); output += 4; - vst1q_f32(output, vf4567); output += 4; - vst1q_f32(output, vf89AB); output += 4; - - vacc0 = vaddq_f32(vacc0, vf0123); - vacc0 = vaddq_f32(vacc0, vf4567); - vacc0 = vaddq_f32(vacc0, vf89AB); - } - - float32x4_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vmlaq_f32(vmagic_bias, vx, vlog2e); - - const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vmlaq_f32(vx, vn, vminus_ln2_hi); - vt = vmlaq_f32(vt, vn, vminus_ln2_lo); - - float32x4_t vp = vmlaq_f32(vc4, vc5, vt); - vp = vmlaq_f32(vc3, vp, vt); - vp = vmlaq_f32(vc2, vp, vt); - vp = vmlaq_f32(vc1, vp, vt); - - vt = vmulq_f32(vt, vs); - float32x4_t vf = vmlaq_f32(vs, vp, vt); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - vst1q_f32(output, vf); output += 4; - - vacc = vaddq_f32(vacc, vf); - } -#if XNN_ARCH_ARM64 - float vacc_lo = vaddvq_f32(vacc); -#else - float32x2_t vacc_lo = vadd_f32(vget_high_f32(vacc), vget_low_f32(vacc)); -#endif - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vmlaq_f32(vmagic_bias, vx, vlog2e); - - const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vmlaq_f32(vx, vn, vminus_ln2_hi); - vt = vmlaq_f32(vt, vn, vminus_ln2_lo); - - float32x4_t vp = vmlaq_f32(vc4, vc5, vt); - vp = vmlaq_f32(vc3, vp, vt); - vp = vmlaq_f32(vc2, vp, vt); - vp = vmlaq_f32(vc1, vp, vt); - - vt = vmulq_f32(vt, vs); - float32x4_t vf = vmlaq_f32(vs, vp, vt); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - float32x2_t vf_lo = vget_low_f32(vf); - if (batch & (2 * sizeof(float))) { - vst1_f32(output, vf_lo); output += 2; - - #if XNN_ARCH_ARM64 - vacc_lo += vaddv_f32(vf_lo); - #else - vacc_lo = vadd_f32(vacc_lo, vf_lo); - #endif - - vf_lo = vget_high_f32(vf); - } - if (batch & (1 * sizeof(float))) { - vst1_lane_f32(output, vf_lo, 0); - - #if XNN_ARCH_ARM64 - vacc_lo += vget_lane_f32(vf_lo, 0); - #else - vacc_lo = vadd_f32(vacc_lo, vreinterpret_f32_u64(vshl_n_u64(vreinterpret_u64_f32(vf_lo), 32))); - #endif - } - } -#if XNN_ARCH_ARM64 - *sum = vacc_lo; -#else - vst1_lane_f32(sum, vpadd_f32(vacc_lo, vacc_lo), 0); -#endif -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u16.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u16.c deleted file mode 100644 index 4eecea937b1..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u16.c +++ /dev/null @@ -1,228 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/neon-p5.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u16( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const float32x4_t vlog2e = vmovq_n_f32(0x1.715476p+0f); - const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f); - const float32x4_t vc5 = vmovq_n_f32(0x1.0F9F9Cp-7f); - const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f); - const float32x4_t vc3 = vmovq_n_f32(0x1.555A80p-3f); - const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f); - const float32x4_t vc1 = vmovq_n_f32(0x1.FFFFF6p-1f); - const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const float32x4_t vminus_ln2_hi = vmovq_n_f32(-0x1.62E400p-1f); - const float32x4_t vminus_ln2_lo = vmovq_n_f32(-0x1.7F7D1Cp-20f); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - - const float32x4_t vi_max = vld1q_dup_f32(max); - - float32x4_t vacc0 = vmovq_n_f32(0.0f); - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const float32x4_t vi0123 = vld1q_f32(input); input += 4; - const float32x4_t vi4567 = vld1q_f32(input); input += 4; - const float32x4_t vi89AB = vld1q_f32(input); input += 4; - const float32x4_t viCDEF = vld1q_f32(input); input += 4; - - const float32x4_t vx0123 = vsubq_f32(vi0123, vi_max); - const float32x4_t vx4567 = vsubq_f32(vi4567, vi_max); - const float32x4_t vx89AB = vsubq_f32(vi89AB, vi_max); - const float32x4_t vxCDEF = vsubq_f32(viCDEF, vi_max); - - float32x4_t vn0123 = vmlaq_f32(vmagic_bias, vx0123, vlog2e); - float32x4_t vn4567 = vmlaq_f32(vmagic_bias, vx4567, vlog2e); - float32x4_t vn89AB = vmlaq_f32(vmagic_bias, vx89AB, vlog2e); - float32x4_t vnCDEF = vmlaq_f32(vmagic_bias, vxCDEF, vlog2e); - - const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23)); - const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23)); - const float32x4_t vs89AB = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn89AB), 23)); - const float32x4_t vsCDEF = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnCDEF), 23)); - - vn0123 = vsubq_f32(vn0123, vmagic_bias); - vn4567 = vsubq_f32(vn4567, vmagic_bias); - vn89AB = vsubq_f32(vn89AB, vmagic_bias); - vnCDEF = vsubq_f32(vnCDEF, vmagic_bias); - - float32x4_t vt0123 = vmlaq_f32(vx0123, vn0123, vminus_ln2_hi); - float32x4_t vt4567 = vmlaq_f32(vx4567, vn4567, vminus_ln2_hi); - float32x4_t vt89AB = vmlaq_f32(vx89AB, vn89AB, vminus_ln2_hi); - float32x4_t vtCDEF = vmlaq_f32(vxCDEF, vnCDEF, vminus_ln2_hi); - - vt0123 = vmlaq_f32(vt0123, vn0123, vminus_ln2_lo); - vt4567 = vmlaq_f32(vt4567, vn4567, vminus_ln2_lo); - vt89AB = vmlaq_f32(vt89AB, vn89AB, vminus_ln2_lo); - vtCDEF = vmlaq_f32(vtCDEF, vnCDEF, vminus_ln2_lo); - - float32x4_t vp0123 = vmlaq_f32(vc4, vc5, vt0123); - float32x4_t vp4567 = vmlaq_f32(vc4, vc5, vt4567); - float32x4_t vp89AB = vmlaq_f32(vc4, vc5, vt89AB); - float32x4_t vpCDEF = vmlaq_f32(vc4, vc5, vtCDEF); - - vp0123 = vmlaq_f32(vc3, vp0123, vt0123); - vp4567 = vmlaq_f32(vc3, vp4567, vt4567); - vp89AB = vmlaq_f32(vc3, vp89AB, vt89AB); - vpCDEF = vmlaq_f32(vc3, vpCDEF, vtCDEF); - - vp0123 = vmlaq_f32(vc2, vp0123, vt0123); - vp4567 = vmlaq_f32(vc2, vp4567, vt4567); - vp89AB = vmlaq_f32(vc2, vp89AB, vt89AB); - vpCDEF = vmlaq_f32(vc2, vpCDEF, vtCDEF); - - vp0123 = vmlaq_f32(vc1, vp0123, vt0123); - vp4567 = vmlaq_f32(vc1, vp4567, vt4567); - vp89AB = vmlaq_f32(vc1, vp89AB, vt89AB); - vpCDEF = vmlaq_f32(vc1, vpCDEF, vtCDEF); - - vt0123 = vmulq_f32(vt0123, vs0123); - vt4567 = vmulq_f32(vt4567, vs4567); - vt89AB = vmulq_f32(vt89AB, vs89AB); - vtCDEF = vmulq_f32(vtCDEF, vsCDEF); - - float32x4_t vf0123 = vmlaq_f32(vs0123, vp0123, vt0123); - float32x4_t vf4567 = vmlaq_f32(vs4567, vp4567, vt4567); - float32x4_t vf89AB = vmlaq_f32(vs89AB, vp89AB, vt89AB); - float32x4_t vfCDEF = vmlaq_f32(vsCDEF, vpCDEF, vtCDEF); - - vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff))); - vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcltq_f32(vx4567, vdenorm_cutoff))); - vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcltq_f32(vx89AB, vdenorm_cutoff))); - vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcltq_f32(vxCDEF, vdenorm_cutoff))); - - vst1q_f32(output, vf0123); output += 4; - vst1q_f32(output, vf4567); output += 4; - vst1q_f32(output, vf89AB); output += 4; - vst1q_f32(output, vfCDEF); output += 4; - - vacc0 = vaddq_f32(vacc0, vf0123); - vacc0 = vaddq_f32(vacc0, vf4567); - vacc0 = vaddq_f32(vacc0, vf89AB); - vacc0 = vaddq_f32(vacc0, vfCDEF); - } - - float32x4_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vmlaq_f32(vmagic_bias, vx, vlog2e); - - const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vmlaq_f32(vx, vn, vminus_ln2_hi); - vt = vmlaq_f32(vt, vn, vminus_ln2_lo); - - float32x4_t vp = vmlaq_f32(vc4, vc5, vt); - vp = vmlaq_f32(vc3, vp, vt); - vp = vmlaq_f32(vc2, vp, vt); - vp = vmlaq_f32(vc1, vp, vt); - - vt = vmulq_f32(vt, vs); - float32x4_t vf = vmlaq_f32(vs, vp, vt); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - vst1q_f32(output, vf); output += 4; - - vacc = vaddq_f32(vacc, vf); - } -#if XNN_ARCH_ARM64 - float vacc_lo = vaddvq_f32(vacc); -#else - float32x2_t vacc_lo = vadd_f32(vget_high_f32(vacc), vget_low_f32(vacc)); -#endif - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vmlaq_f32(vmagic_bias, vx, vlog2e); - - const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vmlaq_f32(vx, vn, vminus_ln2_hi); - vt = vmlaq_f32(vt, vn, vminus_ln2_lo); - - float32x4_t vp = vmlaq_f32(vc4, vc5, vt); - vp = vmlaq_f32(vc3, vp, vt); - vp = vmlaq_f32(vc2, vp, vt); - vp = vmlaq_f32(vc1, vp, vt); - - vt = vmulq_f32(vt, vs); - float32x4_t vf = vmlaq_f32(vs, vp, vt); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - float32x2_t vf_lo = vget_low_f32(vf); - if (batch & (2 * sizeof(float))) { - vst1_f32(output, vf_lo); output += 2; - - #if XNN_ARCH_ARM64 - vacc_lo += vaddv_f32(vf_lo); - #else - vacc_lo = vadd_f32(vacc_lo, vf_lo); - #endif - - vf_lo = vget_high_f32(vf); - } - if (batch & (1 * sizeof(float))) { - vst1_lane_f32(output, vf_lo, 0); - - #if XNN_ARCH_ARM64 - vacc_lo += vget_lane_f32(vf_lo, 0); - #else - vacc_lo = vadd_f32(vacc_lo, vreinterpret_f32_u64(vshl_n_u64(vreinterpret_u64_f32(vf_lo), 32))); - #endif - } - } -#if XNN_ARCH_ARM64 - *sum = vacc_lo; -#else - vst1_lane_f32(sum, vpadd_f32(vacc_lo, vacc_lo), 0); -#endif -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u20-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u20-acc2.c deleted file mode 100644 index 3f338928983..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u20-acc2.c +++ /dev/null @@ -1,246 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/neon-p5.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u20_acc2( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const float32x4_t vlog2e = vmovq_n_f32(0x1.715476p+0f); - const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f); - const float32x4_t vc5 = vmovq_n_f32(0x1.0F9F9Cp-7f); - const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f); - const float32x4_t vc3 = vmovq_n_f32(0x1.555A80p-3f); - const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f); - const float32x4_t vc1 = vmovq_n_f32(0x1.FFFFF6p-1f); - const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const float32x4_t vminus_ln2_hi = vmovq_n_f32(-0x1.62E400p-1f); - const float32x4_t vminus_ln2_lo = vmovq_n_f32(-0x1.7F7D1Cp-20f); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - - const float32x4_t vi_max = vld1q_dup_f32(max); - - float32x4_t vacc0 = vmovq_n_f32(0.0f); - float32x4_t vacc1 = vmovq_n_f32(0.0f); - for (; batch >= 20 * sizeof(float); batch -= 20 * sizeof(float)) { - const float32x4_t vi0123 = vld1q_f32(input); input += 4; - const float32x4_t vi4567 = vld1q_f32(input); input += 4; - const float32x4_t vi89AB = vld1q_f32(input); input += 4; - const float32x4_t viCDEF = vld1q_f32(input); input += 4; - const float32x4_t viGHIJ = vld1q_f32(input); input += 4; - - const float32x4_t vx0123 = vsubq_f32(vi0123, vi_max); - const float32x4_t vx4567 = vsubq_f32(vi4567, vi_max); - const float32x4_t vx89AB = vsubq_f32(vi89AB, vi_max); - const float32x4_t vxCDEF = vsubq_f32(viCDEF, vi_max); - const float32x4_t vxGHIJ = vsubq_f32(viGHIJ, vi_max); - - float32x4_t vn0123 = vmlaq_f32(vmagic_bias, vx0123, vlog2e); - float32x4_t vn4567 = vmlaq_f32(vmagic_bias, vx4567, vlog2e); - float32x4_t vn89AB = vmlaq_f32(vmagic_bias, vx89AB, vlog2e); - float32x4_t vnCDEF = vmlaq_f32(vmagic_bias, vxCDEF, vlog2e); - float32x4_t vnGHIJ = vmlaq_f32(vmagic_bias, vxGHIJ, vlog2e); - - const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23)); - const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23)); - const float32x4_t vs89AB = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn89AB), 23)); - const float32x4_t vsCDEF = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnCDEF), 23)); - const float32x4_t vsGHIJ = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnGHIJ), 23)); - - vn0123 = vsubq_f32(vn0123, vmagic_bias); - vn4567 = vsubq_f32(vn4567, vmagic_bias); - vn89AB = vsubq_f32(vn89AB, vmagic_bias); - vnCDEF = vsubq_f32(vnCDEF, vmagic_bias); - vnGHIJ = vsubq_f32(vnGHIJ, vmagic_bias); - - float32x4_t vt0123 = vmlaq_f32(vx0123, vn0123, vminus_ln2_hi); - float32x4_t vt4567 = vmlaq_f32(vx4567, vn4567, vminus_ln2_hi); - float32x4_t vt89AB = vmlaq_f32(vx89AB, vn89AB, vminus_ln2_hi); - float32x4_t vtCDEF = vmlaq_f32(vxCDEF, vnCDEF, vminus_ln2_hi); - float32x4_t vtGHIJ = vmlaq_f32(vxGHIJ, vnGHIJ, vminus_ln2_hi); - - vt0123 = vmlaq_f32(vt0123, vn0123, vminus_ln2_lo); - vt4567 = vmlaq_f32(vt4567, vn4567, vminus_ln2_lo); - vt89AB = vmlaq_f32(vt89AB, vn89AB, vminus_ln2_lo); - vtCDEF = vmlaq_f32(vtCDEF, vnCDEF, vminus_ln2_lo); - vtGHIJ = vmlaq_f32(vtGHIJ, vnGHIJ, vminus_ln2_lo); - - float32x4_t vp0123 = vmlaq_f32(vc4, vc5, vt0123); - float32x4_t vp4567 = vmlaq_f32(vc4, vc5, vt4567); - float32x4_t vp89AB = vmlaq_f32(vc4, vc5, vt89AB); - float32x4_t vpCDEF = vmlaq_f32(vc4, vc5, vtCDEF); - float32x4_t vpGHIJ = vmlaq_f32(vc4, vc5, vtGHIJ); - - vp0123 = vmlaq_f32(vc3, vp0123, vt0123); - vp4567 = vmlaq_f32(vc3, vp4567, vt4567); - vp89AB = vmlaq_f32(vc3, vp89AB, vt89AB); - vpCDEF = vmlaq_f32(vc3, vpCDEF, vtCDEF); - vpGHIJ = vmlaq_f32(vc3, vpGHIJ, vtGHIJ); - - vp0123 = vmlaq_f32(vc2, vp0123, vt0123); - vp4567 = vmlaq_f32(vc2, vp4567, vt4567); - vp89AB = vmlaq_f32(vc2, vp89AB, vt89AB); - vpCDEF = vmlaq_f32(vc2, vpCDEF, vtCDEF); - vpGHIJ = vmlaq_f32(vc2, vpGHIJ, vtGHIJ); - - vp0123 = vmlaq_f32(vc1, vp0123, vt0123); - vp4567 = vmlaq_f32(vc1, vp4567, vt4567); - vp89AB = vmlaq_f32(vc1, vp89AB, vt89AB); - vpCDEF = vmlaq_f32(vc1, vpCDEF, vtCDEF); - vpGHIJ = vmlaq_f32(vc1, vpGHIJ, vtGHIJ); - - vt0123 = vmulq_f32(vt0123, vs0123); - vt4567 = vmulq_f32(vt4567, vs4567); - vt89AB = vmulq_f32(vt89AB, vs89AB); - vtCDEF = vmulq_f32(vtCDEF, vsCDEF); - vtGHIJ = vmulq_f32(vtGHIJ, vsGHIJ); - - float32x4_t vf0123 = vmlaq_f32(vs0123, vp0123, vt0123); - float32x4_t vf4567 = vmlaq_f32(vs4567, vp4567, vt4567); - float32x4_t vf89AB = vmlaq_f32(vs89AB, vp89AB, vt89AB); - float32x4_t vfCDEF = vmlaq_f32(vsCDEF, vpCDEF, vtCDEF); - float32x4_t vfGHIJ = vmlaq_f32(vsGHIJ, vpGHIJ, vtGHIJ); - - vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff))); - vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcltq_f32(vx4567, vdenorm_cutoff))); - vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcltq_f32(vx89AB, vdenorm_cutoff))); - vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcltq_f32(vxCDEF, vdenorm_cutoff))); - vfGHIJ = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfGHIJ), vcltq_f32(vxGHIJ, vdenorm_cutoff))); - - vst1q_f32(output, vf0123); output += 4; - vst1q_f32(output, vf4567); output += 4; - vst1q_f32(output, vf89AB); output += 4; - vst1q_f32(output, vfCDEF); output += 4; - vst1q_f32(output, vfGHIJ); output += 4; - - vacc0 = vaddq_f32(vacc0, vf0123); - vacc0 = vaddq_f32(vacc0, vf4567); - vacc0 = vaddq_f32(vacc0, vf89AB); - vacc0 = vaddq_f32(vacc0, vfCDEF); - vacc0 = vaddq_f32(vacc0, vfGHIJ); - } - vacc0 = vaddq_f32(vacc0, vacc1); - - float32x4_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vmlaq_f32(vmagic_bias, vx, vlog2e); - - const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vmlaq_f32(vx, vn, vminus_ln2_hi); - vt = vmlaq_f32(vt, vn, vminus_ln2_lo); - - float32x4_t vp = vmlaq_f32(vc4, vc5, vt); - vp = vmlaq_f32(vc3, vp, vt); - vp = vmlaq_f32(vc2, vp, vt); - vp = vmlaq_f32(vc1, vp, vt); - - vt = vmulq_f32(vt, vs); - float32x4_t vf = vmlaq_f32(vs, vp, vt); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - vst1q_f32(output, vf); output += 4; - - vacc = vaddq_f32(vacc, vf); - } -#if XNN_ARCH_ARM64 - float vacc_lo = vaddvq_f32(vacc); -#else - float32x2_t vacc_lo = vadd_f32(vget_high_f32(vacc), vget_low_f32(vacc)); -#endif - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vmlaq_f32(vmagic_bias, vx, vlog2e); - - const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vmlaq_f32(vx, vn, vminus_ln2_hi); - vt = vmlaq_f32(vt, vn, vminus_ln2_lo); - - float32x4_t vp = vmlaq_f32(vc4, vc5, vt); - vp = vmlaq_f32(vc3, vp, vt); - vp = vmlaq_f32(vc2, vp, vt); - vp = vmlaq_f32(vc1, vp, vt); - - vt = vmulq_f32(vt, vs); - float32x4_t vf = vmlaq_f32(vs, vp, vt); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - float32x2_t vf_lo = vget_low_f32(vf); - if (batch & (2 * sizeof(float))) { - vst1_f32(output, vf_lo); output += 2; - - #if XNN_ARCH_ARM64 - vacc_lo += vaddv_f32(vf_lo); - #else - vacc_lo = vadd_f32(vacc_lo, vf_lo); - #endif - - vf_lo = vget_high_f32(vf); - } - if (batch & (1 * sizeof(float))) { - vst1_lane_f32(output, vf_lo, 0); - - #if XNN_ARCH_ARM64 - vacc_lo += vget_lane_f32(vf_lo, 0); - #else - vacc_lo = vadd_f32(vacc_lo, vreinterpret_f32_u64(vshl_n_u64(vreinterpret_u64_f32(vf_lo), 32))); - #endif - } - } -#if XNN_ARCH_ARM64 - *sum = vacc_lo; -#else - vst1_lane_f32(sum, vpadd_f32(vacc_lo, vacc_lo), 0); -#endif -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u20-acc5.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u20-acc5.c deleted file mode 100644 index 4332a8f2bc3..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u20-acc5.c +++ /dev/null @@ -1,252 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/neon-p5.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u20_acc5( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const float32x4_t vlog2e = vmovq_n_f32(0x1.715476p+0f); - const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f); - const float32x4_t vc5 = vmovq_n_f32(0x1.0F9F9Cp-7f); - const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f); - const float32x4_t vc3 = vmovq_n_f32(0x1.555A80p-3f); - const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f); - const float32x4_t vc1 = vmovq_n_f32(0x1.FFFFF6p-1f); - const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const float32x4_t vminus_ln2_hi = vmovq_n_f32(-0x1.62E400p-1f); - const float32x4_t vminus_ln2_lo = vmovq_n_f32(-0x1.7F7D1Cp-20f); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - - const float32x4_t vi_max = vld1q_dup_f32(max); - - float32x4_t vacc0 = vmovq_n_f32(0.0f); - float32x4_t vacc1 = vmovq_n_f32(0.0f); - float32x4_t vacc2 = vmovq_n_f32(0.0f); - float32x4_t vacc3 = vmovq_n_f32(0.0f); - float32x4_t vacc4 = vmovq_n_f32(0.0f); - for (; batch >= 20 * sizeof(float); batch -= 20 * sizeof(float)) { - const float32x4_t vi0123 = vld1q_f32(input); input += 4; - const float32x4_t vi4567 = vld1q_f32(input); input += 4; - const float32x4_t vi89AB = vld1q_f32(input); input += 4; - const float32x4_t viCDEF = vld1q_f32(input); input += 4; - const float32x4_t viGHIJ = vld1q_f32(input); input += 4; - - const float32x4_t vx0123 = vsubq_f32(vi0123, vi_max); - const float32x4_t vx4567 = vsubq_f32(vi4567, vi_max); - const float32x4_t vx89AB = vsubq_f32(vi89AB, vi_max); - const float32x4_t vxCDEF = vsubq_f32(viCDEF, vi_max); - const float32x4_t vxGHIJ = vsubq_f32(viGHIJ, vi_max); - - float32x4_t vn0123 = vmlaq_f32(vmagic_bias, vx0123, vlog2e); - float32x4_t vn4567 = vmlaq_f32(vmagic_bias, vx4567, vlog2e); - float32x4_t vn89AB = vmlaq_f32(vmagic_bias, vx89AB, vlog2e); - float32x4_t vnCDEF = vmlaq_f32(vmagic_bias, vxCDEF, vlog2e); - float32x4_t vnGHIJ = vmlaq_f32(vmagic_bias, vxGHIJ, vlog2e); - - const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23)); - const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23)); - const float32x4_t vs89AB = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn89AB), 23)); - const float32x4_t vsCDEF = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnCDEF), 23)); - const float32x4_t vsGHIJ = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnGHIJ), 23)); - - vn0123 = vsubq_f32(vn0123, vmagic_bias); - vn4567 = vsubq_f32(vn4567, vmagic_bias); - vn89AB = vsubq_f32(vn89AB, vmagic_bias); - vnCDEF = vsubq_f32(vnCDEF, vmagic_bias); - vnGHIJ = vsubq_f32(vnGHIJ, vmagic_bias); - - float32x4_t vt0123 = vmlaq_f32(vx0123, vn0123, vminus_ln2_hi); - float32x4_t vt4567 = vmlaq_f32(vx4567, vn4567, vminus_ln2_hi); - float32x4_t vt89AB = vmlaq_f32(vx89AB, vn89AB, vminus_ln2_hi); - float32x4_t vtCDEF = vmlaq_f32(vxCDEF, vnCDEF, vminus_ln2_hi); - float32x4_t vtGHIJ = vmlaq_f32(vxGHIJ, vnGHIJ, vminus_ln2_hi); - - vt0123 = vmlaq_f32(vt0123, vn0123, vminus_ln2_lo); - vt4567 = vmlaq_f32(vt4567, vn4567, vminus_ln2_lo); - vt89AB = vmlaq_f32(vt89AB, vn89AB, vminus_ln2_lo); - vtCDEF = vmlaq_f32(vtCDEF, vnCDEF, vminus_ln2_lo); - vtGHIJ = vmlaq_f32(vtGHIJ, vnGHIJ, vminus_ln2_lo); - - float32x4_t vp0123 = vmlaq_f32(vc4, vc5, vt0123); - float32x4_t vp4567 = vmlaq_f32(vc4, vc5, vt4567); - float32x4_t vp89AB = vmlaq_f32(vc4, vc5, vt89AB); - float32x4_t vpCDEF = vmlaq_f32(vc4, vc5, vtCDEF); - float32x4_t vpGHIJ = vmlaq_f32(vc4, vc5, vtGHIJ); - - vp0123 = vmlaq_f32(vc3, vp0123, vt0123); - vp4567 = vmlaq_f32(vc3, vp4567, vt4567); - vp89AB = vmlaq_f32(vc3, vp89AB, vt89AB); - vpCDEF = vmlaq_f32(vc3, vpCDEF, vtCDEF); - vpGHIJ = vmlaq_f32(vc3, vpGHIJ, vtGHIJ); - - vp0123 = vmlaq_f32(vc2, vp0123, vt0123); - vp4567 = vmlaq_f32(vc2, vp4567, vt4567); - vp89AB = vmlaq_f32(vc2, vp89AB, vt89AB); - vpCDEF = vmlaq_f32(vc2, vpCDEF, vtCDEF); - vpGHIJ = vmlaq_f32(vc2, vpGHIJ, vtGHIJ); - - vp0123 = vmlaq_f32(vc1, vp0123, vt0123); - vp4567 = vmlaq_f32(vc1, vp4567, vt4567); - vp89AB = vmlaq_f32(vc1, vp89AB, vt89AB); - vpCDEF = vmlaq_f32(vc1, vpCDEF, vtCDEF); - vpGHIJ = vmlaq_f32(vc1, vpGHIJ, vtGHIJ); - - vt0123 = vmulq_f32(vt0123, vs0123); - vt4567 = vmulq_f32(vt4567, vs4567); - vt89AB = vmulq_f32(vt89AB, vs89AB); - vtCDEF = vmulq_f32(vtCDEF, vsCDEF); - vtGHIJ = vmulq_f32(vtGHIJ, vsGHIJ); - - float32x4_t vf0123 = vmlaq_f32(vs0123, vp0123, vt0123); - float32x4_t vf4567 = vmlaq_f32(vs4567, vp4567, vt4567); - float32x4_t vf89AB = vmlaq_f32(vs89AB, vp89AB, vt89AB); - float32x4_t vfCDEF = vmlaq_f32(vsCDEF, vpCDEF, vtCDEF); - float32x4_t vfGHIJ = vmlaq_f32(vsGHIJ, vpGHIJ, vtGHIJ); - - vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff))); - vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcltq_f32(vx4567, vdenorm_cutoff))); - vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcltq_f32(vx89AB, vdenorm_cutoff))); - vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcltq_f32(vxCDEF, vdenorm_cutoff))); - vfGHIJ = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfGHIJ), vcltq_f32(vxGHIJ, vdenorm_cutoff))); - - vst1q_f32(output, vf0123); output += 4; - vst1q_f32(output, vf4567); output += 4; - vst1q_f32(output, vf89AB); output += 4; - vst1q_f32(output, vfCDEF); output += 4; - vst1q_f32(output, vfGHIJ); output += 4; - - vacc0 = vaddq_f32(vacc0, vf0123); - vacc4 = vaddq_f32(vacc4, vf4567); - vacc3 = vaddq_f32(vacc3, vf89AB); - vacc2 = vaddq_f32(vacc2, vfCDEF); - vacc1 = vaddq_f32(vacc1, vfGHIJ); - } - vacc0 = vaddq_f32(vacc0, vacc1); - vacc2 = vaddq_f32(vacc2, vacc3); - vacc0 = vaddq_f32(vacc0, vacc2); - vacc0 = vaddq_f32(vacc0, vacc4); - - float32x4_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vmlaq_f32(vmagic_bias, vx, vlog2e); - - const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vmlaq_f32(vx, vn, vminus_ln2_hi); - vt = vmlaq_f32(vt, vn, vminus_ln2_lo); - - float32x4_t vp = vmlaq_f32(vc4, vc5, vt); - vp = vmlaq_f32(vc3, vp, vt); - vp = vmlaq_f32(vc2, vp, vt); - vp = vmlaq_f32(vc1, vp, vt); - - vt = vmulq_f32(vt, vs); - float32x4_t vf = vmlaq_f32(vs, vp, vt); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - vst1q_f32(output, vf); output += 4; - - vacc = vaddq_f32(vacc, vf); - } -#if XNN_ARCH_ARM64 - float vacc_lo = vaddvq_f32(vacc); -#else - float32x2_t vacc_lo = vadd_f32(vget_high_f32(vacc), vget_low_f32(vacc)); -#endif - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vmlaq_f32(vmagic_bias, vx, vlog2e); - - const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vmlaq_f32(vx, vn, vminus_ln2_hi); - vt = vmlaq_f32(vt, vn, vminus_ln2_lo); - - float32x4_t vp = vmlaq_f32(vc4, vc5, vt); - vp = vmlaq_f32(vc3, vp, vt); - vp = vmlaq_f32(vc2, vp, vt); - vp = vmlaq_f32(vc1, vp, vt); - - vt = vmulq_f32(vt, vs); - float32x4_t vf = vmlaq_f32(vs, vp, vt); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - float32x2_t vf_lo = vget_low_f32(vf); - if (batch & (2 * sizeof(float))) { - vst1_f32(output, vf_lo); output += 2; - - #if XNN_ARCH_ARM64 - vacc_lo += vaddv_f32(vf_lo); - #else - vacc_lo = vadd_f32(vacc_lo, vf_lo); - #endif - - vf_lo = vget_high_f32(vf); - } - if (batch & (1 * sizeof(float))) { - vst1_lane_f32(output, vf_lo, 0); - - #if XNN_ARCH_ARM64 - vacc_lo += vget_lane_f32(vf_lo, 0); - #else - vacc_lo = vadd_f32(vacc_lo, vreinterpret_f32_u64(vshl_n_u64(vreinterpret_u64_f32(vf_lo), 32))); - #endif - } - } -#if XNN_ARCH_ARM64 - *sum = vacc_lo; -#else - vst1_lane_f32(sum, vpadd_f32(vacc_lo, vacc_lo), 0); -#endif -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u20.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u20.c deleted file mode 100644 index 7e35d919e1c..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u20.c +++ /dev/null @@ -1,244 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/neon-p5.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u20( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const float32x4_t vlog2e = vmovq_n_f32(0x1.715476p+0f); - const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f); - const float32x4_t vc5 = vmovq_n_f32(0x1.0F9F9Cp-7f); - const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f); - const float32x4_t vc3 = vmovq_n_f32(0x1.555A80p-3f); - const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f); - const float32x4_t vc1 = vmovq_n_f32(0x1.FFFFF6p-1f); - const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const float32x4_t vminus_ln2_hi = vmovq_n_f32(-0x1.62E400p-1f); - const float32x4_t vminus_ln2_lo = vmovq_n_f32(-0x1.7F7D1Cp-20f); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - - const float32x4_t vi_max = vld1q_dup_f32(max); - - float32x4_t vacc0 = vmovq_n_f32(0.0f); - for (; batch >= 20 * sizeof(float); batch -= 20 * sizeof(float)) { - const float32x4_t vi0123 = vld1q_f32(input); input += 4; - const float32x4_t vi4567 = vld1q_f32(input); input += 4; - const float32x4_t vi89AB = vld1q_f32(input); input += 4; - const float32x4_t viCDEF = vld1q_f32(input); input += 4; - const float32x4_t viGHIJ = vld1q_f32(input); input += 4; - - const float32x4_t vx0123 = vsubq_f32(vi0123, vi_max); - const float32x4_t vx4567 = vsubq_f32(vi4567, vi_max); - const float32x4_t vx89AB = vsubq_f32(vi89AB, vi_max); - const float32x4_t vxCDEF = vsubq_f32(viCDEF, vi_max); - const float32x4_t vxGHIJ = vsubq_f32(viGHIJ, vi_max); - - float32x4_t vn0123 = vmlaq_f32(vmagic_bias, vx0123, vlog2e); - float32x4_t vn4567 = vmlaq_f32(vmagic_bias, vx4567, vlog2e); - float32x4_t vn89AB = vmlaq_f32(vmagic_bias, vx89AB, vlog2e); - float32x4_t vnCDEF = vmlaq_f32(vmagic_bias, vxCDEF, vlog2e); - float32x4_t vnGHIJ = vmlaq_f32(vmagic_bias, vxGHIJ, vlog2e); - - const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23)); - const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23)); - const float32x4_t vs89AB = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn89AB), 23)); - const float32x4_t vsCDEF = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnCDEF), 23)); - const float32x4_t vsGHIJ = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnGHIJ), 23)); - - vn0123 = vsubq_f32(vn0123, vmagic_bias); - vn4567 = vsubq_f32(vn4567, vmagic_bias); - vn89AB = vsubq_f32(vn89AB, vmagic_bias); - vnCDEF = vsubq_f32(vnCDEF, vmagic_bias); - vnGHIJ = vsubq_f32(vnGHIJ, vmagic_bias); - - float32x4_t vt0123 = vmlaq_f32(vx0123, vn0123, vminus_ln2_hi); - float32x4_t vt4567 = vmlaq_f32(vx4567, vn4567, vminus_ln2_hi); - float32x4_t vt89AB = vmlaq_f32(vx89AB, vn89AB, vminus_ln2_hi); - float32x4_t vtCDEF = vmlaq_f32(vxCDEF, vnCDEF, vminus_ln2_hi); - float32x4_t vtGHIJ = vmlaq_f32(vxGHIJ, vnGHIJ, vminus_ln2_hi); - - vt0123 = vmlaq_f32(vt0123, vn0123, vminus_ln2_lo); - vt4567 = vmlaq_f32(vt4567, vn4567, vminus_ln2_lo); - vt89AB = vmlaq_f32(vt89AB, vn89AB, vminus_ln2_lo); - vtCDEF = vmlaq_f32(vtCDEF, vnCDEF, vminus_ln2_lo); - vtGHIJ = vmlaq_f32(vtGHIJ, vnGHIJ, vminus_ln2_lo); - - float32x4_t vp0123 = vmlaq_f32(vc4, vc5, vt0123); - float32x4_t vp4567 = vmlaq_f32(vc4, vc5, vt4567); - float32x4_t vp89AB = vmlaq_f32(vc4, vc5, vt89AB); - float32x4_t vpCDEF = vmlaq_f32(vc4, vc5, vtCDEF); - float32x4_t vpGHIJ = vmlaq_f32(vc4, vc5, vtGHIJ); - - vp0123 = vmlaq_f32(vc3, vp0123, vt0123); - vp4567 = vmlaq_f32(vc3, vp4567, vt4567); - vp89AB = vmlaq_f32(vc3, vp89AB, vt89AB); - vpCDEF = vmlaq_f32(vc3, vpCDEF, vtCDEF); - vpGHIJ = vmlaq_f32(vc3, vpGHIJ, vtGHIJ); - - vp0123 = vmlaq_f32(vc2, vp0123, vt0123); - vp4567 = vmlaq_f32(vc2, vp4567, vt4567); - vp89AB = vmlaq_f32(vc2, vp89AB, vt89AB); - vpCDEF = vmlaq_f32(vc2, vpCDEF, vtCDEF); - vpGHIJ = vmlaq_f32(vc2, vpGHIJ, vtGHIJ); - - vp0123 = vmlaq_f32(vc1, vp0123, vt0123); - vp4567 = vmlaq_f32(vc1, vp4567, vt4567); - vp89AB = vmlaq_f32(vc1, vp89AB, vt89AB); - vpCDEF = vmlaq_f32(vc1, vpCDEF, vtCDEF); - vpGHIJ = vmlaq_f32(vc1, vpGHIJ, vtGHIJ); - - vt0123 = vmulq_f32(vt0123, vs0123); - vt4567 = vmulq_f32(vt4567, vs4567); - vt89AB = vmulq_f32(vt89AB, vs89AB); - vtCDEF = vmulq_f32(vtCDEF, vsCDEF); - vtGHIJ = vmulq_f32(vtGHIJ, vsGHIJ); - - float32x4_t vf0123 = vmlaq_f32(vs0123, vp0123, vt0123); - float32x4_t vf4567 = vmlaq_f32(vs4567, vp4567, vt4567); - float32x4_t vf89AB = vmlaq_f32(vs89AB, vp89AB, vt89AB); - float32x4_t vfCDEF = vmlaq_f32(vsCDEF, vpCDEF, vtCDEF); - float32x4_t vfGHIJ = vmlaq_f32(vsGHIJ, vpGHIJ, vtGHIJ); - - vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff))); - vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcltq_f32(vx4567, vdenorm_cutoff))); - vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcltq_f32(vx89AB, vdenorm_cutoff))); - vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcltq_f32(vxCDEF, vdenorm_cutoff))); - vfGHIJ = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfGHIJ), vcltq_f32(vxGHIJ, vdenorm_cutoff))); - - vst1q_f32(output, vf0123); output += 4; - vst1q_f32(output, vf4567); output += 4; - vst1q_f32(output, vf89AB); output += 4; - vst1q_f32(output, vfCDEF); output += 4; - vst1q_f32(output, vfGHIJ); output += 4; - - vacc0 = vaddq_f32(vacc0, vf0123); - vacc0 = vaddq_f32(vacc0, vf4567); - vacc0 = vaddq_f32(vacc0, vf89AB); - vacc0 = vaddq_f32(vacc0, vfCDEF); - vacc0 = vaddq_f32(vacc0, vfGHIJ); - } - - float32x4_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vmlaq_f32(vmagic_bias, vx, vlog2e); - - const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vmlaq_f32(vx, vn, vminus_ln2_hi); - vt = vmlaq_f32(vt, vn, vminus_ln2_lo); - - float32x4_t vp = vmlaq_f32(vc4, vc5, vt); - vp = vmlaq_f32(vc3, vp, vt); - vp = vmlaq_f32(vc2, vp, vt); - vp = vmlaq_f32(vc1, vp, vt); - - vt = vmulq_f32(vt, vs); - float32x4_t vf = vmlaq_f32(vs, vp, vt); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - vst1q_f32(output, vf); output += 4; - - vacc = vaddq_f32(vacc, vf); - } -#if XNN_ARCH_ARM64 - float vacc_lo = vaddvq_f32(vacc); -#else - float32x2_t vacc_lo = vadd_f32(vget_high_f32(vacc), vget_low_f32(vacc)); -#endif - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vmlaq_f32(vmagic_bias, vx, vlog2e); - - const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vmlaq_f32(vx, vn, vminus_ln2_hi); - vt = vmlaq_f32(vt, vn, vminus_ln2_lo); - - float32x4_t vp = vmlaq_f32(vc4, vc5, vt); - vp = vmlaq_f32(vc3, vp, vt); - vp = vmlaq_f32(vc2, vp, vt); - vp = vmlaq_f32(vc1, vp, vt); - - vt = vmulq_f32(vt, vs); - float32x4_t vf = vmlaq_f32(vs, vp, vt); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - float32x2_t vf_lo = vget_low_f32(vf); - if (batch & (2 * sizeof(float))) { - vst1_f32(output, vf_lo); output += 2; - - #if XNN_ARCH_ARM64 - vacc_lo += vaddv_f32(vf_lo); - #else - vacc_lo = vadd_f32(vacc_lo, vf_lo); - #endif - - vf_lo = vget_high_f32(vf); - } - if (batch & (1 * sizeof(float))) { - vst1_lane_f32(output, vf_lo, 0); - - #if XNN_ARCH_ARM64 - vacc_lo += vget_lane_f32(vf_lo, 0); - #else - vacc_lo = vadd_f32(vacc_lo, vreinterpret_f32_u64(vshl_n_u64(vreinterpret_u64_f32(vf_lo), 32))); - #endif - } - } -#if XNN_ARCH_ARM64 - *sum = vacc_lo; -#else - vst1_lane_f32(sum, vpadd_f32(vacc_lo, vacc_lo), 0); -#endif -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u8.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u8.c deleted file mode 100644 index cf3499a24eb..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-u8.c +++ /dev/null @@ -1,196 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/neon-p5.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u8( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const float32x4_t vlog2e = vmovq_n_f32(0x1.715476p+0f); - const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f); - const float32x4_t vc5 = vmovq_n_f32(0x1.0F9F9Cp-7f); - const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f); - const float32x4_t vc3 = vmovq_n_f32(0x1.555A80p-3f); - const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f); - const float32x4_t vc1 = vmovq_n_f32(0x1.FFFFF6p-1f); - const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const float32x4_t vminus_ln2_hi = vmovq_n_f32(-0x1.62E400p-1f); - const float32x4_t vminus_ln2_lo = vmovq_n_f32(-0x1.7F7D1Cp-20f); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - - const float32x4_t vi_max = vld1q_dup_f32(max); - - float32x4_t vacc0 = vmovq_n_f32(0.0f); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const float32x4_t vi0123 = vld1q_f32(input); input += 4; - const float32x4_t vi4567 = vld1q_f32(input); input += 4; - - const float32x4_t vx0123 = vsubq_f32(vi0123, vi_max); - const float32x4_t vx4567 = vsubq_f32(vi4567, vi_max); - - float32x4_t vn0123 = vmlaq_f32(vmagic_bias, vx0123, vlog2e); - float32x4_t vn4567 = vmlaq_f32(vmagic_bias, vx4567, vlog2e); - - const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23)); - const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23)); - - vn0123 = vsubq_f32(vn0123, vmagic_bias); - vn4567 = vsubq_f32(vn4567, vmagic_bias); - - float32x4_t vt0123 = vmlaq_f32(vx0123, vn0123, vminus_ln2_hi); - float32x4_t vt4567 = vmlaq_f32(vx4567, vn4567, vminus_ln2_hi); - - vt0123 = vmlaq_f32(vt0123, vn0123, vminus_ln2_lo); - vt4567 = vmlaq_f32(vt4567, vn4567, vminus_ln2_lo); - - float32x4_t vp0123 = vmlaq_f32(vc4, vc5, vt0123); - float32x4_t vp4567 = vmlaq_f32(vc4, vc5, vt4567); - - vp0123 = vmlaq_f32(vc3, vp0123, vt0123); - vp4567 = vmlaq_f32(vc3, vp4567, vt4567); - - vp0123 = vmlaq_f32(vc2, vp0123, vt0123); - vp4567 = vmlaq_f32(vc2, vp4567, vt4567); - - vp0123 = vmlaq_f32(vc1, vp0123, vt0123); - vp4567 = vmlaq_f32(vc1, vp4567, vt4567); - - vt0123 = vmulq_f32(vt0123, vs0123); - vt4567 = vmulq_f32(vt4567, vs4567); - - float32x4_t vf0123 = vmlaq_f32(vs0123, vp0123, vt0123); - float32x4_t vf4567 = vmlaq_f32(vs4567, vp4567, vt4567); - - vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff))); - vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcltq_f32(vx4567, vdenorm_cutoff))); - - vst1q_f32(output, vf0123); output += 4; - vst1q_f32(output, vf4567); output += 4; - - vacc0 = vaddq_f32(vacc0, vf0123); - vacc0 = vaddq_f32(vacc0, vf4567); - } - - float32x4_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vmlaq_f32(vmagic_bias, vx, vlog2e); - - const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vmlaq_f32(vx, vn, vminus_ln2_hi); - vt = vmlaq_f32(vt, vn, vminus_ln2_lo); - - float32x4_t vp = vmlaq_f32(vc4, vc5, vt); - vp = vmlaq_f32(vc3, vp, vt); - vp = vmlaq_f32(vc2, vp, vt); - vp = vmlaq_f32(vc1, vp, vt); - - vt = vmulq_f32(vt, vs); - float32x4_t vf = vmlaq_f32(vs, vp, vt); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - vst1q_f32(output, vf); output += 4; - - vacc = vaddq_f32(vacc, vf); - } -#if XNN_ARCH_ARM64 - float vacc_lo = vaddvq_f32(vacc); -#else - float32x2_t vacc_lo = vadd_f32(vget_high_f32(vacc), vget_low_f32(vacc)); -#endif - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vmlaq_f32(vmagic_bias, vx, vlog2e); - - const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vmlaq_f32(vx, vn, vminus_ln2_hi); - vt = vmlaq_f32(vt, vn, vminus_ln2_lo); - - float32x4_t vp = vmlaq_f32(vc4, vc5, vt); - vp = vmlaq_f32(vc3, vp, vt); - vp = vmlaq_f32(vc2, vp, vt); - vp = vmlaq_f32(vc1, vp, vt); - - vt = vmulq_f32(vt, vs); - float32x4_t vf = vmlaq_f32(vs, vp, vt); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - float32x2_t vf_lo = vget_low_f32(vf); - if (batch & (2 * sizeof(float))) { - vst1_f32(output, vf_lo); output += 2; - - #if XNN_ARCH_ARM64 - vacc_lo += vaddv_f32(vf_lo); - #else - vacc_lo = vadd_f32(vacc_lo, vf_lo); - #endif - - vf_lo = vget_high_f32(vf); - } - if (batch & (1 * sizeof(float))) { - vst1_lane_f32(output, vf_lo, 0); - - #if XNN_ARCH_ARM64 - vacc_lo += vget_lane_f32(vf_lo, 0); - #else - vacc_lo = vadd_f32(vacc_lo, vreinterpret_f32_u64(vshl_n_u64(vreinterpret_u64_f32(vf_lo), 32))); - #endif - } - } -#if XNN_ARCH_ARM64 - *sum = vacc_lo; -#else - vst1_lane_f32(sum, vpadd_f32(vacc_lo, vacc_lo), 0); -#endif -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u12-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u12-acc2.c deleted file mode 100644 index 0a13741d009..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u12-acc2.c +++ /dev/null @@ -1,235 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -extern XNN_INTERNAL const float xnn_table_exp2_k_over_64[64]; - -void xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u12_acc2( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const float32x4_t vlog2e = vmovq_n_f32(0x1.715476p+0f); - const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p17f); - const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x3F)); - const float32x4_t vc2 = vmovq_n_f32(0x1.FFFF0Ap-2f); - const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vindex_mask); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const float32x4_t vminus_ln2 = vmovq_n_f32(-0x1.62E430p-1f); - XNN_FORCE_REALIZATION(vminus_ln2); - - const float32x4_t vi_max = vld1q_dup_f32(max); - - float32x4_t vacc0 = vmovq_n_f32(0.0f); - float32x4_t vacc1 = vmovq_n_f32(0.0f); - for (; batch >= 12 * sizeof(float); batch -= 12 * sizeof(float)) { - const float32x4_t vi0123 = vld1q_f32(input); input += 4; - const float32x4_t vi4567 = vld1q_f32(input); input += 4; - const float32x4_t vi89AB = vld1q_f32(input); input += 4; - - const float32x4_t vx0123 = vsubq_f32(vi0123, vi_max); - const float32x4_t vx4567 = vsubq_f32(vi4567, vi_max); - const float32x4_t vx89AB = vsubq_f32(vi89AB, vi_max); - - float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vx0123, vlog2e); - float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vx4567, vlog2e); - float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vx89AB, vlog2e); - - const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t ve89AB = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn89AB), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask)); - const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0); - const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1); - const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask)); - const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0); - const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1); - const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask)); - const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0); - const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1); - - float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx01]); - float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx23]); - float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx45]); - float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx67]); - float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx89]); - float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxAB]); - - vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx01 >> 32)], vl01, 1); - vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx23 >> 32)], vl23, 1); - const float32x4_t vl0123 = vcombine_f32(vl01, vl23); - vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx45 >> 32)], vl45, 1); - vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx67 >> 32)], vl67, 1); - const float32x4_t vl4567 = vcombine_f32(vl45, vl67); - vl89 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx89 >> 32)], vl89, 1); - vlAB = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxAB >> 32)], vlAB, 1); - const float32x4_t vl89AB = vcombine_f32(vl89, vlAB); - - const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123)); - const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567)); - const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB)); - - vn0123 = vsubq_f32(vn0123, vmagic_bias); - vn4567 = vsubq_f32(vn4567, vmagic_bias); - vn89AB = vsubq_f32(vn89AB, vmagic_bias); - - float32x4_t vt0123 = vfmaq_f32(vx0123, vn0123, vminus_ln2); - float32x4_t vt4567 = vfmaq_f32(vx4567, vn4567, vminus_ln2); - float32x4_t vt89AB = vfmaq_f32(vx89AB, vn89AB, vminus_ln2); - - float32x4_t vp0123 = vmulq_f32(vt0123, vc2); - float32x4_t vp4567 = vmulq_f32(vt4567, vc2); - float32x4_t vp89AB = vmulq_f32(vt89AB, vc2); - - vp0123 = vfmaq_f32(vt0123, vt0123, vp0123); - vp4567 = vfmaq_f32(vt4567, vt4567, vp4567); - vp89AB = vfmaq_f32(vt89AB, vt89AB, vp89AB); - - float32x4_t vf0123 = vfmaq_f32(vs0123, vs0123, vp0123); - float32x4_t vf4567 = vfmaq_f32(vs4567, vs4567, vp4567); - float32x4_t vf89AB = vfmaq_f32(vs89AB, vs89AB, vp89AB); - - vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff))); - vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcltq_f32(vx4567, vdenorm_cutoff))); - vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcltq_f32(vx89AB, vdenorm_cutoff))); - - vst1q_f32(output, vf0123); output += 4; - vst1q_f32(output, vf4567); output += 4; - vst1q_f32(output, vf89AB); output += 4; - - vacc0 = vaddq_f32(vacc0, vf0123); - vacc0 = vaddq_f32(vacc0, vf4567); - vacc0 = vaddq_f32(vacc0, vf89AB); - } - vacc0 = vaddq_f32(vacc0, vacc1); - - float32x4_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e); - - const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); - const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); - const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); - float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); - float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]); - vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); - vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); - const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); - const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2); - - float32x4_t vp = vmulq_f32(vt, vc2); - vp = vfmaq_f32(vt, vt, vp); - - float32x4_t vf = vfmaq_f32(vs, vs, vp); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - vst1q_f32(output, vf); output += 4; - - vacc = vaddq_f32(vacc, vf); - } -#if XNN_ARCH_ARM64 - float vacc_lo = vaddvq_f32(vacc); -#else - float32x2_t vacc_lo = vadd_f32(vget_high_f32(vacc), vget_low_f32(vacc)); -#endif - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e); - - const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); - const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); - const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); - float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); - float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]); - vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); - vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); - const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); - const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2); - - float32x4_t vp = vmulq_f32(vt, vc2); - vp = vfmaq_f32(vt, vt, vp); - - float32x4_t vf = vfmaq_f32(vs, vs, vp); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - float32x2_t vf_lo = vget_low_f32(vf); - if (batch & (2 * sizeof(float))) { - vst1_f32(output, vf_lo); output += 2; - - #if XNN_ARCH_ARM64 - vacc_lo += vaddv_f32(vf_lo); - #else - vacc_lo = vadd_f32(vacc_lo, vf_lo); - #endif - - vf_lo = vget_high_f32(vf); - } - if (batch & (1 * sizeof(float))) { - vst1_lane_f32(output, vf_lo, 0); - - #if XNN_ARCH_ARM64 - vacc_lo += vget_lane_f32(vf_lo, 0); - #else - vacc_lo = vadd_f32(vacc_lo, vreinterpret_f32_u64(vshl_n_u64(vreinterpret_u64_f32(vf_lo), 32))); - #endif - } - } -#if XNN_ARCH_ARM64 - *sum = vacc_lo; -#else - vst1_lane_f32(sum, vpadd_f32(vacc_lo, vacc_lo), 0); -#endif -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u12-acc3.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u12-acc3.c deleted file mode 100644 index c87bddeebb4..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u12-acc3.c +++ /dev/null @@ -1,237 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -extern XNN_INTERNAL const float xnn_table_exp2_k_over_64[64]; - -void xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u12_acc3( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const float32x4_t vlog2e = vmovq_n_f32(0x1.715476p+0f); - const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p17f); - const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x3F)); - const float32x4_t vc2 = vmovq_n_f32(0x1.FFFF0Ap-2f); - const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vindex_mask); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const float32x4_t vminus_ln2 = vmovq_n_f32(-0x1.62E430p-1f); - XNN_FORCE_REALIZATION(vminus_ln2); - - const float32x4_t vi_max = vld1q_dup_f32(max); - - float32x4_t vacc0 = vmovq_n_f32(0.0f); - float32x4_t vacc1 = vmovq_n_f32(0.0f); - float32x4_t vacc2 = vmovq_n_f32(0.0f); - for (; batch >= 12 * sizeof(float); batch -= 12 * sizeof(float)) { - const float32x4_t vi0123 = vld1q_f32(input); input += 4; - const float32x4_t vi4567 = vld1q_f32(input); input += 4; - const float32x4_t vi89AB = vld1q_f32(input); input += 4; - - const float32x4_t vx0123 = vsubq_f32(vi0123, vi_max); - const float32x4_t vx4567 = vsubq_f32(vi4567, vi_max); - const float32x4_t vx89AB = vsubq_f32(vi89AB, vi_max); - - float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vx0123, vlog2e); - float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vx4567, vlog2e); - float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vx89AB, vlog2e); - - const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t ve89AB = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn89AB), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask)); - const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0); - const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1); - const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask)); - const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0); - const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1); - const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask)); - const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0); - const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1); - - float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx01]); - float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx23]); - float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx45]); - float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx67]); - float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx89]); - float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxAB]); - - vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx01 >> 32)], vl01, 1); - vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx23 >> 32)], vl23, 1); - const float32x4_t vl0123 = vcombine_f32(vl01, vl23); - vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx45 >> 32)], vl45, 1); - vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx67 >> 32)], vl67, 1); - const float32x4_t vl4567 = vcombine_f32(vl45, vl67); - vl89 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx89 >> 32)], vl89, 1); - vlAB = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxAB >> 32)], vlAB, 1); - const float32x4_t vl89AB = vcombine_f32(vl89, vlAB); - - const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123)); - const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567)); - const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB)); - - vn0123 = vsubq_f32(vn0123, vmagic_bias); - vn4567 = vsubq_f32(vn4567, vmagic_bias); - vn89AB = vsubq_f32(vn89AB, vmagic_bias); - - float32x4_t vt0123 = vfmaq_f32(vx0123, vn0123, vminus_ln2); - float32x4_t vt4567 = vfmaq_f32(vx4567, vn4567, vminus_ln2); - float32x4_t vt89AB = vfmaq_f32(vx89AB, vn89AB, vminus_ln2); - - float32x4_t vp0123 = vmulq_f32(vt0123, vc2); - float32x4_t vp4567 = vmulq_f32(vt4567, vc2); - float32x4_t vp89AB = vmulq_f32(vt89AB, vc2); - - vp0123 = vfmaq_f32(vt0123, vt0123, vp0123); - vp4567 = vfmaq_f32(vt4567, vt4567, vp4567); - vp89AB = vfmaq_f32(vt89AB, vt89AB, vp89AB); - - float32x4_t vf0123 = vfmaq_f32(vs0123, vs0123, vp0123); - float32x4_t vf4567 = vfmaq_f32(vs4567, vs4567, vp4567); - float32x4_t vf89AB = vfmaq_f32(vs89AB, vs89AB, vp89AB); - - vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff))); - vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcltq_f32(vx4567, vdenorm_cutoff))); - vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcltq_f32(vx89AB, vdenorm_cutoff))); - - vst1q_f32(output, vf0123); output += 4; - vst1q_f32(output, vf4567); output += 4; - vst1q_f32(output, vf89AB); output += 4; - - vacc0 = vaddq_f32(vacc0, vf0123); - vacc1 = vaddq_f32(vacc1, vf4567); - vacc2 = vaddq_f32(vacc2, vf89AB); - } - vacc0 = vaddq_f32(vacc0, vacc1); - vacc0 = vaddq_f32(vacc0, vacc2); - - float32x4_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e); - - const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); - const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); - const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); - float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); - float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]); - vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); - vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); - const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); - const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2); - - float32x4_t vp = vmulq_f32(vt, vc2); - vp = vfmaq_f32(vt, vt, vp); - - float32x4_t vf = vfmaq_f32(vs, vs, vp); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - vst1q_f32(output, vf); output += 4; - - vacc = vaddq_f32(vacc, vf); - } -#if XNN_ARCH_ARM64 - float vacc_lo = vaddvq_f32(vacc); -#else - float32x2_t vacc_lo = vadd_f32(vget_high_f32(vacc), vget_low_f32(vacc)); -#endif - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e); - - const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); - const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); - const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); - float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); - float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]); - vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); - vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); - const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); - const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2); - - float32x4_t vp = vmulq_f32(vt, vc2); - vp = vfmaq_f32(vt, vt, vp); - - float32x4_t vf = vfmaq_f32(vs, vs, vp); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - float32x2_t vf_lo = vget_low_f32(vf); - if (batch & (2 * sizeof(float))) { - vst1_f32(output, vf_lo); output += 2; - - #if XNN_ARCH_ARM64 - vacc_lo += vaddv_f32(vf_lo); - #else - vacc_lo = vadd_f32(vacc_lo, vf_lo); - #endif - - vf_lo = vget_high_f32(vf); - } - if (batch & (1 * sizeof(float))) { - vst1_lane_f32(output, vf_lo, 0); - - #if XNN_ARCH_ARM64 - vacc_lo += vget_lane_f32(vf_lo, 0); - #else - vacc_lo = vadd_f32(vacc_lo, vreinterpret_f32_u64(vshl_n_u64(vreinterpret_u64_f32(vf_lo), 32))); - #endif - } - } -#if XNN_ARCH_ARM64 - *sum = vacc_lo; -#else - vst1_lane_f32(sum, vpadd_f32(vacc_lo, vacc_lo), 0); -#endif -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u12.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u12.c deleted file mode 100644 index f52bbf4859a..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u12.c +++ /dev/null @@ -1,233 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -extern XNN_INTERNAL const float xnn_table_exp2_k_over_64[64]; - -void xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u12( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const float32x4_t vlog2e = vmovq_n_f32(0x1.715476p+0f); - const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p17f); - const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x3F)); - const float32x4_t vc2 = vmovq_n_f32(0x1.FFFF0Ap-2f); - const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vindex_mask); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const float32x4_t vminus_ln2 = vmovq_n_f32(-0x1.62E430p-1f); - XNN_FORCE_REALIZATION(vminus_ln2); - - const float32x4_t vi_max = vld1q_dup_f32(max); - - float32x4_t vacc0 = vmovq_n_f32(0.0f); - for (; batch >= 12 * sizeof(float); batch -= 12 * sizeof(float)) { - const float32x4_t vi0123 = vld1q_f32(input); input += 4; - const float32x4_t vi4567 = vld1q_f32(input); input += 4; - const float32x4_t vi89AB = vld1q_f32(input); input += 4; - - const float32x4_t vx0123 = vsubq_f32(vi0123, vi_max); - const float32x4_t vx4567 = vsubq_f32(vi4567, vi_max); - const float32x4_t vx89AB = vsubq_f32(vi89AB, vi_max); - - float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vx0123, vlog2e); - float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vx4567, vlog2e); - float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vx89AB, vlog2e); - - const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t ve89AB = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn89AB), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask)); - const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0); - const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1); - const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask)); - const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0); - const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1); - const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask)); - const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0); - const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1); - - float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx01]); - float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx23]); - float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx45]); - float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx67]); - float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx89]); - float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxAB]); - - vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx01 >> 32)], vl01, 1); - vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx23 >> 32)], vl23, 1); - const float32x4_t vl0123 = vcombine_f32(vl01, vl23); - vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx45 >> 32)], vl45, 1); - vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx67 >> 32)], vl67, 1); - const float32x4_t vl4567 = vcombine_f32(vl45, vl67); - vl89 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx89 >> 32)], vl89, 1); - vlAB = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxAB >> 32)], vlAB, 1); - const float32x4_t vl89AB = vcombine_f32(vl89, vlAB); - - const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123)); - const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567)); - const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB)); - - vn0123 = vsubq_f32(vn0123, vmagic_bias); - vn4567 = vsubq_f32(vn4567, vmagic_bias); - vn89AB = vsubq_f32(vn89AB, vmagic_bias); - - float32x4_t vt0123 = vfmaq_f32(vx0123, vn0123, vminus_ln2); - float32x4_t vt4567 = vfmaq_f32(vx4567, vn4567, vminus_ln2); - float32x4_t vt89AB = vfmaq_f32(vx89AB, vn89AB, vminus_ln2); - - float32x4_t vp0123 = vmulq_f32(vt0123, vc2); - float32x4_t vp4567 = vmulq_f32(vt4567, vc2); - float32x4_t vp89AB = vmulq_f32(vt89AB, vc2); - - vp0123 = vfmaq_f32(vt0123, vt0123, vp0123); - vp4567 = vfmaq_f32(vt4567, vt4567, vp4567); - vp89AB = vfmaq_f32(vt89AB, vt89AB, vp89AB); - - float32x4_t vf0123 = vfmaq_f32(vs0123, vs0123, vp0123); - float32x4_t vf4567 = vfmaq_f32(vs4567, vs4567, vp4567); - float32x4_t vf89AB = vfmaq_f32(vs89AB, vs89AB, vp89AB); - - vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff))); - vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcltq_f32(vx4567, vdenorm_cutoff))); - vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcltq_f32(vx89AB, vdenorm_cutoff))); - - vst1q_f32(output, vf0123); output += 4; - vst1q_f32(output, vf4567); output += 4; - vst1q_f32(output, vf89AB); output += 4; - - vacc0 = vaddq_f32(vacc0, vf0123); - vacc0 = vaddq_f32(vacc0, vf4567); - vacc0 = vaddq_f32(vacc0, vf89AB); - } - - float32x4_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e); - - const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); - const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); - const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); - float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); - float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]); - vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); - vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); - const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); - const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2); - - float32x4_t vp = vmulq_f32(vt, vc2); - vp = vfmaq_f32(vt, vt, vp); - - float32x4_t vf = vfmaq_f32(vs, vs, vp); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - vst1q_f32(output, vf); output += 4; - - vacc = vaddq_f32(vacc, vf); - } -#if XNN_ARCH_ARM64 - float vacc_lo = vaddvq_f32(vacc); -#else - float32x2_t vacc_lo = vadd_f32(vget_high_f32(vacc), vget_low_f32(vacc)); -#endif - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e); - - const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); - const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); - const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); - float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); - float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]); - vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); - vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); - const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); - const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2); - - float32x4_t vp = vmulq_f32(vt, vc2); - vp = vfmaq_f32(vt, vt, vp); - - float32x4_t vf = vfmaq_f32(vs, vs, vp); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - float32x2_t vf_lo = vget_low_f32(vf); - if (batch & (2 * sizeof(float))) { - vst1_f32(output, vf_lo); output += 2; - - #if XNN_ARCH_ARM64 - vacc_lo += vaddv_f32(vf_lo); - #else - vacc_lo = vadd_f32(vacc_lo, vf_lo); - #endif - - vf_lo = vget_high_f32(vf); - } - if (batch & (1 * sizeof(float))) { - vst1_lane_f32(output, vf_lo, 0); - - #if XNN_ARCH_ARM64 - vacc_lo += vget_lane_f32(vf_lo, 0); - #else - vacc_lo = vadd_f32(vacc_lo, vreinterpret_f32_u64(vshl_n_u64(vreinterpret_u64_f32(vf_lo), 32))); - #endif - } - } -#if XNN_ARCH_ARM64 - *sum = vacc_lo; -#else - vst1_lane_f32(sum, vpadd_f32(vacc_lo, vacc_lo), 0); -#endif -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u16.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u16.c deleted file mode 100644 index 8c486f32615..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u16.c +++ /dev/null @@ -1,254 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -extern XNN_INTERNAL const float xnn_table_exp2_k_over_64[64]; - -void xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u16( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const float32x4_t vlog2e = vmovq_n_f32(0x1.715476p+0f); - const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p17f); - const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x3F)); - const float32x4_t vc2 = vmovq_n_f32(0x1.FFFF0Ap-2f); - const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vindex_mask); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const float32x4_t vminus_ln2 = vmovq_n_f32(-0x1.62E430p-1f); - XNN_FORCE_REALIZATION(vminus_ln2); - - const float32x4_t vi_max = vld1q_dup_f32(max); - - float32x4_t vacc0 = vmovq_n_f32(0.0f); - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const float32x4_t vi0123 = vld1q_f32(input); input += 4; - const float32x4_t vi4567 = vld1q_f32(input); input += 4; - const float32x4_t vi89AB = vld1q_f32(input); input += 4; - const float32x4_t viCDEF = vld1q_f32(input); input += 4; - - const float32x4_t vx0123 = vsubq_f32(vi0123, vi_max); - const float32x4_t vx4567 = vsubq_f32(vi4567, vi_max); - const float32x4_t vx89AB = vsubq_f32(vi89AB, vi_max); - const float32x4_t vxCDEF = vsubq_f32(viCDEF, vi_max); - - float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vx0123, vlog2e); - float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vx4567, vlog2e); - float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vx89AB, vlog2e); - float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vxCDEF, vlog2e); - - const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t ve89AB = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn89AB), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t veCDEF = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnCDEF), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask)); - const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0); - const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1); - const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask)); - const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0); - const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1); - const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask)); - const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0); - const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1); - const uint64x2_t vidxCDEF = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnCDEF), vindex_mask)); - const uint64_t vidxCD = vgetq_lane_u64(vidxCDEF, 0); - const uint64_t vidxEF = vgetq_lane_u64(vidxCDEF, 1); - - float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx01]); - float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx23]); - float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx45]); - float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx67]); - float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx89]); - float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxAB]); - float32x2_t vlCD = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxCD]); - float32x2_t vlEF = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxEF]); - - vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx01 >> 32)], vl01, 1); - vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx23 >> 32)], vl23, 1); - const float32x4_t vl0123 = vcombine_f32(vl01, vl23); - vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx45 >> 32)], vl45, 1); - vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx67 >> 32)], vl67, 1); - const float32x4_t vl4567 = vcombine_f32(vl45, vl67); - vl89 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx89 >> 32)], vl89, 1); - vlAB = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxAB >> 32)], vlAB, 1); - const float32x4_t vl89AB = vcombine_f32(vl89, vlAB); - vlCD = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxCD >> 32)], vlCD, 1); - vlEF = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxEF >> 32)], vlEF, 1); - const float32x4_t vlCDEF = vcombine_f32(vlCD, vlEF); - - const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123)); - const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567)); - const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB)); - const float32x4_t vsCDEF = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlCDEF), veCDEF)); - - vn0123 = vsubq_f32(vn0123, vmagic_bias); - vn4567 = vsubq_f32(vn4567, vmagic_bias); - vn89AB = vsubq_f32(vn89AB, vmagic_bias); - vnCDEF = vsubq_f32(vnCDEF, vmagic_bias); - - float32x4_t vt0123 = vfmaq_f32(vx0123, vn0123, vminus_ln2); - float32x4_t vt4567 = vfmaq_f32(vx4567, vn4567, vminus_ln2); - float32x4_t vt89AB = vfmaq_f32(vx89AB, vn89AB, vminus_ln2); - float32x4_t vtCDEF = vfmaq_f32(vxCDEF, vnCDEF, vminus_ln2); - - float32x4_t vp0123 = vmulq_f32(vt0123, vc2); - float32x4_t vp4567 = vmulq_f32(vt4567, vc2); - float32x4_t vp89AB = vmulq_f32(vt89AB, vc2); - float32x4_t vpCDEF = vmulq_f32(vtCDEF, vc2); - - vp0123 = vfmaq_f32(vt0123, vt0123, vp0123); - vp4567 = vfmaq_f32(vt4567, vt4567, vp4567); - vp89AB = vfmaq_f32(vt89AB, vt89AB, vp89AB); - vpCDEF = vfmaq_f32(vtCDEF, vtCDEF, vpCDEF); - - float32x4_t vf0123 = vfmaq_f32(vs0123, vs0123, vp0123); - float32x4_t vf4567 = vfmaq_f32(vs4567, vs4567, vp4567); - float32x4_t vf89AB = vfmaq_f32(vs89AB, vs89AB, vp89AB); - float32x4_t vfCDEF = vfmaq_f32(vsCDEF, vsCDEF, vpCDEF); - - vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff))); - vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcltq_f32(vx4567, vdenorm_cutoff))); - vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcltq_f32(vx89AB, vdenorm_cutoff))); - vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcltq_f32(vxCDEF, vdenorm_cutoff))); - - vst1q_f32(output, vf0123); output += 4; - vst1q_f32(output, vf4567); output += 4; - vst1q_f32(output, vf89AB); output += 4; - vst1q_f32(output, vfCDEF); output += 4; - - vacc0 = vaddq_f32(vacc0, vf0123); - vacc0 = vaddq_f32(vacc0, vf4567); - vacc0 = vaddq_f32(vacc0, vf89AB); - vacc0 = vaddq_f32(vacc0, vfCDEF); - } - - float32x4_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e); - - const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); - const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); - const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); - float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); - float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]); - vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); - vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); - const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); - const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2); - - float32x4_t vp = vmulq_f32(vt, vc2); - vp = vfmaq_f32(vt, vt, vp); - - float32x4_t vf = vfmaq_f32(vs, vs, vp); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - vst1q_f32(output, vf); output += 4; - - vacc = vaddq_f32(vacc, vf); - } -#if XNN_ARCH_ARM64 - float vacc_lo = vaddvq_f32(vacc); -#else - float32x2_t vacc_lo = vadd_f32(vget_high_f32(vacc), vget_low_f32(vacc)); -#endif - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e); - - const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); - const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); - const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); - float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); - float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]); - vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); - vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); - const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); - const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2); - - float32x4_t vp = vmulq_f32(vt, vc2); - vp = vfmaq_f32(vt, vt, vp); - - float32x4_t vf = vfmaq_f32(vs, vs, vp); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - float32x2_t vf_lo = vget_low_f32(vf); - if (batch & (2 * sizeof(float))) { - vst1_f32(output, vf_lo); output += 2; - - #if XNN_ARCH_ARM64 - vacc_lo += vaddv_f32(vf_lo); - #else - vacc_lo = vadd_f32(vacc_lo, vf_lo); - #endif - - vf_lo = vget_high_f32(vf); - } - if (batch & (1 * sizeof(float))) { - vst1_lane_f32(output, vf_lo, 0); - - #if XNN_ARCH_ARM64 - vacc_lo += vget_lane_f32(vf_lo, 0); - #else - vacc_lo = vadd_f32(vacc_lo, vreinterpret_f32_u64(vshl_n_u64(vreinterpret_u64_f32(vf_lo), 32))); - #endif - } - } -#if XNN_ARCH_ARM64 - *sum = vacc_lo; -#else - vst1_lane_f32(sum, vpadd_f32(vacc_lo, vacc_lo), 0); -#endif -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u20-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u20-acc2.c deleted file mode 100644 index d93fbf6072a..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u20-acc2.c +++ /dev/null @@ -1,277 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -extern XNN_INTERNAL const float xnn_table_exp2_k_over_64[64]; - -void xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u20_acc2( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const float32x4_t vlog2e = vmovq_n_f32(0x1.715476p+0f); - const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p17f); - const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x3F)); - const float32x4_t vc2 = vmovq_n_f32(0x1.FFFF0Ap-2f); - const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vindex_mask); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const float32x4_t vminus_ln2 = vmovq_n_f32(-0x1.62E430p-1f); - XNN_FORCE_REALIZATION(vminus_ln2); - - const float32x4_t vi_max = vld1q_dup_f32(max); - - float32x4_t vacc0 = vmovq_n_f32(0.0f); - float32x4_t vacc1 = vmovq_n_f32(0.0f); - for (; batch >= 20 * sizeof(float); batch -= 20 * sizeof(float)) { - const float32x4_t vi0123 = vld1q_f32(input); input += 4; - const float32x4_t vi4567 = vld1q_f32(input); input += 4; - const float32x4_t vi89AB = vld1q_f32(input); input += 4; - const float32x4_t viCDEF = vld1q_f32(input); input += 4; - const float32x4_t viGHIJ = vld1q_f32(input); input += 4; - - const float32x4_t vx0123 = vsubq_f32(vi0123, vi_max); - const float32x4_t vx4567 = vsubq_f32(vi4567, vi_max); - const float32x4_t vx89AB = vsubq_f32(vi89AB, vi_max); - const float32x4_t vxCDEF = vsubq_f32(viCDEF, vi_max); - const float32x4_t vxGHIJ = vsubq_f32(viGHIJ, vi_max); - - float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vx0123, vlog2e); - float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vx4567, vlog2e); - float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vx89AB, vlog2e); - float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vxCDEF, vlog2e); - float32x4_t vnGHIJ = vfmaq_f32(vmagic_bias, vxGHIJ, vlog2e); - - const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t ve89AB = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn89AB), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t veCDEF = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnCDEF), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t veGHIJ = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnGHIJ), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask)); - const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0); - const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1); - const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask)); - const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0); - const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1); - const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask)); - const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0); - const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1); - const uint64x2_t vidxCDEF = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnCDEF), vindex_mask)); - const uint64_t vidxCD = vgetq_lane_u64(vidxCDEF, 0); - const uint64_t vidxEF = vgetq_lane_u64(vidxCDEF, 1); - const uint64x2_t vidxGHIJ = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnGHIJ), vindex_mask)); - const uint64_t vidxGH = vgetq_lane_u64(vidxGHIJ, 0); - const uint64_t vidxIJ = vgetq_lane_u64(vidxGHIJ, 1); - - float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx01]); - float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx23]); - float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx45]); - float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx67]); - float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx89]); - float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxAB]); - float32x2_t vlCD = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxCD]); - float32x2_t vlEF = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxEF]); - float32x2_t vlGH = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxGH]); - float32x2_t vlIJ = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxIJ]); - - vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx01 >> 32)], vl01, 1); - vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx23 >> 32)], vl23, 1); - const float32x4_t vl0123 = vcombine_f32(vl01, vl23); - vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx45 >> 32)], vl45, 1); - vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx67 >> 32)], vl67, 1); - const float32x4_t vl4567 = vcombine_f32(vl45, vl67); - vl89 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx89 >> 32)], vl89, 1); - vlAB = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxAB >> 32)], vlAB, 1); - const float32x4_t vl89AB = vcombine_f32(vl89, vlAB); - vlCD = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxCD >> 32)], vlCD, 1); - vlEF = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxEF >> 32)], vlEF, 1); - const float32x4_t vlCDEF = vcombine_f32(vlCD, vlEF); - vlGH = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxGH >> 32)], vlGH, 1); - vlIJ = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxIJ >> 32)], vlIJ, 1); - const float32x4_t vlGHIJ = vcombine_f32(vlGH, vlIJ); - - const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123)); - const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567)); - const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB)); - const float32x4_t vsCDEF = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlCDEF), veCDEF)); - const float32x4_t vsGHIJ = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlGHIJ), veGHIJ)); - - vn0123 = vsubq_f32(vn0123, vmagic_bias); - vn4567 = vsubq_f32(vn4567, vmagic_bias); - vn89AB = vsubq_f32(vn89AB, vmagic_bias); - vnCDEF = vsubq_f32(vnCDEF, vmagic_bias); - vnGHIJ = vsubq_f32(vnGHIJ, vmagic_bias); - - float32x4_t vt0123 = vfmaq_f32(vx0123, vn0123, vminus_ln2); - float32x4_t vt4567 = vfmaq_f32(vx4567, vn4567, vminus_ln2); - float32x4_t vt89AB = vfmaq_f32(vx89AB, vn89AB, vminus_ln2); - float32x4_t vtCDEF = vfmaq_f32(vxCDEF, vnCDEF, vminus_ln2); - float32x4_t vtGHIJ = vfmaq_f32(vxGHIJ, vnGHIJ, vminus_ln2); - - float32x4_t vp0123 = vmulq_f32(vt0123, vc2); - float32x4_t vp4567 = vmulq_f32(vt4567, vc2); - float32x4_t vp89AB = vmulq_f32(vt89AB, vc2); - float32x4_t vpCDEF = vmulq_f32(vtCDEF, vc2); - float32x4_t vpGHIJ = vmulq_f32(vtGHIJ, vc2); - - vp0123 = vfmaq_f32(vt0123, vt0123, vp0123); - vp4567 = vfmaq_f32(vt4567, vt4567, vp4567); - vp89AB = vfmaq_f32(vt89AB, vt89AB, vp89AB); - vpCDEF = vfmaq_f32(vtCDEF, vtCDEF, vpCDEF); - vpGHIJ = vfmaq_f32(vtGHIJ, vtGHIJ, vpGHIJ); - - float32x4_t vf0123 = vfmaq_f32(vs0123, vs0123, vp0123); - float32x4_t vf4567 = vfmaq_f32(vs4567, vs4567, vp4567); - float32x4_t vf89AB = vfmaq_f32(vs89AB, vs89AB, vp89AB); - float32x4_t vfCDEF = vfmaq_f32(vsCDEF, vsCDEF, vpCDEF); - float32x4_t vfGHIJ = vfmaq_f32(vsGHIJ, vsGHIJ, vpGHIJ); - - vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff))); - vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcltq_f32(vx4567, vdenorm_cutoff))); - vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcltq_f32(vx89AB, vdenorm_cutoff))); - vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcltq_f32(vxCDEF, vdenorm_cutoff))); - vfGHIJ = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfGHIJ), vcltq_f32(vxGHIJ, vdenorm_cutoff))); - - vst1q_f32(output, vf0123); output += 4; - vst1q_f32(output, vf4567); output += 4; - vst1q_f32(output, vf89AB); output += 4; - vst1q_f32(output, vfCDEF); output += 4; - vst1q_f32(output, vfGHIJ); output += 4; - - vacc0 = vaddq_f32(vacc0, vf0123); - vacc0 = vaddq_f32(vacc0, vf4567); - vacc0 = vaddq_f32(vacc0, vf89AB); - vacc0 = vaddq_f32(vacc0, vfCDEF); - vacc0 = vaddq_f32(vacc0, vfGHIJ); - } - vacc0 = vaddq_f32(vacc0, vacc1); - - float32x4_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e); - - const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); - const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); - const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); - float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); - float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]); - vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); - vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); - const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); - const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2); - - float32x4_t vp = vmulq_f32(vt, vc2); - vp = vfmaq_f32(vt, vt, vp); - - float32x4_t vf = vfmaq_f32(vs, vs, vp); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - vst1q_f32(output, vf); output += 4; - - vacc = vaddq_f32(vacc, vf); - } -#if XNN_ARCH_ARM64 - float vacc_lo = vaddvq_f32(vacc); -#else - float32x2_t vacc_lo = vadd_f32(vget_high_f32(vacc), vget_low_f32(vacc)); -#endif - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e); - - const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); - const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); - const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); - float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); - float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]); - vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); - vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); - const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); - const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2); - - float32x4_t vp = vmulq_f32(vt, vc2); - vp = vfmaq_f32(vt, vt, vp); - - float32x4_t vf = vfmaq_f32(vs, vs, vp); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - float32x2_t vf_lo = vget_low_f32(vf); - if (batch & (2 * sizeof(float))) { - vst1_f32(output, vf_lo); output += 2; - - #if XNN_ARCH_ARM64 - vacc_lo += vaddv_f32(vf_lo); - #else - vacc_lo = vadd_f32(vacc_lo, vf_lo); - #endif - - vf_lo = vget_high_f32(vf); - } - if (batch & (1 * sizeof(float))) { - vst1_lane_f32(output, vf_lo, 0); - - #if XNN_ARCH_ARM64 - vacc_lo += vget_lane_f32(vf_lo, 0); - #else - vacc_lo = vadd_f32(vacc_lo, vreinterpret_f32_u64(vshl_n_u64(vreinterpret_u64_f32(vf_lo), 32))); - #endif - } - } -#if XNN_ARCH_ARM64 - *sum = vacc_lo; -#else - vst1_lane_f32(sum, vpadd_f32(vacc_lo, vacc_lo), 0); -#endif -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u20-acc5.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u20-acc5.c deleted file mode 100644 index e198eca585a..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u20-acc5.c +++ /dev/null @@ -1,283 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -extern XNN_INTERNAL const float xnn_table_exp2_k_over_64[64]; - -void xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u20_acc5( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const float32x4_t vlog2e = vmovq_n_f32(0x1.715476p+0f); - const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p17f); - const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x3F)); - const float32x4_t vc2 = vmovq_n_f32(0x1.FFFF0Ap-2f); - const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vindex_mask); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const float32x4_t vminus_ln2 = vmovq_n_f32(-0x1.62E430p-1f); - XNN_FORCE_REALIZATION(vminus_ln2); - - const float32x4_t vi_max = vld1q_dup_f32(max); - - float32x4_t vacc0 = vmovq_n_f32(0.0f); - float32x4_t vacc1 = vmovq_n_f32(0.0f); - float32x4_t vacc2 = vmovq_n_f32(0.0f); - float32x4_t vacc3 = vmovq_n_f32(0.0f); - float32x4_t vacc4 = vmovq_n_f32(0.0f); - for (; batch >= 20 * sizeof(float); batch -= 20 * sizeof(float)) { - const float32x4_t vi0123 = vld1q_f32(input); input += 4; - const float32x4_t vi4567 = vld1q_f32(input); input += 4; - const float32x4_t vi89AB = vld1q_f32(input); input += 4; - const float32x4_t viCDEF = vld1q_f32(input); input += 4; - const float32x4_t viGHIJ = vld1q_f32(input); input += 4; - - const float32x4_t vx0123 = vsubq_f32(vi0123, vi_max); - const float32x4_t vx4567 = vsubq_f32(vi4567, vi_max); - const float32x4_t vx89AB = vsubq_f32(vi89AB, vi_max); - const float32x4_t vxCDEF = vsubq_f32(viCDEF, vi_max); - const float32x4_t vxGHIJ = vsubq_f32(viGHIJ, vi_max); - - float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vx0123, vlog2e); - float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vx4567, vlog2e); - float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vx89AB, vlog2e); - float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vxCDEF, vlog2e); - float32x4_t vnGHIJ = vfmaq_f32(vmagic_bias, vxGHIJ, vlog2e); - - const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t ve89AB = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn89AB), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t veCDEF = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnCDEF), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t veGHIJ = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnGHIJ), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask)); - const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0); - const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1); - const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask)); - const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0); - const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1); - const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask)); - const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0); - const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1); - const uint64x2_t vidxCDEF = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnCDEF), vindex_mask)); - const uint64_t vidxCD = vgetq_lane_u64(vidxCDEF, 0); - const uint64_t vidxEF = vgetq_lane_u64(vidxCDEF, 1); - const uint64x2_t vidxGHIJ = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnGHIJ), vindex_mask)); - const uint64_t vidxGH = vgetq_lane_u64(vidxGHIJ, 0); - const uint64_t vidxIJ = vgetq_lane_u64(vidxGHIJ, 1); - - float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx01]); - float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx23]); - float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx45]); - float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx67]); - float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx89]); - float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxAB]); - float32x2_t vlCD = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxCD]); - float32x2_t vlEF = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxEF]); - float32x2_t vlGH = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxGH]); - float32x2_t vlIJ = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxIJ]); - - vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx01 >> 32)], vl01, 1); - vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx23 >> 32)], vl23, 1); - const float32x4_t vl0123 = vcombine_f32(vl01, vl23); - vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx45 >> 32)], vl45, 1); - vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx67 >> 32)], vl67, 1); - const float32x4_t vl4567 = vcombine_f32(vl45, vl67); - vl89 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx89 >> 32)], vl89, 1); - vlAB = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxAB >> 32)], vlAB, 1); - const float32x4_t vl89AB = vcombine_f32(vl89, vlAB); - vlCD = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxCD >> 32)], vlCD, 1); - vlEF = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxEF >> 32)], vlEF, 1); - const float32x4_t vlCDEF = vcombine_f32(vlCD, vlEF); - vlGH = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxGH >> 32)], vlGH, 1); - vlIJ = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxIJ >> 32)], vlIJ, 1); - const float32x4_t vlGHIJ = vcombine_f32(vlGH, vlIJ); - - const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123)); - const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567)); - const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB)); - const float32x4_t vsCDEF = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlCDEF), veCDEF)); - const float32x4_t vsGHIJ = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlGHIJ), veGHIJ)); - - vn0123 = vsubq_f32(vn0123, vmagic_bias); - vn4567 = vsubq_f32(vn4567, vmagic_bias); - vn89AB = vsubq_f32(vn89AB, vmagic_bias); - vnCDEF = vsubq_f32(vnCDEF, vmagic_bias); - vnGHIJ = vsubq_f32(vnGHIJ, vmagic_bias); - - float32x4_t vt0123 = vfmaq_f32(vx0123, vn0123, vminus_ln2); - float32x4_t vt4567 = vfmaq_f32(vx4567, vn4567, vminus_ln2); - float32x4_t vt89AB = vfmaq_f32(vx89AB, vn89AB, vminus_ln2); - float32x4_t vtCDEF = vfmaq_f32(vxCDEF, vnCDEF, vminus_ln2); - float32x4_t vtGHIJ = vfmaq_f32(vxGHIJ, vnGHIJ, vminus_ln2); - - float32x4_t vp0123 = vmulq_f32(vt0123, vc2); - float32x4_t vp4567 = vmulq_f32(vt4567, vc2); - float32x4_t vp89AB = vmulq_f32(vt89AB, vc2); - float32x4_t vpCDEF = vmulq_f32(vtCDEF, vc2); - float32x4_t vpGHIJ = vmulq_f32(vtGHIJ, vc2); - - vp0123 = vfmaq_f32(vt0123, vt0123, vp0123); - vp4567 = vfmaq_f32(vt4567, vt4567, vp4567); - vp89AB = vfmaq_f32(vt89AB, vt89AB, vp89AB); - vpCDEF = vfmaq_f32(vtCDEF, vtCDEF, vpCDEF); - vpGHIJ = vfmaq_f32(vtGHIJ, vtGHIJ, vpGHIJ); - - float32x4_t vf0123 = vfmaq_f32(vs0123, vs0123, vp0123); - float32x4_t vf4567 = vfmaq_f32(vs4567, vs4567, vp4567); - float32x4_t vf89AB = vfmaq_f32(vs89AB, vs89AB, vp89AB); - float32x4_t vfCDEF = vfmaq_f32(vsCDEF, vsCDEF, vpCDEF); - float32x4_t vfGHIJ = vfmaq_f32(vsGHIJ, vsGHIJ, vpGHIJ); - - vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff))); - vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcltq_f32(vx4567, vdenorm_cutoff))); - vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcltq_f32(vx89AB, vdenorm_cutoff))); - vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcltq_f32(vxCDEF, vdenorm_cutoff))); - vfGHIJ = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfGHIJ), vcltq_f32(vxGHIJ, vdenorm_cutoff))); - - vst1q_f32(output, vf0123); output += 4; - vst1q_f32(output, vf4567); output += 4; - vst1q_f32(output, vf89AB); output += 4; - vst1q_f32(output, vfCDEF); output += 4; - vst1q_f32(output, vfGHIJ); output += 4; - - vacc0 = vaddq_f32(vacc0, vf0123); - vacc4 = vaddq_f32(vacc4, vf4567); - vacc3 = vaddq_f32(vacc3, vf89AB); - vacc2 = vaddq_f32(vacc2, vfCDEF); - vacc1 = vaddq_f32(vacc1, vfGHIJ); - } - vacc0 = vaddq_f32(vacc0, vacc1); - vacc2 = vaddq_f32(vacc2, vacc3); - vacc0 = vaddq_f32(vacc0, vacc2); - vacc0 = vaddq_f32(vacc0, vacc4); - - float32x4_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e); - - const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); - const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); - const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); - float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); - float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]); - vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); - vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); - const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); - const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2); - - float32x4_t vp = vmulq_f32(vt, vc2); - vp = vfmaq_f32(vt, vt, vp); - - float32x4_t vf = vfmaq_f32(vs, vs, vp); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - vst1q_f32(output, vf); output += 4; - - vacc = vaddq_f32(vacc, vf); - } -#if XNN_ARCH_ARM64 - float vacc_lo = vaddvq_f32(vacc); -#else - float32x2_t vacc_lo = vadd_f32(vget_high_f32(vacc), vget_low_f32(vacc)); -#endif - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e); - - const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); - const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); - const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); - float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); - float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]); - vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); - vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); - const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); - const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2); - - float32x4_t vp = vmulq_f32(vt, vc2); - vp = vfmaq_f32(vt, vt, vp); - - float32x4_t vf = vfmaq_f32(vs, vs, vp); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - float32x2_t vf_lo = vget_low_f32(vf); - if (batch & (2 * sizeof(float))) { - vst1_f32(output, vf_lo); output += 2; - - #if XNN_ARCH_ARM64 - vacc_lo += vaddv_f32(vf_lo); - #else - vacc_lo = vadd_f32(vacc_lo, vf_lo); - #endif - - vf_lo = vget_high_f32(vf); - } - if (batch & (1 * sizeof(float))) { - vst1_lane_f32(output, vf_lo, 0); - - #if XNN_ARCH_ARM64 - vacc_lo += vget_lane_f32(vf_lo, 0); - #else - vacc_lo = vadd_f32(vacc_lo, vreinterpret_f32_u64(vshl_n_u64(vreinterpret_u64_f32(vf_lo), 32))); - #endif - } - } -#if XNN_ARCH_ARM64 - *sum = vacc_lo; -#else - vst1_lane_f32(sum, vpadd_f32(vacc_lo, vacc_lo), 0); -#endif -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u20.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u20.c deleted file mode 100644 index e3586d6e316..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u20.c +++ /dev/null @@ -1,275 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -extern XNN_INTERNAL const float xnn_table_exp2_k_over_64[64]; - -void xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u20( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const float32x4_t vlog2e = vmovq_n_f32(0x1.715476p+0f); - const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p17f); - const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x3F)); - const float32x4_t vc2 = vmovq_n_f32(0x1.FFFF0Ap-2f); - const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vindex_mask); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const float32x4_t vminus_ln2 = vmovq_n_f32(-0x1.62E430p-1f); - XNN_FORCE_REALIZATION(vminus_ln2); - - const float32x4_t vi_max = vld1q_dup_f32(max); - - float32x4_t vacc0 = vmovq_n_f32(0.0f); - for (; batch >= 20 * sizeof(float); batch -= 20 * sizeof(float)) { - const float32x4_t vi0123 = vld1q_f32(input); input += 4; - const float32x4_t vi4567 = vld1q_f32(input); input += 4; - const float32x4_t vi89AB = vld1q_f32(input); input += 4; - const float32x4_t viCDEF = vld1q_f32(input); input += 4; - const float32x4_t viGHIJ = vld1q_f32(input); input += 4; - - const float32x4_t vx0123 = vsubq_f32(vi0123, vi_max); - const float32x4_t vx4567 = vsubq_f32(vi4567, vi_max); - const float32x4_t vx89AB = vsubq_f32(vi89AB, vi_max); - const float32x4_t vxCDEF = vsubq_f32(viCDEF, vi_max); - const float32x4_t vxGHIJ = vsubq_f32(viGHIJ, vi_max); - - float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vx0123, vlog2e); - float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vx4567, vlog2e); - float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vx89AB, vlog2e); - float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vxCDEF, vlog2e); - float32x4_t vnGHIJ = vfmaq_f32(vmagic_bias, vxGHIJ, vlog2e); - - const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t ve89AB = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn89AB), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t veCDEF = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnCDEF), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t veGHIJ = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnGHIJ), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask)); - const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0); - const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1); - const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask)); - const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0); - const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1); - const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask)); - const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0); - const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1); - const uint64x2_t vidxCDEF = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnCDEF), vindex_mask)); - const uint64_t vidxCD = vgetq_lane_u64(vidxCDEF, 0); - const uint64_t vidxEF = vgetq_lane_u64(vidxCDEF, 1); - const uint64x2_t vidxGHIJ = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnGHIJ), vindex_mask)); - const uint64_t vidxGH = vgetq_lane_u64(vidxGHIJ, 0); - const uint64_t vidxIJ = vgetq_lane_u64(vidxGHIJ, 1); - - float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx01]); - float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx23]); - float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx45]); - float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx67]); - float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx89]); - float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxAB]); - float32x2_t vlCD = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxCD]); - float32x2_t vlEF = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxEF]); - float32x2_t vlGH = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxGH]); - float32x2_t vlIJ = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxIJ]); - - vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx01 >> 32)], vl01, 1); - vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx23 >> 32)], vl23, 1); - const float32x4_t vl0123 = vcombine_f32(vl01, vl23); - vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx45 >> 32)], vl45, 1); - vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx67 >> 32)], vl67, 1); - const float32x4_t vl4567 = vcombine_f32(vl45, vl67); - vl89 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx89 >> 32)], vl89, 1); - vlAB = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxAB >> 32)], vlAB, 1); - const float32x4_t vl89AB = vcombine_f32(vl89, vlAB); - vlCD = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxCD >> 32)], vlCD, 1); - vlEF = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxEF >> 32)], vlEF, 1); - const float32x4_t vlCDEF = vcombine_f32(vlCD, vlEF); - vlGH = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxGH >> 32)], vlGH, 1); - vlIJ = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxIJ >> 32)], vlIJ, 1); - const float32x4_t vlGHIJ = vcombine_f32(vlGH, vlIJ); - - const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123)); - const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567)); - const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB)); - const float32x4_t vsCDEF = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlCDEF), veCDEF)); - const float32x4_t vsGHIJ = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlGHIJ), veGHIJ)); - - vn0123 = vsubq_f32(vn0123, vmagic_bias); - vn4567 = vsubq_f32(vn4567, vmagic_bias); - vn89AB = vsubq_f32(vn89AB, vmagic_bias); - vnCDEF = vsubq_f32(vnCDEF, vmagic_bias); - vnGHIJ = vsubq_f32(vnGHIJ, vmagic_bias); - - float32x4_t vt0123 = vfmaq_f32(vx0123, vn0123, vminus_ln2); - float32x4_t vt4567 = vfmaq_f32(vx4567, vn4567, vminus_ln2); - float32x4_t vt89AB = vfmaq_f32(vx89AB, vn89AB, vminus_ln2); - float32x4_t vtCDEF = vfmaq_f32(vxCDEF, vnCDEF, vminus_ln2); - float32x4_t vtGHIJ = vfmaq_f32(vxGHIJ, vnGHIJ, vminus_ln2); - - float32x4_t vp0123 = vmulq_f32(vt0123, vc2); - float32x4_t vp4567 = vmulq_f32(vt4567, vc2); - float32x4_t vp89AB = vmulq_f32(vt89AB, vc2); - float32x4_t vpCDEF = vmulq_f32(vtCDEF, vc2); - float32x4_t vpGHIJ = vmulq_f32(vtGHIJ, vc2); - - vp0123 = vfmaq_f32(vt0123, vt0123, vp0123); - vp4567 = vfmaq_f32(vt4567, vt4567, vp4567); - vp89AB = vfmaq_f32(vt89AB, vt89AB, vp89AB); - vpCDEF = vfmaq_f32(vtCDEF, vtCDEF, vpCDEF); - vpGHIJ = vfmaq_f32(vtGHIJ, vtGHIJ, vpGHIJ); - - float32x4_t vf0123 = vfmaq_f32(vs0123, vs0123, vp0123); - float32x4_t vf4567 = vfmaq_f32(vs4567, vs4567, vp4567); - float32x4_t vf89AB = vfmaq_f32(vs89AB, vs89AB, vp89AB); - float32x4_t vfCDEF = vfmaq_f32(vsCDEF, vsCDEF, vpCDEF); - float32x4_t vfGHIJ = vfmaq_f32(vsGHIJ, vsGHIJ, vpGHIJ); - - vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff))); - vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcltq_f32(vx4567, vdenorm_cutoff))); - vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcltq_f32(vx89AB, vdenorm_cutoff))); - vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcltq_f32(vxCDEF, vdenorm_cutoff))); - vfGHIJ = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfGHIJ), vcltq_f32(vxGHIJ, vdenorm_cutoff))); - - vst1q_f32(output, vf0123); output += 4; - vst1q_f32(output, vf4567); output += 4; - vst1q_f32(output, vf89AB); output += 4; - vst1q_f32(output, vfCDEF); output += 4; - vst1q_f32(output, vfGHIJ); output += 4; - - vacc0 = vaddq_f32(vacc0, vf0123); - vacc0 = vaddq_f32(vacc0, vf4567); - vacc0 = vaddq_f32(vacc0, vf89AB); - vacc0 = vaddq_f32(vacc0, vfCDEF); - vacc0 = vaddq_f32(vacc0, vfGHIJ); - } - - float32x4_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e); - - const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); - const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); - const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); - float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); - float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]); - vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); - vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); - const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); - const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2); - - float32x4_t vp = vmulq_f32(vt, vc2); - vp = vfmaq_f32(vt, vt, vp); - - float32x4_t vf = vfmaq_f32(vs, vs, vp); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - vst1q_f32(output, vf); output += 4; - - vacc = vaddq_f32(vacc, vf); - } -#if XNN_ARCH_ARM64 - float vacc_lo = vaddvq_f32(vacc); -#else - float32x2_t vacc_lo = vadd_f32(vget_high_f32(vacc), vget_low_f32(vacc)); -#endif - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e); - - const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); - const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); - const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); - float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); - float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]); - vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); - vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); - const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); - const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2); - - float32x4_t vp = vmulq_f32(vt, vc2); - vp = vfmaq_f32(vt, vt, vp); - - float32x4_t vf = vfmaq_f32(vs, vs, vp); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - float32x2_t vf_lo = vget_low_f32(vf); - if (batch & (2 * sizeof(float))) { - vst1_f32(output, vf_lo); output += 2; - - #if XNN_ARCH_ARM64 - vacc_lo += vaddv_f32(vf_lo); - #else - vacc_lo = vadd_f32(vacc_lo, vf_lo); - #endif - - vf_lo = vget_high_f32(vf); - } - if (batch & (1 * sizeof(float))) { - vst1_lane_f32(output, vf_lo, 0); - - #if XNN_ARCH_ARM64 - vacc_lo += vget_lane_f32(vf_lo, 0); - #else - vacc_lo = vadd_f32(vacc_lo, vreinterpret_f32_u64(vshl_n_u64(vreinterpret_u64_f32(vf_lo), 32))); - #endif - } - } -#if XNN_ARCH_ARM64 - *sum = vacc_lo; -#else - vst1_lane_f32(sum, vpadd_f32(vacc_lo, vacc_lo), 0); -#endif -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u8.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u8.c deleted file mode 100644 index 431c9e8dc31..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-u8.c +++ /dev/null @@ -1,212 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -extern XNN_INTERNAL const float xnn_table_exp2_k_over_64[64]; - -void xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u8( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const float32x4_t vlog2e = vmovq_n_f32(0x1.715476p+0f); - const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p17f); - const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x3F)); - const float32x4_t vc2 = vmovq_n_f32(0x1.FFFF0Ap-2f); - const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vindex_mask); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const float32x4_t vminus_ln2 = vmovq_n_f32(-0x1.62E430p-1f); - XNN_FORCE_REALIZATION(vminus_ln2); - - const float32x4_t vi_max = vld1q_dup_f32(max); - - float32x4_t vacc0 = vmovq_n_f32(0.0f); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const float32x4_t vi0123 = vld1q_f32(input); input += 4; - const float32x4_t vi4567 = vld1q_f32(input); input += 4; - - const float32x4_t vx0123 = vsubq_f32(vi0123, vi_max); - const float32x4_t vx4567 = vsubq_f32(vi4567, vi_max); - - float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vx0123, vlog2e); - float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vx4567, vlog2e); - - const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x3F))), 17); - const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask)); - const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0); - const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1); - const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask)); - const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0); - const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1); - - float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx01]); - float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx23]); - float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx45]); - float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx67]); - - vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx01 >> 32)], vl01, 1); - vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx23 >> 32)], vl23, 1); - const float32x4_t vl0123 = vcombine_f32(vl01, vl23); - vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx45 >> 32)], vl45, 1); - vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx67 >> 32)], vl67, 1); - const float32x4_t vl4567 = vcombine_f32(vl45, vl67); - - const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123)); - const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567)); - - vn0123 = vsubq_f32(vn0123, vmagic_bias); - vn4567 = vsubq_f32(vn4567, vmagic_bias); - - float32x4_t vt0123 = vfmaq_f32(vx0123, vn0123, vminus_ln2); - float32x4_t vt4567 = vfmaq_f32(vx4567, vn4567, vminus_ln2); - - float32x4_t vp0123 = vmulq_f32(vt0123, vc2); - float32x4_t vp4567 = vmulq_f32(vt4567, vc2); - - vp0123 = vfmaq_f32(vt0123, vt0123, vp0123); - vp4567 = vfmaq_f32(vt4567, vt4567, vp4567); - - float32x4_t vf0123 = vfmaq_f32(vs0123, vs0123, vp0123); - float32x4_t vf4567 = vfmaq_f32(vs4567, vs4567, vp4567); - - vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff))); - vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcltq_f32(vx4567, vdenorm_cutoff))); - - vst1q_f32(output, vf0123); output += 4; - vst1q_f32(output, vf4567); output += 4; - - vacc0 = vaddq_f32(vacc0, vf0123); - vacc0 = vaddq_f32(vacc0, vf4567); - } - - float32x4_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e); - - const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); - const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); - const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); - float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); - float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]); - vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); - vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); - const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); - const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2); - - float32x4_t vp = vmulq_f32(vt, vc2); - vp = vfmaq_f32(vt, vt, vp); - - float32x4_t vf = vfmaq_f32(vs, vs, vp); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - vst1q_f32(output, vf); output += 4; - - vacc = vaddq_f32(vacc, vf); - } -#if XNN_ARCH_ARM64 - float vacc_lo = vaddvq_f32(vacc); -#else - float32x2_t vacc_lo = vadd_f32(vget_high_f32(vacc), vget_low_f32(vacc)); -#endif - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e); - - const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17); - - const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); - const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); - const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); - float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); - float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]); - vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); - vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); - const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); - const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2); - - float32x4_t vp = vmulq_f32(vt, vc2); - vp = vfmaq_f32(vt, vt, vp); - - float32x4_t vf = vfmaq_f32(vs, vs, vp); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - float32x2_t vf_lo = vget_low_f32(vf); - if (batch & (2 * sizeof(float))) { - vst1_f32(output, vf_lo); output += 2; - - #if XNN_ARCH_ARM64 - vacc_lo += vaddv_f32(vf_lo); - #else - vacc_lo = vadd_f32(vacc_lo, vf_lo); - #endif - - vf_lo = vget_high_f32(vf); - } - if (batch & (1 * sizeof(float))) { - vst1_lane_f32(output, vf_lo, 0); - - #if XNN_ARCH_ARM64 - vacc_lo += vget_lane_f32(vf_lo, 0); - #else - vacc_lo = vadd_f32(vacc_lo, vreinterpret_f32_u64(vshl_n_u64(vreinterpret_u64_f32(vf_lo), 32))); - #endif - } - } -#if XNN_ARCH_ARM64 - *sum = vacc_lo; -#else - vst1_lane_f32(sum, vpadd_f32(vacc_lo, vacc_lo), 0); -#endif -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u12-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u12-acc2.c deleted file mode 100644 index f80fd7efc67..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u12-acc2.c +++ /dev/null @@ -1,206 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/neon-p5.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u12_acc2( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const float32x4_t vlog2e = vmovq_n_f32(0x1.715476p+0f); - const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f); - const float32x4_t vc5 = vmovq_n_f32(0x1.0F9F9Cp-7f); - const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f); - const float32x4_t vc3 = vmovq_n_f32(0x1.555A80p-3f); - const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f); - const float32x4_t vc1 = vmovq_n_f32(0x1.FFFFF6p-1f); - const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const float32x4_t vminus_ln2 = vmovq_n_f32(-0x1.62E430p-1f); - XNN_FORCE_REALIZATION(vminus_ln2); - - const float32x4_t vi_max = vld1q_dup_f32(max); - - float32x4_t vacc0 = vmovq_n_f32(0.0f); - float32x4_t vacc1 = vmovq_n_f32(0.0f); - for (; batch >= 12 * sizeof(float); batch -= 12 * sizeof(float)) { - const float32x4_t vi0123 = vld1q_f32(input); input += 4; - const float32x4_t vi4567 = vld1q_f32(input); input += 4; - const float32x4_t vi89AB = vld1q_f32(input); input += 4; - - const float32x4_t vx0123 = vsubq_f32(vi0123, vi_max); - const float32x4_t vx4567 = vsubq_f32(vi4567, vi_max); - const float32x4_t vx89AB = vsubq_f32(vi89AB, vi_max); - - float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vx0123, vlog2e); - float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vx4567, vlog2e); - float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vx89AB, vlog2e); - - const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23)); - const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23)); - const float32x4_t vs89AB = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn89AB), 23)); - - vn0123 = vsubq_f32(vn0123, vmagic_bias); - vn4567 = vsubq_f32(vn4567, vmagic_bias); - vn89AB = vsubq_f32(vn89AB, vmagic_bias); - - float32x4_t vt0123 = vfmaq_f32(vx0123, vn0123, vminus_ln2); - float32x4_t vt4567 = vfmaq_f32(vx4567, vn4567, vminus_ln2); - float32x4_t vt89AB = vfmaq_f32(vx89AB, vn89AB, vminus_ln2); - - float32x4_t vp0123 = vfmaq_f32(vc4, vc5, vt0123); - float32x4_t vp4567 = vfmaq_f32(vc4, vc5, vt4567); - float32x4_t vp89AB = vfmaq_f32(vc4, vc5, vt89AB); - - vp0123 = vfmaq_f32(vc3, vp0123, vt0123); - vp4567 = vfmaq_f32(vc3, vp4567, vt4567); - vp89AB = vfmaq_f32(vc3, vp89AB, vt89AB); - - vp0123 = vfmaq_f32(vc2, vp0123, vt0123); - vp4567 = vfmaq_f32(vc2, vp4567, vt4567); - vp89AB = vfmaq_f32(vc2, vp89AB, vt89AB); - - vp0123 = vfmaq_f32(vc1, vp0123, vt0123); - vp4567 = vfmaq_f32(vc1, vp4567, vt4567); - vp89AB = vfmaq_f32(vc1, vp89AB, vt89AB); - - vt0123 = vmulq_f32(vt0123, vs0123); - vt4567 = vmulq_f32(vt4567, vs4567); - vt89AB = vmulq_f32(vt89AB, vs89AB); - - float32x4_t vf0123 = vfmaq_f32(vs0123, vp0123, vt0123); - float32x4_t vf4567 = vfmaq_f32(vs4567, vp4567, vt4567); - float32x4_t vf89AB = vfmaq_f32(vs89AB, vp89AB, vt89AB); - - vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff))); - vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcltq_f32(vx4567, vdenorm_cutoff))); - vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcltq_f32(vx89AB, vdenorm_cutoff))); - - vst1q_f32(output, vf0123); output += 4; - vst1q_f32(output, vf4567); output += 4; - vst1q_f32(output, vf89AB); output += 4; - - vacc0 = vaddq_f32(vacc0, vf0123); - vacc0 = vaddq_f32(vacc0, vf4567); - vacc0 = vaddq_f32(vacc0, vf89AB); - } - vacc0 = vaddq_f32(vacc0, vacc1); - - float32x4_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e); - - const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2); - - float32x4_t vp = vfmaq_f32(vc4, vc5, vt); - vp = vfmaq_f32(vc3, vp, vt); - vp = vfmaq_f32(vc2, vp, vt); - vp = vfmaq_f32(vc1, vp, vt); - - vt = vmulq_f32(vt, vs); - float32x4_t vf = vfmaq_f32(vs, vp, vt); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - vst1q_f32(output, vf); output += 4; - - vacc = vaddq_f32(vacc, vf); - } -#if XNN_ARCH_ARM64 - float vacc_lo = vaddvq_f32(vacc); -#else - float32x2_t vacc_lo = vadd_f32(vget_high_f32(vacc), vget_low_f32(vacc)); -#endif - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e); - - const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2); - - float32x4_t vp = vfmaq_f32(vc4, vc5, vt); - vp = vfmaq_f32(vc3, vp, vt); - vp = vfmaq_f32(vc2, vp, vt); - vp = vfmaq_f32(vc1, vp, vt); - - vt = vmulq_f32(vt, vs); - float32x4_t vf = vfmaq_f32(vs, vp, vt); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - float32x2_t vf_lo = vget_low_f32(vf); - if (batch & (2 * sizeof(float))) { - vst1_f32(output, vf_lo); output += 2; - - #if XNN_ARCH_ARM64 - vacc_lo += vaddv_f32(vf_lo); - #else - vacc_lo = vadd_f32(vacc_lo, vf_lo); - #endif - - vf_lo = vget_high_f32(vf); - } - if (batch & (1 * sizeof(float))) { - vst1_lane_f32(output, vf_lo, 0); - - #if XNN_ARCH_ARM64 - vacc_lo += vget_lane_f32(vf_lo, 0); - #else - vacc_lo = vadd_f32(vacc_lo, vreinterpret_f32_u64(vshl_n_u64(vreinterpret_u64_f32(vf_lo), 32))); - #endif - } - } -#if XNN_ARCH_ARM64 - *sum = vacc_lo; -#else - vst1_lane_f32(sum, vpadd_f32(vacc_lo, vacc_lo), 0); -#endif -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u12-acc3.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u12-acc3.c deleted file mode 100644 index c270d0f1e8e..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u12-acc3.c +++ /dev/null @@ -1,208 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/neon-p5.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u12_acc3( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const float32x4_t vlog2e = vmovq_n_f32(0x1.715476p+0f); - const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f); - const float32x4_t vc5 = vmovq_n_f32(0x1.0F9F9Cp-7f); - const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f); - const float32x4_t vc3 = vmovq_n_f32(0x1.555A80p-3f); - const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f); - const float32x4_t vc1 = vmovq_n_f32(0x1.FFFFF6p-1f); - const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const float32x4_t vminus_ln2 = vmovq_n_f32(-0x1.62E430p-1f); - XNN_FORCE_REALIZATION(vminus_ln2); - - const float32x4_t vi_max = vld1q_dup_f32(max); - - float32x4_t vacc0 = vmovq_n_f32(0.0f); - float32x4_t vacc1 = vmovq_n_f32(0.0f); - float32x4_t vacc2 = vmovq_n_f32(0.0f); - for (; batch >= 12 * sizeof(float); batch -= 12 * sizeof(float)) { - const float32x4_t vi0123 = vld1q_f32(input); input += 4; - const float32x4_t vi4567 = vld1q_f32(input); input += 4; - const float32x4_t vi89AB = vld1q_f32(input); input += 4; - - const float32x4_t vx0123 = vsubq_f32(vi0123, vi_max); - const float32x4_t vx4567 = vsubq_f32(vi4567, vi_max); - const float32x4_t vx89AB = vsubq_f32(vi89AB, vi_max); - - float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vx0123, vlog2e); - float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vx4567, vlog2e); - float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vx89AB, vlog2e); - - const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23)); - const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23)); - const float32x4_t vs89AB = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn89AB), 23)); - - vn0123 = vsubq_f32(vn0123, vmagic_bias); - vn4567 = vsubq_f32(vn4567, vmagic_bias); - vn89AB = vsubq_f32(vn89AB, vmagic_bias); - - float32x4_t vt0123 = vfmaq_f32(vx0123, vn0123, vminus_ln2); - float32x4_t vt4567 = vfmaq_f32(vx4567, vn4567, vminus_ln2); - float32x4_t vt89AB = vfmaq_f32(vx89AB, vn89AB, vminus_ln2); - - float32x4_t vp0123 = vfmaq_f32(vc4, vc5, vt0123); - float32x4_t vp4567 = vfmaq_f32(vc4, vc5, vt4567); - float32x4_t vp89AB = vfmaq_f32(vc4, vc5, vt89AB); - - vp0123 = vfmaq_f32(vc3, vp0123, vt0123); - vp4567 = vfmaq_f32(vc3, vp4567, vt4567); - vp89AB = vfmaq_f32(vc3, vp89AB, vt89AB); - - vp0123 = vfmaq_f32(vc2, vp0123, vt0123); - vp4567 = vfmaq_f32(vc2, vp4567, vt4567); - vp89AB = vfmaq_f32(vc2, vp89AB, vt89AB); - - vp0123 = vfmaq_f32(vc1, vp0123, vt0123); - vp4567 = vfmaq_f32(vc1, vp4567, vt4567); - vp89AB = vfmaq_f32(vc1, vp89AB, vt89AB); - - vt0123 = vmulq_f32(vt0123, vs0123); - vt4567 = vmulq_f32(vt4567, vs4567); - vt89AB = vmulq_f32(vt89AB, vs89AB); - - float32x4_t vf0123 = vfmaq_f32(vs0123, vp0123, vt0123); - float32x4_t vf4567 = vfmaq_f32(vs4567, vp4567, vt4567); - float32x4_t vf89AB = vfmaq_f32(vs89AB, vp89AB, vt89AB); - - vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff))); - vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcltq_f32(vx4567, vdenorm_cutoff))); - vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcltq_f32(vx89AB, vdenorm_cutoff))); - - vst1q_f32(output, vf0123); output += 4; - vst1q_f32(output, vf4567); output += 4; - vst1q_f32(output, vf89AB); output += 4; - - vacc0 = vaddq_f32(vacc0, vf0123); - vacc1 = vaddq_f32(vacc1, vf4567); - vacc2 = vaddq_f32(vacc2, vf89AB); - } - vacc0 = vaddq_f32(vacc0, vacc1); - vacc0 = vaddq_f32(vacc0, vacc2); - - float32x4_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e); - - const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2); - - float32x4_t vp = vfmaq_f32(vc4, vc5, vt); - vp = vfmaq_f32(vc3, vp, vt); - vp = vfmaq_f32(vc2, vp, vt); - vp = vfmaq_f32(vc1, vp, vt); - - vt = vmulq_f32(vt, vs); - float32x4_t vf = vfmaq_f32(vs, vp, vt); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - vst1q_f32(output, vf); output += 4; - - vacc = vaddq_f32(vacc, vf); - } -#if XNN_ARCH_ARM64 - float vacc_lo = vaddvq_f32(vacc); -#else - float32x2_t vacc_lo = vadd_f32(vget_high_f32(vacc), vget_low_f32(vacc)); -#endif - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e); - - const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2); - - float32x4_t vp = vfmaq_f32(vc4, vc5, vt); - vp = vfmaq_f32(vc3, vp, vt); - vp = vfmaq_f32(vc2, vp, vt); - vp = vfmaq_f32(vc1, vp, vt); - - vt = vmulq_f32(vt, vs); - float32x4_t vf = vfmaq_f32(vs, vp, vt); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - float32x2_t vf_lo = vget_low_f32(vf); - if (batch & (2 * sizeof(float))) { - vst1_f32(output, vf_lo); output += 2; - - #if XNN_ARCH_ARM64 - vacc_lo += vaddv_f32(vf_lo); - #else - vacc_lo = vadd_f32(vacc_lo, vf_lo); - #endif - - vf_lo = vget_high_f32(vf); - } - if (batch & (1 * sizeof(float))) { - vst1_lane_f32(output, vf_lo, 0); - - #if XNN_ARCH_ARM64 - vacc_lo += vget_lane_f32(vf_lo, 0); - #else - vacc_lo = vadd_f32(vacc_lo, vreinterpret_f32_u64(vshl_n_u64(vreinterpret_u64_f32(vf_lo), 32))); - #endif - } - } -#if XNN_ARCH_ARM64 - *sum = vacc_lo; -#else - vst1_lane_f32(sum, vpadd_f32(vacc_lo, vacc_lo), 0); -#endif -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u12.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u12.c deleted file mode 100644 index 2cd755b5010..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u12.c +++ /dev/null @@ -1,204 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/neon-p5.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u12( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const float32x4_t vlog2e = vmovq_n_f32(0x1.715476p+0f); - const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f); - const float32x4_t vc5 = vmovq_n_f32(0x1.0F9F9Cp-7f); - const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f); - const float32x4_t vc3 = vmovq_n_f32(0x1.555A80p-3f); - const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f); - const float32x4_t vc1 = vmovq_n_f32(0x1.FFFFF6p-1f); - const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const float32x4_t vminus_ln2 = vmovq_n_f32(-0x1.62E430p-1f); - XNN_FORCE_REALIZATION(vminus_ln2); - - const float32x4_t vi_max = vld1q_dup_f32(max); - - float32x4_t vacc0 = vmovq_n_f32(0.0f); - for (; batch >= 12 * sizeof(float); batch -= 12 * sizeof(float)) { - const float32x4_t vi0123 = vld1q_f32(input); input += 4; - const float32x4_t vi4567 = vld1q_f32(input); input += 4; - const float32x4_t vi89AB = vld1q_f32(input); input += 4; - - const float32x4_t vx0123 = vsubq_f32(vi0123, vi_max); - const float32x4_t vx4567 = vsubq_f32(vi4567, vi_max); - const float32x4_t vx89AB = vsubq_f32(vi89AB, vi_max); - - float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vx0123, vlog2e); - float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vx4567, vlog2e); - float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vx89AB, vlog2e); - - const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23)); - const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23)); - const float32x4_t vs89AB = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn89AB), 23)); - - vn0123 = vsubq_f32(vn0123, vmagic_bias); - vn4567 = vsubq_f32(vn4567, vmagic_bias); - vn89AB = vsubq_f32(vn89AB, vmagic_bias); - - float32x4_t vt0123 = vfmaq_f32(vx0123, vn0123, vminus_ln2); - float32x4_t vt4567 = vfmaq_f32(vx4567, vn4567, vminus_ln2); - float32x4_t vt89AB = vfmaq_f32(vx89AB, vn89AB, vminus_ln2); - - float32x4_t vp0123 = vfmaq_f32(vc4, vc5, vt0123); - float32x4_t vp4567 = vfmaq_f32(vc4, vc5, vt4567); - float32x4_t vp89AB = vfmaq_f32(vc4, vc5, vt89AB); - - vp0123 = vfmaq_f32(vc3, vp0123, vt0123); - vp4567 = vfmaq_f32(vc3, vp4567, vt4567); - vp89AB = vfmaq_f32(vc3, vp89AB, vt89AB); - - vp0123 = vfmaq_f32(vc2, vp0123, vt0123); - vp4567 = vfmaq_f32(vc2, vp4567, vt4567); - vp89AB = vfmaq_f32(vc2, vp89AB, vt89AB); - - vp0123 = vfmaq_f32(vc1, vp0123, vt0123); - vp4567 = vfmaq_f32(vc1, vp4567, vt4567); - vp89AB = vfmaq_f32(vc1, vp89AB, vt89AB); - - vt0123 = vmulq_f32(vt0123, vs0123); - vt4567 = vmulq_f32(vt4567, vs4567); - vt89AB = vmulq_f32(vt89AB, vs89AB); - - float32x4_t vf0123 = vfmaq_f32(vs0123, vp0123, vt0123); - float32x4_t vf4567 = vfmaq_f32(vs4567, vp4567, vt4567); - float32x4_t vf89AB = vfmaq_f32(vs89AB, vp89AB, vt89AB); - - vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff))); - vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcltq_f32(vx4567, vdenorm_cutoff))); - vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcltq_f32(vx89AB, vdenorm_cutoff))); - - vst1q_f32(output, vf0123); output += 4; - vst1q_f32(output, vf4567); output += 4; - vst1q_f32(output, vf89AB); output += 4; - - vacc0 = vaddq_f32(vacc0, vf0123); - vacc0 = vaddq_f32(vacc0, vf4567); - vacc0 = vaddq_f32(vacc0, vf89AB); - } - - float32x4_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e); - - const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2); - - float32x4_t vp = vfmaq_f32(vc4, vc5, vt); - vp = vfmaq_f32(vc3, vp, vt); - vp = vfmaq_f32(vc2, vp, vt); - vp = vfmaq_f32(vc1, vp, vt); - - vt = vmulq_f32(vt, vs); - float32x4_t vf = vfmaq_f32(vs, vp, vt); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - vst1q_f32(output, vf); output += 4; - - vacc = vaddq_f32(vacc, vf); - } -#if XNN_ARCH_ARM64 - float vacc_lo = vaddvq_f32(vacc); -#else - float32x2_t vacc_lo = vadd_f32(vget_high_f32(vacc), vget_low_f32(vacc)); -#endif - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e); - - const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2); - - float32x4_t vp = vfmaq_f32(vc4, vc5, vt); - vp = vfmaq_f32(vc3, vp, vt); - vp = vfmaq_f32(vc2, vp, vt); - vp = vfmaq_f32(vc1, vp, vt); - - vt = vmulq_f32(vt, vs); - float32x4_t vf = vfmaq_f32(vs, vp, vt); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - float32x2_t vf_lo = vget_low_f32(vf); - if (batch & (2 * sizeof(float))) { - vst1_f32(output, vf_lo); output += 2; - - #if XNN_ARCH_ARM64 - vacc_lo += vaddv_f32(vf_lo); - #else - vacc_lo = vadd_f32(vacc_lo, vf_lo); - #endif - - vf_lo = vget_high_f32(vf); - } - if (batch & (1 * sizeof(float))) { - vst1_lane_f32(output, vf_lo, 0); - - #if XNN_ARCH_ARM64 - vacc_lo += vget_lane_f32(vf_lo, 0); - #else - vacc_lo = vadd_f32(vacc_lo, vreinterpret_f32_u64(vshl_n_u64(vreinterpret_u64_f32(vf_lo), 32))); - #endif - } - } -#if XNN_ARCH_ARM64 - *sum = vacc_lo; -#else - vst1_lane_f32(sum, vpadd_f32(vacc_lo, vacc_lo), 0); -#endif -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u16.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u16.c deleted file mode 100644 index 722ef83ea9e..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u16.c +++ /dev/null @@ -1,219 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/neon-p5.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u16( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const float32x4_t vlog2e = vmovq_n_f32(0x1.715476p+0f); - const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f); - const float32x4_t vc5 = vmovq_n_f32(0x1.0F9F9Cp-7f); - const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f); - const float32x4_t vc3 = vmovq_n_f32(0x1.555A80p-3f); - const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f); - const float32x4_t vc1 = vmovq_n_f32(0x1.FFFFF6p-1f); - const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const float32x4_t vminus_ln2 = vmovq_n_f32(-0x1.62E430p-1f); - XNN_FORCE_REALIZATION(vminus_ln2); - - const float32x4_t vi_max = vld1q_dup_f32(max); - - float32x4_t vacc0 = vmovq_n_f32(0.0f); - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - const float32x4_t vi0123 = vld1q_f32(input); input += 4; - const float32x4_t vi4567 = vld1q_f32(input); input += 4; - const float32x4_t vi89AB = vld1q_f32(input); input += 4; - const float32x4_t viCDEF = vld1q_f32(input); input += 4; - - const float32x4_t vx0123 = vsubq_f32(vi0123, vi_max); - const float32x4_t vx4567 = vsubq_f32(vi4567, vi_max); - const float32x4_t vx89AB = vsubq_f32(vi89AB, vi_max); - const float32x4_t vxCDEF = vsubq_f32(viCDEF, vi_max); - - float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vx0123, vlog2e); - float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vx4567, vlog2e); - float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vx89AB, vlog2e); - float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vxCDEF, vlog2e); - - const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23)); - const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23)); - const float32x4_t vs89AB = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn89AB), 23)); - const float32x4_t vsCDEF = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnCDEF), 23)); - - vn0123 = vsubq_f32(vn0123, vmagic_bias); - vn4567 = vsubq_f32(vn4567, vmagic_bias); - vn89AB = vsubq_f32(vn89AB, vmagic_bias); - vnCDEF = vsubq_f32(vnCDEF, vmagic_bias); - - float32x4_t vt0123 = vfmaq_f32(vx0123, vn0123, vminus_ln2); - float32x4_t vt4567 = vfmaq_f32(vx4567, vn4567, vminus_ln2); - float32x4_t vt89AB = vfmaq_f32(vx89AB, vn89AB, vminus_ln2); - float32x4_t vtCDEF = vfmaq_f32(vxCDEF, vnCDEF, vminus_ln2); - - float32x4_t vp0123 = vfmaq_f32(vc4, vc5, vt0123); - float32x4_t vp4567 = vfmaq_f32(vc4, vc5, vt4567); - float32x4_t vp89AB = vfmaq_f32(vc4, vc5, vt89AB); - float32x4_t vpCDEF = vfmaq_f32(vc4, vc5, vtCDEF); - - vp0123 = vfmaq_f32(vc3, vp0123, vt0123); - vp4567 = vfmaq_f32(vc3, vp4567, vt4567); - vp89AB = vfmaq_f32(vc3, vp89AB, vt89AB); - vpCDEF = vfmaq_f32(vc3, vpCDEF, vtCDEF); - - vp0123 = vfmaq_f32(vc2, vp0123, vt0123); - vp4567 = vfmaq_f32(vc2, vp4567, vt4567); - vp89AB = vfmaq_f32(vc2, vp89AB, vt89AB); - vpCDEF = vfmaq_f32(vc2, vpCDEF, vtCDEF); - - vp0123 = vfmaq_f32(vc1, vp0123, vt0123); - vp4567 = vfmaq_f32(vc1, vp4567, vt4567); - vp89AB = vfmaq_f32(vc1, vp89AB, vt89AB); - vpCDEF = vfmaq_f32(vc1, vpCDEF, vtCDEF); - - vt0123 = vmulq_f32(vt0123, vs0123); - vt4567 = vmulq_f32(vt4567, vs4567); - vt89AB = vmulq_f32(vt89AB, vs89AB); - vtCDEF = vmulq_f32(vtCDEF, vsCDEF); - - float32x4_t vf0123 = vfmaq_f32(vs0123, vp0123, vt0123); - float32x4_t vf4567 = vfmaq_f32(vs4567, vp4567, vt4567); - float32x4_t vf89AB = vfmaq_f32(vs89AB, vp89AB, vt89AB); - float32x4_t vfCDEF = vfmaq_f32(vsCDEF, vpCDEF, vtCDEF); - - vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff))); - vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcltq_f32(vx4567, vdenorm_cutoff))); - vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcltq_f32(vx89AB, vdenorm_cutoff))); - vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcltq_f32(vxCDEF, vdenorm_cutoff))); - - vst1q_f32(output, vf0123); output += 4; - vst1q_f32(output, vf4567); output += 4; - vst1q_f32(output, vf89AB); output += 4; - vst1q_f32(output, vfCDEF); output += 4; - - vacc0 = vaddq_f32(vacc0, vf0123); - vacc0 = vaddq_f32(vacc0, vf4567); - vacc0 = vaddq_f32(vacc0, vf89AB); - vacc0 = vaddq_f32(vacc0, vfCDEF); - } - - float32x4_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e); - - const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2); - - float32x4_t vp = vfmaq_f32(vc4, vc5, vt); - vp = vfmaq_f32(vc3, vp, vt); - vp = vfmaq_f32(vc2, vp, vt); - vp = vfmaq_f32(vc1, vp, vt); - - vt = vmulq_f32(vt, vs); - float32x4_t vf = vfmaq_f32(vs, vp, vt); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - vst1q_f32(output, vf); output += 4; - - vacc = vaddq_f32(vacc, vf); - } -#if XNN_ARCH_ARM64 - float vacc_lo = vaddvq_f32(vacc); -#else - float32x2_t vacc_lo = vadd_f32(vget_high_f32(vacc), vget_low_f32(vacc)); -#endif - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e); - - const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2); - - float32x4_t vp = vfmaq_f32(vc4, vc5, vt); - vp = vfmaq_f32(vc3, vp, vt); - vp = vfmaq_f32(vc2, vp, vt); - vp = vfmaq_f32(vc1, vp, vt); - - vt = vmulq_f32(vt, vs); - float32x4_t vf = vfmaq_f32(vs, vp, vt); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - float32x2_t vf_lo = vget_low_f32(vf); - if (batch & (2 * sizeof(float))) { - vst1_f32(output, vf_lo); output += 2; - - #if XNN_ARCH_ARM64 - vacc_lo += vaddv_f32(vf_lo); - #else - vacc_lo = vadd_f32(vacc_lo, vf_lo); - #endif - - vf_lo = vget_high_f32(vf); - } - if (batch & (1 * sizeof(float))) { - vst1_lane_f32(output, vf_lo, 0); - - #if XNN_ARCH_ARM64 - vacc_lo += vget_lane_f32(vf_lo, 0); - #else - vacc_lo = vadd_f32(vacc_lo, vreinterpret_f32_u64(vshl_n_u64(vreinterpret_u64_f32(vf_lo), 32))); - #endif - } - } -#if XNN_ARCH_ARM64 - *sum = vacc_lo; -#else - vst1_lane_f32(sum, vpadd_f32(vacc_lo, vacc_lo), 0); -#endif -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u20-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u20-acc2.c deleted file mode 100644 index 54334a8134b..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u20-acc2.c +++ /dev/null @@ -1,236 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/neon-p5.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u20_acc2( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const float32x4_t vlog2e = vmovq_n_f32(0x1.715476p+0f); - const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f); - const float32x4_t vc5 = vmovq_n_f32(0x1.0F9F9Cp-7f); - const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f); - const float32x4_t vc3 = vmovq_n_f32(0x1.555A80p-3f); - const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f); - const float32x4_t vc1 = vmovq_n_f32(0x1.FFFFF6p-1f); - const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const float32x4_t vminus_ln2 = vmovq_n_f32(-0x1.62E430p-1f); - XNN_FORCE_REALIZATION(vminus_ln2); - - const float32x4_t vi_max = vld1q_dup_f32(max); - - float32x4_t vacc0 = vmovq_n_f32(0.0f); - float32x4_t vacc1 = vmovq_n_f32(0.0f); - for (; batch >= 20 * sizeof(float); batch -= 20 * sizeof(float)) { - const float32x4_t vi0123 = vld1q_f32(input); input += 4; - const float32x4_t vi4567 = vld1q_f32(input); input += 4; - const float32x4_t vi89AB = vld1q_f32(input); input += 4; - const float32x4_t viCDEF = vld1q_f32(input); input += 4; - const float32x4_t viGHIJ = vld1q_f32(input); input += 4; - - const float32x4_t vx0123 = vsubq_f32(vi0123, vi_max); - const float32x4_t vx4567 = vsubq_f32(vi4567, vi_max); - const float32x4_t vx89AB = vsubq_f32(vi89AB, vi_max); - const float32x4_t vxCDEF = vsubq_f32(viCDEF, vi_max); - const float32x4_t vxGHIJ = vsubq_f32(viGHIJ, vi_max); - - float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vx0123, vlog2e); - float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vx4567, vlog2e); - float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vx89AB, vlog2e); - float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vxCDEF, vlog2e); - float32x4_t vnGHIJ = vfmaq_f32(vmagic_bias, vxGHIJ, vlog2e); - - const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23)); - const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23)); - const float32x4_t vs89AB = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn89AB), 23)); - const float32x4_t vsCDEF = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnCDEF), 23)); - const float32x4_t vsGHIJ = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnGHIJ), 23)); - - vn0123 = vsubq_f32(vn0123, vmagic_bias); - vn4567 = vsubq_f32(vn4567, vmagic_bias); - vn89AB = vsubq_f32(vn89AB, vmagic_bias); - vnCDEF = vsubq_f32(vnCDEF, vmagic_bias); - vnGHIJ = vsubq_f32(vnGHIJ, vmagic_bias); - - float32x4_t vt0123 = vfmaq_f32(vx0123, vn0123, vminus_ln2); - float32x4_t vt4567 = vfmaq_f32(vx4567, vn4567, vminus_ln2); - float32x4_t vt89AB = vfmaq_f32(vx89AB, vn89AB, vminus_ln2); - float32x4_t vtCDEF = vfmaq_f32(vxCDEF, vnCDEF, vminus_ln2); - float32x4_t vtGHIJ = vfmaq_f32(vxGHIJ, vnGHIJ, vminus_ln2); - - float32x4_t vp0123 = vfmaq_f32(vc4, vc5, vt0123); - float32x4_t vp4567 = vfmaq_f32(vc4, vc5, vt4567); - float32x4_t vp89AB = vfmaq_f32(vc4, vc5, vt89AB); - float32x4_t vpCDEF = vfmaq_f32(vc4, vc5, vtCDEF); - float32x4_t vpGHIJ = vfmaq_f32(vc4, vc5, vtGHIJ); - - vp0123 = vfmaq_f32(vc3, vp0123, vt0123); - vp4567 = vfmaq_f32(vc3, vp4567, vt4567); - vp89AB = vfmaq_f32(vc3, vp89AB, vt89AB); - vpCDEF = vfmaq_f32(vc3, vpCDEF, vtCDEF); - vpGHIJ = vfmaq_f32(vc3, vpGHIJ, vtGHIJ); - - vp0123 = vfmaq_f32(vc2, vp0123, vt0123); - vp4567 = vfmaq_f32(vc2, vp4567, vt4567); - vp89AB = vfmaq_f32(vc2, vp89AB, vt89AB); - vpCDEF = vfmaq_f32(vc2, vpCDEF, vtCDEF); - vpGHIJ = vfmaq_f32(vc2, vpGHIJ, vtGHIJ); - - vp0123 = vfmaq_f32(vc1, vp0123, vt0123); - vp4567 = vfmaq_f32(vc1, vp4567, vt4567); - vp89AB = vfmaq_f32(vc1, vp89AB, vt89AB); - vpCDEF = vfmaq_f32(vc1, vpCDEF, vtCDEF); - vpGHIJ = vfmaq_f32(vc1, vpGHIJ, vtGHIJ); - - vt0123 = vmulq_f32(vt0123, vs0123); - vt4567 = vmulq_f32(vt4567, vs4567); - vt89AB = vmulq_f32(vt89AB, vs89AB); - vtCDEF = vmulq_f32(vtCDEF, vsCDEF); - vtGHIJ = vmulq_f32(vtGHIJ, vsGHIJ); - - float32x4_t vf0123 = vfmaq_f32(vs0123, vp0123, vt0123); - float32x4_t vf4567 = vfmaq_f32(vs4567, vp4567, vt4567); - float32x4_t vf89AB = vfmaq_f32(vs89AB, vp89AB, vt89AB); - float32x4_t vfCDEF = vfmaq_f32(vsCDEF, vpCDEF, vtCDEF); - float32x4_t vfGHIJ = vfmaq_f32(vsGHIJ, vpGHIJ, vtGHIJ); - - vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff))); - vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcltq_f32(vx4567, vdenorm_cutoff))); - vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcltq_f32(vx89AB, vdenorm_cutoff))); - vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcltq_f32(vxCDEF, vdenorm_cutoff))); - vfGHIJ = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfGHIJ), vcltq_f32(vxGHIJ, vdenorm_cutoff))); - - vst1q_f32(output, vf0123); output += 4; - vst1q_f32(output, vf4567); output += 4; - vst1q_f32(output, vf89AB); output += 4; - vst1q_f32(output, vfCDEF); output += 4; - vst1q_f32(output, vfGHIJ); output += 4; - - vacc0 = vaddq_f32(vacc0, vf0123); - vacc0 = vaddq_f32(vacc0, vf4567); - vacc0 = vaddq_f32(vacc0, vf89AB); - vacc0 = vaddq_f32(vacc0, vfCDEF); - vacc0 = vaddq_f32(vacc0, vfGHIJ); - } - vacc0 = vaddq_f32(vacc0, vacc1); - - float32x4_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e); - - const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2); - - float32x4_t vp = vfmaq_f32(vc4, vc5, vt); - vp = vfmaq_f32(vc3, vp, vt); - vp = vfmaq_f32(vc2, vp, vt); - vp = vfmaq_f32(vc1, vp, vt); - - vt = vmulq_f32(vt, vs); - float32x4_t vf = vfmaq_f32(vs, vp, vt); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - vst1q_f32(output, vf); output += 4; - - vacc = vaddq_f32(vacc, vf); - } -#if XNN_ARCH_ARM64 - float vacc_lo = vaddvq_f32(vacc); -#else - float32x2_t vacc_lo = vadd_f32(vget_high_f32(vacc), vget_low_f32(vacc)); -#endif - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e); - - const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2); - - float32x4_t vp = vfmaq_f32(vc4, vc5, vt); - vp = vfmaq_f32(vc3, vp, vt); - vp = vfmaq_f32(vc2, vp, vt); - vp = vfmaq_f32(vc1, vp, vt); - - vt = vmulq_f32(vt, vs); - float32x4_t vf = vfmaq_f32(vs, vp, vt); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - float32x2_t vf_lo = vget_low_f32(vf); - if (batch & (2 * sizeof(float))) { - vst1_f32(output, vf_lo); output += 2; - - #if XNN_ARCH_ARM64 - vacc_lo += vaddv_f32(vf_lo); - #else - vacc_lo = vadd_f32(vacc_lo, vf_lo); - #endif - - vf_lo = vget_high_f32(vf); - } - if (batch & (1 * sizeof(float))) { - vst1_lane_f32(output, vf_lo, 0); - - #if XNN_ARCH_ARM64 - vacc_lo += vget_lane_f32(vf_lo, 0); - #else - vacc_lo = vadd_f32(vacc_lo, vreinterpret_f32_u64(vshl_n_u64(vreinterpret_u64_f32(vf_lo), 32))); - #endif - } - } -#if XNN_ARCH_ARM64 - *sum = vacc_lo; -#else - vst1_lane_f32(sum, vpadd_f32(vacc_lo, vacc_lo), 0); -#endif -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u20-acc5.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u20-acc5.c deleted file mode 100644 index 85b3185da49..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u20-acc5.c +++ /dev/null @@ -1,242 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/neon-p5.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u20_acc5( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const float32x4_t vlog2e = vmovq_n_f32(0x1.715476p+0f); - const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f); - const float32x4_t vc5 = vmovq_n_f32(0x1.0F9F9Cp-7f); - const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f); - const float32x4_t vc3 = vmovq_n_f32(0x1.555A80p-3f); - const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f); - const float32x4_t vc1 = vmovq_n_f32(0x1.FFFFF6p-1f); - const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const float32x4_t vminus_ln2 = vmovq_n_f32(-0x1.62E430p-1f); - XNN_FORCE_REALIZATION(vminus_ln2); - - const float32x4_t vi_max = vld1q_dup_f32(max); - - float32x4_t vacc0 = vmovq_n_f32(0.0f); - float32x4_t vacc1 = vmovq_n_f32(0.0f); - float32x4_t vacc2 = vmovq_n_f32(0.0f); - float32x4_t vacc3 = vmovq_n_f32(0.0f); - float32x4_t vacc4 = vmovq_n_f32(0.0f); - for (; batch >= 20 * sizeof(float); batch -= 20 * sizeof(float)) { - const float32x4_t vi0123 = vld1q_f32(input); input += 4; - const float32x4_t vi4567 = vld1q_f32(input); input += 4; - const float32x4_t vi89AB = vld1q_f32(input); input += 4; - const float32x4_t viCDEF = vld1q_f32(input); input += 4; - const float32x4_t viGHIJ = vld1q_f32(input); input += 4; - - const float32x4_t vx0123 = vsubq_f32(vi0123, vi_max); - const float32x4_t vx4567 = vsubq_f32(vi4567, vi_max); - const float32x4_t vx89AB = vsubq_f32(vi89AB, vi_max); - const float32x4_t vxCDEF = vsubq_f32(viCDEF, vi_max); - const float32x4_t vxGHIJ = vsubq_f32(viGHIJ, vi_max); - - float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vx0123, vlog2e); - float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vx4567, vlog2e); - float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vx89AB, vlog2e); - float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vxCDEF, vlog2e); - float32x4_t vnGHIJ = vfmaq_f32(vmagic_bias, vxGHIJ, vlog2e); - - const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23)); - const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23)); - const float32x4_t vs89AB = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn89AB), 23)); - const float32x4_t vsCDEF = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnCDEF), 23)); - const float32x4_t vsGHIJ = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnGHIJ), 23)); - - vn0123 = vsubq_f32(vn0123, vmagic_bias); - vn4567 = vsubq_f32(vn4567, vmagic_bias); - vn89AB = vsubq_f32(vn89AB, vmagic_bias); - vnCDEF = vsubq_f32(vnCDEF, vmagic_bias); - vnGHIJ = vsubq_f32(vnGHIJ, vmagic_bias); - - float32x4_t vt0123 = vfmaq_f32(vx0123, vn0123, vminus_ln2); - float32x4_t vt4567 = vfmaq_f32(vx4567, vn4567, vminus_ln2); - float32x4_t vt89AB = vfmaq_f32(vx89AB, vn89AB, vminus_ln2); - float32x4_t vtCDEF = vfmaq_f32(vxCDEF, vnCDEF, vminus_ln2); - float32x4_t vtGHIJ = vfmaq_f32(vxGHIJ, vnGHIJ, vminus_ln2); - - float32x4_t vp0123 = vfmaq_f32(vc4, vc5, vt0123); - float32x4_t vp4567 = vfmaq_f32(vc4, vc5, vt4567); - float32x4_t vp89AB = vfmaq_f32(vc4, vc5, vt89AB); - float32x4_t vpCDEF = vfmaq_f32(vc4, vc5, vtCDEF); - float32x4_t vpGHIJ = vfmaq_f32(vc4, vc5, vtGHIJ); - - vp0123 = vfmaq_f32(vc3, vp0123, vt0123); - vp4567 = vfmaq_f32(vc3, vp4567, vt4567); - vp89AB = vfmaq_f32(vc3, vp89AB, vt89AB); - vpCDEF = vfmaq_f32(vc3, vpCDEF, vtCDEF); - vpGHIJ = vfmaq_f32(vc3, vpGHIJ, vtGHIJ); - - vp0123 = vfmaq_f32(vc2, vp0123, vt0123); - vp4567 = vfmaq_f32(vc2, vp4567, vt4567); - vp89AB = vfmaq_f32(vc2, vp89AB, vt89AB); - vpCDEF = vfmaq_f32(vc2, vpCDEF, vtCDEF); - vpGHIJ = vfmaq_f32(vc2, vpGHIJ, vtGHIJ); - - vp0123 = vfmaq_f32(vc1, vp0123, vt0123); - vp4567 = vfmaq_f32(vc1, vp4567, vt4567); - vp89AB = vfmaq_f32(vc1, vp89AB, vt89AB); - vpCDEF = vfmaq_f32(vc1, vpCDEF, vtCDEF); - vpGHIJ = vfmaq_f32(vc1, vpGHIJ, vtGHIJ); - - vt0123 = vmulq_f32(vt0123, vs0123); - vt4567 = vmulq_f32(vt4567, vs4567); - vt89AB = vmulq_f32(vt89AB, vs89AB); - vtCDEF = vmulq_f32(vtCDEF, vsCDEF); - vtGHIJ = vmulq_f32(vtGHIJ, vsGHIJ); - - float32x4_t vf0123 = vfmaq_f32(vs0123, vp0123, vt0123); - float32x4_t vf4567 = vfmaq_f32(vs4567, vp4567, vt4567); - float32x4_t vf89AB = vfmaq_f32(vs89AB, vp89AB, vt89AB); - float32x4_t vfCDEF = vfmaq_f32(vsCDEF, vpCDEF, vtCDEF); - float32x4_t vfGHIJ = vfmaq_f32(vsGHIJ, vpGHIJ, vtGHIJ); - - vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff))); - vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcltq_f32(vx4567, vdenorm_cutoff))); - vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcltq_f32(vx89AB, vdenorm_cutoff))); - vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcltq_f32(vxCDEF, vdenorm_cutoff))); - vfGHIJ = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfGHIJ), vcltq_f32(vxGHIJ, vdenorm_cutoff))); - - vst1q_f32(output, vf0123); output += 4; - vst1q_f32(output, vf4567); output += 4; - vst1q_f32(output, vf89AB); output += 4; - vst1q_f32(output, vfCDEF); output += 4; - vst1q_f32(output, vfGHIJ); output += 4; - - vacc0 = vaddq_f32(vacc0, vf0123); - vacc4 = vaddq_f32(vacc4, vf4567); - vacc3 = vaddq_f32(vacc3, vf89AB); - vacc2 = vaddq_f32(vacc2, vfCDEF); - vacc1 = vaddq_f32(vacc1, vfGHIJ); - } - vacc0 = vaddq_f32(vacc0, vacc1); - vacc2 = vaddq_f32(vacc2, vacc3); - vacc0 = vaddq_f32(vacc0, vacc2); - vacc0 = vaddq_f32(vacc0, vacc4); - - float32x4_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e); - - const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2); - - float32x4_t vp = vfmaq_f32(vc4, vc5, vt); - vp = vfmaq_f32(vc3, vp, vt); - vp = vfmaq_f32(vc2, vp, vt); - vp = vfmaq_f32(vc1, vp, vt); - - vt = vmulq_f32(vt, vs); - float32x4_t vf = vfmaq_f32(vs, vp, vt); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - vst1q_f32(output, vf); output += 4; - - vacc = vaddq_f32(vacc, vf); - } -#if XNN_ARCH_ARM64 - float vacc_lo = vaddvq_f32(vacc); -#else - float32x2_t vacc_lo = vadd_f32(vget_high_f32(vacc), vget_low_f32(vacc)); -#endif - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e); - - const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2); - - float32x4_t vp = vfmaq_f32(vc4, vc5, vt); - vp = vfmaq_f32(vc3, vp, vt); - vp = vfmaq_f32(vc2, vp, vt); - vp = vfmaq_f32(vc1, vp, vt); - - vt = vmulq_f32(vt, vs); - float32x4_t vf = vfmaq_f32(vs, vp, vt); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - float32x2_t vf_lo = vget_low_f32(vf); - if (batch & (2 * sizeof(float))) { - vst1_f32(output, vf_lo); output += 2; - - #if XNN_ARCH_ARM64 - vacc_lo += vaddv_f32(vf_lo); - #else - vacc_lo = vadd_f32(vacc_lo, vf_lo); - #endif - - vf_lo = vget_high_f32(vf); - } - if (batch & (1 * sizeof(float))) { - vst1_lane_f32(output, vf_lo, 0); - - #if XNN_ARCH_ARM64 - vacc_lo += vget_lane_f32(vf_lo, 0); - #else - vacc_lo = vadd_f32(vacc_lo, vreinterpret_f32_u64(vshl_n_u64(vreinterpret_u64_f32(vf_lo), 32))); - #endif - } - } -#if XNN_ARCH_ARM64 - *sum = vacc_lo; -#else - vst1_lane_f32(sum, vpadd_f32(vacc_lo, vacc_lo), 0); -#endif -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u20.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u20.c deleted file mode 100644 index 20d1e6ddfd7..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u20.c +++ /dev/null @@ -1,234 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/neon-p5.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u20( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const float32x4_t vlog2e = vmovq_n_f32(0x1.715476p+0f); - const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f); - const float32x4_t vc5 = vmovq_n_f32(0x1.0F9F9Cp-7f); - const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f); - const float32x4_t vc3 = vmovq_n_f32(0x1.555A80p-3f); - const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f); - const float32x4_t vc1 = vmovq_n_f32(0x1.FFFFF6p-1f); - const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const float32x4_t vminus_ln2 = vmovq_n_f32(-0x1.62E430p-1f); - XNN_FORCE_REALIZATION(vminus_ln2); - - const float32x4_t vi_max = vld1q_dup_f32(max); - - float32x4_t vacc0 = vmovq_n_f32(0.0f); - for (; batch >= 20 * sizeof(float); batch -= 20 * sizeof(float)) { - const float32x4_t vi0123 = vld1q_f32(input); input += 4; - const float32x4_t vi4567 = vld1q_f32(input); input += 4; - const float32x4_t vi89AB = vld1q_f32(input); input += 4; - const float32x4_t viCDEF = vld1q_f32(input); input += 4; - const float32x4_t viGHIJ = vld1q_f32(input); input += 4; - - const float32x4_t vx0123 = vsubq_f32(vi0123, vi_max); - const float32x4_t vx4567 = vsubq_f32(vi4567, vi_max); - const float32x4_t vx89AB = vsubq_f32(vi89AB, vi_max); - const float32x4_t vxCDEF = vsubq_f32(viCDEF, vi_max); - const float32x4_t vxGHIJ = vsubq_f32(viGHIJ, vi_max); - - float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vx0123, vlog2e); - float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vx4567, vlog2e); - float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vx89AB, vlog2e); - float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vxCDEF, vlog2e); - float32x4_t vnGHIJ = vfmaq_f32(vmagic_bias, vxGHIJ, vlog2e); - - const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23)); - const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23)); - const float32x4_t vs89AB = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn89AB), 23)); - const float32x4_t vsCDEF = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnCDEF), 23)); - const float32x4_t vsGHIJ = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnGHIJ), 23)); - - vn0123 = vsubq_f32(vn0123, vmagic_bias); - vn4567 = vsubq_f32(vn4567, vmagic_bias); - vn89AB = vsubq_f32(vn89AB, vmagic_bias); - vnCDEF = vsubq_f32(vnCDEF, vmagic_bias); - vnGHIJ = vsubq_f32(vnGHIJ, vmagic_bias); - - float32x4_t vt0123 = vfmaq_f32(vx0123, vn0123, vminus_ln2); - float32x4_t vt4567 = vfmaq_f32(vx4567, vn4567, vminus_ln2); - float32x4_t vt89AB = vfmaq_f32(vx89AB, vn89AB, vminus_ln2); - float32x4_t vtCDEF = vfmaq_f32(vxCDEF, vnCDEF, vminus_ln2); - float32x4_t vtGHIJ = vfmaq_f32(vxGHIJ, vnGHIJ, vminus_ln2); - - float32x4_t vp0123 = vfmaq_f32(vc4, vc5, vt0123); - float32x4_t vp4567 = vfmaq_f32(vc4, vc5, vt4567); - float32x4_t vp89AB = vfmaq_f32(vc4, vc5, vt89AB); - float32x4_t vpCDEF = vfmaq_f32(vc4, vc5, vtCDEF); - float32x4_t vpGHIJ = vfmaq_f32(vc4, vc5, vtGHIJ); - - vp0123 = vfmaq_f32(vc3, vp0123, vt0123); - vp4567 = vfmaq_f32(vc3, vp4567, vt4567); - vp89AB = vfmaq_f32(vc3, vp89AB, vt89AB); - vpCDEF = vfmaq_f32(vc3, vpCDEF, vtCDEF); - vpGHIJ = vfmaq_f32(vc3, vpGHIJ, vtGHIJ); - - vp0123 = vfmaq_f32(vc2, vp0123, vt0123); - vp4567 = vfmaq_f32(vc2, vp4567, vt4567); - vp89AB = vfmaq_f32(vc2, vp89AB, vt89AB); - vpCDEF = vfmaq_f32(vc2, vpCDEF, vtCDEF); - vpGHIJ = vfmaq_f32(vc2, vpGHIJ, vtGHIJ); - - vp0123 = vfmaq_f32(vc1, vp0123, vt0123); - vp4567 = vfmaq_f32(vc1, vp4567, vt4567); - vp89AB = vfmaq_f32(vc1, vp89AB, vt89AB); - vpCDEF = vfmaq_f32(vc1, vpCDEF, vtCDEF); - vpGHIJ = vfmaq_f32(vc1, vpGHIJ, vtGHIJ); - - vt0123 = vmulq_f32(vt0123, vs0123); - vt4567 = vmulq_f32(vt4567, vs4567); - vt89AB = vmulq_f32(vt89AB, vs89AB); - vtCDEF = vmulq_f32(vtCDEF, vsCDEF); - vtGHIJ = vmulq_f32(vtGHIJ, vsGHIJ); - - float32x4_t vf0123 = vfmaq_f32(vs0123, vp0123, vt0123); - float32x4_t vf4567 = vfmaq_f32(vs4567, vp4567, vt4567); - float32x4_t vf89AB = vfmaq_f32(vs89AB, vp89AB, vt89AB); - float32x4_t vfCDEF = vfmaq_f32(vsCDEF, vpCDEF, vtCDEF); - float32x4_t vfGHIJ = vfmaq_f32(vsGHIJ, vpGHIJ, vtGHIJ); - - vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff))); - vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcltq_f32(vx4567, vdenorm_cutoff))); - vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcltq_f32(vx89AB, vdenorm_cutoff))); - vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcltq_f32(vxCDEF, vdenorm_cutoff))); - vfGHIJ = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfGHIJ), vcltq_f32(vxGHIJ, vdenorm_cutoff))); - - vst1q_f32(output, vf0123); output += 4; - vst1q_f32(output, vf4567); output += 4; - vst1q_f32(output, vf89AB); output += 4; - vst1q_f32(output, vfCDEF); output += 4; - vst1q_f32(output, vfGHIJ); output += 4; - - vacc0 = vaddq_f32(vacc0, vf0123); - vacc0 = vaddq_f32(vacc0, vf4567); - vacc0 = vaddq_f32(vacc0, vf89AB); - vacc0 = vaddq_f32(vacc0, vfCDEF); - vacc0 = vaddq_f32(vacc0, vfGHIJ); - } - - float32x4_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e); - - const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2); - - float32x4_t vp = vfmaq_f32(vc4, vc5, vt); - vp = vfmaq_f32(vc3, vp, vt); - vp = vfmaq_f32(vc2, vp, vt); - vp = vfmaq_f32(vc1, vp, vt); - - vt = vmulq_f32(vt, vs); - float32x4_t vf = vfmaq_f32(vs, vp, vt); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - vst1q_f32(output, vf); output += 4; - - vacc = vaddq_f32(vacc, vf); - } -#if XNN_ARCH_ARM64 - float vacc_lo = vaddvq_f32(vacc); -#else - float32x2_t vacc_lo = vadd_f32(vget_high_f32(vacc), vget_low_f32(vacc)); -#endif - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e); - - const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2); - - float32x4_t vp = vfmaq_f32(vc4, vc5, vt); - vp = vfmaq_f32(vc3, vp, vt); - vp = vfmaq_f32(vc2, vp, vt); - vp = vfmaq_f32(vc1, vp, vt); - - vt = vmulq_f32(vt, vs); - float32x4_t vf = vfmaq_f32(vs, vp, vt); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - float32x2_t vf_lo = vget_low_f32(vf); - if (batch & (2 * sizeof(float))) { - vst1_f32(output, vf_lo); output += 2; - - #if XNN_ARCH_ARM64 - vacc_lo += vaddv_f32(vf_lo); - #else - vacc_lo = vadd_f32(vacc_lo, vf_lo); - #endif - - vf_lo = vget_high_f32(vf); - } - if (batch & (1 * sizeof(float))) { - vst1_lane_f32(output, vf_lo, 0); - - #if XNN_ARCH_ARM64 - vacc_lo += vget_lane_f32(vf_lo, 0); - #else - vacc_lo = vadd_f32(vacc_lo, vreinterpret_f32_u64(vshl_n_u64(vreinterpret_u64_f32(vf_lo), 32))); - #endif - } - } -#if XNN_ARCH_ARM64 - *sum = vacc_lo; -#else - vst1_lane_f32(sum, vpadd_f32(vacc_lo, vacc_lo), 0); -#endif -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u8.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u8.c deleted file mode 100644 index 26f99ff265a..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-u8.c +++ /dev/null @@ -1,189 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/neon-p5.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u8( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const float32x4_t vlog2e = vmovq_n_f32(0x1.715476p+0f); - const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f); - const float32x4_t vc5 = vmovq_n_f32(0x1.0F9F9Cp-7f); - const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f); - const float32x4_t vc3 = vmovq_n_f32(0x1.555A80p-3f); - const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f); - const float32x4_t vc1 = vmovq_n_f32(0x1.FFFFF6p-1f); - const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const float32x4_t vminus_ln2 = vmovq_n_f32(-0x1.62E430p-1f); - XNN_FORCE_REALIZATION(vminus_ln2); - - const float32x4_t vi_max = vld1q_dup_f32(max); - - float32x4_t vacc0 = vmovq_n_f32(0.0f); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - const float32x4_t vi0123 = vld1q_f32(input); input += 4; - const float32x4_t vi4567 = vld1q_f32(input); input += 4; - - const float32x4_t vx0123 = vsubq_f32(vi0123, vi_max); - const float32x4_t vx4567 = vsubq_f32(vi4567, vi_max); - - float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vx0123, vlog2e); - float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vx4567, vlog2e); - - const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23)); - const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23)); - - vn0123 = vsubq_f32(vn0123, vmagic_bias); - vn4567 = vsubq_f32(vn4567, vmagic_bias); - - float32x4_t vt0123 = vfmaq_f32(vx0123, vn0123, vminus_ln2); - float32x4_t vt4567 = vfmaq_f32(vx4567, vn4567, vminus_ln2); - - float32x4_t vp0123 = vfmaq_f32(vc4, vc5, vt0123); - float32x4_t vp4567 = vfmaq_f32(vc4, vc5, vt4567); - - vp0123 = vfmaq_f32(vc3, vp0123, vt0123); - vp4567 = vfmaq_f32(vc3, vp4567, vt4567); - - vp0123 = vfmaq_f32(vc2, vp0123, vt0123); - vp4567 = vfmaq_f32(vc2, vp4567, vt4567); - - vp0123 = vfmaq_f32(vc1, vp0123, vt0123); - vp4567 = vfmaq_f32(vc1, vp4567, vt4567); - - vt0123 = vmulq_f32(vt0123, vs0123); - vt4567 = vmulq_f32(vt4567, vs4567); - - float32x4_t vf0123 = vfmaq_f32(vs0123, vp0123, vt0123); - float32x4_t vf4567 = vfmaq_f32(vs4567, vp4567, vt4567); - - vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff))); - vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcltq_f32(vx4567, vdenorm_cutoff))); - - vst1q_f32(output, vf0123); output += 4; - vst1q_f32(output, vf4567); output += 4; - - vacc0 = vaddq_f32(vacc0, vf0123); - vacc0 = vaddq_f32(vacc0, vf4567); - } - - float32x4_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e); - - const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2); - - float32x4_t vp = vfmaq_f32(vc4, vc5, vt); - vp = vfmaq_f32(vc3, vp, vt); - vp = vfmaq_f32(vc2, vp, vt); - vp = vfmaq_f32(vc1, vp, vt); - - vt = vmulq_f32(vt, vs); - float32x4_t vf = vfmaq_f32(vs, vp, vt); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - vst1q_f32(output, vf); output += 4; - - vacc = vaddq_f32(vacc, vf); - } -#if XNN_ARCH_ARM64 - float vacc_lo = vaddvq_f32(vacc); -#else - float32x2_t vacc_lo = vadd_f32(vget_high_f32(vacc), vget_low_f32(vacc)); -#endif - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - const float32x4_t vi = vld1q_f32(input); input += 4; - - const float32x4_t vx = vsubq_f32(vi, vi_max); - - float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e); - - const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); - - vn = vsubq_f32(vn, vmagic_bias); - - float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2); - - float32x4_t vp = vfmaq_f32(vc4, vc5, vt); - vp = vfmaq_f32(vc3, vp, vt); - vp = vfmaq_f32(vc2, vp, vt); - vp = vfmaq_f32(vc1, vp, vt); - - vt = vmulq_f32(vt, vs); - float32x4_t vf = vfmaq_f32(vs, vp, vt); - - vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); - - float32x2_t vf_lo = vget_low_f32(vf); - if (batch & (2 * sizeof(float))) { - vst1_f32(output, vf_lo); output += 2; - - #if XNN_ARCH_ARM64 - vacc_lo += vaddv_f32(vf_lo); - #else - vacc_lo = vadd_f32(vacc_lo, vf_lo); - #endif - - vf_lo = vget_high_f32(vf); - } - if (batch & (1 * sizeof(float))) { - vst1_lane_f32(output, vf_lo, 0); - - #if XNN_ARCH_ARM64 - vacc_lo += vget_lane_f32(vf_lo, 0); - #else - vacc_lo = vadd_f32(vacc_lo, vreinterpret_f32_u64(vshl_n_u64(vreinterpret_u64_f32(vf_lo), 32))); - #endif - } - } -#if XNN_ARCH_ARM64 - *sum = vacc_lo; -#else - vst1_lane_f32(sum, vpadd_f32(vacc_lo, vacc_lo), 0); -#endif -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-lut64-p2-u2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-lut64-p2-u2.c deleted file mode 100644 index 0b7b5c35cdd..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-lut64-p2-u2.c +++ /dev/null @@ -1,198 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/scalar-rr2-lut64-p2.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -// Note redefine as uint32[] to avoid redundant bitcasts. -extern XNN_INTERNAL const uint32_t xnn_table_exp2_k_over_64[64]; - -void xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_lut64_p2_u2( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const float vlog2e = 0x1.715476p0f; - const float vmagic_bias = 0x1.800000p17f; - const uint32_t vindex_mask = UINT32_C(0x3F); - const float vminus_ln2_hi = -0x1.630000p-1f; - const float vminus_ln2_lo = 0x1.BD0106p-13f; - const float vc2 = 0x1.FFFF0Ap-2f; - const float vdenorm_cutoff = -0x1.5D589Ep6f; - - const float vi_max = *max; - - float vacc0 = 0.0f; - for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) { - // Load 2 inputs at a time. - const float vi0 = input[0]; - const float vi1 = input[1]; - input += 2; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const float vx0 = vi0 - vi_max; - const float vx1 = vi1 - vi_max; - - // Compute reduced argument n := round(x * 64 / log(2)). - // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing - // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction. - // The trick with adding large number is valid only within certain bounds (|x * 64 / log(2)| <= 2**22, i.e. - // |x| <= 0x1.62E43p+15 = 45426.09375), but that is acceptable, because inputs outside of [-87.336540, 0.0] - // result in denormalized or underflown expf(x). We fixup the result for such inputs at the very end of the - // algorithm. - float vn0 = vx0 * vlog2e + vmagic_bias; - float vn1 = vx1 * vlog2e + vmagic_bias; - - // Create a floating-point number s (scale) such that s := 2**(n / 64) for such inputs that expf(x) is normalized, - // i.e. -87.33642 <= x <= 0.0. As n has 6 fractional bits, we split s == 2**(n / 64) = 2**e * 2**(n / 64 - e), where - // e := int(n / 64). We create s in two steps: - // 1. Fetch 2**(n / 64 - e) = 2**(n % 64) from the table using the 6 low bits of n, as integer. Note that the - // fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0. - // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized - // number, because for -87.33642 <= x <= 0.0 (inputs for which expf(x) is normalized) we have -126 <= e <= 0, - // and thus the adjusted exponent is not lower than -126. - // - // Extract e from bits 6:14 of n and shift it into bits 23:31 (position of floating-point exponent). - const uint32_t ve0 = (float_as_uint32(vn0) & UINT32_C(0xFFFFFFC0)) << 17; - const uint32_t ve1 = (float_as_uint32(vn1) & UINT32_C(0xFFFFFFC0)) << 17; - - // Use bits 0:6 bits of n, as integer, as an index for table lookup of l := 2**(n % 64). - const uint32_t vidx0 = float_as_uint32(vn0) & vindex_mask; - const uint32_t vidx1 = float_as_uint32(vn1) & vindex_mask; - // Adjust exponent of the value l fetched from the table to get the final s value. - const float vs0 = uint32_as_float(xnn_table_exp2_k_over_64[vidx0] + ve0); - const float vs1 = uint32_as_float(xnn_table_exp2_k_over_64[vidx1] + ve1); - - // Subtract the large number back to get final n := round(x * 64 / log(2)) as a floating-point number. - vn0 -= vmagic_bias; - vn1 -= vmagic_bias; - - // Compute reduced argument t := x - n * log(2) / 64. - // Use Cody-Waite range reduction method (note the two constants representing log(2) / 64) to improve accuracy. - float vt0 = vn0 * vminus_ln2_hi + vx0; - float vt1 = vn1 * vminus_ln2_hi + vx1; - - vt0 = vn0 * vminus_ln2_lo + vt0; - vt1 = vn1 * vminus_ln2_lo + vt1; - - // Compute degree-2 polynomial approximation for exp(t) on [-log(2)/128, log(2)/128]. - float vp0 = vt0 * vc2; - float vp1 = vt1 * vc2; - - vp0 = vp0 * vt0 + vt0; - vp1 = vp1 * vt1 + vt1; - - // Reconstruct the final f value: - // f = s * (1 + t * (1 + t * c2)) - // = s * (1 + t + t * (t * c2)) - // = s + s * (t + t * (t * c2)) - // = s + s * p - float vf0 = vp0 * vs0 + vs0; - float vf1 = vp1 * vs1 + vs1; - - // For inputs below denormal cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - if XNN_UNPREDICTABLE(vx0 < vdenorm_cutoff) { - vf0 = 0.0f; - } - if XNN_UNPREDICTABLE(vx1 < vdenorm_cutoff) { - vf1 = 0.0f; - } - - // Store 2 outputs at a time. - output[0] = vf0; - output[1] = vf1; - output += 2; - - // Accumulate computed exponents. - vacc0 += vf0; - vacc0 += vf1; - } - - float vacc = vacc0; - for (; batch >= sizeof(float); batch -= sizeof(float)) { - // Load 1 input at a time. - const float vi = *input++; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const float vx = vi - vi_max; - - // Compute reduced argument n := round(x * 64 / log(2)). - // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing - // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction. - // The trick with adding large number is valid only within certain bounds (|x * 64 / log(2)| <= 2**22, i.e. - // |x| <= 0x1.62E43p+15 = 45426.09375), but that is acceptable, because inputs outside of [-87.336540, 0.0] - // result in denormalized or underflown expf(x). We fixup the result for such inputs at the very end of the - // algorithm. - float vn = vx * vlog2e + vmagic_bias; - - // Create a floating-point number s (scale) such that s := 2**(n / 64) for such inputs that expf(x) is normalized, - // i.e. -87.33642 <= x <= 0.0. As n has 6 fractional bits, we split s == 2**(n / 64) = 2**e * 2**(n / 64 - e), where - // e := int(n / 64). We create s in two steps: - // 1. Fetch 2**(n / 64 - e) = 2**(n % 64) from the table using the 6 low bits of n, as integer. Note that the - // fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0. - // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized - // number, because for -87.33642 <= x <= 0.0 (inputs for which expf(x) is normalized) we have -126 <= e <= 0, - // and thus the adjusted exponent is not lower than -126. - // - // Extract e from bits 6:14 of n and shift it into bits 23:31 (position of floating-point exponent). - const uint32_t ve = (float_as_uint32(vn) & UINT32_C(0xFFFFFFC0)) << 17; - - // Use bits 0:6 bits of n, as integer, as an index for table lookup of l := 2**(n % 64). - const uint32_t vidx = float_as_uint32(vn) & vindex_mask; - // Adjust exponent of the value l fetched from the table to get the final s value. - const float vs = uint32_as_float(xnn_table_exp2_k_over_64[vidx] + ve); - - // Subtract the large number back to get final n := round(x * 64 / log(2)) as a floating-point number. - vn -= vmagic_bias; - - // Compute reduced argument t := x - n * log(2) / 64. - // Use Cody-Waite range reduction method (note the two constants representing log(2) / 64) to improve accuracy. - float vt = vn * vminus_ln2_hi + vx; - vt = vn * vminus_ln2_lo + vt; - - // Compute degree-2 polynomial approximation for exp(t) on [-log(2)/128, log(2)/128]. - float vp = vt * vc2; - vp = vp * vt + vt; - - // Reconstruct the final f value: - // f = s * (1 + t * (1 + t * c2)) - // = s * (1 + t + t * (t * c2)) - // = s + s * (t + t * (t * c2)) - // = s + s * p - float vf = vp * vs + vs; - - // For inputs below denormal cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - if XNN_UNPREDICTABLE(vx < vdenorm_cutoff) { - vf = 0.0f; - } - - // Store 1 output at a time. - *output++ = vf; - - // Accumulate computed exponents. - vacc += vf; - } - *sum = vacc; -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-lut64-p2-u4.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-lut64-p2-u4.c deleted file mode 100644 index 1a27918f912..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-lut64-p2-u4.c +++ /dev/null @@ -1,232 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/scalar-rr2-lut64-p2.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -// Note redefine as uint32[] to avoid redundant bitcasts. -extern XNN_INTERNAL const uint32_t xnn_table_exp2_k_over_64[64]; - -void xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_lut64_p2_u4( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const float vlog2e = 0x1.715476p0f; - const float vmagic_bias = 0x1.800000p17f; - const uint32_t vindex_mask = UINT32_C(0x3F); - const float vminus_ln2_hi = -0x1.630000p-1f; - const float vminus_ln2_lo = 0x1.BD0106p-13f; - const float vc2 = 0x1.FFFF0Ap-2f; - const float vdenorm_cutoff = -0x1.5D589Ep6f; - - const float vi_max = *max; - - float vacc0 = 0.0f; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - // Load 4 inputs at a time. - const float vi0 = input[0]; - const float vi1 = input[1]; - const float vi2 = input[2]; - const float vi3 = input[3]; - input += 4; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const float vx0 = vi0 - vi_max; - const float vx1 = vi1 - vi_max; - const float vx2 = vi2 - vi_max; - const float vx3 = vi3 - vi_max; - - // Compute reduced argument n := round(x * 64 / log(2)). - // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing - // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction. - // The trick with adding large number is valid only within certain bounds (|x * 64 / log(2)| <= 2**22, i.e. - // |x| <= 0x1.62E43p+15 = 45426.09375), but that is acceptable, because inputs outside of [-87.336540, 0.0] - // result in denormalized or underflown expf(x). We fixup the result for such inputs at the very end of the - // algorithm. - float vn0 = vx0 * vlog2e + vmagic_bias; - float vn1 = vx1 * vlog2e + vmagic_bias; - float vn2 = vx2 * vlog2e + vmagic_bias; - float vn3 = vx3 * vlog2e + vmagic_bias; - - // Create a floating-point number s (scale) such that s := 2**(n / 64) for such inputs that expf(x) is normalized, - // i.e. -87.33642 <= x <= 0.0. As n has 6 fractional bits, we split s == 2**(n / 64) = 2**e * 2**(n / 64 - e), where - // e := int(n / 64). We create s in two steps: - // 1. Fetch 2**(n / 64 - e) = 2**(n % 64) from the table using the 6 low bits of n, as integer. Note that the - // fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0. - // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized - // number, because for -87.33642 <= x <= 0.0 (inputs for which expf(x) is normalized) we have -126 <= e <= 0, - // and thus the adjusted exponent is not lower than -126. - // - // Extract e from bits 6:14 of n and shift it into bits 23:31 (position of floating-point exponent). - const uint32_t ve0 = (float_as_uint32(vn0) & UINT32_C(0xFFFFFFC0)) << 17; - const uint32_t ve1 = (float_as_uint32(vn1) & UINT32_C(0xFFFFFFC0)) << 17; - const uint32_t ve2 = (float_as_uint32(vn2) & UINT32_C(0xFFFFFFC0)) << 17; - const uint32_t ve3 = (float_as_uint32(vn3) & UINT32_C(0xFFFFFFC0)) << 17; - - // Use bits 0:6 bits of n, as integer, as an index for table lookup of l := 2**(n % 64). - const uint32_t vidx0 = float_as_uint32(vn0) & vindex_mask; - const uint32_t vidx1 = float_as_uint32(vn1) & vindex_mask; - const uint32_t vidx2 = float_as_uint32(vn2) & vindex_mask; - const uint32_t vidx3 = float_as_uint32(vn3) & vindex_mask; - // Adjust exponent of the value l fetched from the table to get the final s value. - const float vs0 = uint32_as_float(xnn_table_exp2_k_over_64[vidx0] + ve0); - const float vs1 = uint32_as_float(xnn_table_exp2_k_over_64[vidx1] + ve1); - const float vs2 = uint32_as_float(xnn_table_exp2_k_over_64[vidx2] + ve2); - const float vs3 = uint32_as_float(xnn_table_exp2_k_over_64[vidx3] + ve3); - - // Subtract the large number back to get final n := round(x * 64 / log(2)) as a floating-point number. - vn0 -= vmagic_bias; - vn1 -= vmagic_bias; - vn2 -= vmagic_bias; - vn3 -= vmagic_bias; - - // Compute reduced argument t := x - n * log(2) / 64. - // Use Cody-Waite range reduction method (note the two constants representing log(2) / 64) to improve accuracy. - float vt0 = vn0 * vminus_ln2_hi + vx0; - float vt1 = vn1 * vminus_ln2_hi + vx1; - float vt2 = vn2 * vminus_ln2_hi + vx2; - float vt3 = vn3 * vminus_ln2_hi + vx3; - - vt0 = vn0 * vminus_ln2_lo + vt0; - vt1 = vn1 * vminus_ln2_lo + vt1; - vt2 = vn2 * vminus_ln2_lo + vt2; - vt3 = vn3 * vminus_ln2_lo + vt3; - - // Compute degree-2 polynomial approximation for exp(t) on [-log(2)/128, log(2)/128]. - float vp0 = vt0 * vc2; - float vp1 = vt1 * vc2; - float vp2 = vt2 * vc2; - float vp3 = vt3 * vc2; - - vp0 = vp0 * vt0 + vt0; - vp1 = vp1 * vt1 + vt1; - vp2 = vp2 * vt2 + vt2; - vp3 = vp3 * vt3 + vt3; - - // Reconstruct the final f value: - // f = s * (1 + t * (1 + t * c2)) - // = s * (1 + t + t * (t * c2)) - // = s + s * (t + t * (t * c2)) - // = s + s * p - float vf0 = vp0 * vs0 + vs0; - float vf1 = vp1 * vs1 + vs1; - float vf2 = vp2 * vs2 + vs2; - float vf3 = vp3 * vs3 + vs3; - - // For inputs below denormal cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - if XNN_UNPREDICTABLE(vx0 < vdenorm_cutoff) { - vf0 = 0.0f; - } - if XNN_UNPREDICTABLE(vx1 < vdenorm_cutoff) { - vf1 = 0.0f; - } - if XNN_UNPREDICTABLE(vx2 < vdenorm_cutoff) { - vf2 = 0.0f; - } - if XNN_UNPREDICTABLE(vx3 < vdenorm_cutoff) { - vf3 = 0.0f; - } - - // Store 4 outputs at a time. - output[0] = vf0; - output[1] = vf1; - output[2] = vf2; - output[3] = vf3; - output += 4; - - // Accumulate computed exponents. - vacc0 += vf0; - vacc0 += vf1; - vacc0 += vf2; - vacc0 += vf3; - } - - float vacc = vacc0; - for (; batch >= sizeof(float); batch -= sizeof(float)) { - // Load 1 input at a time. - const float vi = *input++; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const float vx = vi - vi_max; - - // Compute reduced argument n := round(x * 64 / log(2)). - // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing - // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction. - // The trick with adding large number is valid only within certain bounds (|x * 64 / log(2)| <= 2**22, i.e. - // |x| <= 0x1.62E43p+15 = 45426.09375), but that is acceptable, because inputs outside of [-87.336540, 0.0] - // result in denormalized or underflown expf(x). We fixup the result for such inputs at the very end of the - // algorithm. - float vn = vx * vlog2e + vmagic_bias; - - // Create a floating-point number s (scale) such that s := 2**(n / 64) for such inputs that expf(x) is normalized, - // i.e. -87.33642 <= x <= 0.0. As n has 6 fractional bits, we split s == 2**(n / 64) = 2**e * 2**(n / 64 - e), where - // e := int(n / 64). We create s in two steps: - // 1. Fetch 2**(n / 64 - e) = 2**(n % 64) from the table using the 6 low bits of n, as integer. Note that the - // fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0. - // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized - // number, because for -87.33642 <= x <= 0.0 (inputs for which expf(x) is normalized) we have -126 <= e <= 0, - // and thus the adjusted exponent is not lower than -126. - // - // Extract e from bits 6:14 of n and shift it into bits 23:31 (position of floating-point exponent). - const uint32_t ve = (float_as_uint32(vn) & UINT32_C(0xFFFFFFC0)) << 17; - - // Use bits 0:6 bits of n, as integer, as an index for table lookup of l := 2**(n % 64). - const uint32_t vidx = float_as_uint32(vn) & vindex_mask; - // Adjust exponent of the value l fetched from the table to get the final s value. - const float vs = uint32_as_float(xnn_table_exp2_k_over_64[vidx] + ve); - - // Subtract the large number back to get final n := round(x * 64 / log(2)) as a floating-point number. - vn -= vmagic_bias; - - // Compute reduced argument t := x - n * log(2) / 64. - // Use Cody-Waite range reduction method (note the two constants representing log(2) / 64) to improve accuracy. - float vt = vn * vminus_ln2_hi + vx; - vt = vn * vminus_ln2_lo + vt; - - // Compute degree-2 polynomial approximation for exp(t) on [-log(2)/128, log(2)/128]. - float vp = vt * vc2; - vp = vp * vt + vt; - - // Reconstruct the final f value: - // f = s * (1 + t * (1 + t * c2)) - // = s * (1 + t + t * (t * c2)) - // = s + s * (t + t * (t * c2)) - // = s + s * p - float vf = vp * vs + vs; - - // For inputs below denormal cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - if XNN_UNPREDICTABLE(vx < vdenorm_cutoff) { - vf = 0.0f; - } - - // Store 1 output at a time. - *output++ = vf; - - // Accumulate computed exponents. - vacc += vf; - } - *sum = vacc; -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-p5-u2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-p5-u2.c deleted file mode 100644 index 8be92fcaf74..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-p5-u2.c +++ /dev/null @@ -1,176 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/scalar-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_u2( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const float vlog2e = 0x1.715476p+0f; - const float vmagic_bias = 0x1.8000FEp23f; - const float vminus_ln2_hi = -0x1.62E400p-1f; - const float vminus_ln2_lo = -0x1.7F7D1Cp-20f; - const float vc5 = 0x1.0F9F9Cp-7f; - const float vc4 = 0x1.573A1Ap-5f; - const float vc3 = 0x1.555A80p-3f; - const float vc2 = 0x1.FFFDC6p-2f; - const float vc1 = 0x1.FFFFF6p-1f; - const float vdenorm_cutoff = -0x1.5D589Ep6f; - - const float vi_max = *max; - - float vacc0 = 0.0f; - for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) { - // Load 2 inputs at a time. - const float vi0 = input[0]; - const float vi1 = input[1]; - input += 2; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const float vx0 = vi0 - vi_max; - const float vx1 = vi1 - vi_max; - - // Compute reduced argument n := round(x / log(2)). - // We do it by adding a large number (magic bias) to the product x * (1/log(2)), which cause rounding of the result - // to an integer, then subtracing the large number back. The trick with adding large number is valid only within - // certain bounds (|x| <= 2**22), but that's ok, because inputs outside of [-87.336540, 0.0] underflow expf(x) - // anyway. We fixup the result for such inputs at the very end of the algorithm. - float vn0 = vx0 * vlog2e + vmagic_bias; - float vn1 = vx1 * vlog2e + vmagic_bias; - - // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= n <= 0 accordingly. - const float vs0 = uint32_as_float(float_as_uint32(vn0) << 23); - const float vs1 = uint32_as_float(float_as_uint32(vn1) << 23); - - // Subtract the large number back to get final n := round(x / log(2)). - vn0 -= vmagic_bias; - vn1 -= vmagic_bias; - - // Compute reduced argument t := x - n * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - float vt0 = vn0 * vminus_ln2_hi + vx0; - float vt1 = vn1 * vminus_ln2_hi + vx1; - - vt0 = vn0 * vminus_ln2_lo + vt0; - vt1 = vn1 * vminus_ln2_lo + vt1; - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - float vp0 = vc5 * vt0 + vc4; - float vp1 = vc5 * vt1 + vc4; - - vp0 = vp0 * vt0 + vc3; - vp1 = vp1 * vt1 + vc3; - - vp0 = vp0 * vt0 + vc2; - vp1 = vp1 * vt1 + vc2; - - vp0 = vp0 * vt0 + vc1; - vp1 = vp1 * vt1 + vc1; - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt0 *= vs0; - vt1 *= vs1; - - float vf0 = vt0 * vp0 + vs0; - float vf1 = vt1 * vp1 + vs1; - - // For inputs below denormal cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - if XNN_UNPREDICTABLE(vx0 < vdenorm_cutoff) { - vf0 = 0.0f; - } - if XNN_UNPREDICTABLE(vx1 < vdenorm_cutoff) { - vf1 = 0.0f; - } - - // Store 2 outputs at a time. - output[0] = vf0; - output[1] = vf1; - output += 2; - - // Accumulate computed exponents. - vacc0 += vf0; - vacc0 += vf1; - } - - float vacc = vacc0; - for (; batch >= sizeof(float); batch -= sizeof(float)) { - // Load 1 input at a time. - const float vi = *input++; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const float vx = vi - vi_max; - - // Compute reduced argument n := round(x / log(2)). - // We do it by adding a large number (magic bias) to the product x * (1/log(2)), which cause rounding of the result - // to an integer, then subtracing the large number back. The trick with adding large number is valid only within - // certain bounds (|x| <= 2**22), but that's ok, because inputs outside of [-87.336540, 0.0] underflow expf(x) - // anyway. We fixup the result for such inputs at the very end of the algorithm. - float vn = vx * vlog2e + vmagic_bias; - - // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= n <= 0 accordingly. - const float vs = uint32_as_float(float_as_uint32(vn) << 23); - - // Subtract the large number back to get final n := round(x / log(2)). - vn -= vmagic_bias; - - // Compute reduced argument t := x - n * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - float vt = vn * vminus_ln2_hi + vx; - vt = vn * vminus_ln2_lo + vt; - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - float vp = vc5 * vt + vc4; - vp = vp * vt + vc3; - vp = vp * vt + vc2; - vp = vp * vt + vc1; - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt *= vs; - float vf = vt * vp + vs; - - // For inputs below denormal cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - if XNN_UNPREDICTABLE(vx < vdenorm_cutoff) { - vf = 0.0f; - } - - // Store 1 output at a time. - *output++ = vf; - - // Accumulate computed exponents. - vacc += vf; - } - *sum = vacc; -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-p5-u4.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-p5-u4.c deleted file mode 100644 index 00c9e26c68e..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-p5-u4.c +++ /dev/null @@ -1,212 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/scalar-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_u4( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const float vlog2e = 0x1.715476p+0f; - const float vmagic_bias = 0x1.8000FEp23f; - const float vminus_ln2_hi = -0x1.62E400p-1f; - const float vminus_ln2_lo = -0x1.7F7D1Cp-20f; - const float vc5 = 0x1.0F9F9Cp-7f; - const float vc4 = 0x1.573A1Ap-5f; - const float vc3 = 0x1.555A80p-3f; - const float vc2 = 0x1.FFFDC6p-2f; - const float vc1 = 0x1.FFFFF6p-1f; - const float vdenorm_cutoff = -0x1.5D589Ep6f; - - const float vi_max = *max; - - float vacc0 = 0.0f; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - // Load 4 inputs at a time. - const float vi0 = input[0]; - const float vi1 = input[1]; - const float vi2 = input[2]; - const float vi3 = input[3]; - input += 4; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const float vx0 = vi0 - vi_max; - const float vx1 = vi1 - vi_max; - const float vx2 = vi2 - vi_max; - const float vx3 = vi3 - vi_max; - - // Compute reduced argument n := round(x / log(2)). - // We do it by adding a large number (magic bias) to the product x * (1/log(2)), which cause rounding of the result - // to an integer, then subtracing the large number back. The trick with adding large number is valid only within - // certain bounds (|x| <= 2**22), but that's ok, because inputs outside of [-87.336540, 0.0] underflow expf(x) - // anyway. We fixup the result for such inputs at the very end of the algorithm. - float vn0 = vx0 * vlog2e + vmagic_bias; - float vn1 = vx1 * vlog2e + vmagic_bias; - float vn2 = vx2 * vlog2e + vmagic_bias; - float vn3 = vx3 * vlog2e + vmagic_bias; - - // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= n <= 0 accordingly. - const float vs0 = uint32_as_float(float_as_uint32(vn0) << 23); - const float vs1 = uint32_as_float(float_as_uint32(vn1) << 23); - const float vs2 = uint32_as_float(float_as_uint32(vn2) << 23); - const float vs3 = uint32_as_float(float_as_uint32(vn3) << 23); - - // Subtract the large number back to get final n := round(x / log(2)). - vn0 -= vmagic_bias; - vn1 -= vmagic_bias; - vn2 -= vmagic_bias; - vn3 -= vmagic_bias; - - // Compute reduced argument t := x - n * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - float vt0 = vn0 * vminus_ln2_hi + vx0; - float vt1 = vn1 * vminus_ln2_hi + vx1; - float vt2 = vn2 * vminus_ln2_hi + vx2; - float vt3 = vn3 * vminus_ln2_hi + vx3; - - vt0 = vn0 * vminus_ln2_lo + vt0; - vt1 = vn1 * vminus_ln2_lo + vt1; - vt2 = vn2 * vminus_ln2_lo + vt2; - vt3 = vn3 * vminus_ln2_lo + vt3; - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - float vp0 = vc5 * vt0 + vc4; - float vp1 = vc5 * vt1 + vc4; - float vp2 = vc5 * vt2 + vc4; - float vp3 = vc5 * vt3 + vc4; - - vp0 = vp0 * vt0 + vc3; - vp1 = vp1 * vt1 + vc3; - vp2 = vp2 * vt2 + vc3; - vp3 = vp3 * vt3 + vc3; - - vp0 = vp0 * vt0 + vc2; - vp1 = vp1 * vt1 + vc2; - vp2 = vp2 * vt2 + vc2; - vp3 = vp3 * vt3 + vc2; - - vp0 = vp0 * vt0 + vc1; - vp1 = vp1 * vt1 + vc1; - vp2 = vp2 * vt2 + vc1; - vp3 = vp3 * vt3 + vc1; - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt0 *= vs0; - vt1 *= vs1; - vt2 *= vs2; - vt3 *= vs3; - - float vf0 = vt0 * vp0 + vs0; - float vf1 = vt1 * vp1 + vs1; - float vf2 = vt2 * vp2 + vs2; - float vf3 = vt3 * vp3 + vs3; - - // For inputs below denormal cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - if XNN_UNPREDICTABLE(vx0 < vdenorm_cutoff) { - vf0 = 0.0f; - } - if XNN_UNPREDICTABLE(vx1 < vdenorm_cutoff) { - vf1 = 0.0f; - } - if XNN_UNPREDICTABLE(vx2 < vdenorm_cutoff) { - vf2 = 0.0f; - } - if XNN_UNPREDICTABLE(vx3 < vdenorm_cutoff) { - vf3 = 0.0f; - } - - // Store 4 outputs at a time. - output[0] = vf0; - output[1] = vf1; - output[2] = vf2; - output[3] = vf3; - output += 4; - - // Accumulate computed exponents. - vacc0 += vf0; - vacc0 += vf1; - vacc0 += vf2; - vacc0 += vf3; - } - - float vacc = vacc0; - for (; batch >= sizeof(float); batch -= sizeof(float)) { - // Load 1 input at a time. - const float vi = *input++; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const float vx = vi - vi_max; - - // Compute reduced argument n := round(x / log(2)). - // We do it by adding a large number (magic bias) to the product x * (1/log(2)), which cause rounding of the result - // to an integer, then subtracing the large number back. The trick with adding large number is valid only within - // certain bounds (|x| <= 2**22), but that's ok, because inputs outside of [-87.336540, 0.0] underflow expf(x) - // anyway. We fixup the result for such inputs at the very end of the algorithm. - float vn = vx * vlog2e + vmagic_bias; - - // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= n <= 0 accordingly. - const float vs = uint32_as_float(float_as_uint32(vn) << 23); - - // Subtract the large number back to get final n := round(x / log(2)). - vn -= vmagic_bias; - - // Compute reduced argument t := x - n * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - float vt = vn * vminus_ln2_hi + vx; - vt = vn * vminus_ln2_lo + vt; - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - float vp = vc5 * vt + vc4; - vp = vp * vt + vc3; - vp = vp * vt + vc2; - vp = vp * vt + vc1; - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt *= vs; - float vf = vt * vp + vs; - - // For inputs below denormal cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - if XNN_UNPREDICTABLE(vx < vdenorm_cutoff) { - vf = 0.0f; - } - - // Store 1 output at a time. - *output++ = vf; - - // Accumulate computed exponents. - vacc += vf; - } - *sum = vacc; -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u12-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u12-acc2.c deleted file mode 100644 index 2ff1d6bfb9b..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u12-acc2.c +++ /dev/null @@ -1,257 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u12_acc2( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f); - const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f); - const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f); - const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f); - const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f); - const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f); - const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f); - const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f); - const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f); - const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m128 vi_max = _mm_load1_ps(max); - - __m128 vacc0 = _mm_setzero_ps(); - __m128 vacc1 = _mm_setzero_ps(); - for (; batch >= 12 * sizeof(float); batch -= 12 * sizeof(float)) { - // Load 12 (3x4) inputs at a time. - const __m128 vi0 = _mm_loadu_ps(input); - const __m128 vi1 = _mm_loadu_ps(input + 4); - const __m128 vi2 = _mm_loadu_ps(input + 8); - input += 12; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx0 = _mm_sub_ps(vi0, vi_max); - const __m128 vx1 = _mm_sub_ps(vi1, vi_max); - const __m128 vx2 = _mm_sub_ps(vi2, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); - __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); - __m128 vn2 = _mm_add_ps(_mm_mul_ps(vx2, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); - const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); - const __m128 vs2 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn2), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn0 = _mm_sub_ps(vn0, vmagic_bias); - vn1 = _mm_sub_ps(vn1, vmagic_bias); - vn2 = _mm_sub_ps(vn2, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); - __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); - __m128 vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_hi), vx2); - - vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); - vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); - vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_lo), vt2); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); - __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); - __m128 vp2 = _mm_add_ps(_mm_mul_ps(vc5, vt2), vc4); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc3); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc2); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt0 = _mm_mul_ps(vt0, vs0); - vt1 = _mm_mul_ps(vt1, vs1); - vt2 = _mm_mul_ps(vt2, vs2); - - __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); - __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); - __m128 vf2 = _mm_add_ps(_mm_mul_ps(vt2, vp2), vs2); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); - vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); - vf2 = _mm_andnot_ps(_mm_cmplt_ps(vx2, vdenorm_cutoff), vf2); - - // Store 12 (3x4) outputs at a time. - _mm_storeu_ps(output, vf0); - _mm_storeu_ps(output + 4, vf1); - _mm_storeu_ps(output + 8, vf2); - output += 12; - - // Accumulate computed exponents. - vacc0 = _mm_add_ps(vacc0, vf0); - vacc1 = _mm_add_ps(vacc1, vf1); - vacc0 = _mm_add_ps(vacc0, vf2); - } - // Add up all accumulators to vacc0 - vacc0 = _mm_add_ps(vacc0, vacc1); - - __m128 vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - input += 4; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - // Store 4 outputs at a time. - _mm_storeu_ps(output, vf); - output += 4; - - // Accumulate computed exponents. - vacc = _mm_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - if (batch & (2 * sizeof(float))) { - // Store 2 outputs at a time. - _mm_storel_pi((__m64*) output, vf); - output += 2; - - // Accumulate 2 computed exponents. - vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps())); - - vf = _mm_movehl_ps(vf, vf); - } - if (batch & (1 * sizeof(float))) { - // Store 1 output at a time. - _mm_store_ss(output, vf); - - // Accumulate 1 computed exponent. - vacc = _mm_add_ss(vacc, vf); - } - } - // Reduce 4 batch in the SIMD register - vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); - vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1))); - _mm_store_ss(sum, vacc); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u12-acc3.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u12-acc3.c deleted file mode 100644 index d67d9fb7caf..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u12-acc3.c +++ /dev/null @@ -1,259 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u12_acc3( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f); - const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f); - const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f); - const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f); - const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f); - const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f); - const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f); - const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f); - const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f); - const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m128 vi_max = _mm_load1_ps(max); - - __m128 vacc0 = _mm_setzero_ps(); - __m128 vacc1 = _mm_setzero_ps(); - __m128 vacc2 = _mm_setzero_ps(); - for (; batch >= 12 * sizeof(float); batch -= 12 * sizeof(float)) { - // Load 12 (3x4) inputs at a time. - const __m128 vi0 = _mm_loadu_ps(input); - const __m128 vi1 = _mm_loadu_ps(input + 4); - const __m128 vi2 = _mm_loadu_ps(input + 8); - input += 12; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx0 = _mm_sub_ps(vi0, vi_max); - const __m128 vx1 = _mm_sub_ps(vi1, vi_max); - const __m128 vx2 = _mm_sub_ps(vi2, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); - __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); - __m128 vn2 = _mm_add_ps(_mm_mul_ps(vx2, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); - const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); - const __m128 vs2 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn2), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn0 = _mm_sub_ps(vn0, vmagic_bias); - vn1 = _mm_sub_ps(vn1, vmagic_bias); - vn2 = _mm_sub_ps(vn2, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); - __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); - __m128 vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_hi), vx2); - - vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); - vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); - vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_lo), vt2); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); - __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); - __m128 vp2 = _mm_add_ps(_mm_mul_ps(vc5, vt2), vc4); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc3); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc2); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt0 = _mm_mul_ps(vt0, vs0); - vt1 = _mm_mul_ps(vt1, vs1); - vt2 = _mm_mul_ps(vt2, vs2); - - __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); - __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); - __m128 vf2 = _mm_add_ps(_mm_mul_ps(vt2, vp2), vs2); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); - vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); - vf2 = _mm_andnot_ps(_mm_cmplt_ps(vx2, vdenorm_cutoff), vf2); - - // Store 12 (3x4) outputs at a time. - _mm_storeu_ps(output, vf0); - _mm_storeu_ps(output + 4, vf1); - _mm_storeu_ps(output + 8, vf2); - output += 12; - - // Accumulate computed exponents. - vacc0 = _mm_add_ps(vacc0, vf0); - vacc1 = _mm_add_ps(vacc1, vf1); - vacc2 = _mm_add_ps(vacc2, vf2); - } - // Add up all accumulators to vacc0 - vacc0 = _mm_add_ps(vacc0, vacc1); - vacc0 = _mm_add_ps(vacc0, vacc2); - - __m128 vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - input += 4; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - // Store 4 outputs at a time. - _mm_storeu_ps(output, vf); - output += 4; - - // Accumulate computed exponents. - vacc = _mm_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - if (batch & (2 * sizeof(float))) { - // Store 2 outputs at a time. - _mm_storel_pi((__m64*) output, vf); - output += 2; - - // Accumulate 2 computed exponents. - vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps())); - - vf = _mm_movehl_ps(vf, vf); - } - if (batch & (1 * sizeof(float))) { - // Store 1 output at a time. - _mm_store_ss(output, vf); - - // Accumulate 1 computed exponent. - vacc = _mm_add_ss(vacc, vf); - } - } - // Reduce 4 batch in the SIMD register - vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); - vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1))); - _mm_store_ss(sum, vacc); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u12.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u12.c deleted file mode 100644 index 227262dc565..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u12.c +++ /dev/null @@ -1,254 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u12( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f); - const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f); - const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f); - const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f); - const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f); - const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f); - const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f); - const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f); - const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f); - const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m128 vi_max = _mm_load1_ps(max); - - __m128 vacc0 = _mm_setzero_ps(); - for (; batch >= 12 * sizeof(float); batch -= 12 * sizeof(float)) { - // Load 12 (3x4) inputs at a time. - const __m128 vi0 = _mm_loadu_ps(input); - const __m128 vi1 = _mm_loadu_ps(input + 4); - const __m128 vi2 = _mm_loadu_ps(input + 8); - input += 12; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx0 = _mm_sub_ps(vi0, vi_max); - const __m128 vx1 = _mm_sub_ps(vi1, vi_max); - const __m128 vx2 = _mm_sub_ps(vi2, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); - __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); - __m128 vn2 = _mm_add_ps(_mm_mul_ps(vx2, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); - const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); - const __m128 vs2 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn2), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn0 = _mm_sub_ps(vn0, vmagic_bias); - vn1 = _mm_sub_ps(vn1, vmagic_bias); - vn2 = _mm_sub_ps(vn2, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); - __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); - __m128 vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_hi), vx2); - - vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); - vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); - vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_lo), vt2); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); - __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); - __m128 vp2 = _mm_add_ps(_mm_mul_ps(vc5, vt2), vc4); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc3); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc2); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt0 = _mm_mul_ps(vt0, vs0); - vt1 = _mm_mul_ps(vt1, vs1); - vt2 = _mm_mul_ps(vt2, vs2); - - __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); - __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); - __m128 vf2 = _mm_add_ps(_mm_mul_ps(vt2, vp2), vs2); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); - vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); - vf2 = _mm_andnot_ps(_mm_cmplt_ps(vx2, vdenorm_cutoff), vf2); - - // Store 12 (3x4) outputs at a time. - _mm_storeu_ps(output, vf0); - _mm_storeu_ps(output + 4, vf1); - _mm_storeu_ps(output + 8, vf2); - output += 12; - - // Accumulate computed exponents. - vacc0 = _mm_add_ps(vacc0, vf0); - vacc0 = _mm_add_ps(vacc0, vf1); - vacc0 = _mm_add_ps(vacc0, vf2); - } - - __m128 vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - input += 4; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - // Store 4 outputs at a time. - _mm_storeu_ps(output, vf); - output += 4; - - // Accumulate computed exponents. - vacc = _mm_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - if (batch & (2 * sizeof(float))) { - // Store 2 outputs at a time. - _mm_storel_pi((__m64*) output, vf); - output += 2; - - // Accumulate 2 computed exponents. - vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps())); - - vf = _mm_movehl_ps(vf, vf); - } - if (batch & (1 * sizeof(float))) { - // Store 1 output at a time. - _mm_store_ss(output, vf); - - // Accumulate 1 computed exponent. - vacc = _mm_add_ss(vacc, vf); - } - } - // Reduce 4 batch in the SIMD register - vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); - vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1))); - _mm_store_ss(sum, vacc); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16.c deleted file mode 100644 index 79cabdc774d..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16.c +++ /dev/null @@ -1,270 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f); - const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f); - const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f); - const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f); - const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f); - const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f); - const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f); - const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f); - const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f); - const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m128 vi_max = _mm_load1_ps(max); - - __m128 vacc0 = _mm_setzero_ps(); - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - // Load 16 (4x4) inputs at a time. - const __m128 vi0 = _mm_loadu_ps(input); - const __m128 vi1 = _mm_loadu_ps(input + 4); - const __m128 vi2 = _mm_loadu_ps(input + 8); - const __m128 vi3 = _mm_loadu_ps(input + 12); - input += 16; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx0 = _mm_sub_ps(vi0, vi_max); - const __m128 vx1 = _mm_sub_ps(vi1, vi_max); - const __m128 vx2 = _mm_sub_ps(vi2, vi_max); - const __m128 vx3 = _mm_sub_ps(vi3, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); - __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); - __m128 vn2 = _mm_add_ps(_mm_mul_ps(vx2, vlog2e), vmagic_bias); - __m128 vn3 = _mm_add_ps(_mm_mul_ps(vx3, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); - const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); - const __m128 vs2 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn2), 23)); - const __m128 vs3 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn3), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn0 = _mm_sub_ps(vn0, vmagic_bias); - vn1 = _mm_sub_ps(vn1, vmagic_bias); - vn2 = _mm_sub_ps(vn2, vmagic_bias); - vn3 = _mm_sub_ps(vn3, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); - __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); - __m128 vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_hi), vx2); - __m128 vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_hi), vx3); - - vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); - vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); - vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_lo), vt2); - vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_lo), vt3); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); - __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); - __m128 vp2 = _mm_add_ps(_mm_mul_ps(vc5, vt2), vc4); - __m128 vp3 = _mm_add_ps(_mm_mul_ps(vc5, vt3), vc4); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc3); - vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc3); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc2); - vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc2); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc1); - vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt0 = _mm_mul_ps(vt0, vs0); - vt1 = _mm_mul_ps(vt1, vs1); - vt2 = _mm_mul_ps(vt2, vs2); - vt3 = _mm_mul_ps(vt3, vs3); - - __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); - __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); - __m128 vf2 = _mm_add_ps(_mm_mul_ps(vt2, vp2), vs2); - __m128 vf3 = _mm_add_ps(_mm_mul_ps(vt3, vp3), vs3); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); - vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); - vf2 = _mm_andnot_ps(_mm_cmplt_ps(vx2, vdenorm_cutoff), vf2); - vf3 = _mm_andnot_ps(_mm_cmplt_ps(vx3, vdenorm_cutoff), vf3); - - // Store 16 (4x4) outputs at a time. - _mm_storeu_ps(output, vf0); - _mm_storeu_ps(output + 4, vf1); - _mm_storeu_ps(output + 8, vf2); - _mm_storeu_ps(output + 12, vf3); - output += 16; - - // Accumulate computed exponents. - vacc0 = _mm_add_ps(vacc0, vf0); - vacc0 = _mm_add_ps(vacc0, vf1); - vacc0 = _mm_add_ps(vacc0, vf2); - vacc0 = _mm_add_ps(vacc0, vf3); - } - - __m128 vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - input += 4; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - // Store 4 outputs at a time. - _mm_storeu_ps(output, vf); - output += 4; - - // Accumulate computed exponents. - vacc = _mm_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - if (batch & (2 * sizeof(float))) { - // Store 2 outputs at a time. - _mm_storel_pi((__m64*) output, vf); - output += 2; - - // Accumulate 2 computed exponents. - vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps())); - - vf = _mm_movehl_ps(vf, vf); - } - if (batch & (1 * sizeof(float))) { - // Store 1 output at a time. - _mm_store_ss(output, vf); - - // Accumulate 1 computed exponent. - vacc = _mm_add_ss(vacc, vf); - } - } - // Reduce 4 batch in the SIMD register - vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); - vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1))); - _mm_store_ss(sum, vacc); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20-acc2.c deleted file mode 100644 index efc2cc73c8b..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20-acc2.c +++ /dev/null @@ -1,289 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u20_acc2( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f); - const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f); - const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f); - const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f); - const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f); - const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f); - const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f); - const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f); - const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f); - const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m128 vi_max = _mm_load1_ps(max); - - __m128 vacc0 = _mm_setzero_ps(); - __m128 vacc1 = _mm_setzero_ps(); - for (; batch >= 20 * sizeof(float); batch -= 20 * sizeof(float)) { - // Load 20 (5x4) inputs at a time. - const __m128 vi0 = _mm_loadu_ps(input); - const __m128 vi1 = _mm_loadu_ps(input + 4); - const __m128 vi2 = _mm_loadu_ps(input + 8); - const __m128 vi3 = _mm_loadu_ps(input + 12); - const __m128 vi4 = _mm_loadu_ps(input + 16); - input += 20; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx0 = _mm_sub_ps(vi0, vi_max); - const __m128 vx1 = _mm_sub_ps(vi1, vi_max); - const __m128 vx2 = _mm_sub_ps(vi2, vi_max); - const __m128 vx3 = _mm_sub_ps(vi3, vi_max); - const __m128 vx4 = _mm_sub_ps(vi4, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); - __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); - __m128 vn2 = _mm_add_ps(_mm_mul_ps(vx2, vlog2e), vmagic_bias); - __m128 vn3 = _mm_add_ps(_mm_mul_ps(vx3, vlog2e), vmagic_bias); - __m128 vn4 = _mm_add_ps(_mm_mul_ps(vx4, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); - const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); - const __m128 vs2 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn2), 23)); - const __m128 vs3 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn3), 23)); - const __m128 vs4 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn0 = _mm_sub_ps(vn0, vmagic_bias); - vn1 = _mm_sub_ps(vn1, vmagic_bias); - vn2 = _mm_sub_ps(vn2, vmagic_bias); - vn3 = _mm_sub_ps(vn3, vmagic_bias); - vn4 = _mm_sub_ps(vn4, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); - __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); - __m128 vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_hi), vx2); - __m128 vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_hi), vx3); - __m128 vt4 = _mm_add_ps(_mm_mul_ps(vn4, vminus_ln2_hi), vx4); - - vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); - vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); - vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_lo), vt2); - vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_lo), vt3); - vt4 = _mm_add_ps(_mm_mul_ps(vn4, vminus_ln2_lo), vt4); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); - __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); - __m128 vp2 = _mm_add_ps(_mm_mul_ps(vc5, vt2), vc4); - __m128 vp3 = _mm_add_ps(_mm_mul_ps(vc5, vt3), vc4); - __m128 vp4 = _mm_add_ps(_mm_mul_ps(vc5, vt4), vc4); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc3); - vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc3); - vp4 = _mm_add_ps(_mm_mul_ps(vp4, vt4), vc3); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc2); - vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc2); - vp4 = _mm_add_ps(_mm_mul_ps(vp4, vt4), vc2); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc1); - vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc1); - vp4 = _mm_add_ps(_mm_mul_ps(vp4, vt4), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt0 = _mm_mul_ps(vt0, vs0); - vt1 = _mm_mul_ps(vt1, vs1); - vt2 = _mm_mul_ps(vt2, vs2); - vt3 = _mm_mul_ps(vt3, vs3); - vt4 = _mm_mul_ps(vt4, vs4); - - __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); - __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); - __m128 vf2 = _mm_add_ps(_mm_mul_ps(vt2, vp2), vs2); - __m128 vf3 = _mm_add_ps(_mm_mul_ps(vt3, vp3), vs3); - __m128 vf4 = _mm_add_ps(_mm_mul_ps(vt4, vp4), vs4); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); - vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); - vf2 = _mm_andnot_ps(_mm_cmplt_ps(vx2, vdenorm_cutoff), vf2); - vf3 = _mm_andnot_ps(_mm_cmplt_ps(vx3, vdenorm_cutoff), vf3); - vf4 = _mm_andnot_ps(_mm_cmplt_ps(vx4, vdenorm_cutoff), vf4); - - // Store 20 (5x4) outputs at a time. - _mm_storeu_ps(output, vf0); - _mm_storeu_ps(output + 4, vf1); - _mm_storeu_ps(output + 8, vf2); - _mm_storeu_ps(output + 12, vf3); - _mm_storeu_ps(output + 16, vf4); - output += 20; - - // Accumulate computed exponents. - vacc0 = _mm_add_ps(vacc0, vf0); - vacc1 = _mm_add_ps(vacc1, vf1); - vacc0 = _mm_add_ps(vacc0, vf2); - vacc1 = _mm_add_ps(vacc1, vf3); - vacc0 = _mm_add_ps(vacc0, vf4); - } - // Add up all accumulators to vacc0 - vacc0 = _mm_add_ps(vacc0, vacc1); - - __m128 vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - input += 4; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - // Store 4 outputs at a time. - _mm_storeu_ps(output, vf); - output += 4; - - // Accumulate computed exponents. - vacc = _mm_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - if (batch & (2 * sizeof(float))) { - // Store 2 outputs at a time. - _mm_storel_pi((__m64*) output, vf); - output += 2; - - // Accumulate 2 computed exponents. - vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps())); - - vf = _mm_movehl_ps(vf, vf); - } - if (batch & (1 * sizeof(float))) { - // Store 1 output at a time. - _mm_store_ss(output, vf); - - // Accumulate 1 computed exponent. - vacc = _mm_add_ss(vacc, vf); - } - } - // Reduce 4 batch in the SIMD register - vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); - vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1))); - _mm_store_ss(sum, vacc); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20-acc5.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20-acc5.c deleted file mode 100644 index 97aaf1ab143..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20-acc5.c +++ /dev/null @@ -1,295 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u20_acc5( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f); - const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f); - const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f); - const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f); - const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f); - const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f); - const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f); - const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f); - const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f); - const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m128 vi_max = _mm_load1_ps(max); - - __m128 vacc0 = _mm_setzero_ps(); - __m128 vacc1 = _mm_setzero_ps(); - __m128 vacc2 = _mm_setzero_ps(); - __m128 vacc3 = _mm_setzero_ps(); - __m128 vacc4 = _mm_setzero_ps(); - for (; batch >= 20 * sizeof(float); batch -= 20 * sizeof(float)) { - // Load 20 (5x4) inputs at a time. - const __m128 vi0 = _mm_loadu_ps(input); - const __m128 vi1 = _mm_loadu_ps(input + 4); - const __m128 vi2 = _mm_loadu_ps(input + 8); - const __m128 vi3 = _mm_loadu_ps(input + 12); - const __m128 vi4 = _mm_loadu_ps(input + 16); - input += 20; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx0 = _mm_sub_ps(vi0, vi_max); - const __m128 vx1 = _mm_sub_ps(vi1, vi_max); - const __m128 vx2 = _mm_sub_ps(vi2, vi_max); - const __m128 vx3 = _mm_sub_ps(vi3, vi_max); - const __m128 vx4 = _mm_sub_ps(vi4, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); - __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); - __m128 vn2 = _mm_add_ps(_mm_mul_ps(vx2, vlog2e), vmagic_bias); - __m128 vn3 = _mm_add_ps(_mm_mul_ps(vx3, vlog2e), vmagic_bias); - __m128 vn4 = _mm_add_ps(_mm_mul_ps(vx4, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); - const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); - const __m128 vs2 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn2), 23)); - const __m128 vs3 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn3), 23)); - const __m128 vs4 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn0 = _mm_sub_ps(vn0, vmagic_bias); - vn1 = _mm_sub_ps(vn1, vmagic_bias); - vn2 = _mm_sub_ps(vn2, vmagic_bias); - vn3 = _mm_sub_ps(vn3, vmagic_bias); - vn4 = _mm_sub_ps(vn4, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); - __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); - __m128 vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_hi), vx2); - __m128 vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_hi), vx3); - __m128 vt4 = _mm_add_ps(_mm_mul_ps(vn4, vminus_ln2_hi), vx4); - - vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); - vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); - vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_lo), vt2); - vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_lo), vt3); - vt4 = _mm_add_ps(_mm_mul_ps(vn4, vminus_ln2_lo), vt4); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); - __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); - __m128 vp2 = _mm_add_ps(_mm_mul_ps(vc5, vt2), vc4); - __m128 vp3 = _mm_add_ps(_mm_mul_ps(vc5, vt3), vc4); - __m128 vp4 = _mm_add_ps(_mm_mul_ps(vc5, vt4), vc4); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc3); - vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc3); - vp4 = _mm_add_ps(_mm_mul_ps(vp4, vt4), vc3); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc2); - vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc2); - vp4 = _mm_add_ps(_mm_mul_ps(vp4, vt4), vc2); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc1); - vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc1); - vp4 = _mm_add_ps(_mm_mul_ps(vp4, vt4), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt0 = _mm_mul_ps(vt0, vs0); - vt1 = _mm_mul_ps(vt1, vs1); - vt2 = _mm_mul_ps(vt2, vs2); - vt3 = _mm_mul_ps(vt3, vs3); - vt4 = _mm_mul_ps(vt4, vs4); - - __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); - __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); - __m128 vf2 = _mm_add_ps(_mm_mul_ps(vt2, vp2), vs2); - __m128 vf3 = _mm_add_ps(_mm_mul_ps(vt3, vp3), vs3); - __m128 vf4 = _mm_add_ps(_mm_mul_ps(vt4, vp4), vs4); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); - vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); - vf2 = _mm_andnot_ps(_mm_cmplt_ps(vx2, vdenorm_cutoff), vf2); - vf3 = _mm_andnot_ps(_mm_cmplt_ps(vx3, vdenorm_cutoff), vf3); - vf4 = _mm_andnot_ps(_mm_cmplt_ps(vx4, vdenorm_cutoff), vf4); - - // Store 20 (5x4) outputs at a time. - _mm_storeu_ps(output, vf0); - _mm_storeu_ps(output + 4, vf1); - _mm_storeu_ps(output + 8, vf2); - _mm_storeu_ps(output + 12, vf3); - _mm_storeu_ps(output + 16, vf4); - output += 20; - - // Accumulate computed exponents. - vacc0 = _mm_add_ps(vacc0, vf0); - vacc1 = _mm_add_ps(vacc1, vf1); - vacc2 = _mm_add_ps(vacc2, vf2); - vacc3 = _mm_add_ps(vacc3, vf3); - vacc4 = _mm_add_ps(vacc4, vf4); - } - // Add up all accumulators to vacc0 - vacc0 = _mm_add_ps(vacc0, vacc1); - vacc2 = _mm_add_ps(vacc2, vacc3); - vacc0 = _mm_add_ps(vacc0, vacc2); - vacc0 = _mm_add_ps(vacc0, vacc4); - - __m128 vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - input += 4; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - // Store 4 outputs at a time. - _mm_storeu_ps(output, vf); - output += 4; - - // Accumulate computed exponents. - vacc = _mm_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - if (batch & (2 * sizeof(float))) { - // Store 2 outputs at a time. - _mm_storel_pi((__m64*) output, vf); - output += 2; - - // Accumulate 2 computed exponents. - vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps())); - - vf = _mm_movehl_ps(vf, vf); - } - if (batch & (1 * sizeof(float))) { - // Store 1 output at a time. - _mm_store_ss(output, vf); - - // Accumulate 1 computed exponent. - vacc = _mm_add_ss(vacc, vf); - } - } - // Reduce 4 batch in the SIMD register - vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); - vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1))); - _mm_store_ss(sum, vacc); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20.c deleted file mode 100644 index 8d337ced608..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u20.c +++ /dev/null @@ -1,286 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u20( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f); - const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f); - const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f); - const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f); - const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f); - const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f); - const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f); - const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f); - const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f); - const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m128 vi_max = _mm_load1_ps(max); - - __m128 vacc0 = _mm_setzero_ps(); - for (; batch >= 20 * sizeof(float); batch -= 20 * sizeof(float)) { - // Load 20 (5x4) inputs at a time. - const __m128 vi0 = _mm_loadu_ps(input); - const __m128 vi1 = _mm_loadu_ps(input + 4); - const __m128 vi2 = _mm_loadu_ps(input + 8); - const __m128 vi3 = _mm_loadu_ps(input + 12); - const __m128 vi4 = _mm_loadu_ps(input + 16); - input += 20; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx0 = _mm_sub_ps(vi0, vi_max); - const __m128 vx1 = _mm_sub_ps(vi1, vi_max); - const __m128 vx2 = _mm_sub_ps(vi2, vi_max); - const __m128 vx3 = _mm_sub_ps(vi3, vi_max); - const __m128 vx4 = _mm_sub_ps(vi4, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); - __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); - __m128 vn2 = _mm_add_ps(_mm_mul_ps(vx2, vlog2e), vmagic_bias); - __m128 vn3 = _mm_add_ps(_mm_mul_ps(vx3, vlog2e), vmagic_bias); - __m128 vn4 = _mm_add_ps(_mm_mul_ps(vx4, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); - const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); - const __m128 vs2 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn2), 23)); - const __m128 vs3 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn3), 23)); - const __m128 vs4 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn0 = _mm_sub_ps(vn0, vmagic_bias); - vn1 = _mm_sub_ps(vn1, vmagic_bias); - vn2 = _mm_sub_ps(vn2, vmagic_bias); - vn3 = _mm_sub_ps(vn3, vmagic_bias); - vn4 = _mm_sub_ps(vn4, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); - __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); - __m128 vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_hi), vx2); - __m128 vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_hi), vx3); - __m128 vt4 = _mm_add_ps(_mm_mul_ps(vn4, vminus_ln2_hi), vx4); - - vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); - vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); - vt2 = _mm_add_ps(_mm_mul_ps(vn2, vminus_ln2_lo), vt2); - vt3 = _mm_add_ps(_mm_mul_ps(vn3, vminus_ln2_lo), vt3); - vt4 = _mm_add_ps(_mm_mul_ps(vn4, vminus_ln2_lo), vt4); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); - __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); - __m128 vp2 = _mm_add_ps(_mm_mul_ps(vc5, vt2), vc4); - __m128 vp3 = _mm_add_ps(_mm_mul_ps(vc5, vt3), vc4); - __m128 vp4 = _mm_add_ps(_mm_mul_ps(vc5, vt4), vc4); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc3); - vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc3); - vp4 = _mm_add_ps(_mm_mul_ps(vp4, vt4), vc3); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc2); - vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc2); - vp4 = _mm_add_ps(_mm_mul_ps(vp4, vt4), vc2); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); - vp2 = _mm_add_ps(_mm_mul_ps(vp2, vt2), vc1); - vp3 = _mm_add_ps(_mm_mul_ps(vp3, vt3), vc1); - vp4 = _mm_add_ps(_mm_mul_ps(vp4, vt4), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt0 = _mm_mul_ps(vt0, vs0); - vt1 = _mm_mul_ps(vt1, vs1); - vt2 = _mm_mul_ps(vt2, vs2); - vt3 = _mm_mul_ps(vt3, vs3); - vt4 = _mm_mul_ps(vt4, vs4); - - __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); - __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); - __m128 vf2 = _mm_add_ps(_mm_mul_ps(vt2, vp2), vs2); - __m128 vf3 = _mm_add_ps(_mm_mul_ps(vt3, vp3), vs3); - __m128 vf4 = _mm_add_ps(_mm_mul_ps(vt4, vp4), vs4); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); - vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); - vf2 = _mm_andnot_ps(_mm_cmplt_ps(vx2, vdenorm_cutoff), vf2); - vf3 = _mm_andnot_ps(_mm_cmplt_ps(vx3, vdenorm_cutoff), vf3); - vf4 = _mm_andnot_ps(_mm_cmplt_ps(vx4, vdenorm_cutoff), vf4); - - // Store 20 (5x4) outputs at a time. - _mm_storeu_ps(output, vf0); - _mm_storeu_ps(output + 4, vf1); - _mm_storeu_ps(output + 8, vf2); - _mm_storeu_ps(output + 12, vf3); - _mm_storeu_ps(output + 16, vf4); - output += 20; - - // Accumulate computed exponents. - vacc0 = _mm_add_ps(vacc0, vf0); - vacc0 = _mm_add_ps(vacc0, vf1); - vacc0 = _mm_add_ps(vacc0, vf2); - vacc0 = _mm_add_ps(vacc0, vf3); - vacc0 = _mm_add_ps(vacc0, vf4); - } - - __m128 vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - input += 4; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - // Store 4 outputs at a time. - _mm_storeu_ps(output, vf); - output += 4; - - // Accumulate computed exponents. - vacc = _mm_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - if (batch & (2 * sizeof(float))) { - // Store 2 outputs at a time. - _mm_storel_pi((__m64*) output, vf); - output += 2; - - // Accumulate 2 computed exponents. - vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps())); - - vf = _mm_movehl_ps(vf, vf); - } - if (batch & (1 * sizeof(float))) { - // Store 1 output at a time. - _mm_store_ss(output, vf); - - // Accumulate 1 computed exponent. - vacc = _mm_add_ss(vacc, vf); - } - } - // Reduce 4 batch in the SIMD register - vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); - vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1))); - _mm_store_ss(sum, vacc); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u8.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u8.c deleted file mode 100644 index f645a973b79..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u8.c +++ /dev/null @@ -1,238 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/sse2-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u8( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f); - const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f); - const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f); - const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f); - const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f); - const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f); - const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f); - const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f); - const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f); - const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const __m128 vi_max = _mm_load1_ps(max); - - __m128 vacc0 = _mm_setzero_ps(); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - // Load 8 (2x4) inputs at a time. - const __m128 vi0 = _mm_loadu_ps(input); - const __m128 vi1 = _mm_loadu_ps(input + 4); - input += 8; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx0 = _mm_sub_ps(vi0, vi_max); - const __m128 vx1 = _mm_sub_ps(vi1, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn0 = _mm_add_ps(_mm_mul_ps(vx0, vlog2e), vmagic_bias); - __m128 vn1 = _mm_add_ps(_mm_mul_ps(vx1, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0), 23)); - const __m128 vs1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn1), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn0 = _mm_sub_ps(vn0, vmagic_bias); - vn1 = _mm_sub_ps(vn1, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_hi), vx0); - __m128 vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_hi), vx1); - - vt0 = _mm_add_ps(_mm_mul_ps(vn0, vminus_ln2_lo), vt0); - vt1 = _mm_add_ps(_mm_mul_ps(vn1, vminus_ln2_lo), vt1); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp0 = _mm_add_ps(_mm_mul_ps(vc5, vt0), vc4); - __m128 vp1 = _mm_add_ps(_mm_mul_ps(vc5, vt1), vc4); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc3); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc3); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc2); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc2); - - vp0 = _mm_add_ps(_mm_mul_ps(vp0, vt0), vc1); - vp1 = _mm_add_ps(_mm_mul_ps(vp1, vt1), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt0 = _mm_mul_ps(vt0, vs0); - vt1 = _mm_mul_ps(vt1, vs1); - - __m128 vf0 = _mm_add_ps(_mm_mul_ps(vt0, vp0), vs0); - __m128 vf1 = _mm_add_ps(_mm_mul_ps(vt1, vp1), vs1); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf0 = _mm_andnot_ps(_mm_cmplt_ps(vx0, vdenorm_cutoff), vf0); - vf1 = _mm_andnot_ps(_mm_cmplt_ps(vx1, vdenorm_cutoff), vf1); - - // Store 8 (2x4) outputs at a time. - _mm_storeu_ps(output, vf0); - _mm_storeu_ps(output + 4, vf1); - output += 8; - - // Accumulate computed exponents. - vacc0 = _mm_add_ps(vacc0, vf0); - vacc0 = _mm_add_ps(vacc0, vf1); - } - - __m128 vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - input += 4; - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - // Store 4 outputs at a time. - _mm_storeu_ps(output, vf); - output += 4; - - // Accumulate computed exponents. - vacc = _mm_add_ps(vacc, vf); - } - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - // Load 4 inputs at a time. - const __m128 vi = _mm_loadu_ps(input); - - // Subtract maximum input x := i - i_max. This implies x <= 0. - const __m128 vx = _mm_sub_ps(vi, vi_max); - - // Compute reduced argument batch := round(x / log(2)). - __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias); - - // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. - // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. - const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); - - // Subtract the large number back to get final batch := round(x / log(2)). - vn = _mm_sub_ps(vn, vmagic_bias); - - // Compute reduced argument t := x - batch * log(2). - // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. - __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx); - vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); - - // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. - __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); - vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); - - // Reconstruct the final f value: - // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) - // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) - // = s + (t * s) * p - vt = _mm_mul_ps(vt, vs); - __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs); - - // For inputs below zero cutoff, replace output with +0.0f. - // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. - vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf); - - if (batch & (2 * sizeof(float))) { - // Store 2 outputs at a time. - _mm_storel_pi((__m64*) output, vf); - output += 2; - - // Accumulate 2 computed exponents. - vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps())); - - vf = _mm_movehl_ps(vf, vf); - } - if (batch & (1 * sizeof(float))) { - // Store 1 output at a time. - _mm_store_ss(output, vf); - - // Accumulate 1 computed exponent. - vacc = _mm_add_ss(vacc, vf); - } - } - // Reduce 4 batch in the SIMD register - vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); - vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1))); - _mm_store_ss(sum, vacc); -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u12-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u12-acc2.c deleted file mode 100644 index 2ae500f5fe2..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u12-acc2.c +++ /dev/null @@ -1,203 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u12_acc2( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const v128_t vlog2e = wasm_f32x4_const_splat(0x1.715476p+0f); - const v128_t vmagic_bias = wasm_f32x4_const_splat(0x1.8000FEp23f); - const v128_t vminus_ln2_hi = wasm_f32x4_const_splat(-0x1.62E400p-1f); - const v128_t vminus_ln2_lo = wasm_f32x4_const_splat(-0x1.7F7D1Cp-20f); - const v128_t vc5 = wasm_f32x4_const_splat(0x1.0F9F9Cp-7f); - const v128_t vc4 = wasm_f32x4_const_splat(0x1.573A1Ap-5f); - const v128_t vc3 = wasm_f32x4_const_splat(0x1.555A80p-3f); - const v128_t vc2 = wasm_f32x4_const_splat(0x1.FFFDC6p-2f); - const v128_t vc1 = wasm_f32x4_const_splat(0x1.FFFFF6p-1f); - const v128_t vdenorm_cutoff = wasm_f32x4_const_splat(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const v128_t vi_max = wasm_v128_load32_splat(max); - - v128_t vacc0 = wasm_f32x4_const_splat(0.0f); - v128_t vacc1 = vacc0; - for (; batch >= 12 * sizeof(float); batch -= 12 * sizeof(float)) { - // Load 12 (3x4) inputs at a time. - const v128_t vi0123 = wasm_v128_load(input); - const v128_t vi4567 = wasm_v128_load(input + 4); - const v128_t vi89AB = wasm_v128_load(input + 8); - input += 12; - - const v128_t vx0123 = wasm_f32x4_sub(vi0123, vi_max); - const v128_t vx4567 = wasm_f32x4_sub(vi4567, vi_max); - const v128_t vx89AB = wasm_f32x4_sub(vi89AB, vi_max); - - v128_t vn0123 = wasm_f32x4_relaxed_madd(vx0123, vlog2e, vmagic_bias); - v128_t vn4567 = wasm_f32x4_relaxed_madd(vx4567, vlog2e, vmagic_bias); - v128_t vn89AB = wasm_f32x4_relaxed_madd(vx89AB, vlog2e, vmagic_bias); - - const v128_t vs0123 = wasm_i32x4_shl(vn0123, 23); - const v128_t vs4567 = wasm_i32x4_shl(vn4567, 23); - const v128_t vs89AB = wasm_i32x4_shl(vn89AB, 23); - - vn0123 = wasm_f32x4_sub(vn0123, vmagic_bias); - vn4567 = wasm_f32x4_sub(vn4567, vmagic_bias); - vn89AB = wasm_f32x4_sub(vn89AB, vmagic_bias); - - v128_t vt0123 = wasm_f32x4_relaxed_madd(vn0123, vminus_ln2_hi, vx0123); - v128_t vt4567 = wasm_f32x4_relaxed_madd(vn4567, vminus_ln2_hi, vx4567); - v128_t vt89AB = wasm_f32x4_relaxed_madd(vn89AB, vminus_ln2_hi, vx89AB); - - vt0123 = wasm_f32x4_relaxed_madd(vn0123, vminus_ln2_lo, vt0123); - vt4567 = wasm_f32x4_relaxed_madd(vn4567, vminus_ln2_lo, vt4567); - vt89AB = wasm_f32x4_relaxed_madd(vn89AB, vminus_ln2_lo, vt89AB); - - v128_t vp0123 = wasm_f32x4_relaxed_madd(vc5, vt0123, vc4); - v128_t vp4567 = wasm_f32x4_relaxed_madd(vc5, vt4567, vc4); - v128_t vp89AB = wasm_f32x4_relaxed_madd(vc5, vt89AB, vc4); - - vp0123 = wasm_f32x4_relaxed_madd(vp0123, vt0123, vc3); - vp4567 = wasm_f32x4_relaxed_madd(vp4567, vt4567, vc3); - vp89AB = wasm_f32x4_relaxed_madd(vp89AB, vt89AB, vc3); - - vp0123 = wasm_f32x4_relaxed_madd(vp0123, vt0123, vc2); - vp4567 = wasm_f32x4_relaxed_madd(vp4567, vt4567, vc2); - vp89AB = wasm_f32x4_relaxed_madd(vp89AB, vt89AB, vc2); - - vp0123 = wasm_f32x4_relaxed_madd(vp0123, vt0123, vc1); - vp4567 = wasm_f32x4_relaxed_madd(vp4567, vt4567, vc1); - vp89AB = wasm_f32x4_relaxed_madd(vp89AB, vt89AB, vc1); - - vt0123 = wasm_f32x4_mul(vt0123, vs0123); - vt4567 = wasm_f32x4_mul(vt4567, vs4567); - vt89AB = wasm_f32x4_mul(vt89AB, vs89AB); - - v128_t vf0123 = wasm_f32x4_relaxed_madd(vt0123, vp0123, vs0123); - v128_t vf4567 = wasm_f32x4_relaxed_madd(vt4567, vp4567, vs4567); - v128_t vf89AB = wasm_f32x4_relaxed_madd(vt89AB, vp89AB, vs89AB); - - vf0123 = wasm_v128_andnot(vf0123, wasm_f32x4_lt(vx0123, vdenorm_cutoff)); - vf4567 = wasm_v128_andnot(vf4567, wasm_f32x4_lt(vx4567, vdenorm_cutoff)); - vf89AB = wasm_v128_andnot(vf89AB, wasm_f32x4_lt(vx89AB, vdenorm_cutoff)); - - wasm_v128_store(output, vf0123); - wasm_v128_store(output + 4, vf4567); - wasm_v128_store(output + 8, vf89AB); - output += 12; - - vacc0 = wasm_f32x4_add(vacc0, vf0123); - vacc0 = wasm_f32x4_add(vacc0, vf4567); - vacc0 = wasm_f32x4_add(vacc0, vf89AB); - } - // Add up all accumulators to vacc0 - vacc0 = wasm_f32x4_add(vacc0, vacc1); - - v128_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t vi = wasm_v128_load(input); - input += 4; - - const v128_t vx = wasm_f32x4_sub(vi, vi_max); - - v128_t vn = wasm_f32x4_relaxed_madd(vx, vlog2e, vmagic_bias); - - const v128_t vs = wasm_i32x4_shl(vn, 23); - - vn = wasm_f32x4_sub(vn, vmagic_bias); - - v128_t vt = wasm_f32x4_relaxed_madd(vn, vminus_ln2_hi, vx); - vt = wasm_f32x4_relaxed_madd(vn, vminus_ln2_lo, vt); - - v128_t vp = wasm_f32x4_relaxed_madd(vc5, vt, vc4); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc3); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc2); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc1); - - vt = wasm_f32x4_mul(vt, vs); - v128_t vf = wasm_f32x4_relaxed_madd(vt, vp, vs); - - vf = wasm_v128_andnot(vf, wasm_f32x4_lt(vx, vdenorm_cutoff)); - - wasm_v128_store(output, vf); - output += 4; - - vacc = wasm_f32x4_add(vacc, vf); - } - vacc = wasm_f32x4_add(vacc, wasm_v64x2_shuffle(vacc, vacc, 1, 1)); - float vsum = wasm_f32x4_extract_lane(vacc, 0) + wasm_f32x4_extract_lane(vacc, 1); - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - - const v128_t vi = wasm_v128_load(input); - - const v128_t vx = wasm_f32x4_sub(vi, vi_max); - - v128_t vn = wasm_f32x4_relaxed_madd(vx, vlog2e, vmagic_bias); - - const v128_t vs = wasm_i32x4_shl(vn, 23); - - vn = wasm_f32x4_sub(vn, vmagic_bias); - - v128_t vt = wasm_f32x4_relaxed_madd(vn, vminus_ln2_hi, vx); - vt = wasm_f32x4_relaxed_madd(vn, vminus_ln2_lo, vt); - - v128_t vp = wasm_f32x4_relaxed_madd(vc5, vt, vc4); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc3); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc2); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc1); - - vt = wasm_f32x4_mul(vt, vs); - v128_t vf = wasm_f32x4_relaxed_madd(vt, vp, vs); - - vf = wasm_v128_andnot(vf, wasm_f32x4_lt(vx, vdenorm_cutoff)); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vf, 0); - output += 2; - - vsum += wasm_f32x4_extract_lane(vf, 0) + wasm_f32x4_extract_lane(vf, 1); - vf = wasm_v64x2_shuffle(vf, vf, 1, 1); - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vf, 0); - vsum += wasm_f32x4_extract_lane(vf, 0); - } - } - *sum = vsum; -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u12-acc3.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u12-acc3.c deleted file mode 100644 index 1849c5183ff..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u12-acc3.c +++ /dev/null @@ -1,205 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u12_acc3( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const v128_t vlog2e = wasm_f32x4_const_splat(0x1.715476p+0f); - const v128_t vmagic_bias = wasm_f32x4_const_splat(0x1.8000FEp23f); - const v128_t vminus_ln2_hi = wasm_f32x4_const_splat(-0x1.62E400p-1f); - const v128_t vminus_ln2_lo = wasm_f32x4_const_splat(-0x1.7F7D1Cp-20f); - const v128_t vc5 = wasm_f32x4_const_splat(0x1.0F9F9Cp-7f); - const v128_t vc4 = wasm_f32x4_const_splat(0x1.573A1Ap-5f); - const v128_t vc3 = wasm_f32x4_const_splat(0x1.555A80p-3f); - const v128_t vc2 = wasm_f32x4_const_splat(0x1.FFFDC6p-2f); - const v128_t vc1 = wasm_f32x4_const_splat(0x1.FFFFF6p-1f); - const v128_t vdenorm_cutoff = wasm_f32x4_const_splat(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const v128_t vi_max = wasm_v128_load32_splat(max); - - v128_t vacc0 = wasm_f32x4_const_splat(0.0f); - v128_t vacc1 = vacc0; - v128_t vacc2 = vacc0; - for (; batch >= 12 * sizeof(float); batch -= 12 * sizeof(float)) { - // Load 12 (3x4) inputs at a time. - const v128_t vi0123 = wasm_v128_load(input); - const v128_t vi4567 = wasm_v128_load(input + 4); - const v128_t vi89AB = wasm_v128_load(input + 8); - input += 12; - - const v128_t vx0123 = wasm_f32x4_sub(vi0123, vi_max); - const v128_t vx4567 = wasm_f32x4_sub(vi4567, vi_max); - const v128_t vx89AB = wasm_f32x4_sub(vi89AB, vi_max); - - v128_t vn0123 = wasm_f32x4_relaxed_madd(vx0123, vlog2e, vmagic_bias); - v128_t vn4567 = wasm_f32x4_relaxed_madd(vx4567, vlog2e, vmagic_bias); - v128_t vn89AB = wasm_f32x4_relaxed_madd(vx89AB, vlog2e, vmagic_bias); - - const v128_t vs0123 = wasm_i32x4_shl(vn0123, 23); - const v128_t vs4567 = wasm_i32x4_shl(vn4567, 23); - const v128_t vs89AB = wasm_i32x4_shl(vn89AB, 23); - - vn0123 = wasm_f32x4_sub(vn0123, vmagic_bias); - vn4567 = wasm_f32x4_sub(vn4567, vmagic_bias); - vn89AB = wasm_f32x4_sub(vn89AB, vmagic_bias); - - v128_t vt0123 = wasm_f32x4_relaxed_madd(vn0123, vminus_ln2_hi, vx0123); - v128_t vt4567 = wasm_f32x4_relaxed_madd(vn4567, vminus_ln2_hi, vx4567); - v128_t vt89AB = wasm_f32x4_relaxed_madd(vn89AB, vminus_ln2_hi, vx89AB); - - vt0123 = wasm_f32x4_relaxed_madd(vn0123, vminus_ln2_lo, vt0123); - vt4567 = wasm_f32x4_relaxed_madd(vn4567, vminus_ln2_lo, vt4567); - vt89AB = wasm_f32x4_relaxed_madd(vn89AB, vminus_ln2_lo, vt89AB); - - v128_t vp0123 = wasm_f32x4_relaxed_madd(vc5, vt0123, vc4); - v128_t vp4567 = wasm_f32x4_relaxed_madd(vc5, vt4567, vc4); - v128_t vp89AB = wasm_f32x4_relaxed_madd(vc5, vt89AB, vc4); - - vp0123 = wasm_f32x4_relaxed_madd(vp0123, vt0123, vc3); - vp4567 = wasm_f32x4_relaxed_madd(vp4567, vt4567, vc3); - vp89AB = wasm_f32x4_relaxed_madd(vp89AB, vt89AB, vc3); - - vp0123 = wasm_f32x4_relaxed_madd(vp0123, vt0123, vc2); - vp4567 = wasm_f32x4_relaxed_madd(vp4567, vt4567, vc2); - vp89AB = wasm_f32x4_relaxed_madd(vp89AB, vt89AB, vc2); - - vp0123 = wasm_f32x4_relaxed_madd(vp0123, vt0123, vc1); - vp4567 = wasm_f32x4_relaxed_madd(vp4567, vt4567, vc1); - vp89AB = wasm_f32x4_relaxed_madd(vp89AB, vt89AB, vc1); - - vt0123 = wasm_f32x4_mul(vt0123, vs0123); - vt4567 = wasm_f32x4_mul(vt4567, vs4567); - vt89AB = wasm_f32x4_mul(vt89AB, vs89AB); - - v128_t vf0123 = wasm_f32x4_relaxed_madd(vt0123, vp0123, vs0123); - v128_t vf4567 = wasm_f32x4_relaxed_madd(vt4567, vp4567, vs4567); - v128_t vf89AB = wasm_f32x4_relaxed_madd(vt89AB, vp89AB, vs89AB); - - vf0123 = wasm_v128_andnot(vf0123, wasm_f32x4_lt(vx0123, vdenorm_cutoff)); - vf4567 = wasm_v128_andnot(vf4567, wasm_f32x4_lt(vx4567, vdenorm_cutoff)); - vf89AB = wasm_v128_andnot(vf89AB, wasm_f32x4_lt(vx89AB, vdenorm_cutoff)); - - wasm_v128_store(output, vf0123); - wasm_v128_store(output + 4, vf4567); - wasm_v128_store(output + 8, vf89AB); - output += 12; - - vacc0 = wasm_f32x4_add(vacc0, vf0123); - vacc1 = wasm_f32x4_add(vacc1, vf4567); - vacc2 = wasm_f32x4_add(vacc2, vf89AB); - } - // Add up all accumulators to vacc0 - vacc0 = wasm_f32x4_add(vacc0, vacc1); - vacc0 = wasm_f32x4_add(vacc0, vacc2); - - v128_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t vi = wasm_v128_load(input); - input += 4; - - const v128_t vx = wasm_f32x4_sub(vi, vi_max); - - v128_t vn = wasm_f32x4_relaxed_madd(vx, vlog2e, vmagic_bias); - - const v128_t vs = wasm_i32x4_shl(vn, 23); - - vn = wasm_f32x4_sub(vn, vmagic_bias); - - v128_t vt = wasm_f32x4_relaxed_madd(vn, vminus_ln2_hi, vx); - vt = wasm_f32x4_relaxed_madd(vn, vminus_ln2_lo, vt); - - v128_t vp = wasm_f32x4_relaxed_madd(vc5, vt, vc4); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc3); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc2); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc1); - - vt = wasm_f32x4_mul(vt, vs); - v128_t vf = wasm_f32x4_relaxed_madd(vt, vp, vs); - - vf = wasm_v128_andnot(vf, wasm_f32x4_lt(vx, vdenorm_cutoff)); - - wasm_v128_store(output, vf); - output += 4; - - vacc = wasm_f32x4_add(vacc, vf); - } - vacc = wasm_f32x4_add(vacc, wasm_v64x2_shuffle(vacc, vacc, 1, 1)); - float vsum = wasm_f32x4_extract_lane(vacc, 0) + wasm_f32x4_extract_lane(vacc, 1); - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - - const v128_t vi = wasm_v128_load(input); - - const v128_t vx = wasm_f32x4_sub(vi, vi_max); - - v128_t vn = wasm_f32x4_relaxed_madd(vx, vlog2e, vmagic_bias); - - const v128_t vs = wasm_i32x4_shl(vn, 23); - - vn = wasm_f32x4_sub(vn, vmagic_bias); - - v128_t vt = wasm_f32x4_relaxed_madd(vn, vminus_ln2_hi, vx); - vt = wasm_f32x4_relaxed_madd(vn, vminus_ln2_lo, vt); - - v128_t vp = wasm_f32x4_relaxed_madd(vc5, vt, vc4); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc3); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc2); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc1); - - vt = wasm_f32x4_mul(vt, vs); - v128_t vf = wasm_f32x4_relaxed_madd(vt, vp, vs); - - vf = wasm_v128_andnot(vf, wasm_f32x4_lt(vx, vdenorm_cutoff)); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vf, 0); - output += 2; - - vsum += wasm_f32x4_extract_lane(vf, 0) + wasm_f32x4_extract_lane(vf, 1); - vf = wasm_v64x2_shuffle(vf, vf, 1, 1); - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vf, 0); - vsum += wasm_f32x4_extract_lane(vf, 0); - } - } - *sum = vsum; -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u12.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u12.c deleted file mode 100644 index ccac3bd9a85..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u12.c +++ /dev/null @@ -1,200 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u12( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const v128_t vlog2e = wasm_f32x4_const_splat(0x1.715476p+0f); - const v128_t vmagic_bias = wasm_f32x4_const_splat(0x1.8000FEp23f); - const v128_t vminus_ln2_hi = wasm_f32x4_const_splat(-0x1.62E400p-1f); - const v128_t vminus_ln2_lo = wasm_f32x4_const_splat(-0x1.7F7D1Cp-20f); - const v128_t vc5 = wasm_f32x4_const_splat(0x1.0F9F9Cp-7f); - const v128_t vc4 = wasm_f32x4_const_splat(0x1.573A1Ap-5f); - const v128_t vc3 = wasm_f32x4_const_splat(0x1.555A80p-3f); - const v128_t vc2 = wasm_f32x4_const_splat(0x1.FFFDC6p-2f); - const v128_t vc1 = wasm_f32x4_const_splat(0x1.FFFFF6p-1f); - const v128_t vdenorm_cutoff = wasm_f32x4_const_splat(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const v128_t vi_max = wasm_v128_load32_splat(max); - - v128_t vacc0 = wasm_f32x4_const_splat(0.0f); - for (; batch >= 12 * sizeof(float); batch -= 12 * sizeof(float)) { - // Load 12 (3x4) inputs at a time. - const v128_t vi0123 = wasm_v128_load(input); - const v128_t vi4567 = wasm_v128_load(input + 4); - const v128_t vi89AB = wasm_v128_load(input + 8); - input += 12; - - const v128_t vx0123 = wasm_f32x4_sub(vi0123, vi_max); - const v128_t vx4567 = wasm_f32x4_sub(vi4567, vi_max); - const v128_t vx89AB = wasm_f32x4_sub(vi89AB, vi_max); - - v128_t vn0123 = wasm_f32x4_relaxed_madd(vx0123, vlog2e, vmagic_bias); - v128_t vn4567 = wasm_f32x4_relaxed_madd(vx4567, vlog2e, vmagic_bias); - v128_t vn89AB = wasm_f32x4_relaxed_madd(vx89AB, vlog2e, vmagic_bias); - - const v128_t vs0123 = wasm_i32x4_shl(vn0123, 23); - const v128_t vs4567 = wasm_i32x4_shl(vn4567, 23); - const v128_t vs89AB = wasm_i32x4_shl(vn89AB, 23); - - vn0123 = wasm_f32x4_sub(vn0123, vmagic_bias); - vn4567 = wasm_f32x4_sub(vn4567, vmagic_bias); - vn89AB = wasm_f32x4_sub(vn89AB, vmagic_bias); - - v128_t vt0123 = wasm_f32x4_relaxed_madd(vn0123, vminus_ln2_hi, vx0123); - v128_t vt4567 = wasm_f32x4_relaxed_madd(vn4567, vminus_ln2_hi, vx4567); - v128_t vt89AB = wasm_f32x4_relaxed_madd(vn89AB, vminus_ln2_hi, vx89AB); - - vt0123 = wasm_f32x4_relaxed_madd(vn0123, vminus_ln2_lo, vt0123); - vt4567 = wasm_f32x4_relaxed_madd(vn4567, vminus_ln2_lo, vt4567); - vt89AB = wasm_f32x4_relaxed_madd(vn89AB, vminus_ln2_lo, vt89AB); - - v128_t vp0123 = wasm_f32x4_relaxed_madd(vc5, vt0123, vc4); - v128_t vp4567 = wasm_f32x4_relaxed_madd(vc5, vt4567, vc4); - v128_t vp89AB = wasm_f32x4_relaxed_madd(vc5, vt89AB, vc4); - - vp0123 = wasm_f32x4_relaxed_madd(vp0123, vt0123, vc3); - vp4567 = wasm_f32x4_relaxed_madd(vp4567, vt4567, vc3); - vp89AB = wasm_f32x4_relaxed_madd(vp89AB, vt89AB, vc3); - - vp0123 = wasm_f32x4_relaxed_madd(vp0123, vt0123, vc2); - vp4567 = wasm_f32x4_relaxed_madd(vp4567, vt4567, vc2); - vp89AB = wasm_f32x4_relaxed_madd(vp89AB, vt89AB, vc2); - - vp0123 = wasm_f32x4_relaxed_madd(vp0123, vt0123, vc1); - vp4567 = wasm_f32x4_relaxed_madd(vp4567, vt4567, vc1); - vp89AB = wasm_f32x4_relaxed_madd(vp89AB, vt89AB, vc1); - - vt0123 = wasm_f32x4_mul(vt0123, vs0123); - vt4567 = wasm_f32x4_mul(vt4567, vs4567); - vt89AB = wasm_f32x4_mul(vt89AB, vs89AB); - - v128_t vf0123 = wasm_f32x4_relaxed_madd(vt0123, vp0123, vs0123); - v128_t vf4567 = wasm_f32x4_relaxed_madd(vt4567, vp4567, vs4567); - v128_t vf89AB = wasm_f32x4_relaxed_madd(vt89AB, vp89AB, vs89AB); - - vf0123 = wasm_v128_andnot(vf0123, wasm_f32x4_lt(vx0123, vdenorm_cutoff)); - vf4567 = wasm_v128_andnot(vf4567, wasm_f32x4_lt(vx4567, vdenorm_cutoff)); - vf89AB = wasm_v128_andnot(vf89AB, wasm_f32x4_lt(vx89AB, vdenorm_cutoff)); - - wasm_v128_store(output, vf0123); - wasm_v128_store(output + 4, vf4567); - wasm_v128_store(output + 8, vf89AB); - output += 12; - - vacc0 = wasm_f32x4_add(vacc0, vf0123); - vacc0 = wasm_f32x4_add(vacc0, vf4567); - vacc0 = wasm_f32x4_add(vacc0, vf89AB); - } - - v128_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t vi = wasm_v128_load(input); - input += 4; - - const v128_t vx = wasm_f32x4_sub(vi, vi_max); - - v128_t vn = wasm_f32x4_relaxed_madd(vx, vlog2e, vmagic_bias); - - const v128_t vs = wasm_i32x4_shl(vn, 23); - - vn = wasm_f32x4_sub(vn, vmagic_bias); - - v128_t vt = wasm_f32x4_relaxed_madd(vn, vminus_ln2_hi, vx); - vt = wasm_f32x4_relaxed_madd(vn, vminus_ln2_lo, vt); - - v128_t vp = wasm_f32x4_relaxed_madd(vc5, vt, vc4); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc3); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc2); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc1); - - vt = wasm_f32x4_mul(vt, vs); - v128_t vf = wasm_f32x4_relaxed_madd(vt, vp, vs); - - vf = wasm_v128_andnot(vf, wasm_f32x4_lt(vx, vdenorm_cutoff)); - - wasm_v128_store(output, vf); - output += 4; - - vacc = wasm_f32x4_add(vacc, vf); - } - vacc = wasm_f32x4_add(vacc, wasm_v64x2_shuffle(vacc, vacc, 1, 1)); - float vsum = wasm_f32x4_extract_lane(vacc, 0) + wasm_f32x4_extract_lane(vacc, 1); - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - - const v128_t vi = wasm_v128_load(input); - - const v128_t vx = wasm_f32x4_sub(vi, vi_max); - - v128_t vn = wasm_f32x4_relaxed_madd(vx, vlog2e, vmagic_bias); - - const v128_t vs = wasm_i32x4_shl(vn, 23); - - vn = wasm_f32x4_sub(vn, vmagic_bias); - - v128_t vt = wasm_f32x4_relaxed_madd(vn, vminus_ln2_hi, vx); - vt = wasm_f32x4_relaxed_madd(vn, vminus_ln2_lo, vt); - - v128_t vp = wasm_f32x4_relaxed_madd(vc5, vt, vc4); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc3); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc2); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc1); - - vt = wasm_f32x4_mul(vt, vs); - v128_t vf = wasm_f32x4_relaxed_madd(vt, vp, vs); - - vf = wasm_v128_andnot(vf, wasm_f32x4_lt(vx, vdenorm_cutoff)); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vf, 0); - output += 2; - - vsum += wasm_f32x4_extract_lane(vf, 0) + wasm_f32x4_extract_lane(vf, 1); - vf = wasm_v64x2_shuffle(vf, vf, 1, 1); - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vf, 0); - vsum += wasm_f32x4_extract_lane(vf, 0); - } - } - *sum = vsum; -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u16.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u16.c deleted file mode 100644 index 280f8fcb996..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u16.c +++ /dev/null @@ -1,216 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u16( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const v128_t vlog2e = wasm_f32x4_const_splat(0x1.715476p+0f); - const v128_t vmagic_bias = wasm_f32x4_const_splat(0x1.8000FEp23f); - const v128_t vminus_ln2_hi = wasm_f32x4_const_splat(-0x1.62E400p-1f); - const v128_t vminus_ln2_lo = wasm_f32x4_const_splat(-0x1.7F7D1Cp-20f); - const v128_t vc5 = wasm_f32x4_const_splat(0x1.0F9F9Cp-7f); - const v128_t vc4 = wasm_f32x4_const_splat(0x1.573A1Ap-5f); - const v128_t vc3 = wasm_f32x4_const_splat(0x1.555A80p-3f); - const v128_t vc2 = wasm_f32x4_const_splat(0x1.FFFDC6p-2f); - const v128_t vc1 = wasm_f32x4_const_splat(0x1.FFFFF6p-1f); - const v128_t vdenorm_cutoff = wasm_f32x4_const_splat(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const v128_t vi_max = wasm_v128_load32_splat(max); - - v128_t vacc0 = wasm_f32x4_const_splat(0.0f); - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - // Load 16 (4x4) inputs at a time. - const v128_t vi0123 = wasm_v128_load(input); - const v128_t vi4567 = wasm_v128_load(input + 4); - const v128_t vi89AB = wasm_v128_load(input + 8); - const v128_t viCDEF = wasm_v128_load(input + 12); - input += 16; - - const v128_t vx0123 = wasm_f32x4_sub(vi0123, vi_max); - const v128_t vx4567 = wasm_f32x4_sub(vi4567, vi_max); - const v128_t vx89AB = wasm_f32x4_sub(vi89AB, vi_max); - const v128_t vxCDEF = wasm_f32x4_sub(viCDEF, vi_max); - - v128_t vn0123 = wasm_f32x4_relaxed_madd(vx0123, vlog2e, vmagic_bias); - v128_t vn4567 = wasm_f32x4_relaxed_madd(vx4567, vlog2e, vmagic_bias); - v128_t vn89AB = wasm_f32x4_relaxed_madd(vx89AB, vlog2e, vmagic_bias); - v128_t vnCDEF = wasm_f32x4_relaxed_madd(vxCDEF, vlog2e, vmagic_bias); - - const v128_t vs0123 = wasm_i32x4_shl(vn0123, 23); - const v128_t vs4567 = wasm_i32x4_shl(vn4567, 23); - const v128_t vs89AB = wasm_i32x4_shl(vn89AB, 23); - const v128_t vsCDEF = wasm_i32x4_shl(vnCDEF, 23); - - vn0123 = wasm_f32x4_sub(vn0123, vmagic_bias); - vn4567 = wasm_f32x4_sub(vn4567, vmagic_bias); - vn89AB = wasm_f32x4_sub(vn89AB, vmagic_bias); - vnCDEF = wasm_f32x4_sub(vnCDEF, vmagic_bias); - - v128_t vt0123 = wasm_f32x4_relaxed_madd(vn0123, vminus_ln2_hi, vx0123); - v128_t vt4567 = wasm_f32x4_relaxed_madd(vn4567, vminus_ln2_hi, vx4567); - v128_t vt89AB = wasm_f32x4_relaxed_madd(vn89AB, vminus_ln2_hi, vx89AB); - v128_t vtCDEF = wasm_f32x4_relaxed_madd(vnCDEF, vminus_ln2_hi, vxCDEF); - - vt0123 = wasm_f32x4_relaxed_madd(vn0123, vminus_ln2_lo, vt0123); - vt4567 = wasm_f32x4_relaxed_madd(vn4567, vminus_ln2_lo, vt4567); - vt89AB = wasm_f32x4_relaxed_madd(vn89AB, vminus_ln2_lo, vt89AB); - vtCDEF = wasm_f32x4_relaxed_madd(vnCDEF, vminus_ln2_lo, vtCDEF); - - v128_t vp0123 = wasm_f32x4_relaxed_madd(vc5, vt0123, vc4); - v128_t vp4567 = wasm_f32x4_relaxed_madd(vc5, vt4567, vc4); - v128_t vp89AB = wasm_f32x4_relaxed_madd(vc5, vt89AB, vc4); - v128_t vpCDEF = wasm_f32x4_relaxed_madd(vc5, vtCDEF, vc4); - - vp0123 = wasm_f32x4_relaxed_madd(vp0123, vt0123, vc3); - vp4567 = wasm_f32x4_relaxed_madd(vp4567, vt4567, vc3); - vp89AB = wasm_f32x4_relaxed_madd(vp89AB, vt89AB, vc3); - vpCDEF = wasm_f32x4_relaxed_madd(vpCDEF, vtCDEF, vc3); - - vp0123 = wasm_f32x4_relaxed_madd(vp0123, vt0123, vc2); - vp4567 = wasm_f32x4_relaxed_madd(vp4567, vt4567, vc2); - vp89AB = wasm_f32x4_relaxed_madd(vp89AB, vt89AB, vc2); - vpCDEF = wasm_f32x4_relaxed_madd(vpCDEF, vtCDEF, vc2); - - vp0123 = wasm_f32x4_relaxed_madd(vp0123, vt0123, vc1); - vp4567 = wasm_f32x4_relaxed_madd(vp4567, vt4567, vc1); - vp89AB = wasm_f32x4_relaxed_madd(vp89AB, vt89AB, vc1); - vpCDEF = wasm_f32x4_relaxed_madd(vpCDEF, vtCDEF, vc1); - - vt0123 = wasm_f32x4_mul(vt0123, vs0123); - vt4567 = wasm_f32x4_mul(vt4567, vs4567); - vt89AB = wasm_f32x4_mul(vt89AB, vs89AB); - vtCDEF = wasm_f32x4_mul(vtCDEF, vsCDEF); - - v128_t vf0123 = wasm_f32x4_relaxed_madd(vt0123, vp0123, vs0123); - v128_t vf4567 = wasm_f32x4_relaxed_madd(vt4567, vp4567, vs4567); - v128_t vf89AB = wasm_f32x4_relaxed_madd(vt89AB, vp89AB, vs89AB); - v128_t vfCDEF = wasm_f32x4_relaxed_madd(vtCDEF, vpCDEF, vsCDEF); - - vf0123 = wasm_v128_andnot(vf0123, wasm_f32x4_lt(vx0123, vdenorm_cutoff)); - vf4567 = wasm_v128_andnot(vf4567, wasm_f32x4_lt(vx4567, vdenorm_cutoff)); - vf89AB = wasm_v128_andnot(vf89AB, wasm_f32x4_lt(vx89AB, vdenorm_cutoff)); - vfCDEF = wasm_v128_andnot(vfCDEF, wasm_f32x4_lt(vxCDEF, vdenorm_cutoff)); - - wasm_v128_store(output, vf0123); - wasm_v128_store(output + 4, vf4567); - wasm_v128_store(output + 8, vf89AB); - wasm_v128_store(output + 12, vfCDEF); - output += 16; - - vacc0 = wasm_f32x4_add(vacc0, vf0123); - vacc0 = wasm_f32x4_add(vacc0, vf4567); - vacc0 = wasm_f32x4_add(vacc0, vf89AB); - vacc0 = wasm_f32x4_add(vacc0, vfCDEF); - } - - v128_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t vi = wasm_v128_load(input); - input += 4; - - const v128_t vx = wasm_f32x4_sub(vi, vi_max); - - v128_t vn = wasm_f32x4_relaxed_madd(vx, vlog2e, vmagic_bias); - - const v128_t vs = wasm_i32x4_shl(vn, 23); - - vn = wasm_f32x4_sub(vn, vmagic_bias); - - v128_t vt = wasm_f32x4_relaxed_madd(vn, vminus_ln2_hi, vx); - vt = wasm_f32x4_relaxed_madd(vn, vminus_ln2_lo, vt); - - v128_t vp = wasm_f32x4_relaxed_madd(vc5, vt, vc4); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc3); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc2); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc1); - - vt = wasm_f32x4_mul(vt, vs); - v128_t vf = wasm_f32x4_relaxed_madd(vt, vp, vs); - - vf = wasm_v128_andnot(vf, wasm_f32x4_lt(vx, vdenorm_cutoff)); - - wasm_v128_store(output, vf); - output += 4; - - vacc = wasm_f32x4_add(vacc, vf); - } - vacc = wasm_f32x4_add(vacc, wasm_v64x2_shuffle(vacc, vacc, 1, 1)); - float vsum = wasm_f32x4_extract_lane(vacc, 0) + wasm_f32x4_extract_lane(vacc, 1); - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - - const v128_t vi = wasm_v128_load(input); - - const v128_t vx = wasm_f32x4_sub(vi, vi_max); - - v128_t vn = wasm_f32x4_relaxed_madd(vx, vlog2e, vmagic_bias); - - const v128_t vs = wasm_i32x4_shl(vn, 23); - - vn = wasm_f32x4_sub(vn, vmagic_bias); - - v128_t vt = wasm_f32x4_relaxed_madd(vn, vminus_ln2_hi, vx); - vt = wasm_f32x4_relaxed_madd(vn, vminus_ln2_lo, vt); - - v128_t vp = wasm_f32x4_relaxed_madd(vc5, vt, vc4); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc3); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc2); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc1); - - vt = wasm_f32x4_mul(vt, vs); - v128_t vf = wasm_f32x4_relaxed_madd(vt, vp, vs); - - vf = wasm_v128_andnot(vf, wasm_f32x4_lt(vx, vdenorm_cutoff)); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vf, 0); - output += 2; - - vsum += wasm_f32x4_extract_lane(vf, 0) + wasm_f32x4_extract_lane(vf, 1); - vf = wasm_v64x2_shuffle(vf, vf, 1, 1); - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vf, 0); - vsum += wasm_f32x4_extract_lane(vf, 0); - } - } - *sum = vsum; -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u20-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u20-acc2.c deleted file mode 100644 index 712adfef9df..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u20-acc2.c +++ /dev/null @@ -1,235 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u20_acc2( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const v128_t vlog2e = wasm_f32x4_const_splat(0x1.715476p+0f); - const v128_t vmagic_bias = wasm_f32x4_const_splat(0x1.8000FEp23f); - const v128_t vminus_ln2_hi = wasm_f32x4_const_splat(-0x1.62E400p-1f); - const v128_t vminus_ln2_lo = wasm_f32x4_const_splat(-0x1.7F7D1Cp-20f); - const v128_t vc5 = wasm_f32x4_const_splat(0x1.0F9F9Cp-7f); - const v128_t vc4 = wasm_f32x4_const_splat(0x1.573A1Ap-5f); - const v128_t vc3 = wasm_f32x4_const_splat(0x1.555A80p-3f); - const v128_t vc2 = wasm_f32x4_const_splat(0x1.FFFDC6p-2f); - const v128_t vc1 = wasm_f32x4_const_splat(0x1.FFFFF6p-1f); - const v128_t vdenorm_cutoff = wasm_f32x4_const_splat(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const v128_t vi_max = wasm_v128_load32_splat(max); - - v128_t vacc0 = wasm_f32x4_const_splat(0.0f); - v128_t vacc1 = vacc0; - for (; batch >= 20 * sizeof(float); batch -= 20 * sizeof(float)) { - // Load 20 (5x4) inputs at a time. - const v128_t vi0123 = wasm_v128_load(input); - const v128_t vi4567 = wasm_v128_load(input + 4); - const v128_t vi89AB = wasm_v128_load(input + 8); - const v128_t viCDEF = wasm_v128_load(input + 12); - const v128_t viGHIJ = wasm_v128_load(input + 16); - input += 20; - - const v128_t vx0123 = wasm_f32x4_sub(vi0123, vi_max); - const v128_t vx4567 = wasm_f32x4_sub(vi4567, vi_max); - const v128_t vx89AB = wasm_f32x4_sub(vi89AB, vi_max); - const v128_t vxCDEF = wasm_f32x4_sub(viCDEF, vi_max); - const v128_t vxGHIJ = wasm_f32x4_sub(viGHIJ, vi_max); - - v128_t vn0123 = wasm_f32x4_relaxed_madd(vx0123, vlog2e, vmagic_bias); - v128_t vn4567 = wasm_f32x4_relaxed_madd(vx4567, vlog2e, vmagic_bias); - v128_t vn89AB = wasm_f32x4_relaxed_madd(vx89AB, vlog2e, vmagic_bias); - v128_t vnCDEF = wasm_f32x4_relaxed_madd(vxCDEF, vlog2e, vmagic_bias); - v128_t vnGHIJ = wasm_f32x4_relaxed_madd(vxGHIJ, vlog2e, vmagic_bias); - - const v128_t vs0123 = wasm_i32x4_shl(vn0123, 23); - const v128_t vs4567 = wasm_i32x4_shl(vn4567, 23); - const v128_t vs89AB = wasm_i32x4_shl(vn89AB, 23); - const v128_t vsCDEF = wasm_i32x4_shl(vnCDEF, 23); - const v128_t vsGHIJ = wasm_i32x4_shl(vnGHIJ, 23); - - vn0123 = wasm_f32x4_sub(vn0123, vmagic_bias); - vn4567 = wasm_f32x4_sub(vn4567, vmagic_bias); - vn89AB = wasm_f32x4_sub(vn89AB, vmagic_bias); - vnCDEF = wasm_f32x4_sub(vnCDEF, vmagic_bias); - vnGHIJ = wasm_f32x4_sub(vnGHIJ, vmagic_bias); - - v128_t vt0123 = wasm_f32x4_relaxed_madd(vn0123, vminus_ln2_hi, vx0123); - v128_t vt4567 = wasm_f32x4_relaxed_madd(vn4567, vminus_ln2_hi, vx4567); - v128_t vt89AB = wasm_f32x4_relaxed_madd(vn89AB, vminus_ln2_hi, vx89AB); - v128_t vtCDEF = wasm_f32x4_relaxed_madd(vnCDEF, vminus_ln2_hi, vxCDEF); - v128_t vtGHIJ = wasm_f32x4_relaxed_madd(vnGHIJ, vminus_ln2_hi, vxGHIJ); - - vt0123 = wasm_f32x4_relaxed_madd(vn0123, vminus_ln2_lo, vt0123); - vt4567 = wasm_f32x4_relaxed_madd(vn4567, vminus_ln2_lo, vt4567); - vt89AB = wasm_f32x4_relaxed_madd(vn89AB, vminus_ln2_lo, vt89AB); - vtCDEF = wasm_f32x4_relaxed_madd(vnCDEF, vminus_ln2_lo, vtCDEF); - vtGHIJ = wasm_f32x4_relaxed_madd(vnGHIJ, vminus_ln2_lo, vtGHIJ); - - v128_t vp0123 = wasm_f32x4_relaxed_madd(vc5, vt0123, vc4); - v128_t vp4567 = wasm_f32x4_relaxed_madd(vc5, vt4567, vc4); - v128_t vp89AB = wasm_f32x4_relaxed_madd(vc5, vt89AB, vc4); - v128_t vpCDEF = wasm_f32x4_relaxed_madd(vc5, vtCDEF, vc4); - v128_t vpGHIJ = wasm_f32x4_relaxed_madd(vc5, vtGHIJ, vc4); - - vp0123 = wasm_f32x4_relaxed_madd(vp0123, vt0123, vc3); - vp4567 = wasm_f32x4_relaxed_madd(vp4567, vt4567, vc3); - vp89AB = wasm_f32x4_relaxed_madd(vp89AB, vt89AB, vc3); - vpCDEF = wasm_f32x4_relaxed_madd(vpCDEF, vtCDEF, vc3); - vpGHIJ = wasm_f32x4_relaxed_madd(vpGHIJ, vtGHIJ, vc3); - - vp0123 = wasm_f32x4_relaxed_madd(vp0123, vt0123, vc2); - vp4567 = wasm_f32x4_relaxed_madd(vp4567, vt4567, vc2); - vp89AB = wasm_f32x4_relaxed_madd(vp89AB, vt89AB, vc2); - vpCDEF = wasm_f32x4_relaxed_madd(vpCDEF, vtCDEF, vc2); - vpGHIJ = wasm_f32x4_relaxed_madd(vpGHIJ, vtGHIJ, vc2); - - vp0123 = wasm_f32x4_relaxed_madd(vp0123, vt0123, vc1); - vp4567 = wasm_f32x4_relaxed_madd(vp4567, vt4567, vc1); - vp89AB = wasm_f32x4_relaxed_madd(vp89AB, vt89AB, vc1); - vpCDEF = wasm_f32x4_relaxed_madd(vpCDEF, vtCDEF, vc1); - vpGHIJ = wasm_f32x4_relaxed_madd(vpGHIJ, vtGHIJ, vc1); - - vt0123 = wasm_f32x4_mul(vt0123, vs0123); - vt4567 = wasm_f32x4_mul(vt4567, vs4567); - vt89AB = wasm_f32x4_mul(vt89AB, vs89AB); - vtCDEF = wasm_f32x4_mul(vtCDEF, vsCDEF); - vtGHIJ = wasm_f32x4_mul(vtGHIJ, vsGHIJ); - - v128_t vf0123 = wasm_f32x4_relaxed_madd(vt0123, vp0123, vs0123); - v128_t vf4567 = wasm_f32x4_relaxed_madd(vt4567, vp4567, vs4567); - v128_t vf89AB = wasm_f32x4_relaxed_madd(vt89AB, vp89AB, vs89AB); - v128_t vfCDEF = wasm_f32x4_relaxed_madd(vtCDEF, vpCDEF, vsCDEF); - v128_t vfGHIJ = wasm_f32x4_relaxed_madd(vtGHIJ, vpGHIJ, vsGHIJ); - - vf0123 = wasm_v128_andnot(vf0123, wasm_f32x4_lt(vx0123, vdenorm_cutoff)); - vf4567 = wasm_v128_andnot(vf4567, wasm_f32x4_lt(vx4567, vdenorm_cutoff)); - vf89AB = wasm_v128_andnot(vf89AB, wasm_f32x4_lt(vx89AB, vdenorm_cutoff)); - vfCDEF = wasm_v128_andnot(vfCDEF, wasm_f32x4_lt(vxCDEF, vdenorm_cutoff)); - vfGHIJ = wasm_v128_andnot(vfGHIJ, wasm_f32x4_lt(vxGHIJ, vdenorm_cutoff)); - - wasm_v128_store(output, vf0123); - wasm_v128_store(output + 4, vf4567); - wasm_v128_store(output + 8, vf89AB); - wasm_v128_store(output + 12, vfCDEF); - wasm_v128_store(output + 16, vfGHIJ); - output += 20; - - vacc0 = wasm_f32x4_add(vacc0, vf0123); - vacc0 = wasm_f32x4_add(vacc0, vf4567); - vacc0 = wasm_f32x4_add(vacc0, vf89AB); - vacc0 = wasm_f32x4_add(vacc0, vfCDEF); - vacc0 = wasm_f32x4_add(vacc0, vfGHIJ); - } - // Add up all accumulators to vacc0 - vacc0 = wasm_f32x4_add(vacc0, vacc1); - - v128_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t vi = wasm_v128_load(input); - input += 4; - - const v128_t vx = wasm_f32x4_sub(vi, vi_max); - - v128_t vn = wasm_f32x4_relaxed_madd(vx, vlog2e, vmagic_bias); - - const v128_t vs = wasm_i32x4_shl(vn, 23); - - vn = wasm_f32x4_sub(vn, vmagic_bias); - - v128_t vt = wasm_f32x4_relaxed_madd(vn, vminus_ln2_hi, vx); - vt = wasm_f32x4_relaxed_madd(vn, vminus_ln2_lo, vt); - - v128_t vp = wasm_f32x4_relaxed_madd(vc5, vt, vc4); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc3); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc2); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc1); - - vt = wasm_f32x4_mul(vt, vs); - v128_t vf = wasm_f32x4_relaxed_madd(vt, vp, vs); - - vf = wasm_v128_andnot(vf, wasm_f32x4_lt(vx, vdenorm_cutoff)); - - wasm_v128_store(output, vf); - output += 4; - - vacc = wasm_f32x4_add(vacc, vf); - } - vacc = wasm_f32x4_add(vacc, wasm_v64x2_shuffle(vacc, vacc, 1, 1)); - float vsum = wasm_f32x4_extract_lane(vacc, 0) + wasm_f32x4_extract_lane(vacc, 1); - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - - const v128_t vi = wasm_v128_load(input); - - const v128_t vx = wasm_f32x4_sub(vi, vi_max); - - v128_t vn = wasm_f32x4_relaxed_madd(vx, vlog2e, vmagic_bias); - - const v128_t vs = wasm_i32x4_shl(vn, 23); - - vn = wasm_f32x4_sub(vn, vmagic_bias); - - v128_t vt = wasm_f32x4_relaxed_madd(vn, vminus_ln2_hi, vx); - vt = wasm_f32x4_relaxed_madd(vn, vminus_ln2_lo, vt); - - v128_t vp = wasm_f32x4_relaxed_madd(vc5, vt, vc4); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc3); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc2); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc1); - - vt = wasm_f32x4_mul(vt, vs); - v128_t vf = wasm_f32x4_relaxed_madd(vt, vp, vs); - - vf = wasm_v128_andnot(vf, wasm_f32x4_lt(vx, vdenorm_cutoff)); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vf, 0); - output += 2; - - vsum += wasm_f32x4_extract_lane(vf, 0) + wasm_f32x4_extract_lane(vf, 1); - vf = wasm_v64x2_shuffle(vf, vf, 1, 1); - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vf, 0); - vsum += wasm_f32x4_extract_lane(vf, 0); - } - } - *sum = vsum; -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u20-acc5.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u20-acc5.c deleted file mode 100644 index 2d1e47dcf28..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u20-acc5.c +++ /dev/null @@ -1,241 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u20_acc5( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const v128_t vlog2e = wasm_f32x4_const_splat(0x1.715476p+0f); - const v128_t vmagic_bias = wasm_f32x4_const_splat(0x1.8000FEp23f); - const v128_t vminus_ln2_hi = wasm_f32x4_const_splat(-0x1.62E400p-1f); - const v128_t vminus_ln2_lo = wasm_f32x4_const_splat(-0x1.7F7D1Cp-20f); - const v128_t vc5 = wasm_f32x4_const_splat(0x1.0F9F9Cp-7f); - const v128_t vc4 = wasm_f32x4_const_splat(0x1.573A1Ap-5f); - const v128_t vc3 = wasm_f32x4_const_splat(0x1.555A80p-3f); - const v128_t vc2 = wasm_f32x4_const_splat(0x1.FFFDC6p-2f); - const v128_t vc1 = wasm_f32x4_const_splat(0x1.FFFFF6p-1f); - const v128_t vdenorm_cutoff = wasm_f32x4_const_splat(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const v128_t vi_max = wasm_v128_load32_splat(max); - - v128_t vacc0 = wasm_f32x4_const_splat(0.0f); - v128_t vacc1 = vacc0; - v128_t vacc2 = vacc0; - v128_t vacc3 = vacc0; - v128_t vacc4 = vacc0; - for (; batch >= 20 * sizeof(float); batch -= 20 * sizeof(float)) { - // Load 20 (5x4) inputs at a time. - const v128_t vi0123 = wasm_v128_load(input); - const v128_t vi4567 = wasm_v128_load(input + 4); - const v128_t vi89AB = wasm_v128_load(input + 8); - const v128_t viCDEF = wasm_v128_load(input + 12); - const v128_t viGHIJ = wasm_v128_load(input + 16); - input += 20; - - const v128_t vx0123 = wasm_f32x4_sub(vi0123, vi_max); - const v128_t vx4567 = wasm_f32x4_sub(vi4567, vi_max); - const v128_t vx89AB = wasm_f32x4_sub(vi89AB, vi_max); - const v128_t vxCDEF = wasm_f32x4_sub(viCDEF, vi_max); - const v128_t vxGHIJ = wasm_f32x4_sub(viGHIJ, vi_max); - - v128_t vn0123 = wasm_f32x4_relaxed_madd(vx0123, vlog2e, vmagic_bias); - v128_t vn4567 = wasm_f32x4_relaxed_madd(vx4567, vlog2e, vmagic_bias); - v128_t vn89AB = wasm_f32x4_relaxed_madd(vx89AB, vlog2e, vmagic_bias); - v128_t vnCDEF = wasm_f32x4_relaxed_madd(vxCDEF, vlog2e, vmagic_bias); - v128_t vnGHIJ = wasm_f32x4_relaxed_madd(vxGHIJ, vlog2e, vmagic_bias); - - const v128_t vs0123 = wasm_i32x4_shl(vn0123, 23); - const v128_t vs4567 = wasm_i32x4_shl(vn4567, 23); - const v128_t vs89AB = wasm_i32x4_shl(vn89AB, 23); - const v128_t vsCDEF = wasm_i32x4_shl(vnCDEF, 23); - const v128_t vsGHIJ = wasm_i32x4_shl(vnGHIJ, 23); - - vn0123 = wasm_f32x4_sub(vn0123, vmagic_bias); - vn4567 = wasm_f32x4_sub(vn4567, vmagic_bias); - vn89AB = wasm_f32x4_sub(vn89AB, vmagic_bias); - vnCDEF = wasm_f32x4_sub(vnCDEF, vmagic_bias); - vnGHIJ = wasm_f32x4_sub(vnGHIJ, vmagic_bias); - - v128_t vt0123 = wasm_f32x4_relaxed_madd(vn0123, vminus_ln2_hi, vx0123); - v128_t vt4567 = wasm_f32x4_relaxed_madd(vn4567, vminus_ln2_hi, vx4567); - v128_t vt89AB = wasm_f32x4_relaxed_madd(vn89AB, vminus_ln2_hi, vx89AB); - v128_t vtCDEF = wasm_f32x4_relaxed_madd(vnCDEF, vminus_ln2_hi, vxCDEF); - v128_t vtGHIJ = wasm_f32x4_relaxed_madd(vnGHIJ, vminus_ln2_hi, vxGHIJ); - - vt0123 = wasm_f32x4_relaxed_madd(vn0123, vminus_ln2_lo, vt0123); - vt4567 = wasm_f32x4_relaxed_madd(vn4567, vminus_ln2_lo, vt4567); - vt89AB = wasm_f32x4_relaxed_madd(vn89AB, vminus_ln2_lo, vt89AB); - vtCDEF = wasm_f32x4_relaxed_madd(vnCDEF, vminus_ln2_lo, vtCDEF); - vtGHIJ = wasm_f32x4_relaxed_madd(vnGHIJ, vminus_ln2_lo, vtGHIJ); - - v128_t vp0123 = wasm_f32x4_relaxed_madd(vc5, vt0123, vc4); - v128_t vp4567 = wasm_f32x4_relaxed_madd(vc5, vt4567, vc4); - v128_t vp89AB = wasm_f32x4_relaxed_madd(vc5, vt89AB, vc4); - v128_t vpCDEF = wasm_f32x4_relaxed_madd(vc5, vtCDEF, vc4); - v128_t vpGHIJ = wasm_f32x4_relaxed_madd(vc5, vtGHIJ, vc4); - - vp0123 = wasm_f32x4_relaxed_madd(vp0123, vt0123, vc3); - vp4567 = wasm_f32x4_relaxed_madd(vp4567, vt4567, vc3); - vp89AB = wasm_f32x4_relaxed_madd(vp89AB, vt89AB, vc3); - vpCDEF = wasm_f32x4_relaxed_madd(vpCDEF, vtCDEF, vc3); - vpGHIJ = wasm_f32x4_relaxed_madd(vpGHIJ, vtGHIJ, vc3); - - vp0123 = wasm_f32x4_relaxed_madd(vp0123, vt0123, vc2); - vp4567 = wasm_f32x4_relaxed_madd(vp4567, vt4567, vc2); - vp89AB = wasm_f32x4_relaxed_madd(vp89AB, vt89AB, vc2); - vpCDEF = wasm_f32x4_relaxed_madd(vpCDEF, vtCDEF, vc2); - vpGHIJ = wasm_f32x4_relaxed_madd(vpGHIJ, vtGHIJ, vc2); - - vp0123 = wasm_f32x4_relaxed_madd(vp0123, vt0123, vc1); - vp4567 = wasm_f32x4_relaxed_madd(vp4567, vt4567, vc1); - vp89AB = wasm_f32x4_relaxed_madd(vp89AB, vt89AB, vc1); - vpCDEF = wasm_f32x4_relaxed_madd(vpCDEF, vtCDEF, vc1); - vpGHIJ = wasm_f32x4_relaxed_madd(vpGHIJ, vtGHIJ, vc1); - - vt0123 = wasm_f32x4_mul(vt0123, vs0123); - vt4567 = wasm_f32x4_mul(vt4567, vs4567); - vt89AB = wasm_f32x4_mul(vt89AB, vs89AB); - vtCDEF = wasm_f32x4_mul(vtCDEF, vsCDEF); - vtGHIJ = wasm_f32x4_mul(vtGHIJ, vsGHIJ); - - v128_t vf0123 = wasm_f32x4_relaxed_madd(vt0123, vp0123, vs0123); - v128_t vf4567 = wasm_f32x4_relaxed_madd(vt4567, vp4567, vs4567); - v128_t vf89AB = wasm_f32x4_relaxed_madd(vt89AB, vp89AB, vs89AB); - v128_t vfCDEF = wasm_f32x4_relaxed_madd(vtCDEF, vpCDEF, vsCDEF); - v128_t vfGHIJ = wasm_f32x4_relaxed_madd(vtGHIJ, vpGHIJ, vsGHIJ); - - vf0123 = wasm_v128_andnot(vf0123, wasm_f32x4_lt(vx0123, vdenorm_cutoff)); - vf4567 = wasm_v128_andnot(vf4567, wasm_f32x4_lt(vx4567, vdenorm_cutoff)); - vf89AB = wasm_v128_andnot(vf89AB, wasm_f32x4_lt(vx89AB, vdenorm_cutoff)); - vfCDEF = wasm_v128_andnot(vfCDEF, wasm_f32x4_lt(vxCDEF, vdenorm_cutoff)); - vfGHIJ = wasm_v128_andnot(vfGHIJ, wasm_f32x4_lt(vxGHIJ, vdenorm_cutoff)); - - wasm_v128_store(output, vf0123); - wasm_v128_store(output + 4, vf4567); - wasm_v128_store(output + 8, vf89AB); - wasm_v128_store(output + 12, vfCDEF); - wasm_v128_store(output + 16, vfGHIJ); - output += 20; - - vacc0 = wasm_f32x4_add(vacc0, vf0123); - vacc4 = wasm_f32x4_add(vacc4, vf4567); - vacc3 = wasm_f32x4_add(vacc3, vf89AB); - vacc2 = wasm_f32x4_add(vacc2, vfCDEF); - vacc1 = wasm_f32x4_add(vacc1, vfGHIJ); - } - // Add up all accumulators to vacc0 - vacc0 = wasm_f32x4_add(vacc0, vacc1); - vacc2 = wasm_f32x4_add(vacc2, vacc3); - vacc0 = wasm_f32x4_add(vacc0, vacc2); - vacc0 = wasm_f32x4_add(vacc0, vacc4); - - v128_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t vi = wasm_v128_load(input); - input += 4; - - const v128_t vx = wasm_f32x4_sub(vi, vi_max); - - v128_t vn = wasm_f32x4_relaxed_madd(vx, vlog2e, vmagic_bias); - - const v128_t vs = wasm_i32x4_shl(vn, 23); - - vn = wasm_f32x4_sub(vn, vmagic_bias); - - v128_t vt = wasm_f32x4_relaxed_madd(vn, vminus_ln2_hi, vx); - vt = wasm_f32x4_relaxed_madd(vn, vminus_ln2_lo, vt); - - v128_t vp = wasm_f32x4_relaxed_madd(vc5, vt, vc4); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc3); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc2); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc1); - - vt = wasm_f32x4_mul(vt, vs); - v128_t vf = wasm_f32x4_relaxed_madd(vt, vp, vs); - - vf = wasm_v128_andnot(vf, wasm_f32x4_lt(vx, vdenorm_cutoff)); - - wasm_v128_store(output, vf); - output += 4; - - vacc = wasm_f32x4_add(vacc, vf); - } - vacc = wasm_f32x4_add(vacc, wasm_v64x2_shuffle(vacc, vacc, 1, 1)); - float vsum = wasm_f32x4_extract_lane(vacc, 0) + wasm_f32x4_extract_lane(vacc, 1); - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - - const v128_t vi = wasm_v128_load(input); - - const v128_t vx = wasm_f32x4_sub(vi, vi_max); - - v128_t vn = wasm_f32x4_relaxed_madd(vx, vlog2e, vmagic_bias); - - const v128_t vs = wasm_i32x4_shl(vn, 23); - - vn = wasm_f32x4_sub(vn, vmagic_bias); - - v128_t vt = wasm_f32x4_relaxed_madd(vn, vminus_ln2_hi, vx); - vt = wasm_f32x4_relaxed_madd(vn, vminus_ln2_lo, vt); - - v128_t vp = wasm_f32x4_relaxed_madd(vc5, vt, vc4); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc3); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc2); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc1); - - vt = wasm_f32x4_mul(vt, vs); - v128_t vf = wasm_f32x4_relaxed_madd(vt, vp, vs); - - vf = wasm_v128_andnot(vf, wasm_f32x4_lt(vx, vdenorm_cutoff)); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vf, 0); - output += 2; - - vsum += wasm_f32x4_extract_lane(vf, 0) + wasm_f32x4_extract_lane(vf, 1); - vf = wasm_v64x2_shuffle(vf, vf, 1, 1); - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vf, 0); - vsum += wasm_f32x4_extract_lane(vf, 0); - } - } - *sum = vsum; -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u20.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u20.c deleted file mode 100644 index 657c29f95cc..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u20.c +++ /dev/null @@ -1,232 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u20( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const v128_t vlog2e = wasm_f32x4_const_splat(0x1.715476p+0f); - const v128_t vmagic_bias = wasm_f32x4_const_splat(0x1.8000FEp23f); - const v128_t vminus_ln2_hi = wasm_f32x4_const_splat(-0x1.62E400p-1f); - const v128_t vminus_ln2_lo = wasm_f32x4_const_splat(-0x1.7F7D1Cp-20f); - const v128_t vc5 = wasm_f32x4_const_splat(0x1.0F9F9Cp-7f); - const v128_t vc4 = wasm_f32x4_const_splat(0x1.573A1Ap-5f); - const v128_t vc3 = wasm_f32x4_const_splat(0x1.555A80p-3f); - const v128_t vc2 = wasm_f32x4_const_splat(0x1.FFFDC6p-2f); - const v128_t vc1 = wasm_f32x4_const_splat(0x1.FFFFF6p-1f); - const v128_t vdenorm_cutoff = wasm_f32x4_const_splat(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const v128_t vi_max = wasm_v128_load32_splat(max); - - v128_t vacc0 = wasm_f32x4_const_splat(0.0f); - for (; batch >= 20 * sizeof(float); batch -= 20 * sizeof(float)) { - // Load 20 (5x4) inputs at a time. - const v128_t vi0123 = wasm_v128_load(input); - const v128_t vi4567 = wasm_v128_load(input + 4); - const v128_t vi89AB = wasm_v128_load(input + 8); - const v128_t viCDEF = wasm_v128_load(input + 12); - const v128_t viGHIJ = wasm_v128_load(input + 16); - input += 20; - - const v128_t vx0123 = wasm_f32x4_sub(vi0123, vi_max); - const v128_t vx4567 = wasm_f32x4_sub(vi4567, vi_max); - const v128_t vx89AB = wasm_f32x4_sub(vi89AB, vi_max); - const v128_t vxCDEF = wasm_f32x4_sub(viCDEF, vi_max); - const v128_t vxGHIJ = wasm_f32x4_sub(viGHIJ, vi_max); - - v128_t vn0123 = wasm_f32x4_relaxed_madd(vx0123, vlog2e, vmagic_bias); - v128_t vn4567 = wasm_f32x4_relaxed_madd(vx4567, vlog2e, vmagic_bias); - v128_t vn89AB = wasm_f32x4_relaxed_madd(vx89AB, vlog2e, vmagic_bias); - v128_t vnCDEF = wasm_f32x4_relaxed_madd(vxCDEF, vlog2e, vmagic_bias); - v128_t vnGHIJ = wasm_f32x4_relaxed_madd(vxGHIJ, vlog2e, vmagic_bias); - - const v128_t vs0123 = wasm_i32x4_shl(vn0123, 23); - const v128_t vs4567 = wasm_i32x4_shl(vn4567, 23); - const v128_t vs89AB = wasm_i32x4_shl(vn89AB, 23); - const v128_t vsCDEF = wasm_i32x4_shl(vnCDEF, 23); - const v128_t vsGHIJ = wasm_i32x4_shl(vnGHIJ, 23); - - vn0123 = wasm_f32x4_sub(vn0123, vmagic_bias); - vn4567 = wasm_f32x4_sub(vn4567, vmagic_bias); - vn89AB = wasm_f32x4_sub(vn89AB, vmagic_bias); - vnCDEF = wasm_f32x4_sub(vnCDEF, vmagic_bias); - vnGHIJ = wasm_f32x4_sub(vnGHIJ, vmagic_bias); - - v128_t vt0123 = wasm_f32x4_relaxed_madd(vn0123, vminus_ln2_hi, vx0123); - v128_t vt4567 = wasm_f32x4_relaxed_madd(vn4567, vminus_ln2_hi, vx4567); - v128_t vt89AB = wasm_f32x4_relaxed_madd(vn89AB, vminus_ln2_hi, vx89AB); - v128_t vtCDEF = wasm_f32x4_relaxed_madd(vnCDEF, vminus_ln2_hi, vxCDEF); - v128_t vtGHIJ = wasm_f32x4_relaxed_madd(vnGHIJ, vminus_ln2_hi, vxGHIJ); - - vt0123 = wasm_f32x4_relaxed_madd(vn0123, vminus_ln2_lo, vt0123); - vt4567 = wasm_f32x4_relaxed_madd(vn4567, vminus_ln2_lo, vt4567); - vt89AB = wasm_f32x4_relaxed_madd(vn89AB, vminus_ln2_lo, vt89AB); - vtCDEF = wasm_f32x4_relaxed_madd(vnCDEF, vminus_ln2_lo, vtCDEF); - vtGHIJ = wasm_f32x4_relaxed_madd(vnGHIJ, vminus_ln2_lo, vtGHIJ); - - v128_t vp0123 = wasm_f32x4_relaxed_madd(vc5, vt0123, vc4); - v128_t vp4567 = wasm_f32x4_relaxed_madd(vc5, vt4567, vc4); - v128_t vp89AB = wasm_f32x4_relaxed_madd(vc5, vt89AB, vc4); - v128_t vpCDEF = wasm_f32x4_relaxed_madd(vc5, vtCDEF, vc4); - v128_t vpGHIJ = wasm_f32x4_relaxed_madd(vc5, vtGHIJ, vc4); - - vp0123 = wasm_f32x4_relaxed_madd(vp0123, vt0123, vc3); - vp4567 = wasm_f32x4_relaxed_madd(vp4567, vt4567, vc3); - vp89AB = wasm_f32x4_relaxed_madd(vp89AB, vt89AB, vc3); - vpCDEF = wasm_f32x4_relaxed_madd(vpCDEF, vtCDEF, vc3); - vpGHIJ = wasm_f32x4_relaxed_madd(vpGHIJ, vtGHIJ, vc3); - - vp0123 = wasm_f32x4_relaxed_madd(vp0123, vt0123, vc2); - vp4567 = wasm_f32x4_relaxed_madd(vp4567, vt4567, vc2); - vp89AB = wasm_f32x4_relaxed_madd(vp89AB, vt89AB, vc2); - vpCDEF = wasm_f32x4_relaxed_madd(vpCDEF, vtCDEF, vc2); - vpGHIJ = wasm_f32x4_relaxed_madd(vpGHIJ, vtGHIJ, vc2); - - vp0123 = wasm_f32x4_relaxed_madd(vp0123, vt0123, vc1); - vp4567 = wasm_f32x4_relaxed_madd(vp4567, vt4567, vc1); - vp89AB = wasm_f32x4_relaxed_madd(vp89AB, vt89AB, vc1); - vpCDEF = wasm_f32x4_relaxed_madd(vpCDEF, vtCDEF, vc1); - vpGHIJ = wasm_f32x4_relaxed_madd(vpGHIJ, vtGHIJ, vc1); - - vt0123 = wasm_f32x4_mul(vt0123, vs0123); - vt4567 = wasm_f32x4_mul(vt4567, vs4567); - vt89AB = wasm_f32x4_mul(vt89AB, vs89AB); - vtCDEF = wasm_f32x4_mul(vtCDEF, vsCDEF); - vtGHIJ = wasm_f32x4_mul(vtGHIJ, vsGHIJ); - - v128_t vf0123 = wasm_f32x4_relaxed_madd(vt0123, vp0123, vs0123); - v128_t vf4567 = wasm_f32x4_relaxed_madd(vt4567, vp4567, vs4567); - v128_t vf89AB = wasm_f32x4_relaxed_madd(vt89AB, vp89AB, vs89AB); - v128_t vfCDEF = wasm_f32x4_relaxed_madd(vtCDEF, vpCDEF, vsCDEF); - v128_t vfGHIJ = wasm_f32x4_relaxed_madd(vtGHIJ, vpGHIJ, vsGHIJ); - - vf0123 = wasm_v128_andnot(vf0123, wasm_f32x4_lt(vx0123, vdenorm_cutoff)); - vf4567 = wasm_v128_andnot(vf4567, wasm_f32x4_lt(vx4567, vdenorm_cutoff)); - vf89AB = wasm_v128_andnot(vf89AB, wasm_f32x4_lt(vx89AB, vdenorm_cutoff)); - vfCDEF = wasm_v128_andnot(vfCDEF, wasm_f32x4_lt(vxCDEF, vdenorm_cutoff)); - vfGHIJ = wasm_v128_andnot(vfGHIJ, wasm_f32x4_lt(vxGHIJ, vdenorm_cutoff)); - - wasm_v128_store(output, vf0123); - wasm_v128_store(output + 4, vf4567); - wasm_v128_store(output + 8, vf89AB); - wasm_v128_store(output + 12, vfCDEF); - wasm_v128_store(output + 16, vfGHIJ); - output += 20; - - vacc0 = wasm_f32x4_add(vacc0, vf0123); - vacc0 = wasm_f32x4_add(vacc0, vf4567); - vacc0 = wasm_f32x4_add(vacc0, vf89AB); - vacc0 = wasm_f32x4_add(vacc0, vfCDEF); - vacc0 = wasm_f32x4_add(vacc0, vfGHIJ); - } - - v128_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t vi = wasm_v128_load(input); - input += 4; - - const v128_t vx = wasm_f32x4_sub(vi, vi_max); - - v128_t vn = wasm_f32x4_relaxed_madd(vx, vlog2e, vmagic_bias); - - const v128_t vs = wasm_i32x4_shl(vn, 23); - - vn = wasm_f32x4_sub(vn, vmagic_bias); - - v128_t vt = wasm_f32x4_relaxed_madd(vn, vminus_ln2_hi, vx); - vt = wasm_f32x4_relaxed_madd(vn, vminus_ln2_lo, vt); - - v128_t vp = wasm_f32x4_relaxed_madd(vc5, vt, vc4); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc3); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc2); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc1); - - vt = wasm_f32x4_mul(vt, vs); - v128_t vf = wasm_f32x4_relaxed_madd(vt, vp, vs); - - vf = wasm_v128_andnot(vf, wasm_f32x4_lt(vx, vdenorm_cutoff)); - - wasm_v128_store(output, vf); - output += 4; - - vacc = wasm_f32x4_add(vacc, vf); - } - vacc = wasm_f32x4_add(vacc, wasm_v64x2_shuffle(vacc, vacc, 1, 1)); - float vsum = wasm_f32x4_extract_lane(vacc, 0) + wasm_f32x4_extract_lane(vacc, 1); - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - - const v128_t vi = wasm_v128_load(input); - - const v128_t vx = wasm_f32x4_sub(vi, vi_max); - - v128_t vn = wasm_f32x4_relaxed_madd(vx, vlog2e, vmagic_bias); - - const v128_t vs = wasm_i32x4_shl(vn, 23); - - vn = wasm_f32x4_sub(vn, vmagic_bias); - - v128_t vt = wasm_f32x4_relaxed_madd(vn, vminus_ln2_hi, vx); - vt = wasm_f32x4_relaxed_madd(vn, vminus_ln2_lo, vt); - - v128_t vp = wasm_f32x4_relaxed_madd(vc5, vt, vc4); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc3); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc2); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc1); - - vt = wasm_f32x4_mul(vt, vs); - v128_t vf = wasm_f32x4_relaxed_madd(vt, vp, vs); - - vf = wasm_v128_andnot(vf, wasm_f32x4_lt(vx, vdenorm_cutoff)); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vf, 0); - output += 2; - - vsum += wasm_f32x4_extract_lane(vf, 0) + wasm_f32x4_extract_lane(vf, 1); - vf = wasm_v64x2_shuffle(vf, vf, 1, 1); - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vf, 0); - vsum += wasm_f32x4_extract_lane(vf, 0); - } - } - *sum = vsum; -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u8.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u8.c deleted file mode 100644 index 4a56bb7ab74..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u8.c +++ /dev/null @@ -1,184 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u8( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const v128_t vlog2e = wasm_f32x4_const_splat(0x1.715476p+0f); - const v128_t vmagic_bias = wasm_f32x4_const_splat(0x1.8000FEp23f); - const v128_t vminus_ln2_hi = wasm_f32x4_const_splat(-0x1.62E400p-1f); - const v128_t vminus_ln2_lo = wasm_f32x4_const_splat(-0x1.7F7D1Cp-20f); - const v128_t vc5 = wasm_f32x4_const_splat(0x1.0F9F9Cp-7f); - const v128_t vc4 = wasm_f32x4_const_splat(0x1.573A1Ap-5f); - const v128_t vc3 = wasm_f32x4_const_splat(0x1.555A80p-3f); - const v128_t vc2 = wasm_f32x4_const_splat(0x1.FFFDC6p-2f); - const v128_t vc1 = wasm_f32x4_const_splat(0x1.FFFFF6p-1f); - const v128_t vdenorm_cutoff = wasm_f32x4_const_splat(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const v128_t vi_max = wasm_v128_load32_splat(max); - - v128_t vacc0 = wasm_f32x4_const_splat(0.0f); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - // Load 8 (2x4) inputs at a time. - const v128_t vi0123 = wasm_v128_load(input); - const v128_t vi4567 = wasm_v128_load(input + 4); - input += 8; - - const v128_t vx0123 = wasm_f32x4_sub(vi0123, vi_max); - const v128_t vx4567 = wasm_f32x4_sub(vi4567, vi_max); - - v128_t vn0123 = wasm_f32x4_relaxed_madd(vx0123, vlog2e, vmagic_bias); - v128_t vn4567 = wasm_f32x4_relaxed_madd(vx4567, vlog2e, vmagic_bias); - - const v128_t vs0123 = wasm_i32x4_shl(vn0123, 23); - const v128_t vs4567 = wasm_i32x4_shl(vn4567, 23); - - vn0123 = wasm_f32x4_sub(vn0123, vmagic_bias); - vn4567 = wasm_f32x4_sub(vn4567, vmagic_bias); - - v128_t vt0123 = wasm_f32x4_relaxed_madd(vn0123, vminus_ln2_hi, vx0123); - v128_t vt4567 = wasm_f32x4_relaxed_madd(vn4567, vminus_ln2_hi, vx4567); - - vt0123 = wasm_f32x4_relaxed_madd(vn0123, vminus_ln2_lo, vt0123); - vt4567 = wasm_f32x4_relaxed_madd(vn4567, vminus_ln2_lo, vt4567); - - v128_t vp0123 = wasm_f32x4_relaxed_madd(vc5, vt0123, vc4); - v128_t vp4567 = wasm_f32x4_relaxed_madd(vc5, vt4567, vc4); - - vp0123 = wasm_f32x4_relaxed_madd(vp0123, vt0123, vc3); - vp4567 = wasm_f32x4_relaxed_madd(vp4567, vt4567, vc3); - - vp0123 = wasm_f32x4_relaxed_madd(vp0123, vt0123, vc2); - vp4567 = wasm_f32x4_relaxed_madd(vp4567, vt4567, vc2); - - vp0123 = wasm_f32x4_relaxed_madd(vp0123, vt0123, vc1); - vp4567 = wasm_f32x4_relaxed_madd(vp4567, vt4567, vc1); - - vt0123 = wasm_f32x4_mul(vt0123, vs0123); - vt4567 = wasm_f32x4_mul(vt4567, vs4567); - - v128_t vf0123 = wasm_f32x4_relaxed_madd(vt0123, vp0123, vs0123); - v128_t vf4567 = wasm_f32x4_relaxed_madd(vt4567, vp4567, vs4567); - - vf0123 = wasm_v128_andnot(vf0123, wasm_f32x4_lt(vx0123, vdenorm_cutoff)); - vf4567 = wasm_v128_andnot(vf4567, wasm_f32x4_lt(vx4567, vdenorm_cutoff)); - - wasm_v128_store(output, vf0123); - wasm_v128_store(output + 4, vf4567); - output += 8; - - vacc0 = wasm_f32x4_add(vacc0, vf0123); - vacc0 = wasm_f32x4_add(vacc0, vf4567); - } - - v128_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t vi = wasm_v128_load(input); - input += 4; - - const v128_t vx = wasm_f32x4_sub(vi, vi_max); - - v128_t vn = wasm_f32x4_relaxed_madd(vx, vlog2e, vmagic_bias); - - const v128_t vs = wasm_i32x4_shl(vn, 23); - - vn = wasm_f32x4_sub(vn, vmagic_bias); - - v128_t vt = wasm_f32x4_relaxed_madd(vn, vminus_ln2_hi, vx); - vt = wasm_f32x4_relaxed_madd(vn, vminus_ln2_lo, vt); - - v128_t vp = wasm_f32x4_relaxed_madd(vc5, vt, vc4); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc3); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc2); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc1); - - vt = wasm_f32x4_mul(vt, vs); - v128_t vf = wasm_f32x4_relaxed_madd(vt, vp, vs); - - vf = wasm_v128_andnot(vf, wasm_f32x4_lt(vx, vdenorm_cutoff)); - - wasm_v128_store(output, vf); - output += 4; - - vacc = wasm_f32x4_add(vacc, vf); - } - vacc = wasm_f32x4_add(vacc, wasm_v64x2_shuffle(vacc, vacc, 1, 1)); - float vsum = wasm_f32x4_extract_lane(vacc, 0) + wasm_f32x4_extract_lane(vacc, 1); - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - - const v128_t vi = wasm_v128_load(input); - - const v128_t vx = wasm_f32x4_sub(vi, vi_max); - - v128_t vn = wasm_f32x4_relaxed_madd(vx, vlog2e, vmagic_bias); - - const v128_t vs = wasm_i32x4_shl(vn, 23); - - vn = wasm_f32x4_sub(vn, vmagic_bias); - - v128_t vt = wasm_f32x4_relaxed_madd(vn, vminus_ln2_hi, vx); - vt = wasm_f32x4_relaxed_madd(vn, vminus_ln2_lo, vt); - - v128_t vp = wasm_f32x4_relaxed_madd(vc5, vt, vc4); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc3); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc2); - vp = wasm_f32x4_relaxed_madd(vp, vt, vc1); - - vt = wasm_f32x4_mul(vt, vs); - v128_t vf = wasm_f32x4_relaxed_madd(vt, vp, vs); - - vf = wasm_v128_andnot(vf, wasm_f32x4_lt(vx, vdenorm_cutoff)); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vf, 0); - output += 2; - - vsum += wasm_f32x4_extract_lane(vf, 0) + wasm_f32x4_extract_lane(vf, 1); - vf = wasm_v64x2_shuffle(vf, vf, 1, 1); - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vf, 0); - vsum += wasm_f32x4_extract_lane(vf, 0); - } - } - *sum = vsum; -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u12-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u12-acc2.c deleted file mode 100644 index 8da5d9d5646..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u12-acc2.c +++ /dev/null @@ -1,203 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u12_acc2( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const v128_t vlog2e = wasm_f32x4_const_splat(0x1.715476p+0f); - const v128_t vmagic_bias = wasm_f32x4_const_splat(0x1.8000FEp23f); - const v128_t vminus_ln2_hi = wasm_f32x4_const_splat(-0x1.62E400p-1f); - const v128_t vminus_ln2_lo = wasm_f32x4_const_splat(-0x1.7F7D1Cp-20f); - const v128_t vc5 = wasm_f32x4_const_splat(0x1.0F9F9Cp-7f); - const v128_t vc4 = wasm_f32x4_const_splat(0x1.573A1Ap-5f); - const v128_t vc3 = wasm_f32x4_const_splat(0x1.555A80p-3f); - const v128_t vc2 = wasm_f32x4_const_splat(0x1.FFFDC6p-2f); - const v128_t vc1 = wasm_f32x4_const_splat(0x1.FFFFF6p-1f); - const v128_t vdenorm_cutoff = wasm_f32x4_const_splat(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const v128_t vi_max = wasm_v128_load32_splat(max); - - v128_t vacc0 = wasm_f32x4_const_splat(0.0f); - v128_t vacc1 = vacc0; - for (; batch >= 12 * sizeof(float); batch -= 12 * sizeof(float)) { - // Load 12 (3x4) inputs at a time. - const v128_t vi0123 = wasm_v128_load(input); - const v128_t vi4567 = wasm_v128_load(input + 4); - const v128_t vi89AB = wasm_v128_load(input + 8); - input += 12; - - const v128_t vx0123 = wasm_f32x4_sub(vi0123, vi_max); - const v128_t vx4567 = wasm_f32x4_sub(vi4567, vi_max); - const v128_t vx89AB = wasm_f32x4_sub(vi89AB, vi_max); - - v128_t vn0123 = wasm_f32x4_add(wasm_f32x4_mul(vx0123, vlog2e), vmagic_bias); - v128_t vn4567 = wasm_f32x4_add(wasm_f32x4_mul(vx4567, vlog2e), vmagic_bias); - v128_t vn89AB = wasm_f32x4_add(wasm_f32x4_mul(vx89AB, vlog2e), vmagic_bias); - - const v128_t vs0123 = wasm_i32x4_shl(vn0123, 23); - const v128_t vs4567 = wasm_i32x4_shl(vn4567, 23); - const v128_t vs89AB = wasm_i32x4_shl(vn89AB, 23); - - vn0123 = wasm_f32x4_sub(vn0123, vmagic_bias); - vn4567 = wasm_f32x4_sub(vn4567, vmagic_bias); - vn89AB = wasm_f32x4_sub(vn89AB, vmagic_bias); - - v128_t vt0123 = wasm_f32x4_add(wasm_f32x4_mul(vn0123, vminus_ln2_hi), vx0123); - v128_t vt4567 = wasm_f32x4_add(wasm_f32x4_mul(vn4567, vminus_ln2_hi), vx4567); - v128_t vt89AB = wasm_f32x4_add(wasm_f32x4_mul(vn89AB, vminus_ln2_hi), vx89AB); - - vt0123 = wasm_f32x4_add(wasm_f32x4_mul(vn0123, vminus_ln2_lo), vt0123); - vt4567 = wasm_f32x4_add(wasm_f32x4_mul(vn4567, vminus_ln2_lo), vt4567); - vt89AB = wasm_f32x4_add(wasm_f32x4_mul(vn89AB, vminus_ln2_lo), vt89AB); - - v128_t vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt0123), vc4); - v128_t vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt4567), vc4); - v128_t vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt89AB), vc4); - - vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vc3); - vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vc3); - vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vc3); - - vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vc2); - vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vc2); - vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vc2); - - vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vc1); - vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vc1); - vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vc1); - - vt0123 = wasm_f32x4_mul(vt0123, vs0123); - vt4567 = wasm_f32x4_mul(vt4567, vs4567); - vt89AB = wasm_f32x4_mul(vt89AB, vs89AB); - - v128_t vf0123 = wasm_f32x4_add(wasm_f32x4_mul(vt0123, vp0123), vs0123); - v128_t vf4567 = wasm_f32x4_add(wasm_f32x4_mul(vt4567, vp4567), vs4567); - v128_t vf89AB = wasm_f32x4_add(wasm_f32x4_mul(vt89AB, vp89AB), vs89AB); - - vf0123 = wasm_v128_andnot(vf0123, wasm_f32x4_lt(vx0123, vdenorm_cutoff)); - vf4567 = wasm_v128_andnot(vf4567, wasm_f32x4_lt(vx4567, vdenorm_cutoff)); - vf89AB = wasm_v128_andnot(vf89AB, wasm_f32x4_lt(vx89AB, vdenorm_cutoff)); - - wasm_v128_store(output, vf0123); - wasm_v128_store(output + 4, vf4567); - wasm_v128_store(output + 8, vf89AB); - output += 12; - - vacc0 = wasm_f32x4_add(vacc0, vf0123); - vacc0 = wasm_f32x4_add(vacc0, vf4567); - vacc0 = wasm_f32x4_add(vacc0, vf89AB); - } - // Add up all accumulators to vacc0 - vacc0 = wasm_f32x4_add(vacc0, vacc1); - - v128_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t vi = wasm_v128_load(input); - input += 4; - - const v128_t vx = wasm_f32x4_sub(vi, vi_max); - - v128_t vn = wasm_f32x4_add(wasm_f32x4_mul(vx, vlog2e), vmagic_bias); - - const v128_t vs = wasm_i32x4_shl(vn, 23); - - vn = wasm_f32x4_sub(vn, vmagic_bias); - - v128_t vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_hi), vx); - vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_lo), vt); - - v128_t vp = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt), vc4); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc3); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc2); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc1); - - vt = wasm_f32x4_mul(vt, vs); - v128_t vf = wasm_f32x4_add(wasm_f32x4_mul(vt, vp), vs); - - vf = wasm_v128_andnot(vf, wasm_f32x4_lt(vx, vdenorm_cutoff)); - - wasm_v128_store(output, vf); - output += 4; - - vacc = wasm_f32x4_add(vacc, vf); - } - vacc = wasm_f32x4_add(vacc, wasm_v64x2_shuffle(vacc, vacc, 1, 1)); - float vsum = wasm_f32x4_extract_lane(vacc, 0) + wasm_f32x4_extract_lane(vacc, 1); - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - - const v128_t vi = wasm_v128_load(input); - - const v128_t vx = wasm_f32x4_sub(vi, vi_max); - - v128_t vn = wasm_f32x4_add(wasm_f32x4_mul(vx, vlog2e), vmagic_bias); - - const v128_t vs = wasm_i32x4_shl(vn, 23); - - vn = wasm_f32x4_sub(vn, vmagic_bias); - - v128_t vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_hi), vx); - vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_lo), vt); - - v128_t vp = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt), vc4); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc3); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc2); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc1); - - vt = wasm_f32x4_mul(vt, vs); - v128_t vf = wasm_f32x4_add(wasm_f32x4_mul(vt, vp), vs); - - vf = wasm_v128_andnot(vf, wasm_f32x4_lt(vx, vdenorm_cutoff)); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vf, 0); - output += 2; - - vsum += wasm_f32x4_extract_lane(vf, 0) + wasm_f32x4_extract_lane(vf, 1); - vf = wasm_v64x2_shuffle(vf, vf, 1, 1); - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vf, 0); - vsum += wasm_f32x4_extract_lane(vf, 0); - } - } - *sum = vsum; -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u12-acc3.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u12-acc3.c deleted file mode 100644 index 7eb9f9ee2e1..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u12-acc3.c +++ /dev/null @@ -1,205 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u12_acc3( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const v128_t vlog2e = wasm_f32x4_const_splat(0x1.715476p+0f); - const v128_t vmagic_bias = wasm_f32x4_const_splat(0x1.8000FEp23f); - const v128_t vminus_ln2_hi = wasm_f32x4_const_splat(-0x1.62E400p-1f); - const v128_t vminus_ln2_lo = wasm_f32x4_const_splat(-0x1.7F7D1Cp-20f); - const v128_t vc5 = wasm_f32x4_const_splat(0x1.0F9F9Cp-7f); - const v128_t vc4 = wasm_f32x4_const_splat(0x1.573A1Ap-5f); - const v128_t vc3 = wasm_f32x4_const_splat(0x1.555A80p-3f); - const v128_t vc2 = wasm_f32x4_const_splat(0x1.FFFDC6p-2f); - const v128_t vc1 = wasm_f32x4_const_splat(0x1.FFFFF6p-1f); - const v128_t vdenorm_cutoff = wasm_f32x4_const_splat(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const v128_t vi_max = wasm_v128_load32_splat(max); - - v128_t vacc0 = wasm_f32x4_const_splat(0.0f); - v128_t vacc1 = vacc0; - v128_t vacc2 = vacc0; - for (; batch >= 12 * sizeof(float); batch -= 12 * sizeof(float)) { - // Load 12 (3x4) inputs at a time. - const v128_t vi0123 = wasm_v128_load(input); - const v128_t vi4567 = wasm_v128_load(input + 4); - const v128_t vi89AB = wasm_v128_load(input + 8); - input += 12; - - const v128_t vx0123 = wasm_f32x4_sub(vi0123, vi_max); - const v128_t vx4567 = wasm_f32x4_sub(vi4567, vi_max); - const v128_t vx89AB = wasm_f32x4_sub(vi89AB, vi_max); - - v128_t vn0123 = wasm_f32x4_add(wasm_f32x4_mul(vx0123, vlog2e), vmagic_bias); - v128_t vn4567 = wasm_f32x4_add(wasm_f32x4_mul(vx4567, vlog2e), vmagic_bias); - v128_t vn89AB = wasm_f32x4_add(wasm_f32x4_mul(vx89AB, vlog2e), vmagic_bias); - - const v128_t vs0123 = wasm_i32x4_shl(vn0123, 23); - const v128_t vs4567 = wasm_i32x4_shl(vn4567, 23); - const v128_t vs89AB = wasm_i32x4_shl(vn89AB, 23); - - vn0123 = wasm_f32x4_sub(vn0123, vmagic_bias); - vn4567 = wasm_f32x4_sub(vn4567, vmagic_bias); - vn89AB = wasm_f32x4_sub(vn89AB, vmagic_bias); - - v128_t vt0123 = wasm_f32x4_add(wasm_f32x4_mul(vn0123, vminus_ln2_hi), vx0123); - v128_t vt4567 = wasm_f32x4_add(wasm_f32x4_mul(vn4567, vminus_ln2_hi), vx4567); - v128_t vt89AB = wasm_f32x4_add(wasm_f32x4_mul(vn89AB, vminus_ln2_hi), vx89AB); - - vt0123 = wasm_f32x4_add(wasm_f32x4_mul(vn0123, vminus_ln2_lo), vt0123); - vt4567 = wasm_f32x4_add(wasm_f32x4_mul(vn4567, vminus_ln2_lo), vt4567); - vt89AB = wasm_f32x4_add(wasm_f32x4_mul(vn89AB, vminus_ln2_lo), vt89AB); - - v128_t vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt0123), vc4); - v128_t vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt4567), vc4); - v128_t vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt89AB), vc4); - - vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vc3); - vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vc3); - vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vc3); - - vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vc2); - vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vc2); - vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vc2); - - vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vc1); - vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vc1); - vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vc1); - - vt0123 = wasm_f32x4_mul(vt0123, vs0123); - vt4567 = wasm_f32x4_mul(vt4567, vs4567); - vt89AB = wasm_f32x4_mul(vt89AB, vs89AB); - - v128_t vf0123 = wasm_f32x4_add(wasm_f32x4_mul(vt0123, vp0123), vs0123); - v128_t vf4567 = wasm_f32x4_add(wasm_f32x4_mul(vt4567, vp4567), vs4567); - v128_t vf89AB = wasm_f32x4_add(wasm_f32x4_mul(vt89AB, vp89AB), vs89AB); - - vf0123 = wasm_v128_andnot(vf0123, wasm_f32x4_lt(vx0123, vdenorm_cutoff)); - vf4567 = wasm_v128_andnot(vf4567, wasm_f32x4_lt(vx4567, vdenorm_cutoff)); - vf89AB = wasm_v128_andnot(vf89AB, wasm_f32x4_lt(vx89AB, vdenorm_cutoff)); - - wasm_v128_store(output, vf0123); - wasm_v128_store(output + 4, vf4567); - wasm_v128_store(output + 8, vf89AB); - output += 12; - - vacc0 = wasm_f32x4_add(vacc0, vf0123); - vacc1 = wasm_f32x4_add(vacc1, vf4567); - vacc2 = wasm_f32x4_add(vacc2, vf89AB); - } - // Add up all accumulators to vacc0 - vacc0 = wasm_f32x4_add(vacc0, vacc1); - vacc0 = wasm_f32x4_add(vacc0, vacc2); - - v128_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t vi = wasm_v128_load(input); - input += 4; - - const v128_t vx = wasm_f32x4_sub(vi, vi_max); - - v128_t vn = wasm_f32x4_add(wasm_f32x4_mul(vx, vlog2e), vmagic_bias); - - const v128_t vs = wasm_i32x4_shl(vn, 23); - - vn = wasm_f32x4_sub(vn, vmagic_bias); - - v128_t vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_hi), vx); - vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_lo), vt); - - v128_t vp = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt), vc4); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc3); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc2); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc1); - - vt = wasm_f32x4_mul(vt, vs); - v128_t vf = wasm_f32x4_add(wasm_f32x4_mul(vt, vp), vs); - - vf = wasm_v128_andnot(vf, wasm_f32x4_lt(vx, vdenorm_cutoff)); - - wasm_v128_store(output, vf); - output += 4; - - vacc = wasm_f32x4_add(vacc, vf); - } - vacc = wasm_f32x4_add(vacc, wasm_v64x2_shuffle(vacc, vacc, 1, 1)); - float vsum = wasm_f32x4_extract_lane(vacc, 0) + wasm_f32x4_extract_lane(vacc, 1); - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - - const v128_t vi = wasm_v128_load(input); - - const v128_t vx = wasm_f32x4_sub(vi, vi_max); - - v128_t vn = wasm_f32x4_add(wasm_f32x4_mul(vx, vlog2e), vmagic_bias); - - const v128_t vs = wasm_i32x4_shl(vn, 23); - - vn = wasm_f32x4_sub(vn, vmagic_bias); - - v128_t vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_hi), vx); - vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_lo), vt); - - v128_t vp = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt), vc4); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc3); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc2); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc1); - - vt = wasm_f32x4_mul(vt, vs); - v128_t vf = wasm_f32x4_add(wasm_f32x4_mul(vt, vp), vs); - - vf = wasm_v128_andnot(vf, wasm_f32x4_lt(vx, vdenorm_cutoff)); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vf, 0); - output += 2; - - vsum += wasm_f32x4_extract_lane(vf, 0) + wasm_f32x4_extract_lane(vf, 1); - vf = wasm_v64x2_shuffle(vf, vf, 1, 1); - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vf, 0); - vsum += wasm_f32x4_extract_lane(vf, 0); - } - } - *sum = vsum; -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u12.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u12.c deleted file mode 100644 index 269e0d5f394..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u12.c +++ /dev/null @@ -1,200 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u12( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const v128_t vlog2e = wasm_f32x4_const_splat(0x1.715476p+0f); - const v128_t vmagic_bias = wasm_f32x4_const_splat(0x1.8000FEp23f); - const v128_t vminus_ln2_hi = wasm_f32x4_const_splat(-0x1.62E400p-1f); - const v128_t vminus_ln2_lo = wasm_f32x4_const_splat(-0x1.7F7D1Cp-20f); - const v128_t vc5 = wasm_f32x4_const_splat(0x1.0F9F9Cp-7f); - const v128_t vc4 = wasm_f32x4_const_splat(0x1.573A1Ap-5f); - const v128_t vc3 = wasm_f32x4_const_splat(0x1.555A80p-3f); - const v128_t vc2 = wasm_f32x4_const_splat(0x1.FFFDC6p-2f); - const v128_t vc1 = wasm_f32x4_const_splat(0x1.FFFFF6p-1f); - const v128_t vdenorm_cutoff = wasm_f32x4_const_splat(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const v128_t vi_max = wasm_v128_load32_splat(max); - - v128_t vacc0 = wasm_f32x4_const_splat(0.0f); - for (; batch >= 12 * sizeof(float); batch -= 12 * sizeof(float)) { - // Load 12 (3x4) inputs at a time. - const v128_t vi0123 = wasm_v128_load(input); - const v128_t vi4567 = wasm_v128_load(input + 4); - const v128_t vi89AB = wasm_v128_load(input + 8); - input += 12; - - const v128_t vx0123 = wasm_f32x4_sub(vi0123, vi_max); - const v128_t vx4567 = wasm_f32x4_sub(vi4567, vi_max); - const v128_t vx89AB = wasm_f32x4_sub(vi89AB, vi_max); - - v128_t vn0123 = wasm_f32x4_add(wasm_f32x4_mul(vx0123, vlog2e), vmagic_bias); - v128_t vn4567 = wasm_f32x4_add(wasm_f32x4_mul(vx4567, vlog2e), vmagic_bias); - v128_t vn89AB = wasm_f32x4_add(wasm_f32x4_mul(vx89AB, vlog2e), vmagic_bias); - - const v128_t vs0123 = wasm_i32x4_shl(vn0123, 23); - const v128_t vs4567 = wasm_i32x4_shl(vn4567, 23); - const v128_t vs89AB = wasm_i32x4_shl(vn89AB, 23); - - vn0123 = wasm_f32x4_sub(vn0123, vmagic_bias); - vn4567 = wasm_f32x4_sub(vn4567, vmagic_bias); - vn89AB = wasm_f32x4_sub(vn89AB, vmagic_bias); - - v128_t vt0123 = wasm_f32x4_add(wasm_f32x4_mul(vn0123, vminus_ln2_hi), vx0123); - v128_t vt4567 = wasm_f32x4_add(wasm_f32x4_mul(vn4567, vminus_ln2_hi), vx4567); - v128_t vt89AB = wasm_f32x4_add(wasm_f32x4_mul(vn89AB, vminus_ln2_hi), vx89AB); - - vt0123 = wasm_f32x4_add(wasm_f32x4_mul(vn0123, vminus_ln2_lo), vt0123); - vt4567 = wasm_f32x4_add(wasm_f32x4_mul(vn4567, vminus_ln2_lo), vt4567); - vt89AB = wasm_f32x4_add(wasm_f32x4_mul(vn89AB, vminus_ln2_lo), vt89AB); - - v128_t vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt0123), vc4); - v128_t vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt4567), vc4); - v128_t vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt89AB), vc4); - - vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vc3); - vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vc3); - vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vc3); - - vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vc2); - vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vc2); - vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vc2); - - vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vc1); - vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vc1); - vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vc1); - - vt0123 = wasm_f32x4_mul(vt0123, vs0123); - vt4567 = wasm_f32x4_mul(vt4567, vs4567); - vt89AB = wasm_f32x4_mul(vt89AB, vs89AB); - - v128_t vf0123 = wasm_f32x4_add(wasm_f32x4_mul(vt0123, vp0123), vs0123); - v128_t vf4567 = wasm_f32x4_add(wasm_f32x4_mul(vt4567, vp4567), vs4567); - v128_t vf89AB = wasm_f32x4_add(wasm_f32x4_mul(vt89AB, vp89AB), vs89AB); - - vf0123 = wasm_v128_andnot(vf0123, wasm_f32x4_lt(vx0123, vdenorm_cutoff)); - vf4567 = wasm_v128_andnot(vf4567, wasm_f32x4_lt(vx4567, vdenorm_cutoff)); - vf89AB = wasm_v128_andnot(vf89AB, wasm_f32x4_lt(vx89AB, vdenorm_cutoff)); - - wasm_v128_store(output, vf0123); - wasm_v128_store(output + 4, vf4567); - wasm_v128_store(output + 8, vf89AB); - output += 12; - - vacc0 = wasm_f32x4_add(vacc0, vf0123); - vacc0 = wasm_f32x4_add(vacc0, vf4567); - vacc0 = wasm_f32x4_add(vacc0, vf89AB); - } - - v128_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t vi = wasm_v128_load(input); - input += 4; - - const v128_t vx = wasm_f32x4_sub(vi, vi_max); - - v128_t vn = wasm_f32x4_add(wasm_f32x4_mul(vx, vlog2e), vmagic_bias); - - const v128_t vs = wasm_i32x4_shl(vn, 23); - - vn = wasm_f32x4_sub(vn, vmagic_bias); - - v128_t vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_hi), vx); - vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_lo), vt); - - v128_t vp = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt), vc4); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc3); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc2); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc1); - - vt = wasm_f32x4_mul(vt, vs); - v128_t vf = wasm_f32x4_add(wasm_f32x4_mul(vt, vp), vs); - - vf = wasm_v128_andnot(vf, wasm_f32x4_lt(vx, vdenorm_cutoff)); - - wasm_v128_store(output, vf); - output += 4; - - vacc = wasm_f32x4_add(vacc, vf); - } - vacc = wasm_f32x4_add(vacc, wasm_v64x2_shuffle(vacc, vacc, 1, 1)); - float vsum = wasm_f32x4_extract_lane(vacc, 0) + wasm_f32x4_extract_lane(vacc, 1); - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - - const v128_t vi = wasm_v128_load(input); - - const v128_t vx = wasm_f32x4_sub(vi, vi_max); - - v128_t vn = wasm_f32x4_add(wasm_f32x4_mul(vx, vlog2e), vmagic_bias); - - const v128_t vs = wasm_i32x4_shl(vn, 23); - - vn = wasm_f32x4_sub(vn, vmagic_bias); - - v128_t vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_hi), vx); - vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_lo), vt); - - v128_t vp = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt), vc4); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc3); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc2); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc1); - - vt = wasm_f32x4_mul(vt, vs); - v128_t vf = wasm_f32x4_add(wasm_f32x4_mul(vt, vp), vs); - - vf = wasm_v128_andnot(vf, wasm_f32x4_lt(vx, vdenorm_cutoff)); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vf, 0); - output += 2; - - vsum += wasm_f32x4_extract_lane(vf, 0) + wasm_f32x4_extract_lane(vf, 1); - vf = wasm_v64x2_shuffle(vf, vf, 1, 1); - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vf, 0); - vsum += wasm_f32x4_extract_lane(vf, 0); - } - } - *sum = vsum; -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u16.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u16.c deleted file mode 100644 index d46e729c718..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u16.c +++ /dev/null @@ -1,216 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u16( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const v128_t vlog2e = wasm_f32x4_const_splat(0x1.715476p+0f); - const v128_t vmagic_bias = wasm_f32x4_const_splat(0x1.8000FEp23f); - const v128_t vminus_ln2_hi = wasm_f32x4_const_splat(-0x1.62E400p-1f); - const v128_t vminus_ln2_lo = wasm_f32x4_const_splat(-0x1.7F7D1Cp-20f); - const v128_t vc5 = wasm_f32x4_const_splat(0x1.0F9F9Cp-7f); - const v128_t vc4 = wasm_f32x4_const_splat(0x1.573A1Ap-5f); - const v128_t vc3 = wasm_f32x4_const_splat(0x1.555A80p-3f); - const v128_t vc2 = wasm_f32x4_const_splat(0x1.FFFDC6p-2f); - const v128_t vc1 = wasm_f32x4_const_splat(0x1.FFFFF6p-1f); - const v128_t vdenorm_cutoff = wasm_f32x4_const_splat(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const v128_t vi_max = wasm_v128_load32_splat(max); - - v128_t vacc0 = wasm_f32x4_const_splat(0.0f); - for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { - // Load 16 (4x4) inputs at a time. - const v128_t vi0123 = wasm_v128_load(input); - const v128_t vi4567 = wasm_v128_load(input + 4); - const v128_t vi89AB = wasm_v128_load(input + 8); - const v128_t viCDEF = wasm_v128_load(input + 12); - input += 16; - - const v128_t vx0123 = wasm_f32x4_sub(vi0123, vi_max); - const v128_t vx4567 = wasm_f32x4_sub(vi4567, vi_max); - const v128_t vx89AB = wasm_f32x4_sub(vi89AB, vi_max); - const v128_t vxCDEF = wasm_f32x4_sub(viCDEF, vi_max); - - v128_t vn0123 = wasm_f32x4_add(wasm_f32x4_mul(vx0123, vlog2e), vmagic_bias); - v128_t vn4567 = wasm_f32x4_add(wasm_f32x4_mul(vx4567, vlog2e), vmagic_bias); - v128_t vn89AB = wasm_f32x4_add(wasm_f32x4_mul(vx89AB, vlog2e), vmagic_bias); - v128_t vnCDEF = wasm_f32x4_add(wasm_f32x4_mul(vxCDEF, vlog2e), vmagic_bias); - - const v128_t vs0123 = wasm_i32x4_shl(vn0123, 23); - const v128_t vs4567 = wasm_i32x4_shl(vn4567, 23); - const v128_t vs89AB = wasm_i32x4_shl(vn89AB, 23); - const v128_t vsCDEF = wasm_i32x4_shl(vnCDEF, 23); - - vn0123 = wasm_f32x4_sub(vn0123, vmagic_bias); - vn4567 = wasm_f32x4_sub(vn4567, vmagic_bias); - vn89AB = wasm_f32x4_sub(vn89AB, vmagic_bias); - vnCDEF = wasm_f32x4_sub(vnCDEF, vmagic_bias); - - v128_t vt0123 = wasm_f32x4_add(wasm_f32x4_mul(vn0123, vminus_ln2_hi), vx0123); - v128_t vt4567 = wasm_f32x4_add(wasm_f32x4_mul(vn4567, vminus_ln2_hi), vx4567); - v128_t vt89AB = wasm_f32x4_add(wasm_f32x4_mul(vn89AB, vminus_ln2_hi), vx89AB); - v128_t vtCDEF = wasm_f32x4_add(wasm_f32x4_mul(vnCDEF, vminus_ln2_hi), vxCDEF); - - vt0123 = wasm_f32x4_add(wasm_f32x4_mul(vn0123, vminus_ln2_lo), vt0123); - vt4567 = wasm_f32x4_add(wasm_f32x4_mul(vn4567, vminus_ln2_lo), vt4567); - vt89AB = wasm_f32x4_add(wasm_f32x4_mul(vn89AB, vminus_ln2_lo), vt89AB); - vtCDEF = wasm_f32x4_add(wasm_f32x4_mul(vnCDEF, vminus_ln2_lo), vtCDEF); - - v128_t vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt0123), vc4); - v128_t vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt4567), vc4); - v128_t vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt89AB), vc4); - v128_t vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vc5, vtCDEF), vc4); - - vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vc3); - vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vc3); - vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vc3); - vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vpCDEF, vtCDEF), vc3); - - vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vc2); - vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vc2); - vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vc2); - vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vpCDEF, vtCDEF), vc2); - - vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vc1); - vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vc1); - vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vc1); - vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vpCDEF, vtCDEF), vc1); - - vt0123 = wasm_f32x4_mul(vt0123, vs0123); - vt4567 = wasm_f32x4_mul(vt4567, vs4567); - vt89AB = wasm_f32x4_mul(vt89AB, vs89AB); - vtCDEF = wasm_f32x4_mul(vtCDEF, vsCDEF); - - v128_t vf0123 = wasm_f32x4_add(wasm_f32x4_mul(vt0123, vp0123), vs0123); - v128_t vf4567 = wasm_f32x4_add(wasm_f32x4_mul(vt4567, vp4567), vs4567); - v128_t vf89AB = wasm_f32x4_add(wasm_f32x4_mul(vt89AB, vp89AB), vs89AB); - v128_t vfCDEF = wasm_f32x4_add(wasm_f32x4_mul(vtCDEF, vpCDEF), vsCDEF); - - vf0123 = wasm_v128_andnot(vf0123, wasm_f32x4_lt(vx0123, vdenorm_cutoff)); - vf4567 = wasm_v128_andnot(vf4567, wasm_f32x4_lt(vx4567, vdenorm_cutoff)); - vf89AB = wasm_v128_andnot(vf89AB, wasm_f32x4_lt(vx89AB, vdenorm_cutoff)); - vfCDEF = wasm_v128_andnot(vfCDEF, wasm_f32x4_lt(vxCDEF, vdenorm_cutoff)); - - wasm_v128_store(output, vf0123); - wasm_v128_store(output + 4, vf4567); - wasm_v128_store(output + 8, vf89AB); - wasm_v128_store(output + 12, vfCDEF); - output += 16; - - vacc0 = wasm_f32x4_add(vacc0, vf0123); - vacc0 = wasm_f32x4_add(vacc0, vf4567); - vacc0 = wasm_f32x4_add(vacc0, vf89AB); - vacc0 = wasm_f32x4_add(vacc0, vfCDEF); - } - - v128_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t vi = wasm_v128_load(input); - input += 4; - - const v128_t vx = wasm_f32x4_sub(vi, vi_max); - - v128_t vn = wasm_f32x4_add(wasm_f32x4_mul(vx, vlog2e), vmagic_bias); - - const v128_t vs = wasm_i32x4_shl(vn, 23); - - vn = wasm_f32x4_sub(vn, vmagic_bias); - - v128_t vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_hi), vx); - vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_lo), vt); - - v128_t vp = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt), vc4); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc3); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc2); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc1); - - vt = wasm_f32x4_mul(vt, vs); - v128_t vf = wasm_f32x4_add(wasm_f32x4_mul(vt, vp), vs); - - vf = wasm_v128_andnot(vf, wasm_f32x4_lt(vx, vdenorm_cutoff)); - - wasm_v128_store(output, vf); - output += 4; - - vacc = wasm_f32x4_add(vacc, vf); - } - vacc = wasm_f32x4_add(vacc, wasm_v64x2_shuffle(vacc, vacc, 1, 1)); - float vsum = wasm_f32x4_extract_lane(vacc, 0) + wasm_f32x4_extract_lane(vacc, 1); - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - - const v128_t vi = wasm_v128_load(input); - - const v128_t vx = wasm_f32x4_sub(vi, vi_max); - - v128_t vn = wasm_f32x4_add(wasm_f32x4_mul(vx, vlog2e), vmagic_bias); - - const v128_t vs = wasm_i32x4_shl(vn, 23); - - vn = wasm_f32x4_sub(vn, vmagic_bias); - - v128_t vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_hi), vx); - vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_lo), vt); - - v128_t vp = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt), vc4); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc3); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc2); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc1); - - vt = wasm_f32x4_mul(vt, vs); - v128_t vf = wasm_f32x4_add(wasm_f32x4_mul(vt, vp), vs); - - vf = wasm_v128_andnot(vf, wasm_f32x4_lt(vx, vdenorm_cutoff)); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vf, 0); - output += 2; - - vsum += wasm_f32x4_extract_lane(vf, 0) + wasm_f32x4_extract_lane(vf, 1); - vf = wasm_v64x2_shuffle(vf, vf, 1, 1); - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vf, 0); - vsum += wasm_f32x4_extract_lane(vf, 0); - } - } - *sum = vsum; -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u20-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u20-acc2.c deleted file mode 100644 index 5a8b5ca0db9..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u20-acc2.c +++ /dev/null @@ -1,235 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u20_acc2( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const v128_t vlog2e = wasm_f32x4_const_splat(0x1.715476p+0f); - const v128_t vmagic_bias = wasm_f32x4_const_splat(0x1.8000FEp23f); - const v128_t vminus_ln2_hi = wasm_f32x4_const_splat(-0x1.62E400p-1f); - const v128_t vminus_ln2_lo = wasm_f32x4_const_splat(-0x1.7F7D1Cp-20f); - const v128_t vc5 = wasm_f32x4_const_splat(0x1.0F9F9Cp-7f); - const v128_t vc4 = wasm_f32x4_const_splat(0x1.573A1Ap-5f); - const v128_t vc3 = wasm_f32x4_const_splat(0x1.555A80p-3f); - const v128_t vc2 = wasm_f32x4_const_splat(0x1.FFFDC6p-2f); - const v128_t vc1 = wasm_f32x4_const_splat(0x1.FFFFF6p-1f); - const v128_t vdenorm_cutoff = wasm_f32x4_const_splat(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const v128_t vi_max = wasm_v128_load32_splat(max); - - v128_t vacc0 = wasm_f32x4_const_splat(0.0f); - v128_t vacc1 = vacc0; - for (; batch >= 20 * sizeof(float); batch -= 20 * sizeof(float)) { - // Load 20 (5x4) inputs at a time. - const v128_t vi0123 = wasm_v128_load(input); - const v128_t vi4567 = wasm_v128_load(input + 4); - const v128_t vi89AB = wasm_v128_load(input + 8); - const v128_t viCDEF = wasm_v128_load(input + 12); - const v128_t viGHIJ = wasm_v128_load(input + 16); - input += 20; - - const v128_t vx0123 = wasm_f32x4_sub(vi0123, vi_max); - const v128_t vx4567 = wasm_f32x4_sub(vi4567, vi_max); - const v128_t vx89AB = wasm_f32x4_sub(vi89AB, vi_max); - const v128_t vxCDEF = wasm_f32x4_sub(viCDEF, vi_max); - const v128_t vxGHIJ = wasm_f32x4_sub(viGHIJ, vi_max); - - v128_t vn0123 = wasm_f32x4_add(wasm_f32x4_mul(vx0123, vlog2e), vmagic_bias); - v128_t vn4567 = wasm_f32x4_add(wasm_f32x4_mul(vx4567, vlog2e), vmagic_bias); - v128_t vn89AB = wasm_f32x4_add(wasm_f32x4_mul(vx89AB, vlog2e), vmagic_bias); - v128_t vnCDEF = wasm_f32x4_add(wasm_f32x4_mul(vxCDEF, vlog2e), vmagic_bias); - v128_t vnGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vxGHIJ, vlog2e), vmagic_bias); - - const v128_t vs0123 = wasm_i32x4_shl(vn0123, 23); - const v128_t vs4567 = wasm_i32x4_shl(vn4567, 23); - const v128_t vs89AB = wasm_i32x4_shl(vn89AB, 23); - const v128_t vsCDEF = wasm_i32x4_shl(vnCDEF, 23); - const v128_t vsGHIJ = wasm_i32x4_shl(vnGHIJ, 23); - - vn0123 = wasm_f32x4_sub(vn0123, vmagic_bias); - vn4567 = wasm_f32x4_sub(vn4567, vmagic_bias); - vn89AB = wasm_f32x4_sub(vn89AB, vmagic_bias); - vnCDEF = wasm_f32x4_sub(vnCDEF, vmagic_bias); - vnGHIJ = wasm_f32x4_sub(vnGHIJ, vmagic_bias); - - v128_t vt0123 = wasm_f32x4_add(wasm_f32x4_mul(vn0123, vminus_ln2_hi), vx0123); - v128_t vt4567 = wasm_f32x4_add(wasm_f32x4_mul(vn4567, vminus_ln2_hi), vx4567); - v128_t vt89AB = wasm_f32x4_add(wasm_f32x4_mul(vn89AB, vminus_ln2_hi), vx89AB); - v128_t vtCDEF = wasm_f32x4_add(wasm_f32x4_mul(vnCDEF, vminus_ln2_hi), vxCDEF); - v128_t vtGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vnGHIJ, vminus_ln2_hi), vxGHIJ); - - vt0123 = wasm_f32x4_add(wasm_f32x4_mul(vn0123, vminus_ln2_lo), vt0123); - vt4567 = wasm_f32x4_add(wasm_f32x4_mul(vn4567, vminus_ln2_lo), vt4567); - vt89AB = wasm_f32x4_add(wasm_f32x4_mul(vn89AB, vminus_ln2_lo), vt89AB); - vtCDEF = wasm_f32x4_add(wasm_f32x4_mul(vnCDEF, vminus_ln2_lo), vtCDEF); - vtGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vnGHIJ, vminus_ln2_lo), vtGHIJ); - - v128_t vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt0123), vc4); - v128_t vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt4567), vc4); - v128_t vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt89AB), vc4); - v128_t vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vc5, vtCDEF), vc4); - v128_t vpGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vc5, vtGHIJ), vc4); - - vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vc3); - vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vc3); - vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vc3); - vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vpCDEF, vtCDEF), vc3); - vpGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vpGHIJ, vtGHIJ), vc3); - - vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vc2); - vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vc2); - vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vc2); - vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vpCDEF, vtCDEF), vc2); - vpGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vpGHIJ, vtGHIJ), vc2); - - vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vc1); - vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vc1); - vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vc1); - vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vpCDEF, vtCDEF), vc1); - vpGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vpGHIJ, vtGHIJ), vc1); - - vt0123 = wasm_f32x4_mul(vt0123, vs0123); - vt4567 = wasm_f32x4_mul(vt4567, vs4567); - vt89AB = wasm_f32x4_mul(vt89AB, vs89AB); - vtCDEF = wasm_f32x4_mul(vtCDEF, vsCDEF); - vtGHIJ = wasm_f32x4_mul(vtGHIJ, vsGHIJ); - - v128_t vf0123 = wasm_f32x4_add(wasm_f32x4_mul(vt0123, vp0123), vs0123); - v128_t vf4567 = wasm_f32x4_add(wasm_f32x4_mul(vt4567, vp4567), vs4567); - v128_t vf89AB = wasm_f32x4_add(wasm_f32x4_mul(vt89AB, vp89AB), vs89AB); - v128_t vfCDEF = wasm_f32x4_add(wasm_f32x4_mul(vtCDEF, vpCDEF), vsCDEF); - v128_t vfGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vtGHIJ, vpGHIJ), vsGHIJ); - - vf0123 = wasm_v128_andnot(vf0123, wasm_f32x4_lt(vx0123, vdenorm_cutoff)); - vf4567 = wasm_v128_andnot(vf4567, wasm_f32x4_lt(vx4567, vdenorm_cutoff)); - vf89AB = wasm_v128_andnot(vf89AB, wasm_f32x4_lt(vx89AB, vdenorm_cutoff)); - vfCDEF = wasm_v128_andnot(vfCDEF, wasm_f32x4_lt(vxCDEF, vdenorm_cutoff)); - vfGHIJ = wasm_v128_andnot(vfGHIJ, wasm_f32x4_lt(vxGHIJ, vdenorm_cutoff)); - - wasm_v128_store(output, vf0123); - wasm_v128_store(output + 4, vf4567); - wasm_v128_store(output + 8, vf89AB); - wasm_v128_store(output + 12, vfCDEF); - wasm_v128_store(output + 16, vfGHIJ); - output += 20; - - vacc0 = wasm_f32x4_add(vacc0, vf0123); - vacc0 = wasm_f32x4_add(vacc0, vf4567); - vacc0 = wasm_f32x4_add(vacc0, vf89AB); - vacc0 = wasm_f32x4_add(vacc0, vfCDEF); - vacc0 = wasm_f32x4_add(vacc0, vfGHIJ); - } - // Add up all accumulators to vacc0 - vacc0 = wasm_f32x4_add(vacc0, vacc1); - - v128_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t vi = wasm_v128_load(input); - input += 4; - - const v128_t vx = wasm_f32x4_sub(vi, vi_max); - - v128_t vn = wasm_f32x4_add(wasm_f32x4_mul(vx, vlog2e), vmagic_bias); - - const v128_t vs = wasm_i32x4_shl(vn, 23); - - vn = wasm_f32x4_sub(vn, vmagic_bias); - - v128_t vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_hi), vx); - vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_lo), vt); - - v128_t vp = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt), vc4); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc3); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc2); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc1); - - vt = wasm_f32x4_mul(vt, vs); - v128_t vf = wasm_f32x4_add(wasm_f32x4_mul(vt, vp), vs); - - vf = wasm_v128_andnot(vf, wasm_f32x4_lt(vx, vdenorm_cutoff)); - - wasm_v128_store(output, vf); - output += 4; - - vacc = wasm_f32x4_add(vacc, vf); - } - vacc = wasm_f32x4_add(vacc, wasm_v64x2_shuffle(vacc, vacc, 1, 1)); - float vsum = wasm_f32x4_extract_lane(vacc, 0) + wasm_f32x4_extract_lane(vacc, 1); - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - - const v128_t vi = wasm_v128_load(input); - - const v128_t vx = wasm_f32x4_sub(vi, vi_max); - - v128_t vn = wasm_f32x4_add(wasm_f32x4_mul(vx, vlog2e), vmagic_bias); - - const v128_t vs = wasm_i32x4_shl(vn, 23); - - vn = wasm_f32x4_sub(vn, vmagic_bias); - - v128_t vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_hi), vx); - vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_lo), vt); - - v128_t vp = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt), vc4); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc3); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc2); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc1); - - vt = wasm_f32x4_mul(vt, vs); - v128_t vf = wasm_f32x4_add(wasm_f32x4_mul(vt, vp), vs); - - vf = wasm_v128_andnot(vf, wasm_f32x4_lt(vx, vdenorm_cutoff)); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vf, 0); - output += 2; - - vsum += wasm_f32x4_extract_lane(vf, 0) + wasm_f32x4_extract_lane(vf, 1); - vf = wasm_v64x2_shuffle(vf, vf, 1, 1); - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vf, 0); - vsum += wasm_f32x4_extract_lane(vf, 0); - } - } - *sum = vsum; -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u20-acc5.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u20-acc5.c deleted file mode 100644 index 4063d567e56..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u20-acc5.c +++ /dev/null @@ -1,241 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u20_acc5( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const v128_t vlog2e = wasm_f32x4_const_splat(0x1.715476p+0f); - const v128_t vmagic_bias = wasm_f32x4_const_splat(0x1.8000FEp23f); - const v128_t vminus_ln2_hi = wasm_f32x4_const_splat(-0x1.62E400p-1f); - const v128_t vminus_ln2_lo = wasm_f32x4_const_splat(-0x1.7F7D1Cp-20f); - const v128_t vc5 = wasm_f32x4_const_splat(0x1.0F9F9Cp-7f); - const v128_t vc4 = wasm_f32x4_const_splat(0x1.573A1Ap-5f); - const v128_t vc3 = wasm_f32x4_const_splat(0x1.555A80p-3f); - const v128_t vc2 = wasm_f32x4_const_splat(0x1.FFFDC6p-2f); - const v128_t vc1 = wasm_f32x4_const_splat(0x1.FFFFF6p-1f); - const v128_t vdenorm_cutoff = wasm_f32x4_const_splat(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const v128_t vi_max = wasm_v128_load32_splat(max); - - v128_t vacc0 = wasm_f32x4_const_splat(0.0f); - v128_t vacc1 = vacc0; - v128_t vacc2 = vacc0; - v128_t vacc3 = vacc0; - v128_t vacc4 = vacc0; - for (; batch >= 20 * sizeof(float); batch -= 20 * sizeof(float)) { - // Load 20 (5x4) inputs at a time. - const v128_t vi0123 = wasm_v128_load(input); - const v128_t vi4567 = wasm_v128_load(input + 4); - const v128_t vi89AB = wasm_v128_load(input + 8); - const v128_t viCDEF = wasm_v128_load(input + 12); - const v128_t viGHIJ = wasm_v128_load(input + 16); - input += 20; - - const v128_t vx0123 = wasm_f32x4_sub(vi0123, vi_max); - const v128_t vx4567 = wasm_f32x4_sub(vi4567, vi_max); - const v128_t vx89AB = wasm_f32x4_sub(vi89AB, vi_max); - const v128_t vxCDEF = wasm_f32x4_sub(viCDEF, vi_max); - const v128_t vxGHIJ = wasm_f32x4_sub(viGHIJ, vi_max); - - v128_t vn0123 = wasm_f32x4_add(wasm_f32x4_mul(vx0123, vlog2e), vmagic_bias); - v128_t vn4567 = wasm_f32x4_add(wasm_f32x4_mul(vx4567, vlog2e), vmagic_bias); - v128_t vn89AB = wasm_f32x4_add(wasm_f32x4_mul(vx89AB, vlog2e), vmagic_bias); - v128_t vnCDEF = wasm_f32x4_add(wasm_f32x4_mul(vxCDEF, vlog2e), vmagic_bias); - v128_t vnGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vxGHIJ, vlog2e), vmagic_bias); - - const v128_t vs0123 = wasm_i32x4_shl(vn0123, 23); - const v128_t vs4567 = wasm_i32x4_shl(vn4567, 23); - const v128_t vs89AB = wasm_i32x4_shl(vn89AB, 23); - const v128_t vsCDEF = wasm_i32x4_shl(vnCDEF, 23); - const v128_t vsGHIJ = wasm_i32x4_shl(vnGHIJ, 23); - - vn0123 = wasm_f32x4_sub(vn0123, vmagic_bias); - vn4567 = wasm_f32x4_sub(vn4567, vmagic_bias); - vn89AB = wasm_f32x4_sub(vn89AB, vmagic_bias); - vnCDEF = wasm_f32x4_sub(vnCDEF, vmagic_bias); - vnGHIJ = wasm_f32x4_sub(vnGHIJ, vmagic_bias); - - v128_t vt0123 = wasm_f32x4_add(wasm_f32x4_mul(vn0123, vminus_ln2_hi), vx0123); - v128_t vt4567 = wasm_f32x4_add(wasm_f32x4_mul(vn4567, vminus_ln2_hi), vx4567); - v128_t vt89AB = wasm_f32x4_add(wasm_f32x4_mul(vn89AB, vminus_ln2_hi), vx89AB); - v128_t vtCDEF = wasm_f32x4_add(wasm_f32x4_mul(vnCDEF, vminus_ln2_hi), vxCDEF); - v128_t vtGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vnGHIJ, vminus_ln2_hi), vxGHIJ); - - vt0123 = wasm_f32x4_add(wasm_f32x4_mul(vn0123, vminus_ln2_lo), vt0123); - vt4567 = wasm_f32x4_add(wasm_f32x4_mul(vn4567, vminus_ln2_lo), vt4567); - vt89AB = wasm_f32x4_add(wasm_f32x4_mul(vn89AB, vminus_ln2_lo), vt89AB); - vtCDEF = wasm_f32x4_add(wasm_f32x4_mul(vnCDEF, vminus_ln2_lo), vtCDEF); - vtGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vnGHIJ, vminus_ln2_lo), vtGHIJ); - - v128_t vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt0123), vc4); - v128_t vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt4567), vc4); - v128_t vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt89AB), vc4); - v128_t vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vc5, vtCDEF), vc4); - v128_t vpGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vc5, vtGHIJ), vc4); - - vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vc3); - vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vc3); - vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vc3); - vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vpCDEF, vtCDEF), vc3); - vpGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vpGHIJ, vtGHIJ), vc3); - - vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vc2); - vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vc2); - vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vc2); - vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vpCDEF, vtCDEF), vc2); - vpGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vpGHIJ, vtGHIJ), vc2); - - vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vc1); - vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vc1); - vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vc1); - vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vpCDEF, vtCDEF), vc1); - vpGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vpGHIJ, vtGHIJ), vc1); - - vt0123 = wasm_f32x4_mul(vt0123, vs0123); - vt4567 = wasm_f32x4_mul(vt4567, vs4567); - vt89AB = wasm_f32x4_mul(vt89AB, vs89AB); - vtCDEF = wasm_f32x4_mul(vtCDEF, vsCDEF); - vtGHIJ = wasm_f32x4_mul(vtGHIJ, vsGHIJ); - - v128_t vf0123 = wasm_f32x4_add(wasm_f32x4_mul(vt0123, vp0123), vs0123); - v128_t vf4567 = wasm_f32x4_add(wasm_f32x4_mul(vt4567, vp4567), vs4567); - v128_t vf89AB = wasm_f32x4_add(wasm_f32x4_mul(vt89AB, vp89AB), vs89AB); - v128_t vfCDEF = wasm_f32x4_add(wasm_f32x4_mul(vtCDEF, vpCDEF), vsCDEF); - v128_t vfGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vtGHIJ, vpGHIJ), vsGHIJ); - - vf0123 = wasm_v128_andnot(vf0123, wasm_f32x4_lt(vx0123, vdenorm_cutoff)); - vf4567 = wasm_v128_andnot(vf4567, wasm_f32x4_lt(vx4567, vdenorm_cutoff)); - vf89AB = wasm_v128_andnot(vf89AB, wasm_f32x4_lt(vx89AB, vdenorm_cutoff)); - vfCDEF = wasm_v128_andnot(vfCDEF, wasm_f32x4_lt(vxCDEF, vdenorm_cutoff)); - vfGHIJ = wasm_v128_andnot(vfGHIJ, wasm_f32x4_lt(vxGHIJ, vdenorm_cutoff)); - - wasm_v128_store(output, vf0123); - wasm_v128_store(output + 4, vf4567); - wasm_v128_store(output + 8, vf89AB); - wasm_v128_store(output + 12, vfCDEF); - wasm_v128_store(output + 16, vfGHIJ); - output += 20; - - vacc0 = wasm_f32x4_add(vacc0, vf0123); - vacc4 = wasm_f32x4_add(vacc4, vf4567); - vacc3 = wasm_f32x4_add(vacc3, vf89AB); - vacc2 = wasm_f32x4_add(vacc2, vfCDEF); - vacc1 = wasm_f32x4_add(vacc1, vfGHIJ); - } - // Add up all accumulators to vacc0 - vacc0 = wasm_f32x4_add(vacc0, vacc1); - vacc2 = wasm_f32x4_add(vacc2, vacc3); - vacc0 = wasm_f32x4_add(vacc0, vacc2); - vacc0 = wasm_f32x4_add(vacc0, vacc4); - - v128_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t vi = wasm_v128_load(input); - input += 4; - - const v128_t vx = wasm_f32x4_sub(vi, vi_max); - - v128_t vn = wasm_f32x4_add(wasm_f32x4_mul(vx, vlog2e), vmagic_bias); - - const v128_t vs = wasm_i32x4_shl(vn, 23); - - vn = wasm_f32x4_sub(vn, vmagic_bias); - - v128_t vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_hi), vx); - vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_lo), vt); - - v128_t vp = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt), vc4); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc3); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc2); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc1); - - vt = wasm_f32x4_mul(vt, vs); - v128_t vf = wasm_f32x4_add(wasm_f32x4_mul(vt, vp), vs); - - vf = wasm_v128_andnot(vf, wasm_f32x4_lt(vx, vdenorm_cutoff)); - - wasm_v128_store(output, vf); - output += 4; - - vacc = wasm_f32x4_add(vacc, vf); - } - vacc = wasm_f32x4_add(vacc, wasm_v64x2_shuffle(vacc, vacc, 1, 1)); - float vsum = wasm_f32x4_extract_lane(vacc, 0) + wasm_f32x4_extract_lane(vacc, 1); - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - - const v128_t vi = wasm_v128_load(input); - - const v128_t vx = wasm_f32x4_sub(vi, vi_max); - - v128_t vn = wasm_f32x4_add(wasm_f32x4_mul(vx, vlog2e), vmagic_bias); - - const v128_t vs = wasm_i32x4_shl(vn, 23); - - vn = wasm_f32x4_sub(vn, vmagic_bias); - - v128_t vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_hi), vx); - vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_lo), vt); - - v128_t vp = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt), vc4); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc3); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc2); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc1); - - vt = wasm_f32x4_mul(vt, vs); - v128_t vf = wasm_f32x4_add(wasm_f32x4_mul(vt, vp), vs); - - vf = wasm_v128_andnot(vf, wasm_f32x4_lt(vx, vdenorm_cutoff)); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vf, 0); - output += 2; - - vsum += wasm_f32x4_extract_lane(vf, 0) + wasm_f32x4_extract_lane(vf, 1); - vf = wasm_v64x2_shuffle(vf, vf, 1, 1); - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vf, 0); - vsum += wasm_f32x4_extract_lane(vf, 0); - } - } - *sum = vsum; -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u20.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u20.c deleted file mode 100644 index 4e420208740..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u20.c +++ /dev/null @@ -1,232 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u20( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const v128_t vlog2e = wasm_f32x4_const_splat(0x1.715476p+0f); - const v128_t vmagic_bias = wasm_f32x4_const_splat(0x1.8000FEp23f); - const v128_t vminus_ln2_hi = wasm_f32x4_const_splat(-0x1.62E400p-1f); - const v128_t vminus_ln2_lo = wasm_f32x4_const_splat(-0x1.7F7D1Cp-20f); - const v128_t vc5 = wasm_f32x4_const_splat(0x1.0F9F9Cp-7f); - const v128_t vc4 = wasm_f32x4_const_splat(0x1.573A1Ap-5f); - const v128_t vc3 = wasm_f32x4_const_splat(0x1.555A80p-3f); - const v128_t vc2 = wasm_f32x4_const_splat(0x1.FFFDC6p-2f); - const v128_t vc1 = wasm_f32x4_const_splat(0x1.FFFFF6p-1f); - const v128_t vdenorm_cutoff = wasm_f32x4_const_splat(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const v128_t vi_max = wasm_v128_load32_splat(max); - - v128_t vacc0 = wasm_f32x4_const_splat(0.0f); - for (; batch >= 20 * sizeof(float); batch -= 20 * sizeof(float)) { - // Load 20 (5x4) inputs at a time. - const v128_t vi0123 = wasm_v128_load(input); - const v128_t vi4567 = wasm_v128_load(input + 4); - const v128_t vi89AB = wasm_v128_load(input + 8); - const v128_t viCDEF = wasm_v128_load(input + 12); - const v128_t viGHIJ = wasm_v128_load(input + 16); - input += 20; - - const v128_t vx0123 = wasm_f32x4_sub(vi0123, vi_max); - const v128_t vx4567 = wasm_f32x4_sub(vi4567, vi_max); - const v128_t vx89AB = wasm_f32x4_sub(vi89AB, vi_max); - const v128_t vxCDEF = wasm_f32x4_sub(viCDEF, vi_max); - const v128_t vxGHIJ = wasm_f32x4_sub(viGHIJ, vi_max); - - v128_t vn0123 = wasm_f32x4_add(wasm_f32x4_mul(vx0123, vlog2e), vmagic_bias); - v128_t vn4567 = wasm_f32x4_add(wasm_f32x4_mul(vx4567, vlog2e), vmagic_bias); - v128_t vn89AB = wasm_f32x4_add(wasm_f32x4_mul(vx89AB, vlog2e), vmagic_bias); - v128_t vnCDEF = wasm_f32x4_add(wasm_f32x4_mul(vxCDEF, vlog2e), vmagic_bias); - v128_t vnGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vxGHIJ, vlog2e), vmagic_bias); - - const v128_t vs0123 = wasm_i32x4_shl(vn0123, 23); - const v128_t vs4567 = wasm_i32x4_shl(vn4567, 23); - const v128_t vs89AB = wasm_i32x4_shl(vn89AB, 23); - const v128_t vsCDEF = wasm_i32x4_shl(vnCDEF, 23); - const v128_t vsGHIJ = wasm_i32x4_shl(vnGHIJ, 23); - - vn0123 = wasm_f32x4_sub(vn0123, vmagic_bias); - vn4567 = wasm_f32x4_sub(vn4567, vmagic_bias); - vn89AB = wasm_f32x4_sub(vn89AB, vmagic_bias); - vnCDEF = wasm_f32x4_sub(vnCDEF, vmagic_bias); - vnGHIJ = wasm_f32x4_sub(vnGHIJ, vmagic_bias); - - v128_t vt0123 = wasm_f32x4_add(wasm_f32x4_mul(vn0123, vminus_ln2_hi), vx0123); - v128_t vt4567 = wasm_f32x4_add(wasm_f32x4_mul(vn4567, vminus_ln2_hi), vx4567); - v128_t vt89AB = wasm_f32x4_add(wasm_f32x4_mul(vn89AB, vminus_ln2_hi), vx89AB); - v128_t vtCDEF = wasm_f32x4_add(wasm_f32x4_mul(vnCDEF, vminus_ln2_hi), vxCDEF); - v128_t vtGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vnGHIJ, vminus_ln2_hi), vxGHIJ); - - vt0123 = wasm_f32x4_add(wasm_f32x4_mul(vn0123, vminus_ln2_lo), vt0123); - vt4567 = wasm_f32x4_add(wasm_f32x4_mul(vn4567, vminus_ln2_lo), vt4567); - vt89AB = wasm_f32x4_add(wasm_f32x4_mul(vn89AB, vminus_ln2_lo), vt89AB); - vtCDEF = wasm_f32x4_add(wasm_f32x4_mul(vnCDEF, vminus_ln2_lo), vtCDEF); - vtGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vnGHIJ, vminus_ln2_lo), vtGHIJ); - - v128_t vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt0123), vc4); - v128_t vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt4567), vc4); - v128_t vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt89AB), vc4); - v128_t vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vc5, vtCDEF), vc4); - v128_t vpGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vc5, vtGHIJ), vc4); - - vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vc3); - vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vc3); - vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vc3); - vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vpCDEF, vtCDEF), vc3); - vpGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vpGHIJ, vtGHIJ), vc3); - - vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vc2); - vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vc2); - vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vc2); - vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vpCDEF, vtCDEF), vc2); - vpGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vpGHIJ, vtGHIJ), vc2); - - vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vc1); - vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vc1); - vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vc1); - vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vpCDEF, vtCDEF), vc1); - vpGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vpGHIJ, vtGHIJ), vc1); - - vt0123 = wasm_f32x4_mul(vt0123, vs0123); - vt4567 = wasm_f32x4_mul(vt4567, vs4567); - vt89AB = wasm_f32x4_mul(vt89AB, vs89AB); - vtCDEF = wasm_f32x4_mul(vtCDEF, vsCDEF); - vtGHIJ = wasm_f32x4_mul(vtGHIJ, vsGHIJ); - - v128_t vf0123 = wasm_f32x4_add(wasm_f32x4_mul(vt0123, vp0123), vs0123); - v128_t vf4567 = wasm_f32x4_add(wasm_f32x4_mul(vt4567, vp4567), vs4567); - v128_t vf89AB = wasm_f32x4_add(wasm_f32x4_mul(vt89AB, vp89AB), vs89AB); - v128_t vfCDEF = wasm_f32x4_add(wasm_f32x4_mul(vtCDEF, vpCDEF), vsCDEF); - v128_t vfGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vtGHIJ, vpGHIJ), vsGHIJ); - - vf0123 = wasm_v128_andnot(vf0123, wasm_f32x4_lt(vx0123, vdenorm_cutoff)); - vf4567 = wasm_v128_andnot(vf4567, wasm_f32x4_lt(vx4567, vdenorm_cutoff)); - vf89AB = wasm_v128_andnot(vf89AB, wasm_f32x4_lt(vx89AB, vdenorm_cutoff)); - vfCDEF = wasm_v128_andnot(vfCDEF, wasm_f32x4_lt(vxCDEF, vdenorm_cutoff)); - vfGHIJ = wasm_v128_andnot(vfGHIJ, wasm_f32x4_lt(vxGHIJ, vdenorm_cutoff)); - - wasm_v128_store(output, vf0123); - wasm_v128_store(output + 4, vf4567); - wasm_v128_store(output + 8, vf89AB); - wasm_v128_store(output + 12, vfCDEF); - wasm_v128_store(output + 16, vfGHIJ); - output += 20; - - vacc0 = wasm_f32x4_add(vacc0, vf0123); - vacc0 = wasm_f32x4_add(vacc0, vf4567); - vacc0 = wasm_f32x4_add(vacc0, vf89AB); - vacc0 = wasm_f32x4_add(vacc0, vfCDEF); - vacc0 = wasm_f32x4_add(vacc0, vfGHIJ); - } - - v128_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t vi = wasm_v128_load(input); - input += 4; - - const v128_t vx = wasm_f32x4_sub(vi, vi_max); - - v128_t vn = wasm_f32x4_add(wasm_f32x4_mul(vx, vlog2e), vmagic_bias); - - const v128_t vs = wasm_i32x4_shl(vn, 23); - - vn = wasm_f32x4_sub(vn, vmagic_bias); - - v128_t vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_hi), vx); - vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_lo), vt); - - v128_t vp = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt), vc4); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc3); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc2); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc1); - - vt = wasm_f32x4_mul(vt, vs); - v128_t vf = wasm_f32x4_add(wasm_f32x4_mul(vt, vp), vs); - - vf = wasm_v128_andnot(vf, wasm_f32x4_lt(vx, vdenorm_cutoff)); - - wasm_v128_store(output, vf); - output += 4; - - vacc = wasm_f32x4_add(vacc, vf); - } - vacc = wasm_f32x4_add(vacc, wasm_v64x2_shuffle(vacc, vacc, 1, 1)); - float vsum = wasm_f32x4_extract_lane(vacc, 0) + wasm_f32x4_extract_lane(vacc, 1); - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - - const v128_t vi = wasm_v128_load(input); - - const v128_t vx = wasm_f32x4_sub(vi, vi_max); - - v128_t vn = wasm_f32x4_add(wasm_f32x4_mul(vx, vlog2e), vmagic_bias); - - const v128_t vs = wasm_i32x4_shl(vn, 23); - - vn = wasm_f32x4_sub(vn, vmagic_bias); - - v128_t vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_hi), vx); - vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_lo), vt); - - v128_t vp = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt), vc4); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc3); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc2); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc1); - - vt = wasm_f32x4_mul(vt, vs); - v128_t vf = wasm_f32x4_add(wasm_f32x4_mul(vt, vp), vs); - - vf = wasm_v128_andnot(vf, wasm_f32x4_lt(vx, vdenorm_cutoff)); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vf, 0); - output += 2; - - vsum += wasm_f32x4_extract_lane(vf, 0) + wasm_f32x4_extract_lane(vf, 1); - vf = wasm_v64x2_shuffle(vf, vf, 1, 1); - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vf, 0); - vsum += wasm_f32x4_extract_lane(vf, 0); - } - } - *sum = vsum; -} diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u8.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u8.c deleted file mode 100644 index 84b97320d72..00000000000 --- a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u8.c +++ /dev/null @@ -1,184 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/raddstoreexpminusmax.h" - - -void xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u8( - size_t batch, - const float* input, - const float* max, - float* output, - float* sum, - const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(batch != 0); - assert(batch % sizeof(float) == 0); - assert(input != NULL); - assert(max != NULL); - assert(output != NULL); - assert(sum != NULL); - - const v128_t vlog2e = wasm_f32x4_const_splat(0x1.715476p+0f); - const v128_t vmagic_bias = wasm_f32x4_const_splat(0x1.8000FEp23f); - const v128_t vminus_ln2_hi = wasm_f32x4_const_splat(-0x1.62E400p-1f); - const v128_t vminus_ln2_lo = wasm_f32x4_const_splat(-0x1.7F7D1Cp-20f); - const v128_t vc5 = wasm_f32x4_const_splat(0x1.0F9F9Cp-7f); - const v128_t vc4 = wasm_f32x4_const_splat(0x1.573A1Ap-5f); - const v128_t vc3 = wasm_f32x4_const_splat(0x1.555A80p-3f); - const v128_t vc2 = wasm_f32x4_const_splat(0x1.FFFDC6p-2f); - const v128_t vc1 = wasm_f32x4_const_splat(0x1.FFFFF6p-1f); - const v128_t vdenorm_cutoff = wasm_f32x4_const_splat(-0x1.5D589Ep6f); - - XNN_FORCE_REALIZATION(vlog2e); - XNN_FORCE_REALIZATION(vmagic_bias); - XNN_FORCE_REALIZATION(vminus_ln2_hi); - XNN_FORCE_REALIZATION(vminus_ln2_lo); - XNN_FORCE_REALIZATION(vc5); - XNN_FORCE_REALIZATION(vc4); - XNN_FORCE_REALIZATION(vc3); - XNN_FORCE_REALIZATION(vc2); - XNN_FORCE_REALIZATION(vc1); - XNN_FORCE_REALIZATION(vdenorm_cutoff); - - const v128_t vi_max = wasm_v128_load32_splat(max); - - v128_t vacc0 = wasm_f32x4_const_splat(0.0f); - for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { - // Load 8 (2x4) inputs at a time. - const v128_t vi0123 = wasm_v128_load(input); - const v128_t vi4567 = wasm_v128_load(input + 4); - input += 8; - - const v128_t vx0123 = wasm_f32x4_sub(vi0123, vi_max); - const v128_t vx4567 = wasm_f32x4_sub(vi4567, vi_max); - - v128_t vn0123 = wasm_f32x4_add(wasm_f32x4_mul(vx0123, vlog2e), vmagic_bias); - v128_t vn4567 = wasm_f32x4_add(wasm_f32x4_mul(vx4567, vlog2e), vmagic_bias); - - const v128_t vs0123 = wasm_i32x4_shl(vn0123, 23); - const v128_t vs4567 = wasm_i32x4_shl(vn4567, 23); - - vn0123 = wasm_f32x4_sub(vn0123, vmagic_bias); - vn4567 = wasm_f32x4_sub(vn4567, vmagic_bias); - - v128_t vt0123 = wasm_f32x4_add(wasm_f32x4_mul(vn0123, vminus_ln2_hi), vx0123); - v128_t vt4567 = wasm_f32x4_add(wasm_f32x4_mul(vn4567, vminus_ln2_hi), vx4567); - - vt0123 = wasm_f32x4_add(wasm_f32x4_mul(vn0123, vminus_ln2_lo), vt0123); - vt4567 = wasm_f32x4_add(wasm_f32x4_mul(vn4567, vminus_ln2_lo), vt4567); - - v128_t vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt0123), vc4); - v128_t vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt4567), vc4); - - vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vc3); - vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vc3); - - vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vc2); - vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vc2); - - vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vc1); - vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vc1); - - vt0123 = wasm_f32x4_mul(vt0123, vs0123); - vt4567 = wasm_f32x4_mul(vt4567, vs4567); - - v128_t vf0123 = wasm_f32x4_add(wasm_f32x4_mul(vt0123, vp0123), vs0123); - v128_t vf4567 = wasm_f32x4_add(wasm_f32x4_mul(vt4567, vp4567), vs4567); - - vf0123 = wasm_v128_andnot(vf0123, wasm_f32x4_lt(vx0123, vdenorm_cutoff)); - vf4567 = wasm_v128_andnot(vf4567, wasm_f32x4_lt(vx4567, vdenorm_cutoff)); - - wasm_v128_store(output, vf0123); - wasm_v128_store(output + 4, vf4567); - output += 8; - - vacc0 = wasm_f32x4_add(vacc0, vf0123); - vacc0 = wasm_f32x4_add(vacc0, vf4567); - } - - v128_t vacc = vacc0; - for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { - const v128_t vi = wasm_v128_load(input); - input += 4; - - const v128_t vx = wasm_f32x4_sub(vi, vi_max); - - v128_t vn = wasm_f32x4_add(wasm_f32x4_mul(vx, vlog2e), vmagic_bias); - - const v128_t vs = wasm_i32x4_shl(vn, 23); - - vn = wasm_f32x4_sub(vn, vmagic_bias); - - v128_t vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_hi), vx); - vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_lo), vt); - - v128_t vp = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt), vc4); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc3); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc2); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc1); - - vt = wasm_f32x4_mul(vt, vs); - v128_t vf = wasm_f32x4_add(wasm_f32x4_mul(vt, vp), vs); - - vf = wasm_v128_andnot(vf, wasm_f32x4_lt(vx, vdenorm_cutoff)); - - wasm_v128_store(output, vf); - output += 4; - - vacc = wasm_f32x4_add(vacc, vf); - } - vacc = wasm_f32x4_add(vacc, wasm_v64x2_shuffle(vacc, vacc, 1, 1)); - float vsum = wasm_f32x4_extract_lane(vacc, 0) + wasm_f32x4_extract_lane(vacc, 1); - if (batch != 0) { - assert(batch >= 1 * sizeof(float)); - assert(batch <= 3 * sizeof(float)); - - const v128_t vi = wasm_v128_load(input); - - const v128_t vx = wasm_f32x4_sub(vi, vi_max); - - v128_t vn = wasm_f32x4_add(wasm_f32x4_mul(vx, vlog2e), vmagic_bias); - - const v128_t vs = wasm_i32x4_shl(vn, 23); - - vn = wasm_f32x4_sub(vn, vmagic_bias); - - v128_t vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_hi), vx); - vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_lo), vt); - - v128_t vp = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt), vc4); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc3); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc2); - vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc1); - - vt = wasm_f32x4_mul(vt, vs); - v128_t vf = wasm_f32x4_add(wasm_f32x4_mul(vt, vp), vs); - - vf = wasm_v128_andnot(vf, wasm_f32x4_lt(vx, vdenorm_cutoff)); - - if (batch & (2 * sizeof(float))) { - wasm_v128_store64_lane(output, vf, 0); - output += 2; - - vsum += wasm_f32x4_extract_lane(vf, 0) + wasm_f32x4_extract_lane(vf, 1); - vf = wasm_v64x2_shuffle(vf, vf, 1, 1); - } - if (batch & (1 * sizeof(float))) { - wasm_v128_store32_lane(output, vf, 0); - vsum += wasm_f32x4_extract_lane(vf, 0); - } - } - *sum = vsum; -} diff --git a/src/xnnpack/raddstoreexpminusmax.h b/src/xnnpack/raddstoreexpminusmax.h index ca1f5c90a75..9b49f97fa1d 100644 --- a/src/xnnpack/raddstoreexpminusmax.h +++ b/src/xnnpack/raddstoreexpminusmax.h @@ -81,182 +81,70 @@ DECLARE_F16_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f16_raddstoreexpminusmax_u const struct xnn_f32_expminus_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u4) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u8) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u8_acc2) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u12) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u12_acc2) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u12_acc3) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u16) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u16_acc2) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u16_acc4) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u20) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u20_acc2) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u20_acc5) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u4) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u8) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u8_acc2) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u12) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u12_acc2) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u12_acc3) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u16) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u16_acc2) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u16_acc4) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u20) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u20_acc2) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u20_acc5) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u4) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u8) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u8_acc2) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u12) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u12_acc2) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u12_acc3) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u16) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u16_acc2) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u16_acc4) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u20) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u20_acc2) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u20_acc5) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u4) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u8) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u8_acc2) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u12) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u12_acc2) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u12_acc3) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u16) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u16_acc2) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u16_acc4) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u20) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u20_acc2) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u20_acc5) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__rvv_rr2_p6_u2v) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__rvv_rr2_p6_u4v) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u4) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u8) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u8_acc2) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u12) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u12_acc2) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u12_acc3) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16_acc2) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16_acc4) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u20) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u20_acc2) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u20_acc5) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u4) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u8) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u8_acc2) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12_acc2) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12_acc3) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16_acc2) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16_acc4) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20_acc2) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20_acc5) - -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32) +DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u8) +DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u16_acc2) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32_acc2) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32_acc4) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u64) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u64_acc2) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u64_acc4) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u72) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u72_acc3) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u80) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u80_acc2) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u80_acc5) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96_acc2) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96_acc3) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96_acc6) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32) +DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u8) +DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u16_acc2) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc2) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc4) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64_acc2) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64_acc4) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u72) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u72_acc3) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80_acc2) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80_acc5) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc2) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc3) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc6) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64) +DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u16) +DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u32_acc2) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64_acc2) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64_acc4) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u128) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u128_acc2) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u128_acc4) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u144) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u144_acc3) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u160) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u160_acc2) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u160_acc5) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u192) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u192_acc2) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u192_acc3) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u192_acc6) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u4) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u8) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u8_acc2) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u12) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u12_acc2) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u12_acc3) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u16) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u16_acc2) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u16_acc4) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u20) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u20_acc2) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u20_acc5) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u4) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u8) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u8_acc2) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u12) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u12_acc2) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u12_acc3) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u16) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u16_acc2) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u16_acc4) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u20) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u20_acc2) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u20_acc5) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u32) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u64) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u64_acc2) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u96) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u96_acc2) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u96_acc3) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u128) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u128_acc2) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u128_acc3) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u128_acc4) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_u1) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_u2) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_u2_acc2) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_u4) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_u4_acc2) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_u4_acc4) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_lut64_p2_u1) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_lut64_p2_u2) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_lut64_p2_u2_acc2) -DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_lut64_p2_u4) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_lut64_p2_u4_acc2) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_lut64_p2_u4_acc4) diff --git a/test/f32-raddstoreexpminusmax.cc b/test/f32-raddstoreexpminusmax.cc index 94cc2261f1c..305fa348018 100644 --- a/test/f32-raddstoreexpminusmax.cc +++ b/test/f32-raddstoreexpminusmax.cc @@ -54,5503 +54,1608 @@ #if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U8, elements_eq_8) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U8_ACC2, elements_eq_8) { TEST_REQUIRES_ARM_NEON; RAddStoreExpMinusMaxMicrokernelTester() .elements(8) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u8, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u8_acc2, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U8, elements_div_8) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U8_ACC2, elements_div_8) { TEST_REQUIRES_ARM_NEON; for (size_t elements = 16; elements < 80; elements += 8) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u8, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u8_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U8, elements_lt_8) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U8_ACC2, elements_lt_8) { TEST_REQUIRES_ARM_NEON; for (size_t elements = 1; elements < 8; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u8, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u8_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U8, elements_gt_8) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U8_ACC2, elements_gt_8) { TEST_REQUIRES_ARM_NEON; for (size_t elements = 9; elements < 16; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u8, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u8_acc2, nullptr); } } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U8_ACC2, elements_eq_8) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U16_ACC2, elements_eq_16) { TEST_REQUIRES_ARM_NEON; RAddStoreExpMinusMaxMicrokernelTester() - .elements(8) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u8_acc2, nullptr); + .elements(16) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u16_acc2, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U8_ACC2, elements_div_8) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U16_ACC2, elements_div_16) { TEST_REQUIRES_ARM_NEON; - for (size_t elements = 16; elements < 80; elements += 8) { + for (size_t elements = 32; elements < 160; elements += 16) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u8_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u16_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U8_ACC2, elements_lt_8) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U16_ACC2, elements_lt_16) { TEST_REQUIRES_ARM_NEON; - for (size_t elements = 1; elements < 8; elements++) { + for (size_t elements = 1; elements < 16; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u8_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u16_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U8_ACC2, elements_gt_8) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U16_ACC2, elements_gt_16) { TEST_REQUIRES_ARM_NEON; - for (size_t elements = 9; elements < 16; elements++) { + for (size_t elements = 17; elements < 32; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u8_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u16_acc2, nullptr); } } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U12, elements_eq_12) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U16_ACC4, elements_eq_16) { TEST_REQUIRES_ARM_NEON; RAddStoreExpMinusMaxMicrokernelTester() - .elements(12) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u12, nullptr); + .elements(16) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u16_acc4, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U12, elements_div_12) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U16_ACC4, elements_div_16) { TEST_REQUIRES_ARM_NEON; - for (size_t elements = 24; elements < 120; elements += 12) { + for (size_t elements = 32; elements < 160; elements += 16) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u12, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u16_acc4, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U12, elements_lt_12) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U16_ACC4, elements_lt_16) { TEST_REQUIRES_ARM_NEON; - for (size_t elements = 1; elements < 12; elements++) { + for (size_t elements = 1; elements < 16; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u12, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u16_acc4, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U12, elements_gt_12) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U16_ACC4, elements_gt_16) { TEST_REQUIRES_ARM_NEON; - for (size_t elements = 13; elements < 24; elements++) { + for (size_t elements = 17; elements < 32; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u12, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u16_acc4, nullptr); } } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U12_ACC2, elements_eq_12) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U4, elements_eq_4) { TEST_REQUIRES_ARM_NEON; RAddStoreExpMinusMaxMicrokernelTester() - .elements(12) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u12_acc2, nullptr); + .elements(4) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u4, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U12_ACC2, elements_div_12) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U4, elements_div_4) { TEST_REQUIRES_ARM_NEON; - for (size_t elements = 24; elements < 120; elements += 12) { + for (size_t elements = 8; elements < 40; elements += 4) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u12_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u4, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U12_ACC2, elements_lt_12) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U4, elements_lt_4) { TEST_REQUIRES_ARM_NEON; - for (size_t elements = 1; elements < 12; elements++) { + for (size_t elements = 1; elements < 4; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u12_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u4, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U12_ACC2, elements_gt_12) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U4, elements_gt_4) { TEST_REQUIRES_ARM_NEON; - for (size_t elements = 13; elements < 24; elements++) { + for (size_t elements = 5; elements < 8; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u12_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u4, nullptr); } } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U12_ACC3, elements_eq_12) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U8_ACC2, elements_eq_8) { TEST_REQUIRES_ARM_NEON; RAddStoreExpMinusMaxMicrokernelTester() - .elements(12) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u12_acc3, nullptr); + .elements(8) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u8_acc2, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U12_ACC3, elements_div_12) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U8_ACC2, elements_div_8) { TEST_REQUIRES_ARM_NEON; - for (size_t elements = 24; elements < 120; elements += 12) { + for (size_t elements = 16; elements < 80; elements += 8) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u12_acc3, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u8_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U12_ACC3, elements_lt_12) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U8_ACC2, elements_lt_8) { TEST_REQUIRES_ARM_NEON; - for (size_t elements = 1; elements < 12; elements++) { + for (size_t elements = 1; elements < 8; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u12_acc3, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u8_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U12_ACC3, elements_gt_12) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U8_ACC2, elements_gt_8) { TEST_REQUIRES_ARM_NEON; - for (size_t elements = 13; elements < 24; elements++) { + for (size_t elements = 9; elements < 16; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u12_acc3, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u8_acc2, nullptr); } } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U16, elements_eq_16) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U16_ACC2, elements_eq_16) { TEST_REQUIRES_ARM_NEON; RAddStoreExpMinusMaxMicrokernelTester() .elements(16) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u16, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u16_acc2, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U16, elements_div_16) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U16_ACC2, elements_div_16) { TEST_REQUIRES_ARM_NEON; for (size_t elements = 32; elements < 160; elements += 16) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u16, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u16_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U16, elements_lt_16) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U16_ACC2, elements_lt_16) { TEST_REQUIRES_ARM_NEON; for (size_t elements = 1; elements < 16; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u16, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u16_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U16, elements_gt_16) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U16_ACC2, elements_gt_16) { TEST_REQUIRES_ARM_NEON; for (size_t elements = 17; elements < 32; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u16, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u16_acc2, nullptr); } } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U16_ACC2, elements_eq_16) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U16_ACC4, elements_eq_16) { TEST_REQUIRES_ARM_NEON; RAddStoreExpMinusMaxMicrokernelTester() .elements(16) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u16_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u16_acc4, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U16_ACC2, elements_div_16) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U16_ACC4, elements_div_16) { TEST_REQUIRES_ARM_NEON; for (size_t elements = 32; elements < 160; elements += 16) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u16_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u16_acc4, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U16_ACC2, elements_lt_16) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U16_ACC4, elements_lt_16) { TEST_REQUIRES_ARM_NEON; for (size_t elements = 1; elements < 16; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u16_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u16_acc4, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U16_ACC2, elements_gt_16) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U16_ACC4, elements_gt_16) { TEST_REQUIRES_ARM_NEON; for (size_t elements = 17; elements < 32; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u16_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u16_acc4, nullptr); } } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U16_ACC4, elements_eq_16) { - TEST_REQUIRES_ARM_NEON; + TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U4, elements_eq_4) { + TEST_REQUIRES_ARM_NEON_FMA; RAddStoreExpMinusMaxMicrokernelTester() - .elements(16) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u16_acc4, nullptr); + .elements(4) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u4, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U16_ACC4, elements_div_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 32; elements < 160; elements += 16) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U4, elements_div_4) { + TEST_REQUIRES_ARM_NEON_FMA; + for (size_t elements = 8; elements < 40; elements += 4) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u16_acc4, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u4, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U16_ACC4, elements_lt_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 1; elements < 16; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U4, elements_lt_4) { + TEST_REQUIRES_ARM_NEON_FMA; + for (size_t elements = 1; elements < 4; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u16_acc4, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u4, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U16_ACC4, elements_gt_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 17; elements < 32; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U4, elements_gt_4) { + TEST_REQUIRES_ARM_NEON_FMA; + for (size_t elements = 5; elements < 8; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u16_acc4, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u4, nullptr); } } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U20, elements_eq_20) { - TEST_REQUIRES_ARM_NEON; + TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U8_ACC2, elements_eq_8) { + TEST_REQUIRES_ARM_NEON_FMA; RAddStoreExpMinusMaxMicrokernelTester() - .elements(20) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u20, nullptr); + .elements(8) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u8_acc2, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U20, elements_div_20) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 40; elements < 200; elements += 20) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U8_ACC2, elements_div_8) { + TEST_REQUIRES_ARM_NEON_FMA; + for (size_t elements = 16; elements < 80; elements += 8) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u20, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u8_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U20, elements_lt_20) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 1; elements < 20; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U8_ACC2, elements_lt_8) { + TEST_REQUIRES_ARM_NEON_FMA; + for (size_t elements = 1; elements < 8; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u20, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u8_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U20, elements_gt_20) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 21; elements < 40; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U8_ACC2, elements_gt_8) { + TEST_REQUIRES_ARM_NEON_FMA; + for (size_t elements = 9; elements < 16; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u20, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u8_acc2, nullptr); } } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U20_ACC2, elements_eq_20) { - TEST_REQUIRES_ARM_NEON; + TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U16_ACC2, elements_eq_16) { + TEST_REQUIRES_ARM_NEON_FMA; RAddStoreExpMinusMaxMicrokernelTester() - .elements(20) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u20_acc2, nullptr); + .elements(16) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u16_acc2, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U20_ACC2, elements_div_20) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 40; elements < 200; elements += 20) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U16_ACC2, elements_div_16) { + TEST_REQUIRES_ARM_NEON_FMA; + for (size_t elements = 32; elements < 160; elements += 16) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u20_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u16_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U20_ACC2, elements_lt_20) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 1; elements < 20; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U16_ACC2, elements_lt_16) { + TEST_REQUIRES_ARM_NEON_FMA; + for (size_t elements = 1; elements < 16; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u20_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u16_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U20_ACC2, elements_gt_20) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 21; elements < 40; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U16_ACC2, elements_gt_16) { + TEST_REQUIRES_ARM_NEON_FMA; + for (size_t elements = 17; elements < 32; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u20_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u16_acc2, nullptr); } } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U20_ACC5, elements_eq_20) { - TEST_REQUIRES_ARM_NEON; + TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U16_ACC4, elements_eq_16) { + TEST_REQUIRES_ARM_NEON_FMA; RAddStoreExpMinusMaxMicrokernelTester() - .elements(20) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u20_acc5, nullptr); + .elements(16) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u16_acc4, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U20_ACC5, elements_div_20) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 40; elements < 200; elements += 20) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U16_ACC4, elements_div_16) { + TEST_REQUIRES_ARM_NEON_FMA; + for (size_t elements = 32; elements < 160; elements += 16) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u20_acc5, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u16_acc4, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U20_ACC5, elements_lt_20) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 1; elements < 20; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U16_ACC4, elements_lt_16) { + TEST_REQUIRES_ARM_NEON_FMA; + for (size_t elements = 1; elements < 16; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u20_acc5, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u16_acc4, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_LUT64_P2_U20_ACC5, elements_gt_20) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 21; elements < 40; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U16_ACC4, elements_gt_16) { + TEST_REQUIRES_ARM_NEON_FMA; + for (size_t elements = 17; elements < 32; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u20_acc5, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u16_acc4, nullptr); } } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U4, elements_eq_4) { - TEST_REQUIRES_ARM_NEON; + TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U4, elements_eq_4) { + TEST_REQUIRES_ARM_NEON_FMA; RAddStoreExpMinusMaxMicrokernelTester() .elements(4) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u4, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u4, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U4, elements_div_4) { - TEST_REQUIRES_ARM_NEON; + TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U4, elements_div_4) { + TEST_REQUIRES_ARM_NEON_FMA; for (size_t elements = 8; elements < 40; elements += 4) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u4, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u4, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U4, elements_lt_4) { - TEST_REQUIRES_ARM_NEON; + TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U4, elements_lt_4) { + TEST_REQUIRES_ARM_NEON_FMA; for (size_t elements = 1; elements < 4; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u4, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u4, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U4, elements_gt_4) { - TEST_REQUIRES_ARM_NEON; + TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U4, elements_gt_4) { + TEST_REQUIRES_ARM_NEON_FMA; for (size_t elements = 5; elements < 8; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u4, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u4, nullptr); } } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U8, elements_eq_8) { - TEST_REQUIRES_ARM_NEON; + TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U8_ACC2, elements_eq_8) { + TEST_REQUIRES_ARM_NEON_FMA; RAddStoreExpMinusMaxMicrokernelTester() .elements(8) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u8, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u8_acc2, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U8, elements_div_8) { - TEST_REQUIRES_ARM_NEON; + TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U8_ACC2, elements_div_8) { + TEST_REQUIRES_ARM_NEON_FMA; for (size_t elements = 16; elements < 80; elements += 8) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u8, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u8_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U8, elements_lt_8) { - TEST_REQUIRES_ARM_NEON; + TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U8_ACC2, elements_lt_8) { + TEST_REQUIRES_ARM_NEON_FMA; for (size_t elements = 1; elements < 8; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u8, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u8_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U8, elements_gt_8) { - TEST_REQUIRES_ARM_NEON; + TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U8_ACC2, elements_gt_8) { + TEST_REQUIRES_ARM_NEON_FMA; for (size_t elements = 9; elements < 16; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u8, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u8_acc2, nullptr); } } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U8_ACC2, elements_eq_8) { - TEST_REQUIRES_ARM_NEON; + TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U16_ACC2, elements_eq_16) { + TEST_REQUIRES_ARM_NEON_FMA; RAddStoreExpMinusMaxMicrokernelTester() - .elements(8) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u8_acc2, nullptr); + .elements(16) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u16_acc2, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U8_ACC2, elements_div_8) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 16; elements < 80; elements += 8) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U16_ACC2, elements_div_16) { + TEST_REQUIRES_ARM_NEON_FMA; + for (size_t elements = 32; elements < 160; elements += 16) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u8_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u16_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U8_ACC2, elements_lt_8) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 1; elements < 8; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U16_ACC2, elements_lt_16) { + TEST_REQUIRES_ARM_NEON_FMA; + for (size_t elements = 1; elements < 16; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u8_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u16_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U8_ACC2, elements_gt_8) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 9; elements < 16; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U16_ACC2, elements_gt_16) { + TEST_REQUIRES_ARM_NEON_FMA; + for (size_t elements = 17; elements < 32; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u8_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u16_acc2, nullptr); } } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U12, elements_eq_12) { - TEST_REQUIRES_ARM_NEON; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(12) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u12, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U12, elements_div_12) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 24; elements < 120; elements += 12) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u12, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U12, elements_lt_12) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 1; elements < 12; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u12, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U12, elements_gt_12) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 13; elements < 24; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u12, nullptr); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U12_ACC2, elements_eq_12) { - TEST_REQUIRES_ARM_NEON; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(12) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u12_acc2, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U12_ACC2, elements_div_12) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 24; elements < 120; elements += 12) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u12_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U12_ACC2, elements_lt_12) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 1; elements < 12; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u12_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U12_ACC2, elements_gt_12) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 13; elements < 24; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u12_acc2, nullptr); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U12_ACC3, elements_eq_12) { - TEST_REQUIRES_ARM_NEON; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(12) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u12_acc3, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U12_ACC3, elements_div_12) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 24; elements < 120; elements += 12) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u12_acc3, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U12_ACC3, elements_lt_12) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 1; elements < 12; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u12_acc3, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U12_ACC3, elements_gt_12) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 13; elements < 24; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u12_acc3, nullptr); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U16, elements_eq_16) { - TEST_REQUIRES_ARM_NEON; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(16) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u16, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U16, elements_div_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 32; elements < 160; elements += 16) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u16, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U16, elements_lt_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 1; elements < 16; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u16, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U16, elements_gt_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 17; elements < 32; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u16, nullptr); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U16_ACC2, elements_eq_16) { - TEST_REQUIRES_ARM_NEON; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(16) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u16_acc2, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U16_ACC2, elements_div_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 32; elements < 160; elements += 16) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u16_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U16_ACC2, elements_lt_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 1; elements < 16; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u16_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U16_ACC2, elements_gt_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 17; elements < 32; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u16_acc2, nullptr); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U16_ACC4, elements_eq_16) { - TEST_REQUIRES_ARM_NEON; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(16) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u16_acc4, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U16_ACC4, elements_div_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 32; elements < 160; elements += 16) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u16_acc4, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U16_ACC4, elements_lt_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 1; elements < 16; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u16_acc4, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U16_ACC4, elements_gt_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 17; elements < 32; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u16_acc4, nullptr); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U20, elements_eq_20) { - TEST_REQUIRES_ARM_NEON; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(20) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u20, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U20, elements_div_20) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 40; elements < 200; elements += 20) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u20, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U20, elements_lt_20) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 1; elements < 20; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u20, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U20, elements_gt_20) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 21; elements < 40; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u20, nullptr); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U20_ACC2, elements_eq_20) { - TEST_REQUIRES_ARM_NEON; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(20) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u20_acc2, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U20_ACC2, elements_div_20) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 40; elements < 200; elements += 20) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u20_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U20_ACC2, elements_lt_20) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 1; elements < 20; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u20_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U20_ACC2, elements_gt_20) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 21; elements < 40; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u20_acc2, nullptr); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U20_ACC5, elements_eq_20) { - TEST_REQUIRES_ARM_NEON; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(20) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u20_acc5, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U20_ACC5, elements_div_20) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 40; elements < 200; elements += 20) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u20_acc5, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U20_ACC5, elements_lt_20) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 1; elements < 20; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u20_acc5, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEON_RR2_P5_U20_ACC5, elements_gt_20) { - TEST_REQUIRES_ARM_NEON; - for (size_t elements = 21; elements < 40; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u20_acc5, nullptr); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U4, elements_eq_4) { - TEST_REQUIRES_ARM_NEON_FMA; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(4) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u4, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U4, elements_div_4) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 8; elements < 40; elements += 4) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u4, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U4, elements_lt_4) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 1; elements < 4; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u4, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U4, elements_gt_4) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 5; elements < 8; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u4, nullptr); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U8, elements_eq_8) { - TEST_REQUIRES_ARM_NEON_FMA; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(8) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u8, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U8, elements_div_8) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 16; elements < 80; elements += 8) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u8, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U8, elements_lt_8) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 1; elements < 8; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u8, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U8, elements_gt_8) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 9; elements < 16; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u8, nullptr); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U8_ACC2, elements_eq_8) { - TEST_REQUIRES_ARM_NEON_FMA; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(8) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u8_acc2, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U8_ACC2, elements_div_8) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 16; elements < 80; elements += 8) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u8_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U8_ACC2, elements_lt_8) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 1; elements < 8; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u8_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U8_ACC2, elements_gt_8) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 9; elements < 16; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u8_acc2, nullptr); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U12, elements_eq_12) { - TEST_REQUIRES_ARM_NEON_FMA; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(12) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u12, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U12, elements_div_12) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 24; elements < 120; elements += 12) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u12, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U12, elements_lt_12) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 1; elements < 12; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u12, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U12, elements_gt_12) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 13; elements < 24; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u12, nullptr); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U12_ACC2, elements_eq_12) { - TEST_REQUIRES_ARM_NEON_FMA; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(12) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u12_acc2, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U12_ACC2, elements_div_12) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 24; elements < 120; elements += 12) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u12_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U12_ACC2, elements_lt_12) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 1; elements < 12; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u12_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U12_ACC2, elements_gt_12) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 13; elements < 24; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u12_acc2, nullptr); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U12_ACC3, elements_eq_12) { - TEST_REQUIRES_ARM_NEON_FMA; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(12) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u12_acc3, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U12_ACC3, elements_div_12) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 24; elements < 120; elements += 12) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u12_acc3, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U12_ACC3, elements_lt_12) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 1; elements < 12; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u12_acc3, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U12_ACC3, elements_gt_12) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 13; elements < 24; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u12_acc3, nullptr); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U16, elements_eq_16) { - TEST_REQUIRES_ARM_NEON_FMA; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(16) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u16, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U16, elements_div_16) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 32; elements < 160; elements += 16) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u16, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U16, elements_lt_16) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 1; elements < 16; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u16, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U16, elements_gt_16) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 17; elements < 32; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u16, nullptr); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U16_ACC2, elements_eq_16) { - TEST_REQUIRES_ARM_NEON_FMA; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(16) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u16_acc2, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U16_ACC2, elements_div_16) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 32; elements < 160; elements += 16) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u16_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U16_ACC2, elements_lt_16) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 1; elements < 16; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u16_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U16_ACC2, elements_gt_16) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 17; elements < 32; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u16_acc2, nullptr); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U16_ACC4, elements_eq_16) { - TEST_REQUIRES_ARM_NEON_FMA; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(16) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u16_acc4, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U16_ACC4, elements_div_16) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 32; elements < 160; elements += 16) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u16_acc4, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U16_ACC4, elements_lt_16) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 1; elements < 16; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u16_acc4, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U16_ACC4, elements_gt_16) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 17; elements < 32; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u16_acc4, nullptr); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U20, elements_eq_20) { - TEST_REQUIRES_ARM_NEON_FMA; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(20) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u20, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U20, elements_div_20) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 40; elements < 200; elements += 20) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u20, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U20, elements_lt_20) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 1; elements < 20; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u20, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U20, elements_gt_20) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 21; elements < 40; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u20, nullptr); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U20_ACC2, elements_eq_20) { - TEST_REQUIRES_ARM_NEON_FMA; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(20) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u20_acc2, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U20_ACC2, elements_div_20) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 40; elements < 200; elements += 20) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u20_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U20_ACC2, elements_lt_20) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 1; elements < 20; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u20_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U20_ACC2, elements_gt_20) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 21; elements < 40; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u20_acc2, nullptr); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U20_ACC5, elements_eq_20) { - TEST_REQUIRES_ARM_NEON_FMA; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(20) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u20_acc5, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U20_ACC5, elements_div_20) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 40; elements < 200; elements += 20) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u20_acc5, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U20_ACC5, elements_lt_20) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 1; elements < 20; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u20_acc5, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_LUT64_P2_U20_ACC5, elements_gt_20) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 21; elements < 40; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u20_acc5, nullptr); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U4, elements_eq_4) { - TEST_REQUIRES_ARM_NEON_FMA; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(4) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u4, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U4, elements_div_4) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 8; elements < 40; elements += 4) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u4, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U4, elements_lt_4) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 1; elements < 4; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u4, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U4, elements_gt_4) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 5; elements < 8; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u4, nullptr); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U8, elements_eq_8) { - TEST_REQUIRES_ARM_NEON_FMA; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(8) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u8, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U8, elements_div_8) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 16; elements < 80; elements += 8) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u8, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U8, elements_lt_8) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 1; elements < 8; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u8, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U8, elements_gt_8) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 9; elements < 16; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u8, nullptr); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U8_ACC2, elements_eq_8) { - TEST_REQUIRES_ARM_NEON_FMA; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(8) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u8_acc2, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U8_ACC2, elements_div_8) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 16; elements < 80; elements += 8) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u8_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U8_ACC2, elements_lt_8) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 1; elements < 8; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u8_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U8_ACC2, elements_gt_8) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 9; elements < 16; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u8_acc2, nullptr); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U12, elements_eq_12) { - TEST_REQUIRES_ARM_NEON_FMA; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(12) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u12, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U12, elements_div_12) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 24; elements < 120; elements += 12) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u12, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U12, elements_lt_12) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 1; elements < 12; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u12, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U12, elements_gt_12) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 13; elements < 24; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u12, nullptr); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U12_ACC2, elements_eq_12) { - TEST_REQUIRES_ARM_NEON_FMA; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(12) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u12_acc2, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U12_ACC2, elements_div_12) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 24; elements < 120; elements += 12) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u12_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U12_ACC2, elements_lt_12) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 1; elements < 12; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u12_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U12_ACC2, elements_gt_12) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 13; elements < 24; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u12_acc2, nullptr); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U12_ACC3, elements_eq_12) { - TEST_REQUIRES_ARM_NEON_FMA; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(12) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u12_acc3, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U12_ACC3, elements_div_12) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 24; elements < 120; elements += 12) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u12_acc3, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U12_ACC3, elements_lt_12) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 1; elements < 12; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u12_acc3, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U12_ACC3, elements_gt_12) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 13; elements < 24; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u12_acc3, nullptr); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U16, elements_eq_16) { - TEST_REQUIRES_ARM_NEON_FMA; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(16) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u16, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U16, elements_div_16) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 32; elements < 160; elements += 16) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u16, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U16, elements_lt_16) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 1; elements < 16; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u16, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U16, elements_gt_16) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 17; elements < 32; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u16, nullptr); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U16_ACC2, elements_eq_16) { - TEST_REQUIRES_ARM_NEON_FMA; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(16) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u16_acc2, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U16_ACC2, elements_div_16) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 32; elements < 160; elements += 16) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u16_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U16_ACC2, elements_lt_16) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 1; elements < 16; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u16_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U16_ACC2, elements_gt_16) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 17; elements < 32; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u16_acc2, nullptr); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U16_ACC4, elements_eq_16) { - TEST_REQUIRES_ARM_NEON_FMA; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(16) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u16_acc4, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U16_ACC4, elements_div_16) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 32; elements < 160; elements += 16) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u16_acc4, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U16_ACC4, elements_lt_16) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 1; elements < 16; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u16_acc4, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U16_ACC4, elements_gt_16) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 17; elements < 32; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u16_acc4, nullptr); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U20, elements_eq_20) { - TEST_REQUIRES_ARM_NEON_FMA; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(20) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u20, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U20, elements_div_20) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 40; elements < 200; elements += 20) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u20, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U20, elements_lt_20) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 1; elements < 20; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u20, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U20, elements_gt_20) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 21; elements < 40; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u20, nullptr); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U20_ACC2, elements_eq_20) { - TEST_REQUIRES_ARM_NEON_FMA; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(20) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u20_acc2, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U20_ACC2, elements_div_20) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 40; elements < 200; elements += 20) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u20_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U20_ACC2, elements_lt_20) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 1; elements < 20; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u20_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U20_ACC2, elements_gt_20) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 21; elements < 40; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u20_acc2, nullptr); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U20_ACC5, elements_eq_20) { - TEST_REQUIRES_ARM_NEON_FMA; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(20) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u20_acc5, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U20_ACC5, elements_div_20) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 40; elements < 200; elements += 20) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u20_acc5, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U20_ACC5, elements_lt_20) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 1; elements < 20; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u20_acc5, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U20_ACC5, elements_gt_20) { - TEST_REQUIRES_ARM_NEON_FMA; - for (size_t elements = 21; elements < 40; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u20_acc5, nullptr); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV - TEST(F32_RADDSTOREEXPMINUSMAX__RVV_RR2_P6_U2V, elements_eq_2v) { - TEST_REQUIRES_RISCV_VECTOR; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(2 * xnn_init_hardware_config()->vlenb / sizeof(float)) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__rvv_rr2_p6_u2v, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__RVV_RR2_P6_U2V, elements_div_2v) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t elements = 4 * xnn_init_hardware_config()->vlenb / sizeof(float); - elements < 20 * xnn_init_hardware_config()->vlenb / sizeof(float); - elements += 2 * xnn_init_hardware_config()->vlenb / sizeof(float)) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__rvv_rr2_p6_u2v, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__RVV_RR2_P6_U2V, elements_lt_2v) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t elements = 1; - elements < 2 * xnn_init_hardware_config()->vlenb / sizeof(float); - elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__rvv_rr2_p6_u2v, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__RVV_RR2_P6_U2V, elements_gt_2v) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t elements = 2 * xnn_init_hardware_config()->vlenb / sizeof(float) + 1; - elements < 4 * xnn_init_hardware_config()->vlenb / sizeof(float); - elements += 4) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__rvv_rr2_p6_u2v, nullptr); - } - } -#endif // XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV - - -#if XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV - TEST(F32_RADDSTOREEXPMINUSMAX__RVV_RR2_P6_U4V, elements_eq_4v) { - TEST_REQUIRES_RISCV_VECTOR; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(4 * xnn_init_hardware_config()->vlenb / sizeof(float)) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__rvv_rr2_p6_u4v, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__RVV_RR2_P6_U4V, elements_div_4v) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t elements = 8 * xnn_init_hardware_config()->vlenb / sizeof(float); - elements < 40 * xnn_init_hardware_config()->vlenb / sizeof(float); - elements += 4 * xnn_init_hardware_config()->vlenb / sizeof(float)) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__rvv_rr2_p6_u4v, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__RVV_RR2_P6_U4V, elements_lt_4v) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t elements = 1; - elements < 4 * xnn_init_hardware_config()->vlenb / sizeof(float); - elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__rvv_rr2_p6_u4v, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__RVV_RR2_P6_U4V, elements_gt_4v) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t elements = 4 * xnn_init_hardware_config()->vlenb / sizeof(float) + 1; - elements < 8 * xnn_init_hardware_config()->vlenb / sizeof(float); - elements += 8) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__rvv_rr2_p6_u4v, nullptr); - } - } -#endif // XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U4, elements_eq_4) { - TEST_REQUIRES_X86_SSE2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(4) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u4, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U4, elements_div_4) { - TEST_REQUIRES_X86_SSE2; - for (size_t elements = 8; elements < 40; elements += 4) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u4, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U4, elements_lt_4) { - TEST_REQUIRES_X86_SSE2; - for (size_t elements = 1; elements < 4; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u4, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U4, elements_gt_4) { - TEST_REQUIRES_X86_SSE2; - for (size_t elements = 5; elements < 8; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u4, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U8, elements_eq_8) { - TEST_REQUIRES_X86_SSE2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(8) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u8, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U8, elements_div_8) { - TEST_REQUIRES_X86_SSE2; - for (size_t elements = 16; elements < 80; elements += 8) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u8, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U8, elements_lt_8) { - TEST_REQUIRES_X86_SSE2; - for (size_t elements = 1; elements < 8; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u8, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U8, elements_gt_8) { - TEST_REQUIRES_X86_SSE2; - for (size_t elements = 9; elements < 16; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u8, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U8_ACC2, elements_eq_8) { - TEST_REQUIRES_X86_SSE2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(8) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u8_acc2, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U8_ACC2, elements_div_8) { - TEST_REQUIRES_X86_SSE2; - for (size_t elements = 16; elements < 80; elements += 8) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u8_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U8_ACC2, elements_lt_8) { - TEST_REQUIRES_X86_SSE2; - for (size_t elements = 1; elements < 8; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u8_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U8_ACC2, elements_gt_8) { - TEST_REQUIRES_X86_SSE2; - for (size_t elements = 9; elements < 16; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u8_acc2, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U12, elements_eq_12) { - TEST_REQUIRES_X86_SSE2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(12) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u12, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U12, elements_div_12) { - TEST_REQUIRES_X86_SSE2; - for (size_t elements = 24; elements < 120; elements += 12) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u12, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U12, elements_lt_12) { - TEST_REQUIRES_X86_SSE2; - for (size_t elements = 1; elements < 12; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u12, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U12, elements_gt_12) { - TEST_REQUIRES_X86_SSE2; - for (size_t elements = 13; elements < 24; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u12, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U12_ACC2, elements_eq_12) { - TEST_REQUIRES_X86_SSE2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(12) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u12_acc2, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U12_ACC2, elements_div_12) { - TEST_REQUIRES_X86_SSE2; - for (size_t elements = 24; elements < 120; elements += 12) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u12_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U12_ACC2, elements_lt_12) { - TEST_REQUIRES_X86_SSE2; - for (size_t elements = 1; elements < 12; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u12_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U12_ACC2, elements_gt_12) { - TEST_REQUIRES_X86_SSE2; - for (size_t elements = 13; elements < 24; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u12_acc2, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U12_ACC3, elements_eq_12) { - TEST_REQUIRES_X86_SSE2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(12) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u12_acc3, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U12_ACC3, elements_div_12) { - TEST_REQUIRES_X86_SSE2; - for (size_t elements = 24; elements < 120; elements += 12) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u12_acc3, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U12_ACC3, elements_lt_12) { - TEST_REQUIRES_X86_SSE2; - for (size_t elements = 1; elements < 12; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u12_acc3, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U12_ACC3, elements_gt_12) { - TEST_REQUIRES_X86_SSE2; - for (size_t elements = 13; elements < 24; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u12_acc3, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U16, elements_eq_16) { - TEST_REQUIRES_X86_SSE2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(16) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U16, elements_div_16) { - TEST_REQUIRES_X86_SSE2; - for (size_t elements = 32; elements < 160; elements += 16) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U16, elements_lt_16) { - TEST_REQUIRES_X86_SSE2; - for (size_t elements = 1; elements < 16; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U16, elements_gt_16) { - TEST_REQUIRES_X86_SSE2; - for (size_t elements = 17; elements < 32; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U16_ACC2, elements_eq_16) { - TEST_REQUIRES_X86_SSE2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(16) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16_acc2, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U16_ACC2, elements_div_16) { - TEST_REQUIRES_X86_SSE2; - for (size_t elements = 32; elements < 160; elements += 16) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U16_ACC2, elements_lt_16) { - TEST_REQUIRES_X86_SSE2; - for (size_t elements = 1; elements < 16; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U16_ACC2, elements_gt_16) { - TEST_REQUIRES_X86_SSE2; - for (size_t elements = 17; elements < 32; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16_acc2, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U16_ACC4, elements_eq_16) { - TEST_REQUIRES_X86_SSE2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(16) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16_acc4, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U16_ACC4, elements_div_16) { - TEST_REQUIRES_X86_SSE2; - for (size_t elements = 32; elements < 160; elements += 16) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16_acc4, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U16_ACC4, elements_lt_16) { - TEST_REQUIRES_X86_SSE2; - for (size_t elements = 1; elements < 16; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16_acc4, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U16_ACC4, elements_gt_16) { - TEST_REQUIRES_X86_SSE2; - for (size_t elements = 17; elements < 32; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16_acc4, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U20, elements_eq_20) { - TEST_REQUIRES_X86_SSE2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(20) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u20, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U20, elements_div_20) { - TEST_REQUIRES_X86_SSE2; - for (size_t elements = 40; elements < 200; elements += 20) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u20, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U20, elements_lt_20) { - TEST_REQUIRES_X86_SSE2; - for (size_t elements = 1; elements < 20; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u20, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U20, elements_gt_20) { - TEST_REQUIRES_X86_SSE2; - for (size_t elements = 21; elements < 40; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u20, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U20_ACC2, elements_eq_20) { - TEST_REQUIRES_X86_SSE2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(20) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u20_acc2, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U20_ACC2, elements_div_20) { - TEST_REQUIRES_X86_SSE2; - for (size_t elements = 40; elements < 200; elements += 20) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u20_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U20_ACC2, elements_lt_20) { - TEST_REQUIRES_X86_SSE2; - for (size_t elements = 1; elements < 20; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u20_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U20_ACC2, elements_gt_20) { - TEST_REQUIRES_X86_SSE2; - for (size_t elements = 21; elements < 40; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u20_acc2, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U20_ACC5, elements_eq_20) { - TEST_REQUIRES_X86_SSE2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(20) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u20_acc5, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U20_ACC5, elements_div_20) { - TEST_REQUIRES_X86_SSE2; - for (size_t elements = 40; elements < 200; elements += 20) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u20_acc5, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U20_ACC5, elements_lt_20) { - TEST_REQUIRES_X86_SSE2; - for (size_t elements = 1; elements < 20; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u20_acc5, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U20_ACC5, elements_gt_20) { - TEST_REQUIRES_X86_SSE2; - for (size_t elements = 21; elements < 40; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u20_acc5, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U4, elements_eq_4) { - TEST_REQUIRES_X86_AVX; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(4) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u4, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U4, elements_div_4) { - TEST_REQUIRES_X86_AVX; - for (size_t elements = 8; elements < 40; elements += 4) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u4, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U4, elements_lt_4) { - TEST_REQUIRES_X86_AVX; - for (size_t elements = 1; elements < 4; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u4, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U4, elements_gt_4) { - TEST_REQUIRES_X86_AVX; - for (size_t elements = 5; elements < 8; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u4, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U8, elements_eq_8) { - TEST_REQUIRES_X86_AVX; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(8) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u8, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U8, elements_div_8) { - TEST_REQUIRES_X86_AVX; - for (size_t elements = 16; elements < 80; elements += 8) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u8, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U8, elements_lt_8) { - TEST_REQUIRES_X86_AVX; - for (size_t elements = 1; elements < 8; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u8, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U8, elements_gt_8) { - TEST_REQUIRES_X86_AVX; - for (size_t elements = 9; elements < 16; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u8, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U8_ACC2, elements_eq_8) { - TEST_REQUIRES_X86_AVX; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(8) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u8_acc2, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U8_ACC2, elements_div_8) { - TEST_REQUIRES_X86_AVX; - for (size_t elements = 16; elements < 80; elements += 8) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u8_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U8_ACC2, elements_lt_8) { - TEST_REQUIRES_X86_AVX; - for (size_t elements = 1; elements < 8; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u8_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U8_ACC2, elements_gt_8) { - TEST_REQUIRES_X86_AVX; - for (size_t elements = 9; elements < 16; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u8_acc2, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U12, elements_eq_12) { - TEST_REQUIRES_X86_AVX; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(12) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U12, elements_div_12) { - TEST_REQUIRES_X86_AVX; - for (size_t elements = 24; elements < 120; elements += 12) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U12, elements_lt_12) { - TEST_REQUIRES_X86_AVX; - for (size_t elements = 1; elements < 12; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U12, elements_gt_12) { - TEST_REQUIRES_X86_AVX; - for (size_t elements = 13; elements < 24; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U12_ACC2, elements_eq_12) { - TEST_REQUIRES_X86_AVX; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(12) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12_acc2, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U12_ACC2, elements_div_12) { - TEST_REQUIRES_X86_AVX; - for (size_t elements = 24; elements < 120; elements += 12) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U12_ACC2, elements_lt_12) { - TEST_REQUIRES_X86_AVX; - for (size_t elements = 1; elements < 12; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U12_ACC2, elements_gt_12) { - TEST_REQUIRES_X86_AVX; - for (size_t elements = 13; elements < 24; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12_acc2, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U12_ACC3, elements_eq_12) { - TEST_REQUIRES_X86_AVX; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(12) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12_acc3, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U12_ACC3, elements_div_12) { - TEST_REQUIRES_X86_AVX; - for (size_t elements = 24; elements < 120; elements += 12) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12_acc3, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U12_ACC3, elements_lt_12) { - TEST_REQUIRES_X86_AVX; - for (size_t elements = 1; elements < 12; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12_acc3, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U12_ACC3, elements_gt_12) { - TEST_REQUIRES_X86_AVX; - for (size_t elements = 13; elements < 24; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12_acc3, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U16, elements_eq_16) { - TEST_REQUIRES_X86_AVX; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(16) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U16, elements_div_16) { - TEST_REQUIRES_X86_AVX; - for (size_t elements = 32; elements < 160; elements += 16) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U16, elements_lt_16) { - TEST_REQUIRES_X86_AVX; - for (size_t elements = 1; elements < 16; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U16, elements_gt_16) { - TEST_REQUIRES_X86_AVX; - for (size_t elements = 17; elements < 32; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U16_ACC2, elements_eq_16) { - TEST_REQUIRES_X86_AVX; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(16) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16_acc2, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U16_ACC2, elements_div_16) { - TEST_REQUIRES_X86_AVX; - for (size_t elements = 32; elements < 160; elements += 16) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U16_ACC2, elements_lt_16) { - TEST_REQUIRES_X86_AVX; - for (size_t elements = 1; elements < 16; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U16_ACC2, elements_gt_16) { - TEST_REQUIRES_X86_AVX; - for (size_t elements = 17; elements < 32; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16_acc2, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U16_ACC4, elements_eq_16) { - TEST_REQUIRES_X86_AVX; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(16) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16_acc4, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U16_ACC4, elements_div_16) { - TEST_REQUIRES_X86_AVX; - for (size_t elements = 32; elements < 160; elements += 16) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16_acc4, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U16_ACC4, elements_lt_16) { - TEST_REQUIRES_X86_AVX; - for (size_t elements = 1; elements < 16; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16_acc4, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U16_ACC4, elements_gt_16) { - TEST_REQUIRES_X86_AVX; - for (size_t elements = 17; elements < 32; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16_acc4, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U20, elements_eq_20) { - TEST_REQUIRES_X86_AVX; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(20) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U20, elements_div_20) { - TEST_REQUIRES_X86_AVX; - for (size_t elements = 40; elements < 200; elements += 20) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U20, elements_lt_20) { - TEST_REQUIRES_X86_AVX; - for (size_t elements = 1; elements < 20; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U20, elements_gt_20) { - TEST_REQUIRES_X86_AVX; - for (size_t elements = 21; elements < 40; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U20_ACC2, elements_eq_20) { - TEST_REQUIRES_X86_AVX; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(20) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20_acc2, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U20_ACC2, elements_div_20) { - TEST_REQUIRES_X86_AVX; - for (size_t elements = 40; elements < 200; elements += 20) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U20_ACC2, elements_lt_20) { - TEST_REQUIRES_X86_AVX; - for (size_t elements = 1; elements < 20; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U20_ACC2, elements_gt_20) { - TEST_REQUIRES_X86_AVX; - for (size_t elements = 21; elements < 40; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20_acc2, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U20_ACC5, elements_eq_20) { - TEST_REQUIRES_X86_AVX; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(20) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20_acc5, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U20_ACC5, elements_div_20) { - TEST_REQUIRES_X86_AVX; - for (size_t elements = 40; elements < 200; elements += 20) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20_acc5, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U20_ACC5, elements_lt_20) { - TEST_REQUIRES_X86_AVX; - for (size_t elements = 1; elements < 20; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20_acc5, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX_RR2_P5_U20_ACC5, elements_gt_20) { - TEST_REQUIRES_X86_AVX; - for (size_t elements = 21; elements < 40; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20_acc5, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U32, elements_eq_32) { - TEST_REQUIRES_X86_AVX2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(32) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U32, elements_div_32) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 64; elements < 320; elements += 32) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U32, elements_lt_32) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 1; elements < 32; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U32, elements_gt_32) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 33; elements < 64; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U32_ACC2, elements_eq_32) { - TEST_REQUIRES_X86_AVX2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(32) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32_acc2, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U32_ACC2, elements_div_32) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 64; elements < 320; elements += 32) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U32_ACC2, elements_lt_32) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 1; elements < 32; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U32_ACC2, elements_gt_32) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 33; elements < 64; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32_acc2, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U32_ACC4, elements_eq_32) { - TEST_REQUIRES_X86_AVX2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(32) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32_acc4, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U32_ACC4, elements_div_32) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 64; elements < 320; elements += 32) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32_acc4, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U32_ACC4, elements_lt_32) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 1; elements < 32; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32_acc4, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U32_ACC4, elements_gt_32) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 33; elements < 64; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32_acc4, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U64, elements_eq_64) { - TEST_REQUIRES_X86_AVX2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(64) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u64, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U64, elements_div_64) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 128; elements < 640; elements += 64) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u64, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U64, elements_lt_64) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 1; elements < 64; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u64, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U64, elements_gt_64) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 65; elements < 128; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u64, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U64_ACC2, elements_eq_64) { - TEST_REQUIRES_X86_AVX2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(64) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u64_acc2, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U64_ACC2, elements_div_64) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 128; elements < 640; elements += 64) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u64_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U64_ACC2, elements_lt_64) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 1; elements < 64; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u64_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U64_ACC2, elements_gt_64) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 65; elements < 128; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u64_acc2, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U64_ACC4, elements_eq_64) { - TEST_REQUIRES_X86_AVX2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(64) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u64_acc4, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U64_ACC4, elements_div_64) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 128; elements < 640; elements += 64) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u64_acc4, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U64_ACC4, elements_lt_64) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 1; elements < 64; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u64_acc4, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U64_ACC4, elements_gt_64) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 65; elements < 128; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u64_acc4, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U72, elements_eq_72) { - TEST_REQUIRES_X86_AVX2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(72) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u72, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U72, elements_div_72) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 144; elements < 720; elements += 72) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u72, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U72, elements_lt_72) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 1; elements < 72; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u72, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U72, elements_gt_72) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 73; elements < 144; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u72, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U72_ACC3, elements_eq_72) { - TEST_REQUIRES_X86_AVX2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(72) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u72_acc3, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U72_ACC3, elements_div_72) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 144; elements < 720; elements += 72) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u72_acc3, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U72_ACC3, elements_lt_72) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 1; elements < 72; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u72_acc3, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U72_ACC3, elements_gt_72) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 73; elements < 144; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u72_acc3, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U80, elements_eq_80) { - TEST_REQUIRES_X86_AVX2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(80) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u80, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U80, elements_div_80) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 160; elements < 800; elements += 80) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u80, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U80, elements_lt_80) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 1; elements < 80; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u80, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U80, elements_gt_80) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 81; elements < 160; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u80, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U80_ACC2, elements_eq_80) { - TEST_REQUIRES_X86_AVX2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(80) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u80_acc2, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U80_ACC2, elements_div_80) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 160; elements < 800; elements += 80) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u80_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U80_ACC2, elements_lt_80) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 1; elements < 80; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u80_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U80_ACC2, elements_gt_80) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 81; elements < 160; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u80_acc2, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U80_ACC5, elements_eq_80) { - TEST_REQUIRES_X86_AVX2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(80) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u80_acc5, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U80_ACC5, elements_div_80) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 160; elements < 800; elements += 80) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u80_acc5, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U80_ACC5, elements_lt_80) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 1; elements < 80; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u80_acc5, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U80_ACC5, elements_gt_80) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 81; elements < 160; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u80_acc5, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U96, elements_eq_96) { - TEST_REQUIRES_X86_AVX2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(96) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U96, elements_div_96) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 192; elements < 960; elements += 96) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U96, elements_lt_96) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 1; elements < 96; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U96, elements_gt_96) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 97; elements < 192; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U96_ACC2, elements_eq_96) { - TEST_REQUIRES_X86_AVX2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(96) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96_acc2, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U96_ACC2, elements_div_96) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 192; elements < 960; elements += 96) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U96_ACC2, elements_lt_96) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 1; elements < 96; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U96_ACC2, elements_gt_96) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 97; elements < 192; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96_acc2, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U96_ACC3, elements_eq_96) { - TEST_REQUIRES_X86_AVX2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(96) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96_acc3, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U96_ACC3, elements_div_96) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 192; elements < 960; elements += 96) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96_acc3, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U96_ACC3, elements_lt_96) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 1; elements < 96; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96_acc3, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U96_ACC3, elements_gt_96) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 97; elements < 192; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96_acc3, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U96_ACC6, elements_eq_96) { - TEST_REQUIRES_X86_AVX2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(96) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96_acc6, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U96_ACC6, elements_div_96) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 192; elements < 960; elements += 96) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96_acc6, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U96_ACC6, elements_lt_96) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 1; elements < 96; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96_acc6, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U96_ACC6, elements_gt_96) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 97; elements < 192; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96_acc6, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U32, elements_eq_32) { - TEST_REQUIRES_X86_AVX2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(32) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U32, elements_div_32) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 64; elements < 320; elements += 32) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U32, elements_lt_32) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 1; elements < 32; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U32, elements_gt_32) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 33; elements < 64; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U32_ACC2, elements_eq_32) { - TEST_REQUIRES_X86_AVX2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(32) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc2, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U32_ACC2, elements_div_32) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 64; elements < 320; elements += 32) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U32_ACC2, elements_lt_32) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 1; elements < 32; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U32_ACC2, elements_gt_32) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 33; elements < 64; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc2, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U32_ACC4, elements_eq_32) { - TEST_REQUIRES_X86_AVX2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(32) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc4, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U32_ACC4, elements_div_32) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 64; elements < 320; elements += 32) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc4, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U32_ACC4, elements_lt_32) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 1; elements < 32; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc4, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U32_ACC4, elements_gt_32) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 33; elements < 64; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc4, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U64, elements_eq_64) { - TEST_REQUIRES_X86_AVX2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(64) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U64, elements_div_64) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 128; elements < 640; elements += 64) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U64, elements_lt_64) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 1; elements < 64; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U64, elements_gt_64) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 65; elements < 128; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U64_ACC2, elements_eq_64) { - TEST_REQUIRES_X86_AVX2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(64) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64_acc2, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U64_ACC2, elements_div_64) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 128; elements < 640; elements += 64) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U64_ACC2, elements_lt_64) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 1; elements < 64; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U64_ACC2, elements_gt_64) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 65; elements < 128; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64_acc2, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U64_ACC4, elements_eq_64) { - TEST_REQUIRES_X86_AVX2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(64) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64_acc4, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U64_ACC4, elements_div_64) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 128; elements < 640; elements += 64) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64_acc4, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U64_ACC4, elements_lt_64) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 1; elements < 64; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64_acc4, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U64_ACC4, elements_gt_64) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 65; elements < 128; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64_acc4, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U72, elements_eq_72) { - TEST_REQUIRES_X86_AVX2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(72) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u72, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U72, elements_div_72) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 144; elements < 720; elements += 72) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u72, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U72, elements_lt_72) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 1; elements < 72; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u72, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U72, elements_gt_72) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 73; elements < 144; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u72, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U72_ACC3, elements_eq_72) { - TEST_REQUIRES_X86_AVX2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(72) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u72_acc3, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U72_ACC3, elements_div_72) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 144; elements < 720; elements += 72) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u72_acc3, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U72_ACC3, elements_lt_72) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 1; elements < 72; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u72_acc3, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U72_ACC3, elements_gt_72) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 73; elements < 144; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u72_acc3, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U80, elements_eq_80) { - TEST_REQUIRES_X86_AVX2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(80) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U80, elements_div_80) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 160; elements < 800; elements += 80) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U80, elements_lt_80) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 1; elements < 80; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U80, elements_gt_80) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 81; elements < 160; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U80_ACC2, elements_eq_80) { - TEST_REQUIRES_X86_AVX2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(80) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80_acc2, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U80_ACC2, elements_div_80) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 160; elements < 800; elements += 80) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U80_ACC2, elements_lt_80) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 1; elements < 80; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U80_ACC2, elements_gt_80) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 81; elements < 160; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80_acc2, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U80_ACC5, elements_eq_80) { - TEST_REQUIRES_X86_AVX2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(80) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80_acc5, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U80_ACC5, elements_div_80) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 160; elements < 800; elements += 80) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80_acc5, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U80_ACC5, elements_lt_80) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 1; elements < 80; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80_acc5, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U80_ACC5, elements_gt_80) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 81; elements < 160; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80_acc5, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U96, elements_eq_96) { - TEST_REQUIRES_X86_AVX2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(96) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U96, elements_div_96) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 192; elements < 960; elements += 96) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U96, elements_lt_96) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 1; elements < 96; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U96, elements_gt_96) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 97; elements < 192; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U96_ACC2, elements_eq_96) { - TEST_REQUIRES_X86_AVX2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(96) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc2, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U96_ACC2, elements_div_96) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 192; elements < 960; elements += 96) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U96_ACC2, elements_lt_96) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 1; elements < 96; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U96_ACC2, elements_gt_96) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 97; elements < 192; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc2, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U96_ACC3, elements_eq_96) { - TEST_REQUIRES_X86_AVX2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(96) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc3, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U96_ACC3, elements_div_96) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 192; elements < 960; elements += 96) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc3, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U96_ACC3, elements_lt_96) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 1; elements < 96; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc3, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U96_ACC3, elements_gt_96) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 97; elements < 192; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc3, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U96_ACC6, elements_eq_96) { - TEST_REQUIRES_X86_AVX2; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(96) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc6, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U96_ACC6, elements_div_96) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 192; elements < 960; elements += 96) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc6, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U96_ACC6, elements_lt_96) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 1; elements < 96; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc6, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U96_ACC6, elements_gt_96) { - TEST_REQUIRES_X86_AVX2; - for (size_t elements = 97; elements < 192; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc6, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U64, elements_eq_64) { - TEST_REQUIRES_X86_AVX512F; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(64) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U64, elements_div_64) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 128; elements < 640; elements += 64) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U64, elements_lt_64) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 1; elements < 64; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U64, elements_gt_64) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 65; elements < 128; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U64_ACC2, elements_eq_64) { - TEST_REQUIRES_X86_AVX512F; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(64) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64_acc2, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U64_ACC2, elements_div_64) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 128; elements < 640; elements += 64) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U64_ACC2, elements_lt_64) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 1; elements < 64; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U64_ACC2, elements_gt_64) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 65; elements < 128; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64_acc2, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U64_ACC4, elements_eq_64) { - TEST_REQUIRES_X86_AVX512F; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(64) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64_acc4, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U64_ACC4, elements_div_64) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 128; elements < 640; elements += 64) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64_acc4, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U64_ACC4, elements_lt_64) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 1; elements < 64; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64_acc4, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U64_ACC4, elements_gt_64) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 65; elements < 128; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64_acc4, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U128, elements_eq_128) { - TEST_REQUIRES_X86_AVX512F; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(128) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u128, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U128, elements_div_128) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 256; elements < 1280; elements += 128) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u128, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U128, elements_lt_128) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 1; elements < 128; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u128, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U128, elements_gt_128) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 129; elements < 256; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u128, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U128_ACC2, elements_eq_128) { - TEST_REQUIRES_X86_AVX512F; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(128) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u128_acc2, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U128_ACC2, elements_div_128) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 256; elements < 1280; elements += 128) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u128_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U128_ACC2, elements_lt_128) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 1; elements < 128; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u128_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U128_ACC2, elements_gt_128) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 129; elements < 256; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u128_acc2, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U128_ACC4, elements_eq_128) { - TEST_REQUIRES_X86_AVX512F; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(128) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u128_acc4, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U128_ACC4, elements_div_128) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 256; elements < 1280; elements += 128) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u128_acc4, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U128_ACC4, elements_lt_128) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 1; elements < 128; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u128_acc4, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U128_ACC4, elements_gt_128) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 129; elements < 256; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u128_acc4, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U144, elements_eq_144) { - TEST_REQUIRES_X86_AVX512F; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(144) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u144, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U144, elements_div_144) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 288; elements < 1440; elements += 144) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u144, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U144, elements_lt_144) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 1; elements < 144; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u144, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U144, elements_gt_144) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 145; elements < 288; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u144, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U144_ACC3, elements_eq_144) { - TEST_REQUIRES_X86_AVX512F; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(144) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u144_acc3, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U144_ACC3, elements_div_144) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 288; elements < 1440; elements += 144) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u144_acc3, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U144_ACC3, elements_lt_144) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 1; elements < 144; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u144_acc3, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U144_ACC3, elements_gt_144) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 145; elements < 288; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u144_acc3, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U160, elements_eq_160) { - TEST_REQUIRES_X86_AVX512F; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(160) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u160, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U160, elements_div_160) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 320; elements < 1600; elements += 160) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u160, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U160, elements_lt_160) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 1; elements < 160; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u160, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U160, elements_gt_160) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 161; elements < 320; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u160, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U160_ACC2, elements_eq_160) { - TEST_REQUIRES_X86_AVX512F; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(160) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u160_acc2, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U160_ACC2, elements_div_160) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 320; elements < 1600; elements += 160) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u160_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U160_ACC2, elements_lt_160) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 1; elements < 160; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u160_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U160_ACC2, elements_gt_160) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 161; elements < 320; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u160_acc2, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U160_ACC5, elements_eq_160) { - TEST_REQUIRES_X86_AVX512F; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(160) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u160_acc5, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U160_ACC5, elements_div_160) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 320; elements < 1600; elements += 160) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u160_acc5, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U160_ACC5, elements_lt_160) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 1; elements < 160; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u160_acc5, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U160_ACC5, elements_gt_160) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 161; elements < 320; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u160_acc5, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U192, elements_eq_192) { - TEST_REQUIRES_X86_AVX512F; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(192) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u192, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U192, elements_div_192) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 384; elements < 1920; elements += 192) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u192, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U192, elements_lt_192) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 1; elements < 192; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u192, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U192, elements_gt_192) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 193; elements < 384; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u192, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U192_ACC2, elements_eq_192) { - TEST_REQUIRES_X86_AVX512F; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(192) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u192_acc2, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U192_ACC2, elements_div_192) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 384; elements < 1920; elements += 192) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u192_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U192_ACC2, elements_lt_192) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 1; elements < 192; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u192_acc2, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U192_ACC2, elements_gt_192) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 193; elements < 384; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u192_acc2, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U192_ACC3, elements_eq_192) { - TEST_REQUIRES_X86_AVX512F; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(192) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u192_acc3, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U192_ACC3, elements_div_192) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 384; elements < 1920; elements += 192) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u192_acc3, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U192_ACC3, elements_lt_192) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 1; elements < 192; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u192_acc3, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U192_ACC3, elements_gt_192) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 193; elements < 384; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u192_acc3, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U192_ACC6, elements_eq_192) { - TEST_REQUIRES_X86_AVX512F; - RAddStoreExpMinusMaxMicrokernelTester() - .elements(192) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u192_acc6, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U192_ACC6, elements_div_192) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 384; elements < 1920; elements += 192) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u192_acc6, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U192_ACC6, elements_lt_192) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 1; elements < 192; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u192_acc6, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U192_ACC6, elements_gt_192) { - TEST_REQUIRES_X86_AVX512F; - for (size_t elements = 193; elements < 384; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u192_acc6, nullptr); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U4, elements_eq_4) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(4) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u4, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U4, elements_div_4) { - for (size_t elements = 8; elements < 40; elements += 4) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u4, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U4, elements_lt_4) { - for (size_t elements = 1; elements < 4; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u4, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U4, elements_gt_4) { - for (size_t elements = 5; elements < 8; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u4, nullptr); - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U8, elements_eq_8) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(8) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u8, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U8, elements_div_8) { - for (size_t elements = 16; elements < 80; elements += 8) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u8, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U8, elements_lt_8) { - for (size_t elements = 1; elements < 8; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u8, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U8, elements_gt_8) { - for (size_t elements = 9; elements < 16; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u8, nullptr); - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U8_ACC2, elements_eq_8) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U16_ACC4, elements_eq_16) { + TEST_REQUIRES_ARM_NEON_FMA; RAddStoreExpMinusMaxMicrokernelTester() - .elements(8) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u8_acc2, nullptr); + .elements(16) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u16_acc4, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U8_ACC2, elements_div_8) { - for (size_t elements = 16; elements < 80; elements += 8) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U16_ACC4, elements_div_16) { + TEST_REQUIRES_ARM_NEON_FMA; + for (size_t elements = 32; elements < 160; elements += 16) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u8_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u16_acc4, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U8_ACC2, elements_lt_8) { - for (size_t elements = 1; elements < 8; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U16_ACC4, elements_lt_16) { + TEST_REQUIRES_ARM_NEON_FMA; + for (size_t elements = 1; elements < 16; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u8_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u16_acc4, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U8_ACC2, elements_gt_8) { - for (size_t elements = 9; elements < 16; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__NEONFMA_RR1_P5_U16_ACC4, elements_gt_16) { + TEST_REQUIRES_ARM_NEON_FMA; + for (size_t elements = 17; elements < 32; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u8_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u16_acc4, nullptr); } } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U12, elements_eq_12) { +#if XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV + TEST(F32_RADDSTOREEXPMINUSMAX__RVV_RR2_P6_U2V, elements_eq_2v) { + TEST_REQUIRES_RISCV_VECTOR; RAddStoreExpMinusMaxMicrokernelTester() - .elements(12) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u12, nullptr); + .elements(2 * xnn_init_hardware_config()->vlenb / sizeof(float)) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__rvv_rr2_p6_u2v, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U12, elements_div_12) { - for (size_t elements = 24; elements < 120; elements += 12) { + TEST(F32_RADDSTOREEXPMINUSMAX__RVV_RR2_P6_U2V, elements_div_2v) { + TEST_REQUIRES_RISCV_VECTOR; + for (size_t elements = 4 * xnn_init_hardware_config()->vlenb / sizeof(float); + elements < 20 * xnn_init_hardware_config()->vlenb / sizeof(float); + elements += 2 * xnn_init_hardware_config()->vlenb / sizeof(float)) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u12, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__rvv_rr2_p6_u2v, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U12, elements_lt_12) { - for (size_t elements = 1; elements < 12; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__RVV_RR2_P6_U2V, elements_lt_2v) { + TEST_REQUIRES_RISCV_VECTOR; + for (size_t elements = 1; + elements < 2 * xnn_init_hardware_config()->vlenb / sizeof(float); + elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u12, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__rvv_rr2_p6_u2v, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U12, elements_gt_12) { - for (size_t elements = 13; elements < 24; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__RVV_RR2_P6_U2V, elements_gt_2v) { + TEST_REQUIRES_RISCV_VECTOR; + for (size_t elements = 2 * xnn_init_hardware_config()->vlenb / sizeof(float) + 1; + elements < 4 * xnn_init_hardware_config()->vlenb / sizeof(float); + elements += 4) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u12, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__rvv_rr2_p6_u2v, nullptr); } } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +#endif // XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U12_ACC2, elements_eq_12) { +#if XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV + TEST(F32_RADDSTOREEXPMINUSMAX__RVV_RR2_P6_U4V, elements_eq_4v) { + TEST_REQUIRES_RISCV_VECTOR; RAddStoreExpMinusMaxMicrokernelTester() - .elements(12) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u12_acc2, nullptr); + .elements(4 * xnn_init_hardware_config()->vlenb / sizeof(float)) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__rvv_rr2_p6_u4v, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U12_ACC2, elements_div_12) { - for (size_t elements = 24; elements < 120; elements += 12) { + TEST(F32_RADDSTOREEXPMINUSMAX__RVV_RR2_P6_U4V, elements_div_4v) { + TEST_REQUIRES_RISCV_VECTOR; + for (size_t elements = 8 * xnn_init_hardware_config()->vlenb / sizeof(float); + elements < 40 * xnn_init_hardware_config()->vlenb / sizeof(float); + elements += 4 * xnn_init_hardware_config()->vlenb / sizeof(float)) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u12_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__rvv_rr2_p6_u4v, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U12_ACC2, elements_lt_12) { - for (size_t elements = 1; elements < 12; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__RVV_RR2_P6_U4V, elements_lt_4v) { + TEST_REQUIRES_RISCV_VECTOR; + for (size_t elements = 1; + elements < 4 * xnn_init_hardware_config()->vlenb / sizeof(float); + elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u12_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__rvv_rr2_p6_u4v, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U12_ACC2, elements_gt_12) { - for (size_t elements = 13; elements < 24; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__RVV_RR2_P6_U4V, elements_gt_4v) { + TEST_REQUIRES_RISCV_VECTOR; + for (size_t elements = 4 * xnn_init_hardware_config()->vlenb / sizeof(float) + 1; + elements < 8 * xnn_init_hardware_config()->vlenb / sizeof(float); + elements += 8) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u12_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__rvv_rr2_p6_u4v, nullptr); } } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +#endif // XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U12_ACC3, elements_eq_12) { +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U4, elements_eq_4) { + TEST_REQUIRES_X86_SSE2; RAddStoreExpMinusMaxMicrokernelTester() - .elements(12) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u12_acc3, nullptr); + .elements(4) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u4, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U12_ACC3, elements_div_12) { - for (size_t elements = 24; elements < 120; elements += 12) { + TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U4, elements_div_4) { + TEST_REQUIRES_X86_SSE2; + for (size_t elements = 8; elements < 40; elements += 4) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u12_acc3, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u4, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U12_ACC3, elements_lt_12) { - for (size_t elements = 1; elements < 12; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U4, elements_lt_4) { + TEST_REQUIRES_X86_SSE2; + for (size_t elements = 1; elements < 4; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u12_acc3, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u4, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U12_ACC3, elements_gt_12) { - for (size_t elements = 13; elements < 24; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U4, elements_gt_4) { + TEST_REQUIRES_X86_SSE2; + for (size_t elements = 5; elements < 8; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u12_acc3, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u4, nullptr); } } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U16, elements_eq_16) { +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U8_ACC2, elements_eq_8) { + TEST_REQUIRES_X86_SSE2; RAddStoreExpMinusMaxMicrokernelTester() - .elements(16) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u16, nullptr); + .elements(8) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u8_acc2, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U16, elements_div_16) { - for (size_t elements = 32; elements < 160; elements += 16) { + TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U8_ACC2, elements_div_8) { + TEST_REQUIRES_X86_SSE2; + for (size_t elements = 16; elements < 80; elements += 8) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u16, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u8_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U16, elements_lt_16) { - for (size_t elements = 1; elements < 16; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U8_ACC2, elements_lt_8) { + TEST_REQUIRES_X86_SSE2; + for (size_t elements = 1; elements < 8; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u16, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u8_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U16, elements_gt_16) { - for (size_t elements = 17; elements < 32; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U8_ACC2, elements_gt_8) { + TEST_REQUIRES_X86_SSE2; + for (size_t elements = 9; elements < 16; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u16, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u8_acc2, nullptr); } } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U16_ACC2, elements_eq_16) { +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U16_ACC2, elements_eq_16) { + TEST_REQUIRES_X86_SSE2; RAddStoreExpMinusMaxMicrokernelTester() .elements(16) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u16_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16_acc2, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U16_ACC2, elements_div_16) { + TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U16_ACC2, elements_div_16) { + TEST_REQUIRES_X86_SSE2; for (size_t elements = 32; elements < 160; elements += 16) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u16_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U16_ACC2, elements_lt_16) { + TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U16_ACC2, elements_lt_16) { + TEST_REQUIRES_X86_SSE2; for (size_t elements = 1; elements < 16; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u16_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U16_ACC2, elements_gt_16) { + TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U16_ACC2, elements_gt_16) { + TEST_REQUIRES_X86_SSE2; for (size_t elements = 17; elements < 32; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u16_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16_acc2, nullptr); } } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U16_ACC4, elements_eq_16) { +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U16_ACC4, elements_eq_16) { + TEST_REQUIRES_X86_SSE2; RAddStoreExpMinusMaxMicrokernelTester() .elements(16) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u16_acc4, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16_acc4, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U16_ACC4, elements_div_16) { + TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U16_ACC4, elements_div_16) { + TEST_REQUIRES_X86_SSE2; for (size_t elements = 32; elements < 160; elements += 16) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u16_acc4, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16_acc4, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U16_ACC4, elements_lt_16) { + TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U16_ACC4, elements_lt_16) { + TEST_REQUIRES_X86_SSE2; for (size_t elements = 1; elements < 16; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u16_acc4, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16_acc4, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U16_ACC4, elements_gt_16) { + TEST(F32_RADDSTOREEXPMINUSMAX__SSE2_RR2_P5_U16_ACC4, elements_gt_16) { + TEST_REQUIRES_X86_SSE2; for (size_t elements = 17; elements < 32; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u16_acc4, nullptr); - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U20, elements_eq_20) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(20) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u20, nullptr); - } - - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U20, elements_div_20) { - for (size_t elements = 40; elements < 200; elements += 20) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u20, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U20, elements_lt_20) { - for (size_t elements = 1; elements < 20; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u20, nullptr); - } - } - - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U20, elements_gt_20) { - for (size_t elements = 21; elements < 40; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u20, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16_acc4, nullptr); } } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U20_ACC2, elements_eq_20) { +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U8, elements_eq_8) { + TEST_REQUIRES_X86_AVX2; RAddStoreExpMinusMaxMicrokernelTester() - .elements(20) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u20_acc2, nullptr); + .elements(8) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u8, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U20_ACC2, elements_div_20) { - for (size_t elements = 40; elements < 200; elements += 20) { + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U8, elements_div_8) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 16; elements < 80; elements += 8) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u20_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u8, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U20_ACC2, elements_lt_20) { - for (size_t elements = 1; elements < 20; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U8, elements_lt_8) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 1; elements < 8; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u20_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u8, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U20_ACC2, elements_gt_20) { - for (size_t elements = 21; elements < 40; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U8, elements_gt_8) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 9; elements < 16; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u20_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u8, nullptr); } } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U20_ACC5, elements_eq_20) { +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U16_ACC2, elements_eq_16) { + TEST_REQUIRES_X86_AVX2; RAddStoreExpMinusMaxMicrokernelTester() - .elements(20) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u20_acc5, nullptr); + .elements(16) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u16_acc2, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U20_ACC5, elements_div_20) { - for (size_t elements = 40; elements < 200; elements += 20) { + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U16_ACC2, elements_div_16) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 32; elements < 160; elements += 16) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u20_acc5, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u16_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U20_ACC5, elements_lt_20) { - for (size_t elements = 1; elements < 20; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U16_ACC2, elements_lt_16) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 1; elements < 16; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u20_acc5, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u16_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U20_ACC5, elements_gt_20) { - for (size_t elements = 21; elements < 40; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U16_ACC2, elements_gt_16) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 17; elements < 32; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u20_acc5, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u16_acc2, nullptr); } } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 -#if XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U4, elements_eq_4) { +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U32_ACC2, elements_eq_32) { + TEST_REQUIRES_X86_AVX2; RAddStoreExpMinusMaxMicrokernelTester() - .elements(4) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u4, nullptr); + .elements(32) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32_acc2, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U4, elements_div_4) { - for (size_t elements = 8; elements < 40; elements += 4) { + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U32_ACC2, elements_div_32) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 64; elements < 320; elements += 32) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u4, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U4, elements_lt_4) { - for (size_t elements = 1; elements < 4; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U32_ACC2, elements_lt_32) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 1; elements < 32; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u4, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U4, elements_gt_4) { - for (size_t elements = 5; elements < 8; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U32_ACC2, elements_gt_32) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 33; elements < 64; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u4, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32_acc2, nullptr); } } -#endif // XNN_ARCH_WASMRELAXEDSIMD +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 -#if XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U8, elements_eq_8) { +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U32_ACC4, elements_eq_32) { + TEST_REQUIRES_X86_AVX2; RAddStoreExpMinusMaxMicrokernelTester() - .elements(8) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u8, nullptr); + .elements(32) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32_acc4, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U8, elements_div_8) { - for (size_t elements = 16; elements < 80; elements += 8) { + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U32_ACC4, elements_div_32) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 64; elements < 320; elements += 32) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u8, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32_acc4, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U8, elements_lt_8) { - for (size_t elements = 1; elements < 8; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U32_ACC4, elements_lt_32) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 1; elements < 32; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u8, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32_acc4, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U8, elements_gt_8) { - for (size_t elements = 9; elements < 16; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR1_P5_U32_ACC4, elements_gt_32) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 33; elements < 64; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u8, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32_acc4, nullptr); } } -#endif // XNN_ARCH_WASMRELAXEDSIMD +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 -#if XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U8_ACC2, elements_eq_8) { +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U8, elements_eq_8) { + TEST_REQUIRES_X86_AVX2; RAddStoreExpMinusMaxMicrokernelTester() .elements(8) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u8_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u8, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U8_ACC2, elements_div_8) { + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U8, elements_div_8) { + TEST_REQUIRES_X86_AVX2; for (size_t elements = 16; elements < 80; elements += 8) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u8_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u8, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U8_ACC2, elements_lt_8) { + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U8, elements_lt_8) { + TEST_REQUIRES_X86_AVX2; for (size_t elements = 1; elements < 8; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u8_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u8, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U8_ACC2, elements_gt_8) { + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U8, elements_gt_8) { + TEST_REQUIRES_X86_AVX2; for (size_t elements = 9; elements < 16; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u8_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u8, nullptr); } } -#endif // XNN_ARCH_WASMRELAXEDSIMD +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 -#if XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U12, elements_eq_12) { +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U16_ACC2, elements_eq_16) { + TEST_REQUIRES_X86_AVX2; RAddStoreExpMinusMaxMicrokernelTester() - .elements(12) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u12, nullptr); + .elements(16) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u16_acc2, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U12, elements_div_12) { - for (size_t elements = 24; elements < 120; elements += 12) { + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U16_ACC2, elements_div_16) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 32; elements < 160; elements += 16) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u12, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u16_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U12, elements_lt_12) { - for (size_t elements = 1; elements < 12; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U16_ACC2, elements_lt_16) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 1; elements < 16; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u12, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u16_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U12, elements_gt_12) { - for (size_t elements = 13; elements < 24; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U16_ACC2, elements_gt_16) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 17; elements < 32; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u12, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u16_acc2, nullptr); } } -#endif // XNN_ARCH_WASMRELAXEDSIMD +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 -#if XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U12_ACC2, elements_eq_12) { +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U32_ACC2, elements_eq_32) { + TEST_REQUIRES_X86_AVX2; RAddStoreExpMinusMaxMicrokernelTester() - .elements(12) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u12_acc2, nullptr); + .elements(32) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc2, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U12_ACC2, elements_div_12) { - for (size_t elements = 24; elements < 120; elements += 12) { + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U32_ACC2, elements_div_32) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 64; elements < 320; elements += 32) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u12_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U12_ACC2, elements_lt_12) { - for (size_t elements = 1; elements < 12; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U32_ACC2, elements_lt_32) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 1; elements < 32; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u12_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U12_ACC2, elements_gt_12) { - for (size_t elements = 13; elements < 24; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U32_ACC2, elements_gt_32) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 33; elements < 64; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u12_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc2, nullptr); } } -#endif // XNN_ARCH_WASMRELAXEDSIMD +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 -#if XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U12_ACC3, elements_eq_12) { +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U32_ACC4, elements_eq_32) { + TEST_REQUIRES_X86_AVX2; RAddStoreExpMinusMaxMicrokernelTester() - .elements(12) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u12_acc3, nullptr); + .elements(32) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc4, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U12_ACC3, elements_div_12) { - for (size_t elements = 24; elements < 120; elements += 12) { + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U32_ACC4, elements_div_32) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 64; elements < 320; elements += 32) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u12_acc3, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc4, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U12_ACC3, elements_lt_12) { - for (size_t elements = 1; elements < 12; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U32_ACC4, elements_lt_32) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 1; elements < 32; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u12_acc3, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc4, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U12_ACC3, elements_gt_12) { - for (size_t elements = 13; elements < 24; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__AVX2_RR2_P5_U32_ACC4, elements_gt_32) { + TEST_REQUIRES_X86_AVX2; + for (size_t elements = 33; elements < 64; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u12_acc3, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc4, nullptr); } } -#endif // XNN_ARCH_WASMRELAXEDSIMD +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 -#if XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U16, elements_eq_16) { +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U16, elements_eq_16) { + TEST_REQUIRES_X86_AVX512F; RAddStoreExpMinusMaxMicrokernelTester() .elements(16) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u16, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u16, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U16, elements_div_16) { + TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U16, elements_div_16) { + TEST_REQUIRES_X86_AVX512F; for (size_t elements = 32; elements < 160; elements += 16) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u16, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u16, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U16, elements_lt_16) { + TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U16, elements_lt_16) { + TEST_REQUIRES_X86_AVX512F; for (size_t elements = 1; elements < 16; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u16, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u16, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U16, elements_gt_16) { + TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U16, elements_gt_16) { + TEST_REQUIRES_X86_AVX512F; for (size_t elements = 17; elements < 32; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u16, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u16, nullptr); } } -#endif // XNN_ARCH_WASMRELAXEDSIMD +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 -#if XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U16_ACC2, elements_eq_16) { +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U32_ACC2, elements_eq_32) { + TEST_REQUIRES_X86_AVX512F; RAddStoreExpMinusMaxMicrokernelTester() - .elements(16) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u16_acc2, nullptr); + .elements(32) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u32_acc2, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U16_ACC2, elements_div_16) { - for (size_t elements = 32; elements < 160; elements += 16) { + TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U32_ACC2, elements_div_32) { + TEST_REQUIRES_X86_AVX512F; + for (size_t elements = 64; elements < 320; elements += 32) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u16_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u32_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U16_ACC2, elements_lt_16) { - for (size_t elements = 1; elements < 16; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U32_ACC2, elements_lt_32) { + TEST_REQUIRES_X86_AVX512F; + for (size_t elements = 1; elements < 32; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u16_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u32_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U16_ACC2, elements_gt_16) { - for (size_t elements = 17; elements < 32; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U32_ACC2, elements_gt_32) { + TEST_REQUIRES_X86_AVX512F; + for (size_t elements = 33; elements < 64; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u16_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u32_acc2, nullptr); } } -#endif // XNN_ARCH_WASMRELAXEDSIMD +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 -#if XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U16_ACC4, elements_eq_16) { +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U64_ACC2, elements_eq_64) { + TEST_REQUIRES_X86_AVX512F; RAddStoreExpMinusMaxMicrokernelTester() - .elements(16) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u16_acc4, nullptr); + .elements(64) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64_acc2, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U16_ACC4, elements_div_16) { - for (size_t elements = 32; elements < 160; elements += 16) { + TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U64_ACC2, elements_div_64) { + TEST_REQUIRES_X86_AVX512F; + for (size_t elements = 128; elements < 640; elements += 64) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u16_acc4, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U16_ACC4, elements_lt_16) { - for (size_t elements = 1; elements < 16; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U64_ACC2, elements_lt_64) { + TEST_REQUIRES_X86_AVX512F; + for (size_t elements = 1; elements < 64; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u16_acc4, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U16_ACC4, elements_gt_16) { - for (size_t elements = 17; elements < 32; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U64_ACC2, elements_gt_64) { + TEST_REQUIRES_X86_AVX512F; + for (size_t elements = 65; elements < 128; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u16_acc4, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64_acc2, nullptr); } } -#endif // XNN_ARCH_WASMRELAXEDSIMD +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 -#if XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U20, elements_eq_20) { +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U64_ACC4, elements_eq_64) { + TEST_REQUIRES_X86_AVX512F; RAddStoreExpMinusMaxMicrokernelTester() - .elements(20) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u20, nullptr); + .elements(64) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64_acc4, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U20, elements_div_20) { - for (size_t elements = 40; elements < 200; elements += 20) { + TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U64_ACC4, elements_div_64) { + TEST_REQUIRES_X86_AVX512F; + for (size_t elements = 128; elements < 640; elements += 64) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u20, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64_acc4, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U20, elements_lt_20) { - for (size_t elements = 1; elements < 20; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U64_ACC4, elements_lt_64) { + TEST_REQUIRES_X86_AVX512F; + for (size_t elements = 1; elements < 64; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u20, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64_acc4, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U20, elements_gt_20) { - for (size_t elements = 21; elements < 40; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR1_P5_SCALEF_U64_ACC4, elements_gt_64) { + TEST_REQUIRES_X86_AVX512F; + for (size_t elements = 65; elements < 128; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u20, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64_acc4, nullptr); } } -#endif // XNN_ARCH_WASMRELAXEDSIMD +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 -#if XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U20_ACC2, elements_eq_20) { +#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U4, elements_eq_4) { RAddStoreExpMinusMaxMicrokernelTester() - .elements(20) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u20_acc2, nullptr); + .elements(4) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u4, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U20_ACC2, elements_div_20) { - for (size_t elements = 40; elements < 200; elements += 20) { + TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U4, elements_div_4) { + for (size_t elements = 8; elements < 40; elements += 4) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u20_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u4, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U20_ACC2, elements_lt_20) { - for (size_t elements = 1; elements < 20; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U4, elements_lt_4) { + for (size_t elements = 1; elements < 4; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u20_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u4, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U20_ACC2, elements_gt_20) { - for (size_t elements = 21; elements < 40; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U4, elements_gt_4) { + for (size_t elements = 5; elements < 8; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u20_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u4, nullptr); } } -#endif // XNN_ARCH_WASMRELAXEDSIMD +#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -#if XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U20_ACC5, elements_eq_20) { +#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U8_ACC2, elements_eq_8) { RAddStoreExpMinusMaxMicrokernelTester() - .elements(20) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u20_acc5, nullptr); + .elements(8) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u8_acc2, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U20_ACC5, elements_div_20) { - for (size_t elements = 40; elements < 200; elements += 20) { + TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U8_ACC2, elements_div_8) { + for (size_t elements = 16; elements < 80; elements += 8) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u20_acc5, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u8_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U20_ACC5, elements_lt_20) { - for (size_t elements = 1; elements < 20; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U8_ACC2, elements_lt_8) { + for (size_t elements = 1; elements < 8; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u20_acc5, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u8_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U20_ACC5, elements_gt_20) { - for (size_t elements = 21; elements < 40; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U8_ACC2, elements_gt_8) { + for (size_t elements = 9; elements < 16; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u20_acc5, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u8_acc2, nullptr); } } -#endif // XNN_ARCH_WASMRELAXEDSIMD +#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -#if XNN_ENABLE_HVX && XNN_ARCH_HEXAGON - TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U32, elements_eq_32) { - TEST_REQUIRES_HVX; +#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U16_ACC2, elements_eq_16) { RAddStoreExpMinusMaxMicrokernelTester() - .elements(32) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u32, nullptr); + .elements(16) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u16_acc2, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U32, elements_div_32) { - TEST_REQUIRES_HVX; - for (size_t elements = 64; elements < 320; elements += 32) { + TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U16_ACC2, elements_div_16) { + for (size_t elements = 32; elements < 160; elements += 16) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u32, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u16_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U32, elements_lt_32) { - TEST_REQUIRES_HVX; - for (size_t elements = 1; elements < 32; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U16_ACC2, elements_lt_16) { + for (size_t elements = 1; elements < 16; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u32, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u16_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U32, elements_gt_32) { - TEST_REQUIRES_HVX; - for (size_t elements = 33; elements < 64; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U16_ACC2, elements_gt_16) { + for (size_t elements = 17; elements < 32; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u32, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u16_acc2, nullptr); } } -#endif // XNN_ENABLE_HVX && XNN_ARCH_HEXAGON +#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -#if XNN_ENABLE_HVX && XNN_ARCH_HEXAGON - TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U64, elements_eq_64) { - TEST_REQUIRES_HVX; +#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U16_ACC4, elements_eq_16) { RAddStoreExpMinusMaxMicrokernelTester() - .elements(64) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u64, nullptr); + .elements(16) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u16_acc4, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U64, elements_div_64) { - TEST_REQUIRES_HVX; - for (size_t elements = 128; elements < 640; elements += 64) { + TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U16_ACC4, elements_div_16) { + for (size_t elements = 32; elements < 160; elements += 16) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u64, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u16_acc4, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U64, elements_lt_64) { - TEST_REQUIRES_HVX; - for (size_t elements = 1; elements < 64; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U16_ACC4, elements_lt_16) { + for (size_t elements = 1; elements < 16; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u64, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u16_acc4, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U64, elements_gt_64) { - TEST_REQUIRES_HVX; - for (size_t elements = 65; elements < 128; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U16_ACC4, elements_gt_16) { + for (size_t elements = 17; elements < 32; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u64, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u16_acc4, nullptr); } } -#endif // XNN_ENABLE_HVX && XNN_ARCH_HEXAGON +#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -#if XNN_ENABLE_HVX && XNN_ARCH_HEXAGON - TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U64_ACC2, elements_eq_64) { - TEST_REQUIRES_HVX; +#if XNN_ARCH_WASMRELAXEDSIMD + TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U4, elements_eq_4) { RAddStoreExpMinusMaxMicrokernelTester() - .elements(64) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u64_acc2, nullptr); + .elements(4) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u4, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U64_ACC2, elements_div_64) { - TEST_REQUIRES_HVX; - for (size_t elements = 128; elements < 640; elements += 64) { + TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U4, elements_div_4) { + for (size_t elements = 8; elements < 40; elements += 4) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u64_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u4, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U64_ACC2, elements_lt_64) { - TEST_REQUIRES_HVX; - for (size_t elements = 1; elements < 64; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U4, elements_lt_4) { + for (size_t elements = 1; elements < 4; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u64_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u4, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U64_ACC2, elements_gt_64) { - TEST_REQUIRES_HVX; - for (size_t elements = 65; elements < 128; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U4, elements_gt_4) { + for (size_t elements = 5; elements < 8; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u64_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u4, nullptr); } } -#endif // XNN_ENABLE_HVX && XNN_ARCH_HEXAGON +#endif // XNN_ARCH_WASMRELAXEDSIMD -#if XNN_ENABLE_HVX && XNN_ARCH_HEXAGON - TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U96, elements_eq_96) { - TEST_REQUIRES_HVX; +#if XNN_ARCH_WASMRELAXEDSIMD + TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U8_ACC2, elements_eq_8) { RAddStoreExpMinusMaxMicrokernelTester() - .elements(96) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u96, nullptr); + .elements(8) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u8_acc2, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U96, elements_div_96) { - TEST_REQUIRES_HVX; - for (size_t elements = 192; elements < 960; elements += 96) { + TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U8_ACC2, elements_div_8) { + for (size_t elements = 16; elements < 80; elements += 8) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u96, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u8_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U96, elements_lt_96) { - TEST_REQUIRES_HVX; - for (size_t elements = 1; elements < 96; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U8_ACC2, elements_lt_8) { + for (size_t elements = 1; elements < 8; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u96, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u8_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U96, elements_gt_96) { - TEST_REQUIRES_HVX; - for (size_t elements = 97; elements < 192; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U8_ACC2, elements_gt_8) { + for (size_t elements = 9; elements < 16; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u96, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u8_acc2, nullptr); } } -#endif // XNN_ENABLE_HVX && XNN_ARCH_HEXAGON +#endif // XNN_ARCH_WASMRELAXEDSIMD -#if XNN_ENABLE_HVX && XNN_ARCH_HEXAGON - TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U96_ACC2, elements_eq_96) { - TEST_REQUIRES_HVX; +#if XNN_ARCH_WASMRELAXEDSIMD + TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U16_ACC2, elements_eq_16) { RAddStoreExpMinusMaxMicrokernelTester() - .elements(96) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u96_acc2, nullptr); + .elements(16) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u16_acc2, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U96_ACC2, elements_div_96) { - TEST_REQUIRES_HVX; - for (size_t elements = 192; elements < 960; elements += 96) { + TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U16_ACC2, elements_div_16) { + for (size_t elements = 32; elements < 160; elements += 16) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u96_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u16_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U96_ACC2, elements_lt_96) { - TEST_REQUIRES_HVX; - for (size_t elements = 1; elements < 96; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U16_ACC2, elements_lt_16) { + for (size_t elements = 1; elements < 16; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u96_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u16_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U96_ACC2, elements_gt_96) { - TEST_REQUIRES_HVX; - for (size_t elements = 97; elements < 192; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U16_ACC2, elements_gt_16) { + for (size_t elements = 17; elements < 32; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u96_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u16_acc2, nullptr); } } -#endif // XNN_ENABLE_HVX && XNN_ARCH_HEXAGON +#endif // XNN_ARCH_WASMRELAXEDSIMD -#if XNN_ENABLE_HVX && XNN_ARCH_HEXAGON - TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U96_ACC3, elements_eq_96) { - TEST_REQUIRES_HVX; +#if XNN_ARCH_WASMRELAXEDSIMD + TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U16_ACC4, elements_eq_16) { RAddStoreExpMinusMaxMicrokernelTester() - .elements(96) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u96_acc3, nullptr); + .elements(16) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u16_acc4, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U96_ACC3, elements_div_96) { - TEST_REQUIRES_HVX; - for (size_t elements = 192; elements < 960; elements += 96) { + TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U16_ACC4, elements_div_16) { + for (size_t elements = 32; elements < 160; elements += 16) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u96_acc3, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u16_acc4, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U96_ACC3, elements_lt_96) { - TEST_REQUIRES_HVX; - for (size_t elements = 1; elements < 96; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U16_ACC4, elements_lt_16) { + for (size_t elements = 1; elements < 16; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u96_acc3, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u16_acc4, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U96_ACC3, elements_gt_96) { - TEST_REQUIRES_HVX; - for (size_t elements = 97; elements < 192; elements++) { + TEST(F32_RADDSTOREEXPMINUSMAX__WASMRELAXEDSIMD_RR2_P5_U16_ACC4, elements_gt_16) { + for (size_t elements = 17; elements < 32; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u96_acc3, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u16_acc4, nullptr); } } -#endif // XNN_ENABLE_HVX && XNN_ARCH_HEXAGON +#endif // XNN_ARCH_WASMRELAXEDSIMD #if XNN_ENABLE_HVX && XNN_ARCH_HEXAGON - TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U128, elements_eq_128) { + TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U32, elements_eq_32) { TEST_REQUIRES_HVX; RAddStoreExpMinusMaxMicrokernelTester() - .elements(128) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u128, nullptr); + .elements(32) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u32, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U128, elements_div_128) { + TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U32, elements_div_32) { TEST_REQUIRES_HVX; - for (size_t elements = 256; elements < 1280; elements += 128) { + for (size_t elements = 64; elements < 320; elements += 32) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u128, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u32, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U128, elements_lt_128) { + TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U32, elements_lt_32) { TEST_REQUIRES_HVX; - for (size_t elements = 1; elements < 128; elements++) { + for (size_t elements = 1; elements < 32; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u128, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u32, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U128, elements_gt_128) { + TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U32, elements_gt_32) { TEST_REQUIRES_HVX; - for (size_t elements = 129; elements < 256; elements++) { + for (size_t elements = 33; elements < 64; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u128, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u32, nullptr); } } #endif // XNN_ENABLE_HVX && XNN_ARCH_HEXAGON #if XNN_ENABLE_HVX && XNN_ARCH_HEXAGON - TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U128_ACC2, elements_eq_128) { + TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U64_ACC2, elements_eq_64) { TEST_REQUIRES_HVX; RAddStoreExpMinusMaxMicrokernelTester() - .elements(128) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u128_acc2, nullptr); + .elements(64) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u64_acc2, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U128_ACC2, elements_div_128) { + TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U64_ACC2, elements_div_64) { TEST_REQUIRES_HVX; - for (size_t elements = 256; elements < 1280; elements += 128) { + for (size_t elements = 128; elements < 640; elements += 64) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u128_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u64_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U128_ACC2, elements_lt_128) { + TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U64_ACC2, elements_lt_64) { TEST_REQUIRES_HVX; - for (size_t elements = 1; elements < 128; elements++) { + for (size_t elements = 1; elements < 64; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u128_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u64_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U128_ACC2, elements_gt_128) { + TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U64_ACC2, elements_gt_64) { TEST_REQUIRES_HVX; - for (size_t elements = 129; elements < 256; elements++) { + for (size_t elements = 65; elements < 128; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u128_acc2, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u64_acc2, nullptr); } } #endif // XNN_ENABLE_HVX && XNN_ARCH_HEXAGON #if XNN_ENABLE_HVX && XNN_ARCH_HEXAGON - TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U128_ACC3, elements_eq_128) { + TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U128_ACC2, elements_eq_128) { TEST_REQUIRES_HVX; RAddStoreExpMinusMaxMicrokernelTester() .elements(128) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u128_acc3, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u128_acc2, nullptr); } - TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U128_ACC3, elements_div_128) { + TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U128_ACC2, elements_div_128) { TEST_REQUIRES_HVX; for (size_t elements = 256; elements < 1280; elements += 128) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u128_acc3, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u128_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U128_ACC3, elements_lt_128) { + TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U128_ACC2, elements_lt_128) { TEST_REQUIRES_HVX; for (size_t elements = 1; elements < 128; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u128_acc3, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u128_acc2, nullptr); } } - TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U128_ACC3, elements_gt_128) { + TEST(F32_RADDSTOREEXPMINUSMAX__HVX_RR2_P5_U128_ACC2, elements_gt_128) { TEST_REQUIRES_HVX; for (size_t elements = 129; elements < 256; elements++) { RAddStoreExpMinusMaxMicrokernelTester() .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u128_acc3, nullptr); + .Test(xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u128_acc2, nullptr); } } #endif // XNN_ENABLE_HVX && XNN_ARCH_HEXAGON @@ -5607,36 +1712,6 @@ TEST(F32_RADDSTOREEXPMINUSMAX__SCALAR_RR2_LUT64_P2_U1, elements_gt_1) { } } -TEST(F32_RADDSTOREEXPMINUSMAX__SCALAR_RR2_LUT64_P2_U2, elements_eq_2) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(2) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_lut64_p2_u2, nullptr); -} - -TEST(F32_RADDSTOREEXPMINUSMAX__SCALAR_RR2_LUT64_P2_U2, elements_div_2) { - for (size_t elements = 4; elements < 20; elements += 2) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_lut64_p2_u2, nullptr); - } -} - -TEST(F32_RADDSTOREEXPMINUSMAX__SCALAR_RR2_LUT64_P2_U2, elements_lt_2) { - for (size_t elements = 1; elements < 2; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_lut64_p2_u2, nullptr); - } -} - -TEST(F32_RADDSTOREEXPMINUSMAX__SCALAR_RR2_LUT64_P2_U2, elements_gt_2) { - for (size_t elements = 3; elements < 4; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_lut64_p2_u2, nullptr); - } -} - TEST(F32_RADDSTOREEXPMINUSMAX__SCALAR_RR2_LUT64_P2_U2_ACC2, elements_eq_2) { RAddStoreExpMinusMaxMicrokernelTester() .elements(2) @@ -5667,36 +1742,6 @@ TEST(F32_RADDSTOREEXPMINUSMAX__SCALAR_RR2_LUT64_P2_U2_ACC2, elements_gt_2) { } } -TEST(F32_RADDSTOREEXPMINUSMAX__SCALAR_RR2_LUT64_P2_U4, elements_eq_4) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(4) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_lut64_p2_u4, nullptr); -} - -TEST(F32_RADDSTOREEXPMINUSMAX__SCALAR_RR2_LUT64_P2_U4, elements_div_4) { - for (size_t elements = 8; elements < 40; elements += 4) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_lut64_p2_u4, nullptr); - } -} - -TEST(F32_RADDSTOREEXPMINUSMAX__SCALAR_RR2_LUT64_P2_U4, elements_lt_4) { - for (size_t elements = 1; elements < 4; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_lut64_p2_u4, nullptr); - } -} - -TEST(F32_RADDSTOREEXPMINUSMAX__SCALAR_RR2_LUT64_P2_U4, elements_gt_4) { - for (size_t elements = 5; elements < 8; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_lut64_p2_u4, nullptr); - } -} - TEST(F32_RADDSTOREEXPMINUSMAX__SCALAR_RR2_LUT64_P2_U4_ACC2, elements_eq_4) { RAddStoreExpMinusMaxMicrokernelTester() .elements(4) @@ -5771,36 +1816,6 @@ TEST(F32_RADDSTOREEXPMINUSMAX__SCALAR_RR2_P5_U1, elements_gt_1) { } } -TEST(F32_RADDSTOREEXPMINUSMAX__SCALAR_RR2_P5_U2, elements_eq_2) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(2) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_u2, nullptr); -} - -TEST(F32_RADDSTOREEXPMINUSMAX__SCALAR_RR2_P5_U2, elements_div_2) { - for (size_t elements = 4; elements < 20; elements += 2) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_u2, nullptr); - } -} - -TEST(F32_RADDSTOREEXPMINUSMAX__SCALAR_RR2_P5_U2, elements_lt_2) { - for (size_t elements = 1; elements < 2; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_u2, nullptr); - } -} - -TEST(F32_RADDSTOREEXPMINUSMAX__SCALAR_RR2_P5_U2, elements_gt_2) { - for (size_t elements = 3; elements < 4; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_u2, nullptr); - } -} - TEST(F32_RADDSTOREEXPMINUSMAX__SCALAR_RR2_P5_U2_ACC2, elements_eq_2) { RAddStoreExpMinusMaxMicrokernelTester() .elements(2) @@ -5831,36 +1846,6 @@ TEST(F32_RADDSTOREEXPMINUSMAX__SCALAR_RR2_P5_U2_ACC2, elements_gt_2) { } } -TEST(F32_RADDSTOREEXPMINUSMAX__SCALAR_RR2_P5_U4, elements_eq_4) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(4) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_u4, nullptr); -} - -TEST(F32_RADDSTOREEXPMINUSMAX__SCALAR_RR2_P5_U4, elements_div_4) { - for (size_t elements = 8; elements < 40; elements += 4) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_u4, nullptr); - } -} - -TEST(F32_RADDSTOREEXPMINUSMAX__SCALAR_RR2_P5_U4, elements_lt_4) { - for (size_t elements = 1; elements < 4; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_u4, nullptr); - } -} - -TEST(F32_RADDSTOREEXPMINUSMAX__SCALAR_RR2_P5_U4, elements_gt_4) { - for (size_t elements = 5; elements < 8; elements++) { - RAddStoreExpMinusMaxMicrokernelTester() - .elements(elements) - .Test(xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_u4, nullptr); - } -} - TEST(F32_RADDSTOREEXPMINUSMAX__SCALAR_RR2_P5_U4_ACC2, elements_eq_4) { RAddStoreExpMinusMaxMicrokernelTester() .elements(4) diff --git a/test/f32-raddstoreexpminusmax.yaml b/test/f32-raddstoreexpminusmax.yaml index 1a2d0ba8c9d..0bc916ce37d 100644 --- a/test/f32-raddstoreexpminusmax.yaml +++ b/test/f32-raddstoreexpminusmax.yaml @@ -5,53 +5,21 @@ # ARM NEON - name: xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u4 -- name: xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u8 - name: xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u8_acc2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u12 -- name: xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u12_acc2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u12_acc3 -- name: xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u16 - name: xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u16_acc2 - name: xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u16_acc4 -- name: xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u20 -- name: xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u20_acc2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_u20_acc5 - name: xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u4 -- name: xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u8 - name: xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u8_acc2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u12 -- name: xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u12_acc2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u12_acc3 -- name: xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u16 - name: xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u16_acc2 - name: xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u16_acc4 -- name: xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u20 -- name: xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u20_acc2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_u20_acc5 - name: xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u4 -- name: xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u8 - name: xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u8_acc2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u12 -- name: xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u12_acc2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u12_acc3 -- name: xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u16 - name: xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u16_acc2 - name: xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u16_acc4 -- name: xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u20 -- name: xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u20_acc2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_u20_acc5 - name: xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u4 -- name: xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u8 - name: xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u8_acc2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u12 -- name: xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u12_acc2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u12_acc3 -- name: xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u16 - name: xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u16_acc2 - name: xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u16_acc4 -- name: xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u20 -- name: xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u20_acc2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_u20_acc5 # RISC-V Vector - name: xnn_f32_raddstoreexpminusmax_ukernel__rvv_rr2_p6_u2v @@ -59,132 +27,51 @@ # x86 SSE2 - name: xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u4 -- name: xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u8 - name: xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u8_acc2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u12 -- name: xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u12_acc2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u12_acc3 -- name: xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16 - name: xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16_acc2 - name: xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u16_acc4 -- name: xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u20 -- name: xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u20_acc2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_u20_acc5 - -# x86 AVX -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u4 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u8 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u8_acc2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12_acc2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u12_acc3 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16_acc2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u16_acc4 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20_acc2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx_rr2_p5_u20_acc5 # x86 AVX2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32 +- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u8 +- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u16_acc2 - name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32_acc2 - name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u32_acc4 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u64 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u64_acc2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u64_acc4 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u72 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u72_acc3 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u80 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u80_acc2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u80_acc5 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96_acc2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96_acc3 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u96_acc6 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32 +- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u8 +- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u16_acc2 - name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc2 - name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc4 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64_acc2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u64_acc4 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u72 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u72_acc3 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80_acc2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u80_acc5 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc3 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u96_acc6 -# x86 AVX512 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64 +# x86 AVX512F +- name: xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u16 +- name: xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u32_acc2 - name: xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64_acc2 - name: xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64_acc4 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u128 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u128_acc2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u128_acc4 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u144 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u144_acc3 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u160 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u160_acc2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u160_acc5 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u192 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u192_acc2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u192_acc3 -- name: xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u192_acc6 # WAsm SIMD - name: xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u4 -- name: xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u8 - name: xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u8_acc2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u12 -- name: xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u12_acc2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u12_acc3 -- name: xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u16 - name: xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u16_acc2 - name: xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u16_acc4 -- name: xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u20 -- name: xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u20_acc2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u20_acc5 # WAsm Relaxed SIMD - name: xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u4 -- name: xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u8 - name: xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u8_acc2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u12 -- name: xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u12_acc2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u12_acc3 -- name: xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u16 - name: xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u16_acc2 - name: xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u16_acc4 -- name: xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u20 -- name: xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u20_acc2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_u20_acc5 # Hexagon HVX - name: xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u32 -- name: xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u64 - name: xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u64_acc2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u96 -- name: xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u96_acc2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u96_acc3 -- name: xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u128 - name: xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u128_acc2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u128_acc3 - name: xnn_f32_raddstoreexpminusmax_ukernel__hvx_rr2_p5_u128_acc4 # Scalar - name: xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_lut64_p2_u1 -- name: xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_lut64_p2_u2 - name: xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_lut64_p2_u2_acc2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_lut64_p2_u4 - name: xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_lut64_p2_u4_acc2 - name: xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_lut64_p2_u4_acc4 - name: xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_u1 -- name: xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_u2 - name: xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_u2_acc2 -- name: xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_u4 - name: xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_u4_acc2 - name: xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_u4_acc4 From d092e25cf808e44fa8f5b77040e51e2e21bff5c7 Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Tue, 24 Sep 2024 12:10:05 -0700 Subject: [PATCH 45/50] Use header tables to generate enums instead of scripts This makes it impossible for the enum string data to get stale PiperOrigin-RevId: 678345795 --- BUILD.bazel | 20 +- scripts/generate-enums.sh | 12 -- src/enums/allocation-type.c | 26 +-- src/enums/allocation-type.yaml | 21 -- src/enums/microkernel-type.c | 33 +-- src/enums/microkernel-type.yaml | 35 --- src/enums/node-type.c | 87 +------- src/enums/node-type.yaml | 133 ------------ src/enums/operator-type.c | 184 +--------------- src/enums/operator-type.yaml | 324 ---------------------------- src/xnnpack/allocation-type-defs.h | 22 ++ src/xnnpack/allocation-type.h | 15 +- src/xnnpack/microkernel-type-defs.h | 29 +++ src/xnnpack/microkernel-type.h | 22 +- src/xnnpack/node-type-defs.h | 78 +++++++ src/xnnpack/node-type.h | 71 +----- src/xnnpack/operator-type-defs.h | 173 +++++++++++++++ src/xnnpack/operator-type.h | 166 +------------- tools/generate-enum.py | 192 ----------------- 19 files changed, 367 insertions(+), 1276 deletions(-) delete mode 100755 scripts/generate-enums.sh delete mode 100644 src/enums/allocation-type.yaml delete mode 100644 src/enums/microkernel-type.yaml delete mode 100644 src/enums/node-type.yaml delete mode 100644 src/enums/operator-type.yaml create mode 100644 src/xnnpack/allocation-type-defs.h create mode 100644 src/xnnpack/microkernel-type-defs.h create mode 100644 src/xnnpack/node-type-defs.h create mode 100644 src/xnnpack/operator-type-defs.h delete mode 100755 tools/generate-enum.py diff --git a/BUILD.bazel b/BUILD.bazel index 758501e673d..5e8c9a8f61e 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -610,7 +610,10 @@ xnnpack_cc_library( xnnpack_cc_library( name = "node_type", - hdrs = ["src/xnnpack/node-type.h"], + hdrs = [ + "src/xnnpack/node-type.h", + "src/xnnpack/node-type-defs.h", + ], deps = [ ":common", ], @@ -618,7 +621,10 @@ xnnpack_cc_library( xnnpack_cc_library( name = "allocation_type", - hdrs = ["src/xnnpack/allocation-type.h"], + hdrs = [ + "src/xnnpack/allocation-type.h", + "src/xnnpack/allocation-type-defs.h", + ], deps = [ ":common", ], @@ -626,7 +632,10 @@ xnnpack_cc_library( xnnpack_cc_library( name = "operator_type", - hdrs = ["src/xnnpack/operator-type.h"], + hdrs = [ + "src/xnnpack/operator-type.h", + "src/xnnpack/operator-type-defs.h", + ], deps = [ ":common", ], @@ -634,7 +643,10 @@ xnnpack_cc_library( xnnpack_cc_library( name = "microkernel_type", - hdrs = ["src/xnnpack/microkernel-type.h"], + hdrs = [ + "src/xnnpack/microkernel-type.h", + "src/xnnpack/microkernel-type-defs.h", + ], deps = [ ":common", ], diff --git a/scripts/generate-enums.sh b/scripts/generate-enums.sh deleted file mode 100755 index 21b30c5018b..00000000000 --- a/scripts/generate-enums.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/sh -# Copyright 2022 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -tools/generate-enum.py --enum xnn_operator_type --spec src/enums/operator-type.yaml --output_src src/enums/operator-type.c --output_hdr src/xnnpack/operator-type.h & -tools/generate-enum.py --enum xnn_microkernel_type --spec src/enums/microkernel-type.yaml --output_src src/enums/microkernel-type.c --output_hdr src/xnnpack/microkernel-type.h & -tools/generate-enum.py --debug --enum xnn_node_type --spec src/enums/node-type.yaml --output_src src/enums/node-type.c --output_hdr src/xnnpack/node-type.h & -tools/generate-enum.py --debug --enum xnn_allocation_type --spec src/enums/allocation-type.yaml --output_src src/enums/allocation-type.c --output_hdr src/xnnpack/allocation-type.h & - -wait diff --git a/src/enums/allocation-type.c b/src/enums/allocation-type.c index 44079adc59b..0bde6a533c1 100644 --- a/src/enums/allocation-type.c +++ b/src/enums/allocation-type.c @@ -2,10 +2,6 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: src/enums/allocation-type.yaml -// Generator: tools/generate-enum.py #include #include @@ -13,21 +9,13 @@ #include "xnnpack/allocation-type.h" #if XNN_LOG_LEVEL > 0 -static const uint8_t offset[6] = { - 0, 8, 15, 25, 34, 45 -}; - -static const char data[] = - "invalid\0" - "static\0" - "workspace\0" - "external\0" - "persistent\0" - "dynamic"; - const char* xnn_allocation_type_to_string(enum xnn_allocation_type allocation_type) { - assert(allocation_type >= xnn_allocation_type_invalid); - assert(allocation_type <= xnn_allocation_type_dynamic); - return &data[offset[allocation_type]]; + switch(allocation_type) { + #define XNN_ENUM_ITEM(enum_name, enum_string) case enum_name: return enum_string; + #include "xnnpack/allocation-type-defs.h" + default: + XNN_UNREACHABLE; + #undef XNN_ENUM_ITEM + }; } #endif // XNN_LOG_LEVEL > 0 diff --git a/src/enums/allocation-type.yaml b/src/enums/allocation-type.yaml deleted file mode 100644 index 4b89e3f7250..00000000000 --- a/src/enums/allocation-type.yaml +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright 2024 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# LINT.IfChange - -- name: xnn_allocation_type_invalid - string: "invalid" -- name: xnn_allocation_type_static - string: "static" -- name: xnn_allocation_type_workspace - string: "workspace" -- name: xnn_allocation_type_external - string: "external" -- name: xnn_allocation_type_persistent - string: "persistent" -- name: xnn_allocation_type_dynamic - string: "dynamic" - -# LINT.ThenChange(allocation-type.c, ../xnnpack/allocation-type.h) diff --git a/src/enums/microkernel-type.c b/src/enums/microkernel-type.c index b4bce4421fa..65ffa2357f9 100644 --- a/src/enums/microkernel-type.c +++ b/src/enums/microkernel-type.c @@ -2,37 +2,18 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: src/enums/microkernel-type.yaml -// Generator: tools/generate-enum.py #include #include #include "xnnpack/microkernel-type.h" -static const uint8_t offset[13] = { - 0, 8, 24, 39, 46, 51, 74, 80, 85, 111, 116, 126, 136 -}; - -static const char data[] = - "Default\0" - "Average Pooling\0" - "Conv2D HWC2CHW\0" - "DWConv\0" - "GEMM\0" - "Global Average Pooling\0" - "IGEMM\0" - "Mean\0" - "Pixelwise Average Pooling\0" - "SPMM\0" - "Subconv2D\0" - "Transpose\0" - "VMulCAddC"; - const char* xnn_microkernel_type_to_string(enum xnn_microkernel_type microkernel_type) { - assert(microkernel_type >= xnn_microkernel_type_default); - assert(microkernel_type <= xnn_microkernel_type_vmulcaddc); - return &data[offset[microkernel_type]]; + switch(microkernel_type) { + #define XNN_ENUM_ITEM(enum_name, enum_string) case enum_name: return enum_string; + #include "xnnpack/microkernel-type-defs.h" + default: + XNN_UNREACHABLE; + #undef XNN_ENUM_ITEM + }; } diff --git a/src/enums/microkernel-type.yaml b/src/enums/microkernel-type.yaml deleted file mode 100644 index 6f623b5294c..00000000000 --- a/src/enums/microkernel-type.yaml +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright 2022 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# LINT.IfChange - -- name: xnn_microkernel_type_default - string: "Default" -- name: xnn_microkernel_type_average_pooling - string: "Average Pooling" -- name: xnn_microkernel_type_conv2d_hwc2chw - string: "Conv2D HWC2CHW" -- name: xnn_microkernel_type_dwconv - string: "DWConv" -- name: xnn_microkernel_type_gemm - string: "GEMM" -- name: xnn_microkernel_type_global_average_pooling - string: "Global Average Pooling" -- name: xnn_microkernel_type_igemm - string: "IGEMM" -- name: xnn_microkernel_type_mean - string: "Mean" -- name: xnn_microkernel_type_pixelwise_average_pooling - string: "Pixelwise Average Pooling" -- name: xnn_microkernel_type_spmm - string: "SPMM" -- name: xnn_microkernel_type_subconv2d - string: "Subconv2D" -- name: xnn_microkernel_type_transpose - string: "Transpose" -- name: xnn_microkernel_type_vmulcaddc - string: "VMulCAddC" - -# LINT.ThenChange(microkernel-type.c, ../xnnpack/microkernel-type.h) diff --git a/src/enums/node-type.c b/src/enums/node-type.c index 827af470abd..195f0394380 100644 --- a/src/enums/node-type.c +++ b/src/enums/node-type.c @@ -2,90 +2,19 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: src/enums/node-type.yaml -// Generator: tools/generate-enum.py #include -#include #include "xnnpack/node-type.h" #if XNN_LOG_LEVEL > 0 -static const uint16_t offset[62] = { - 0, 8, 12, 17, 35, 54, 71, 93, 101, 107, 120, 133, 146, 159, 167, 182, 187, 197, 214, 232, 257, 264, 268, 272, 284, - 296, 308, 314, 330, 353, 358, 384, 410, 432, 454, 464, 468, 479, 494, 503, 512, 522, 529, 535, 558, 563, 592, 600, - 608, 626, 633, 645, 664, 684, 703, 715, 730, 756, 769, 786, 795, 800 -}; - -static const char data[] = - "Invalid\0" - "Abs\0" - "Add2\0" - "ArgMax Pooling 2D\0" - "Average Pooling 2D\0" - "Bankers Rounding\0" - "Batch Matrix Multiply\0" - "Ceiling\0" - "Clamp\0" - "Concatenate2\0" - "Concatenate3\0" - "Concatenate4\0" - "Concatenate5\0" - "Convert\0" - "Convolution 2D\0" - "Copy\0" - "Copy Sign\0" - "Deconvolution 2D\0" - "Depth To Space 2D\0" - "Depthwise Convolution 2D\0" - "Divide\0" - "ELU\0" - "Exp\0" - "Even Split2\0" - "Even Split3\0" - "Even Split4\0" - "Floor\0" - "Fully Connected\0" - "Fully Connected Sparse\0" - "GELU\0" - "Global Average Pooling 1D\0" - "Global Average Pooling 2D\0" - "Global Sum Pooling 1D\0" - "Global Sum Pooling 2D\0" - "HardSwish\0" - "Log\0" - "Leaky ReLU\0" - "Max Pooling 2D\0" - "Maximum2\0" - "Minimum2\0" - "Multiply2\0" - "Negate\0" - "PReLU\0" - "Reciprocal Square Root\0" - "RoPE\0" - "Scaled Dot Product Attention\0" - "Sigmoid\0" - "Softmax\0" - "Space To Depth 2D\0" - "Square\0" - "Square Root\0" - "Squared Difference\0" - "Static Constant Pad\0" - "Static Expand Dims\0" - "Static Mean\0" - "Static Reshape\0" - "Static Resize Bilinear 2D\0" - "Static Slice\0" - "Static Transpose\0" - "Subtract\0" - "Tanh\0" - "Unpooling 2D"; - const char* xnn_node_type_to_string(enum xnn_node_type node_type) { - assert(node_type >= xnn_node_type_invalid); - assert(node_type <= xnn_node_type_unpooling_2d); - return &data[offset[node_type]]; + switch(node_type) { + #define XNN_ENUM_ITEM(enum_name, enum_string) case enum_name: return enum_string; + #include "xnnpack/node-type-defs.h" + default: + XNN_UNREACHABLE; + #undef XNN_ENUM_ITEM + }; } -#endif // XNN_LOG_LEVEL > 0 +#endif // XNN_LOG_LEVEL > 0 \ No newline at end of file diff --git a/src/enums/node-type.yaml b/src/enums/node-type.yaml deleted file mode 100644 index c1f4cc1c670..00000000000 --- a/src/enums/node-type.yaml +++ /dev/null @@ -1,133 +0,0 @@ -# Copyright 2023 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# LINT.IfChange - -- name: xnn_node_type_invalid - string: "Invalid" -- name: xnn_node_type_abs - string: "Abs" -- name: xnn_node_type_add2 - string: "Add2" -- name: xnn_node_type_argmax_pooling_2d - string: "ArgMax Pooling 2D" -- name: xnn_node_type_average_pooling_2d - string: "Average Pooling 2D" -- name: xnn_node_type_bankers_rounding - string: "Bankers Rounding" -- name: xnn_node_type_batch_matrix_multiply - string: "Batch Matrix Multiply" -- name: xnn_node_type_ceiling - string: "Ceiling" -- name: xnn_node_type_clamp - string: "Clamp" -- name: xnn_node_type_concatenate2 - string: "Concatenate2" -- name: xnn_node_type_concatenate3 - string: "Concatenate3" -- name: xnn_node_type_concatenate4 - string: "Concatenate4" -- name: xnn_node_type_concatenate5 - string: "Concatenate5" -- name: xnn_node_type_convert - string: "Convert" -- name: xnn_node_type_convolution_2d - string: "Convolution 2D" -- name: xnn_node_type_copy - string: "Copy" -- name: xnn_node_type_copysign - string: "Copy Sign" -- name: xnn_node_type_deconvolution_2d - string: "Deconvolution 2D" -- name: xnn_node_type_depth_to_space_2d - string: "Depth To Space 2D" -- name: xnn_node_type_depthwise_convolution_2d - string: "Depthwise Convolution 2D" -- name: xnn_node_type_divide - string: "Divide" -- name: xnn_node_type_elu - string: "ELU" -- name: xnn_node_type_exp - string: "Exp" -- name: xnn_node_type_even_split2 - string: "Even Split2" -- name: xnn_node_type_even_split3 - string: "Even Split3" -- name: xnn_node_type_even_split4 - string: "Even Split4" -- name: xnn_node_type_floor - string: "Floor" -- name: xnn_node_type_fully_connected - string: "Fully Connected" -- name: xnn_node_type_fully_connected_sparse - string: "Fully Connected Sparse" -- name: xnn_node_type_gelu - string: "GELU" -- name: xnn_node_type_global_average_pooling_1d - string: "Global Average Pooling 1D" -- name: xnn_node_type_global_average_pooling_2d - string: "Global Average Pooling 2D" -- name: xnn_node_type_global_sum_pooling_1d - string: "Global Sum Pooling 1D" -- name: xnn_node_type_global_sum_pooling_2d - string: "Global Sum Pooling 2D" -- name: xnn_node_type_hardswish - string: "HardSwish" -- name: xnn_node_type_log - string: "Log" -- name: xnn_node_type_leaky_relu - string: "Leaky ReLU" -- name: xnn_node_type_max_pooling_2d - string: "Max Pooling 2D" -- name: xnn_node_type_maximum2 - string: "Maximum2" -- name: xnn_node_type_minimum2 - string: "Minimum2" -- name: xnn_node_type_multiply2 - string: "Multiply2" -- name: xnn_node_type_negate - string: "Negate" -- name: xnn_node_type_prelu - string: "PReLU" -- name: xnn_node_type_reciprocal_square_root - string: "Reciprocal Square Root" -- name: xnn_node_type_rope - string: "RoPE" -- name: xnn_node_type_scaled_dot_product_attention - string: "Scaled Dot Product Attention" -- name: xnn_node_type_sigmoid - string: "Sigmoid" -- name: xnn_node_type_softmax - string: "Softmax" -- name: xnn_node_type_space_to_depth_2d - string: "Space To Depth 2D" -- name: xnn_node_type_square - string: "Square" -- name: xnn_node_type_square_root - string: "Square Root" -- name: xnn_node_type_squared_difference - string: "Squared Difference" -- name: xnn_node_type_static_constant_pad - string: "Static Constant Pad" -- name: xnn_node_type_static_expand_dims - string: "Static Expand Dims" -- name: xnn_node_type_static_mean - string: "Static Mean" -- name: xnn_node_type_static_reshape - string: "Static Reshape" -- name: xnn_node_type_static_resize_bilinear_2d - string: "Static Resize Bilinear 2D" -- name: xnn_node_type_static_slice - string: "Static Slice" -- name: xnn_node_type_static_transpose - string: "Static Transpose" -- name: xnn_node_type_subtract - string: "Subtract" -- name: xnn_node_type_tanh - string: "Tanh" -- name: xnn_node_type_unpooling_2d - string: "Unpooling 2D" - -# LINT.ThenChange(node-type.c, ../xnnpack/node-type.h) diff --git a/src/enums/operator-type.c b/src/enums/operator-type.c index 664fdb33944..aa8749d2bbb 100644 --- a/src/enums/operator-type.c +++ b/src/enums/operator-type.c @@ -2,188 +2,18 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: src/enums/operator-type.yaml -// Generator: tools/generate-enum.py #include -#include #include "xnnpack/operator-type.h" -static const uint16_t offset[157] = { - 0, 8, 22, 36, 45, 72, 100, 128, 156, 183, 210, 242, 274, 317, 335, 353, 378, 404, 420, 436, 451, 466, 488, 511, 534, - 557, 580, 603, 626, 649, 672, 695, 713, 736, 759, 783, 801, 824, 848, 872, 896, 920, 955, 990, 1014, 1038, 1062, 1076, - 1091, 1106, 1121, 1147, 1173, 1210, 1236, 1266, 1292, 1324, 1356, 1382, 1409, 1436, 1448, 1482, 1516, 1530, 1544, - 1558, 1572, 1588, 1604, 1630, 1656, 1688, 1720, 1757, 1794, 1831, 1868, 1905, 1942, 1979, 2005, 2037, 2063, 2078, - 2112, 2146, 2180, 2214, 2248, 2282, 2312, 2342, 2362, 2382, 2403, 2424, 2445, 2466, 2480, 2504, 2528, 2551, 2574, - 2587, 2602, 2617, 2632, 2647, 2660, 2674, 2691, 2708, 2724, 2740, 2773, 2806, 2834, 2862, 2890, 2918, 2945, 2972, - 2989, 3006, 3047, 3088, 3106, 3124, 3142, 3160, 3175, 3191, 3207, 3225, 3243, 3261, 3287, 3314, 3341, 3358, 3375, - 3397, 3419, 3443, 3457, 3472, 3487, 3502, 3517, 3536, 3556, 3576, 3596, 3617, 3638 -}; - -static const char data[] = - "Invalid\0" - "Abs (NC, F16)\0" - "Abs (NC, F32)\0" - "Add (ND)\0" - "ArgMax Pooling (NHWC, F32)\0" - "Average Pooling (NHWC, F16)\0" - "Average Pooling (NHWC, F32)\0" - "Average Pooling (NHWC, QU8)\0" - "Bankers Rounding (NC, F16)\0" - "Bankers Rounding (NC, F32)\0" - "Batch Matrix Multiply (NC, F16)\0" - "Batch Matrix Multiply (NC, F32)\0" - "Batch Matrix Multiply (NC, QD8, F32, QC8W)\0" - "Ceiling (NC, F16)\0" - "Ceiling (NC, F32)\0" - "Channel Shuffle (NC, X8)\0" - "Channel Shuffle (NC, X32)\0" - "Clamp (NC, F16)\0" - "Clamp (NC, F32)\0" - "Clamp (NC, S8)\0" - "Clamp (NC, U8)\0" - "Constant Pad (ND, X8)\0" - "Constant Pad (ND, X16)\0" - "Constant Pad (ND, X32)\0" - "Convert (NC, F16, F32)\0" - "Convert (NC, F16, QD8)\0" - "Convert (NC, F32, F16)\0" - "Convert (NC, F32, QD8)\0" - "Convert (NC, F32, QP8)\0" - "Convert (NC, F32, QS8)\0" - "Convert (NC, F32, QU8)\0" - "Convert (NC, QS8)\0" - "Convert (NC, QS8, F16)\0" - "Convert (NC, QS8, F32)\0" - "Convert (NC, QS16, QS8)\0" - "Convert (NC, QU8)\0" - "Convert (NC, QU8, F32)\0" - "Convolution (NCHW, F16)\0" - "Convolution (NCHW, F32)\0" - "Convolution (NHWC, F16)\0" - "Convolution (NHWC, F32)\0" - "Convolution (NHWC, QD8, F16, QC8W)\0" - "Convolution (NHWC, QD8, F32, QC8W)\0" - "Convolution (NHWC, QC8)\0" - "Convolution (NHWC, QS8)\0" - "Convolution (NHWC, QU8)\0" - "Copy (NC, X8)\0" - "Copy (NC, X16)\0" - "Copy (NC, X32)\0" - "Copy Sign (ND)\0" - "Deconvolution (NHWC, F16)\0" - "Deconvolution (NHWC, F32)\0" - "Deconvolution (NHWC, QD8, F32, QC8W)\0" - "Deconvolution (NHWC, QS8)\0" - "Deconvolution (NC, QS8, QC8W)\0" - "Deconvolution (NHWC, QU8)\0" - "Depth To Space (NCHW2NHWC, X16)\0" - "Depth To Space (NCHW2NHWC, X32)\0" - "Depth To Space (NHWC, X8)\0" - "Depth To Space (NHWC, X16)\0" - "Depth To Space (NHWC, X32)\0" - "Divide (ND)\0" - "Dynamic Fully Connected (NC, F16)\0" - "Dynamic Fully Connected (NC, F32)\0" - "ELU (NC, F16)\0" - "ELU (NC, F32)\0" - "ELU (NC, QS8)\0" - "Exp (NC, F32)\0" - "Floor (NC, F16)\0" - "Floor (NC, F32)\0" - "Fully Connected (NC, F16)\0" - "Fully Connected (NC, F32)\0" - "Fully Connected (NC, F32, QC4W)\0" - "Fully Connected (NC, F32, QC8W)\0" - "Fully Connected (NC, QD8, F16, QB4W)\0" - "Fully Connected (NC, QD8, F16, QC4W)\0" - "Fully Connected (NC, QD8, F16, QC8W)\0" - "Fully Connected (NC, QD8, F32, QB4W)\0" - "Fully Connected (NC, QD8, F32, QC4W)\0" - "Fully Connected (NC, QD8, F32, QC8W)\0" - "Fully Connected (NC, QP8, F32, QC4W)\0" - "Fully Connected (NC, QS8)\0" - "Fully Connected (NC, QS8, QC8W)\0" - "Fully Connected (NC, QU8)\0" - "GELU (NC, F32)\0" - "Global Average Pooling (NCW, F16)\0" - "Global Average Pooling (NCW, F32)\0" - "Global Average Pooling (NWC, F16)\0" - "Global Average Pooling (NWC, F32)\0" - "Global Average Pooling (NWC, QS8)\0" - "Global Average Pooling (NWC, QU8)\0" - "Global Sum Pooling (NWC, F16)\0" - "Global Sum Pooling (NWC, F32)\0" - "HardSwish (NC, F16)\0" - "HardSwish (NC, F32)\0" - "Leaky ReLU (NC, F16)\0" - "Leaky ReLU (NC, F32)\0" - "Leaky ReLU (NC, QS8)\0" - "Leaky ReLU (NC, QU8)\0" - "Log (NC, F32)\0" - "Max Pooling (NHWC, F16)\0" - "Max Pooling (NHWC, F32)\0" - "Max Pooling (NHWC, S8)\0" - "Max Pooling (NHWC, U8)\0" - "Maximum (ND)\0" - "Mean (ND, F16)\0" - "Mean (ND, F32)\0" - "Mean (ND, QS8)\0" - "Mean (ND, QU8)\0" - "Minimum (ND)\0" - "Multiply (ND)\0" - "Negate (NC, F16)\0" - "Negate (NC, F32)\0" - "PReLU (NC, F16)\0" - "PReLU (NC, F32)\0" - "Reciprocal Square Root (NC, F16)\0" - "Reciprocal Square Root (NC, F32)\0" - "Resize Bilinear (NCHW, F16)\0" - "Resize Bilinear (NCHW, F32)\0" - "Resize Bilinear (NHWC, F16)\0" - "Resize Bilinear (NHWC, F32)\0" - "Resize Bilinear (NHWC, S8)\0" - "Resize Bilinear (NHWC, U8)\0" - "RoPE (NTHC, F16)\0" - "RoPE (NTHC, F32)\0" - "Scaled Dot-Product Attention (NHTC, F16)\0" - "Scaled Dot-Product Attention (NHTC, F32)\0" - "Sigmoid (NC, F16)\0" - "Sigmoid (NC, F32)\0" - "Sigmoid (NC, QS8)\0" - "Sigmoid (NC, QU8)\0" - "Slice (ND, X8)\0" - "Slice (ND, X16)\0" - "Slice (ND, X32)\0" - "Softmax (NC, F16)\0" - "Softmax (NC, F32)\0" - "Softmax (NC, QU8)\0" - "Space To Depth (NHWC, X8)\0" - "Space To Depth (NHWC, X16)\0" - "Space To Depth (NHWC, X32)\0" - "Square (NC, F16)\0" - "Square (NC, F32)\0" - "Square Root (NC, F16)\0" - "Square Root (NC, F32)\0" - "Squared Difference (NC)\0" - "Subtract (ND)\0" - "Tanh (NC, F16)\0" - "Tanh (NC, F32)\0" - "Tanh (NC, QS8)\0" - "Tanh (NC, QU8)\0" - "Transpose (ND, X8)\0" - "Transpose (ND, X16)\0" - "Transpose (ND, X32)\0" - "Transpose (ND, X64)\0" - "Truncation (NC, F16)\0" - "Truncation (NC, F32)\0" - "Unpooling (NHWC, X32)"; const char* xnn_operator_type_to_string(enum xnn_operator_type operator_type) { - assert(operator_type >= xnn_operator_type_invalid); - assert(operator_type <= xnn_operator_type_unpooling_nhwc_x32); - return &data[offset[operator_type]]; + switch(operator_type) { + #define XNN_ENUM_ITEM(enum_name, enum_string) case enum_name: return enum_string; + #include "xnnpack/operator-type-defs.h" + default: + XNN_UNREACHABLE; + #undef XNN_ENUM_ITEM + }; } diff --git a/src/enums/operator-type.yaml b/src/enums/operator-type.yaml deleted file mode 100644 index ebb35073453..00000000000 --- a/src/enums/operator-type.yaml +++ /dev/null @@ -1,324 +0,0 @@ -# Copyright 2022 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# LINT.IfChange - -- name: xnn_operator_type_invalid - string: "Invalid" -- name: xnn_operator_type_abs_nc_f16 - string: "Abs (NC, F16)" -- name: xnn_operator_type_abs_nc_f32 - string: "Abs (NC, F32)" -- name: xnn_operator_type_add - string: "Add (ND)" -- name: xnn_operator_type_argmax_pooling_nhwc_f32 - string: "ArgMax Pooling (NHWC, F32)" -- name: xnn_operator_type_average_pooling_nhwc_f16 - string: "Average Pooling (NHWC, F16)" -- name: xnn_operator_type_average_pooling_nhwc_f32 - string: "Average Pooling (NHWC, F32)" -- name: xnn_operator_type_average_pooling_nhwc_qu8 - string: "Average Pooling (NHWC, QU8)" -- name: xnn_operator_type_bankers_rounding_nc_f16 - string: "Bankers Rounding (NC, F16)" -- name: xnn_operator_type_bankers_rounding_nc_f32 - string: "Bankers Rounding (NC, F32)" -- name: xnn_operator_type_batch_matrix_multiply_nc_f16 - string: "Batch Matrix Multiply (NC, F16)" -- name: xnn_operator_type_batch_matrix_multiply_nc_f32 - string: "Batch Matrix Multiply (NC, F32)" -- name: xnn_operator_type_batch_matrix_multiply_nc_qd8_f32_qc8w - string: "Batch Matrix Multiply (NC, QD8, F32, QC8W)" -- name: xnn_operator_type_ceiling_nc_f16 - string: "Ceiling (NC, F16)" -- name: xnn_operator_type_ceiling_nc_f32 - string: "Ceiling (NC, F32)" -- name: xnn_operator_type_channel_shuffle_nc_x8 - string: "Channel Shuffle (NC, X8)" -- name: xnn_operator_type_channel_shuffle_nc_x32 - string: "Channel Shuffle (NC, X32)" -- name: xnn_operator_type_clamp_nc_f16 - string: "Clamp (NC, F16)" -- name: xnn_operator_type_clamp_nc_f32 - string: "Clamp (NC, F32)" -- name: xnn_operator_type_clamp_nc_s8 - string: "Clamp (NC, S8)" -- name: xnn_operator_type_clamp_nc_u8 - string: "Clamp (NC, U8)" -- name: xnn_operator_type_constant_pad_nd_x8 - string: "Constant Pad (ND, X8)" -- name: xnn_operator_type_constant_pad_nd_x16 - string: "Constant Pad (ND, X16)" -- name: xnn_operator_type_constant_pad_nd_x32 - string: "Constant Pad (ND, X32)" -- name: xnn_operator_type_convert_nc_f16_f32 - string: "Convert (NC, F16, F32)" -- name: xnn_operator_type_convert_nc_f16_qd8 - string: "Convert (NC, F16, QD8)" -- name: xnn_operator_type_convert_nc_f32_f16 - string: "Convert (NC, F32, F16)" -- name: xnn_operator_type_convert_nc_f32_qd8 - string: "Convert (NC, F32, QD8)" -- name: xnn_operator_type_convert_nc_f32_qp8 - string: "Convert (NC, F32, QP8)" -- name: xnn_operator_type_convert_nc_f32_qs8 - string: "Convert (NC, F32, QS8)" -- name: xnn_operator_type_convert_nc_f32_qu8 - string: "Convert (NC, F32, QU8)" -- name: xnn_operator_type_convert_nc_qs8 - string: "Convert (NC, QS8)" -- name: xnn_operator_type_convert_nc_qs8_f16 - string: "Convert (NC, QS8, F16)" -- name: xnn_operator_type_convert_nc_qs8_f32 - string: "Convert (NC, QS8, F32)" -- name: xnn_operator_type_convert_nc_qs16_qs8 - string: "Convert (NC, QS16, QS8)" -- name: xnn_operator_type_convert_nc_qu8 - string: "Convert (NC, QU8)" -- name: xnn_operator_type_convert_nc_qu8_f32 - string: "Convert (NC, QU8, F32)" -- name: xnn_operator_type_convolution_nchw_f16 - string: "Convolution (NCHW, F16)" -- name: xnn_operator_type_convolution_nchw_f32 - string: "Convolution (NCHW, F32)" -- name: xnn_operator_type_convolution_nhwc_f16 - string: "Convolution (NHWC, F16)" -- name: xnn_operator_type_convolution_nhwc_f32 - string: "Convolution (NHWC, F32)" -- name: xnn_operator_type_convolution_nhwc_qd8_f16_qc8w - string: "Convolution (NHWC, QD8, F16, QC8W)" -- name: xnn_operator_type_convolution_nhwc_qd8_f32_qc8w - string: "Convolution (NHWC, QD8, F32, QC8W)" -- name: xnn_operator_type_convolution_nhwc_qc8 - string: "Convolution (NHWC, QC8)" -- name: xnn_operator_type_convolution_nhwc_qs8 - string: "Convolution (NHWC, QS8)" -- name: xnn_operator_type_convolution_nhwc_qu8 - string: "Convolution (NHWC, QU8)" -- name: xnn_operator_type_copy_nc_x8 - string: "Copy (NC, X8)" -- name: xnn_operator_type_copy_nc_x16 - string: "Copy (NC, X16)" -- name: xnn_operator_type_copy_nc_x32 - string: "Copy (NC, X32)" -- name: xnn_operator_type_copysign - string: "Copy Sign (ND)" -- name: xnn_operator_type_deconvolution_nhwc_f16 - string: "Deconvolution (NHWC, F16)" -- name: xnn_operator_type_deconvolution_nhwc_f32 - string: "Deconvolution (NHWC, F32)" -- name: xnn_operator_type_deconvolution_nhwc_qd8_f32_qc8w - string: "Deconvolution (NHWC, QD8, F32, QC8W)" -- name: xnn_operator_type_deconvolution_nhwc_qs8 - string: "Deconvolution (NHWC, QS8)" -- name: xnn_operator_type_deconvolution_nhwc_qs8_qc8w - string: "Deconvolution (NC, QS8, QC8W)" -- name: xnn_operator_type_deconvolution_nhwc_qu8 - string: "Deconvolution (NHWC, QU8)" -- name: xnn_operator_type_depth_to_space_nchw2nhwc_x16 - string: "Depth To Space (NCHW2NHWC, X16)" -- name: xnn_operator_type_depth_to_space_nchw2nhwc_x32 - string: "Depth To Space (NCHW2NHWC, X32)" -- name: xnn_operator_type_depth_to_space_nhwc_x8 - string: "Depth To Space (NHWC, X8)" -- name: xnn_operator_type_depth_to_space_nhwc_x16 - string: "Depth To Space (NHWC, X16)" -- name: xnn_operator_type_depth_to_space_nhwc_x32 - string: "Depth To Space (NHWC, X32)" -- name: xnn_operator_type_divide - string: "Divide (ND)" -- name: xnn_operator_type_dynamic_fully_connected_nc_f16 - string: "Dynamic Fully Connected (NC, F16)" -- name: xnn_operator_type_dynamic_fully_connected_nc_f32 - string: "Dynamic Fully Connected (NC, F32)" -- name: xnn_operator_type_elu_nc_f16 - string: "ELU (NC, F16)" -- name: xnn_operator_type_elu_nc_f32 - string: "ELU (NC, F32)" -- name: xnn_operator_type_elu_nc_qs8 - string: "ELU (NC, QS8)" -- name: xnn_operator_type_exp_nc_f32 - string: "Exp (NC, F32)" -- name: xnn_operator_type_floor_nc_f16 - string: "Floor (NC, F16)" -- name: xnn_operator_type_floor_nc_f32 - string: "Floor (NC, F32)" -- name: xnn_operator_type_fully_connected_nc_f16 - string: "Fully Connected (NC, F16)" -- name: xnn_operator_type_fully_connected_nc_f32 - string: "Fully Connected (NC, F32)" -- name: xnn_operator_type_fully_connected_nc_f32_qc4w - string: "Fully Connected (NC, F32, QC4W)" -- name: xnn_operator_type_fully_connected_nc_f32_qc8w - string: "Fully Connected (NC, F32, QC8W)" -- name: xnn_operator_type_fully_connected_nc_qd8_f16_qb4w - string: "Fully Connected (NC, QD8, F16, QB4W)" -- name: xnn_operator_type_fully_connected_nc_qd8_f16_qc4w - string: "Fully Connected (NC, QD8, F16, QC4W)" -- name: xnn_operator_type_fully_connected_nc_qd8_f16_qc8w - string: "Fully Connected (NC, QD8, F16, QC8W)" -- name: xnn_operator_type_fully_connected_nc_qd8_f32_qb4w - string: "Fully Connected (NC, QD8, F32, QB4W)" -- name: xnn_operator_type_fully_connected_nc_qd8_f32_qc4w - string: "Fully Connected (NC, QD8, F32, QC4W)" -- name: xnn_operator_type_fully_connected_nc_qd8_f32_qc8w - string: "Fully Connected (NC, QD8, F32, QC8W)" -- name: xnn_operator_type_fully_connected_nc_qp8_f32_qc4w - string: "Fully Connected (NC, QP8, F32, QC4W)" -- name: xnn_operator_type_fully_connected_nc_qs8 - string: "Fully Connected (NC, QS8)" -- name: xnn_operator_type_fully_connected_nc_qs8_qc8w - string: "Fully Connected (NC, QS8, QC8W)" -- name: xnn_operator_type_fully_connected_nc_qu8 - string: "Fully Connected (NC, QU8)" -- name: xnn_operator_type_gelu_nc_f32 - string: "GELU (NC, F32)" -- name: xnn_operator_type_global_average_pooling_ncw_f16 - string: "Global Average Pooling (NCW, F16)" -- name: xnn_operator_type_global_average_pooling_ncw_f32 - string: "Global Average Pooling (NCW, F32)" -- name: xnn_operator_type_global_average_pooling_nwc_f16 - string: "Global Average Pooling (NWC, F16)" -- name: xnn_operator_type_global_average_pooling_nwc_f32 - string: "Global Average Pooling (NWC, F32)" -- name: xnn_operator_type_global_average_pooling_nwc_qs8 - string: "Global Average Pooling (NWC, QS8)" -- name: xnn_operator_type_global_average_pooling_nwc_qu8 - string: "Global Average Pooling (NWC, QU8)" -- name: xnn_operator_type_global_sum_pooling_nwc_f16 - string: "Global Sum Pooling (NWC, F16)" -- name: xnn_operator_type_global_sum_pooling_nwc_f32 - string: "Global Sum Pooling (NWC, F32)" -- name: xnn_operator_type_hardswish_nc_f16 - string: "HardSwish (NC, F16)" -- name: xnn_operator_type_hardswish_nc_f32 - string: "HardSwish (NC, F32)" -- name: xnn_operator_type_leaky_relu_nc_f16 - string: "Leaky ReLU (NC, F16)" -- name: xnn_operator_type_leaky_relu_nc_f32 - string: "Leaky ReLU (NC, F32)" -- name: xnn_operator_type_leaky_relu_nc_qs8 - string: "Leaky ReLU (NC, QS8)" -- name: xnn_operator_type_leaky_relu_nc_qu8 - string: "Leaky ReLU (NC, QU8)" -- name: xnn_operator_type_log_nc_f32 - string: "Log (NC, F32)" -- name: xnn_operator_type_max_pooling_nhwc_f16 - string: "Max Pooling (NHWC, F16)" -- name: xnn_operator_type_max_pooling_nhwc_f32 - string: "Max Pooling (NHWC, F32)" -- name: xnn_operator_type_max_pooling_nhwc_s8 - string: "Max Pooling (NHWC, S8)" -- name: xnn_operator_type_max_pooling_nhwc_u8 - string: "Max Pooling (NHWC, U8)" -- name: xnn_operator_type_maximum - string: "Maximum (ND)" -- name: xnn_operator_type_mean_nd_f16 - string: "Mean (ND, F16)" -- name: xnn_operator_type_mean_nd_f32 - string: "Mean (ND, F32)" -- name: xnn_operator_type_mean_nd_qs8 - string: "Mean (ND, QS8)" -- name: xnn_operator_type_mean_nd_qu8 - string: "Mean (ND, QU8)" -- name: xnn_operator_type_minimum - string: "Minimum (ND)" -- name: xnn_operator_type_multiply - string: "Multiply (ND)" -- name: xnn_operator_type_negate_nc_f16 - string: "Negate (NC, F16)" -- name: xnn_operator_type_negate_nc_f32 - string: "Negate (NC, F32)" -- name: xnn_operator_type_prelu_nc_f16 - string: "PReLU (NC, F16)" -- name: xnn_operator_type_prelu_nc_f32 - string: "PReLU (NC, F32)" -- name: xnn_operator_type_reciprocal_square_root_nc_f16 - string: "Reciprocal Square Root (NC, F16)" -- name: xnn_operator_type_reciprocal_square_root_nc_f32 - string: "Reciprocal Square Root (NC, F32)" -- name: xnn_operator_type_resize_bilinear_nchw_f16 - string: "Resize Bilinear (NCHW, F16)" -- name: xnn_operator_type_resize_bilinear_nchw_f32 - string: "Resize Bilinear (NCHW, F32)" -- name: xnn_operator_type_resize_bilinear_nhwc_f16 - string: "Resize Bilinear (NHWC, F16)" -- name: xnn_operator_type_resize_bilinear_nhwc_f32 - string: "Resize Bilinear (NHWC, F32)" -- name: xnn_operator_type_resize_bilinear_nhwc_s8 - string: "Resize Bilinear (NHWC, S8)" -- name: xnn_operator_type_resize_bilinear_nhwc_u8 - string: "Resize Bilinear (NHWC, U8)" -- name: xnn_operator_type_rope_nthc_f16 - string: "RoPE (NTHC, F16)" -- name: xnn_operator_type_rope_nthc_f32 - string: "RoPE (NTHC, F32)" -- name: xnn_operator_type_scaled_dot_product_attention_nhtc_f16 - string: "Scaled Dot-Product Attention (NHTC, F16)" -- name: xnn_operator_type_scaled_dot_product_attention_nhtc_f32 - string: "Scaled Dot-Product Attention (NHTC, F32)" -- name: xnn_operator_type_sigmoid_nc_f16 - string: "Sigmoid (NC, F16)" -- name: xnn_operator_type_sigmoid_nc_f32 - string: "Sigmoid (NC, F32)" -- name: xnn_operator_type_sigmoid_nc_qs8 - string: "Sigmoid (NC, QS8)" -- name: xnn_operator_type_sigmoid_nc_qu8 - string: "Sigmoid (NC, QU8)" -- name: xnn_operator_type_slice_nd_x8 - string: "Slice (ND, X8)" -- name: xnn_operator_type_slice_nd_x16 - string: "Slice (ND, X16)" -- name: xnn_operator_type_slice_nd_x32 - string: "Slice (ND, X32)" -- name: xnn_operator_type_softmax_nc_f16 - string: "Softmax (NC, F16)" -- name: xnn_operator_type_softmax_nc_f32 - string: "Softmax (NC, F32)" -- name: xnn_operator_type_softmax_nc_qu8 - string: "Softmax (NC, QU8)" -- name: xnn_operator_type_space_to_depth_nhwc_x8 - string: "Space To Depth (NHWC, X8)" -- name: xnn_operator_type_space_to_depth_nhwc_x16 - string: "Space To Depth (NHWC, X16)" -- name: xnn_operator_type_space_to_depth_nhwc_x32 - string: "Space To Depth (NHWC, X32)" -- name: xnn_operator_type_square_nc_f16 - string: "Square (NC, F16)" -- name: xnn_operator_type_square_nc_f32 - string: "Square (NC, F32)" -- name: xnn_operator_type_square_root_nc_f16 - string: "Square Root (NC, F16)" -- name: xnn_operator_type_square_root_nc_f32 - string: "Square Root (NC, F32)" -- name: xnn_operator_type_squared_difference - string: "Squared Difference (NC)" -- name: xnn_operator_type_subtract - string: "Subtract (ND)" -- name: xnn_operator_type_tanh_nc_f16 - string: "Tanh (NC, F16)" -- name: xnn_operator_type_tanh_nc_f32 - string: "Tanh (NC, F32)" -- name: xnn_operator_type_tanh_nc_qs8 - string: "Tanh (NC, QS8)" -- name: xnn_operator_type_tanh_nc_qu8 - string: "Tanh (NC, QU8)" -- name: xnn_operator_type_transpose_nd_x8 - string: "Transpose (ND, X8)" -- name: xnn_operator_type_transpose_nd_x16 - string: "Transpose (ND, X16)" -- name: xnn_operator_type_transpose_nd_x32 - string: "Transpose (ND, X32)" -- name: xnn_operator_type_transpose_nd_x64 - string: "Transpose (ND, X64)" -- name: xnn_operator_type_truncation_nc_f16 - string: "Truncation (NC, F16)" -- name: xnn_operator_type_truncation_nc_f32 - string: "Truncation (NC, F32)" -- name: xnn_operator_type_unpooling_nhwc_x32 - string: "Unpooling (NHWC, X32)" - - -# LINT.ThenChange(operator-type.c, ../xnnpack/operator-type.h) diff --git a/src/xnnpack/allocation-type-defs.h b/src/xnnpack/allocation-type-defs.h new file mode 100644 index 00000000000..bded1957129 --- /dev/null +++ b/src/xnnpack/allocation-type-defs.h @@ -0,0 +1,22 @@ +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#ifndef XNN_ENUM_ITEM_0 +#define XNN_ENUM_ITEM_0(enum_name, enum_string) XNN_ENUM_ITEM(enum_name, enum_string) +#define XNN_DEFINED_ENUM_ITEM_0 +#endif + +XNN_ENUM_ITEM_0(xnn_allocation_type_invalid, "invalid") +XNN_ENUM_ITEM(xnn_allocation_type_static, "static") +XNN_ENUM_ITEM(xnn_allocation_type_workspace, "workspace") +XNN_ENUM_ITEM(xnn_allocation_type_external, "external") +XNN_ENUM_ITEM(xnn_allocation_type_persistent, "persistent") +XNN_ENUM_ITEM(xnn_allocation_type_dynamic, "dynamic") + + +#ifdef XNN_DEFINED_ENUM_ITEM_0 +#undef XNN_DEFINED_ENUM_ITEM_0 +#undef XNN_ENUM_ITEM_0 +#endif diff --git a/src/xnnpack/allocation-type.h b/src/xnnpack/allocation-type.h index 7663877794a..42a9376ca59 100644 --- a/src/xnnpack/allocation-type.h +++ b/src/xnnpack/allocation-type.h @@ -2,10 +2,6 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: src/enums/allocation-type.yaml -// Generator: tools/generate-enum.py #pragma once @@ -17,12 +13,11 @@ extern "C" { #endif enum xnn_allocation_type { - xnn_allocation_type_invalid = 0, - xnn_allocation_type_static, - xnn_allocation_type_workspace, - xnn_allocation_type_external, - xnn_allocation_type_persistent, - xnn_allocation_type_dynamic, +#define XNN_ENUM_ITEM_0(enum_name, enum_string) enum_name = 0, +#define XNN_ENUM_ITEM(enum_name, enum_string) enum_name, + #include "xnnpack/allocation-type-defs.h" +#undef XNN_ENUM_ITEM_0 +#undef XNN_ENUM_ITEM }; #if XNN_LOG_LEVEL <= 0 diff --git a/src/xnnpack/microkernel-type-defs.h b/src/xnnpack/microkernel-type-defs.h new file mode 100644 index 00000000000..d94cd96966f --- /dev/null +++ b/src/xnnpack/microkernel-type-defs.h @@ -0,0 +1,29 @@ +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#ifndef XNN_ENUM_ITEM_0 +#define XNN_ENUM_ITEM_0(enum_name, enum_string) XNN_ENUM_ITEM(enum_name, enum_string) +#define XNN_DEFINED_ENUM_ITEM_0 +#endif + +XNN_ENUM_ITEM_0(xnn_microkernel_type_default, "Default") +XNN_ENUM_ITEM(xnn_microkernel_type_average_pooling, "Average Pooling") +XNN_ENUM_ITEM(xnn_microkernel_type_conv2d_hwc2chw, "Conv2D HWC2CHW") +XNN_ENUM_ITEM(xnn_microkernel_type_dwconv, "DWConv") +XNN_ENUM_ITEM(xnn_microkernel_type_gemm, "GEMM") +XNN_ENUM_ITEM(xnn_microkernel_type_global_average_pooling, "Global Average Pooling") +XNN_ENUM_ITEM(xnn_microkernel_type_igemm, "IGEMM") +XNN_ENUM_ITEM(xnn_microkernel_type_mean, "Mean") +XNN_ENUM_ITEM(xnn_microkernel_type_pixelwise_average_pooling, "Pixelwise Average Pooling") +XNN_ENUM_ITEM(xnn_microkernel_type_spmm, "SPMM") +XNN_ENUM_ITEM(xnn_microkernel_type_subconv2d, "Subconv2D") +XNN_ENUM_ITEM(xnn_microkernel_type_transpose, "Transpose") +XNN_ENUM_ITEM(xnn_microkernel_type_vmulcaddc, "VMulCAddC") + + +#ifdef XNN_DEFINED_ENUM_ITEM_0 +#undef XNN_DEFINED_ENUM_ITEM_0 +#undef XNN_ENUM_ITEM_0 +#endif diff --git a/src/xnnpack/microkernel-type.h b/src/xnnpack/microkernel-type.h index 7f1e2533a9a..821f4061335 100644 --- a/src/xnnpack/microkernel-type.h +++ b/src/xnnpack/microkernel-type.h @@ -2,10 +2,6 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: src/enums/microkernel-type.yaml -// Generator: tools/generate-enum.py #pragma once @@ -17,19 +13,11 @@ extern "C" { #endif enum xnn_microkernel_type { - xnn_microkernel_type_default = 0, - xnn_microkernel_type_average_pooling, - xnn_microkernel_type_conv2d_hwc2chw, - xnn_microkernel_type_dwconv, - xnn_microkernel_type_gemm, - xnn_microkernel_type_global_average_pooling, - xnn_microkernel_type_igemm, - xnn_microkernel_type_mean, - xnn_microkernel_type_pixelwise_average_pooling, - xnn_microkernel_type_spmm, - xnn_microkernel_type_subconv2d, - xnn_microkernel_type_transpose, - xnn_microkernel_type_vmulcaddc, +#define XNN_ENUM_ITEM_0(enum_name, enum_string) enum_name = 0, +#define XNN_ENUM_ITEM(enum_name, enum_string) enum_name, + #include "xnnpack/microkernel-type-defs.h" +#undef XNN_ENUM_ITEM_0 +#undef XNN_ENUM_ITEM }; XNN_INTERNAL const char* xnn_microkernel_type_to_string(enum xnn_microkernel_type microkernel_type); diff --git a/src/xnnpack/node-type-defs.h b/src/xnnpack/node-type-defs.h new file mode 100644 index 00000000000..1ccf512d668 --- /dev/null +++ b/src/xnnpack/node-type-defs.h @@ -0,0 +1,78 @@ +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#ifndef XNN_ENUM_ITEM_0 +#define XNN_ENUM_ITEM_0(enum_name, enum_string) XNN_ENUM_ITEM(enum_name, enum_string) +#define XNN_DEFINED_ENUM_ITEM_0 +#endif + +XNN_ENUM_ITEM_0(xnn_node_type_invalid, "Invalid") +XNN_ENUM_ITEM(xnn_node_type_abs, "Abs") +XNN_ENUM_ITEM(xnn_node_type_add2, "Add2") +XNN_ENUM_ITEM(xnn_node_type_argmax_pooling_2d, "ArgMax Pooling 2D") +XNN_ENUM_ITEM(xnn_node_type_average_pooling_2d, "Average Pooling 2D") +XNN_ENUM_ITEM(xnn_node_type_bankers_rounding, "Bankers Rounding") +XNN_ENUM_ITEM(xnn_node_type_batch_matrix_multiply, "Batch Matrix Multiply") +XNN_ENUM_ITEM(xnn_node_type_ceiling, "Ceiling") +XNN_ENUM_ITEM(xnn_node_type_clamp, "Clamp") +XNN_ENUM_ITEM(xnn_node_type_concatenate2, "Concatenate2") +XNN_ENUM_ITEM(xnn_node_type_concatenate3, "Concatenate3") +XNN_ENUM_ITEM(xnn_node_type_concatenate4, "Concatenate4") +XNN_ENUM_ITEM(xnn_node_type_concatenate5, "Concatenate5") +XNN_ENUM_ITEM(xnn_node_type_convert, "Convert") +XNN_ENUM_ITEM(xnn_node_type_convolution_2d, "Convolution 2D") +XNN_ENUM_ITEM(xnn_node_type_copy, "Copy") +XNN_ENUM_ITEM(xnn_node_type_copysign, "Copy Sign") +XNN_ENUM_ITEM(xnn_node_type_deconvolution_2d, "Deconvolution 2D") +XNN_ENUM_ITEM(xnn_node_type_depth_to_space_2d, "Depth To Space 2D") +XNN_ENUM_ITEM(xnn_node_type_depthwise_convolution_2d, "Depthwise Convolution 2D") +XNN_ENUM_ITEM(xnn_node_type_divide, "Divide") +XNN_ENUM_ITEM(xnn_node_type_elu, "ELU") +XNN_ENUM_ITEM(xnn_node_type_exp, "Exp") +XNN_ENUM_ITEM(xnn_node_type_even_split2, "Even Split2") +XNN_ENUM_ITEM(xnn_node_type_even_split3, "Even Split3") +XNN_ENUM_ITEM(xnn_node_type_even_split4, "Even Split4") +XNN_ENUM_ITEM(xnn_node_type_floor, "Floor") +XNN_ENUM_ITEM(xnn_node_type_fully_connected, "Fully Connected") +XNN_ENUM_ITEM(xnn_node_type_fully_connected_sparse, "Fully Connected Sparse") +XNN_ENUM_ITEM(xnn_node_type_gelu, "GELU") +XNN_ENUM_ITEM(xnn_node_type_global_average_pooling_1d, "Global Average Pooling 1D") +XNN_ENUM_ITEM(xnn_node_type_global_average_pooling_2d, "Global Average Pooling 2D") +XNN_ENUM_ITEM(xnn_node_type_global_sum_pooling_1d, "Global Sum Pooling 1D") +XNN_ENUM_ITEM(xnn_node_type_global_sum_pooling_2d, "Global Sum Pooling 2D") +XNN_ENUM_ITEM(xnn_node_type_hardswish, "HardSwish") +XNN_ENUM_ITEM(xnn_node_type_log, "Log") +XNN_ENUM_ITEM(xnn_node_type_leaky_relu, "Leaky ReLU") +XNN_ENUM_ITEM(xnn_node_type_max_pooling_2d, "Max Pooling 2D") +XNN_ENUM_ITEM(xnn_node_type_maximum2, "Maximum2") +XNN_ENUM_ITEM(xnn_node_type_minimum2, "Minimum2") +XNN_ENUM_ITEM(xnn_node_type_multiply2, "Multiply2") +XNN_ENUM_ITEM(xnn_node_type_negate, "Negate") +XNN_ENUM_ITEM(xnn_node_type_prelu, "PReLU") +XNN_ENUM_ITEM(xnn_node_type_reciprocal_square_root, "Reciprocal Square Root") +XNN_ENUM_ITEM(xnn_node_type_rope, "RoPE") +XNN_ENUM_ITEM(xnn_node_type_scaled_dot_product_attention, "Scaled Dot Product Attention") +XNN_ENUM_ITEM(xnn_node_type_sigmoid, "Sigmoid") +XNN_ENUM_ITEM(xnn_node_type_softmax, "Softmax") +XNN_ENUM_ITEM(xnn_node_type_space_to_depth_2d, "Space To Depth 2D") +XNN_ENUM_ITEM(xnn_node_type_square, "Square") +XNN_ENUM_ITEM(xnn_node_type_square_root, "Square Root") +XNN_ENUM_ITEM(xnn_node_type_squared_difference, "Squared Difference") +XNN_ENUM_ITEM(xnn_node_type_static_constant_pad, "Static Constant Pad") +XNN_ENUM_ITEM(xnn_node_type_static_expand_dims, "Static Expand Dims") +XNN_ENUM_ITEM(xnn_node_type_static_mean, "Static Mean") +XNN_ENUM_ITEM(xnn_node_type_static_reshape, "Static Reshape") +XNN_ENUM_ITEM(xnn_node_type_static_resize_bilinear_2d, "Static Resize Bilinear 2D") +XNN_ENUM_ITEM(xnn_node_type_static_slice, "Static Slice") +XNN_ENUM_ITEM(xnn_node_type_static_transpose, "Static Transpose") +XNN_ENUM_ITEM(xnn_node_type_subtract, "Subtract") +XNN_ENUM_ITEM(xnn_node_type_tanh, "Tanh") +XNN_ENUM_ITEM(xnn_node_type_unpooling_2d, "Unpooling 2D") + + +#ifdef XNN_DEFINED_ENUM_ITEM_0 +#undef XNN_DEFINED_ENUM_ITEM_0 +#undef XNN_ENUM_ITEM_0 +#endif diff --git a/src/xnnpack/node-type.h b/src/xnnpack/node-type.h index dbc82a8231b..25cf77e468e 100644 --- a/src/xnnpack/node-type.h +++ b/src/xnnpack/node-type.h @@ -2,10 +2,6 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: src/enums/node-type.yaml -// Generator: tools/generate-enum.py #pragma once @@ -17,68 +13,11 @@ extern "C" { #endif enum xnn_node_type { - xnn_node_type_invalid = 0, - xnn_node_type_abs, - xnn_node_type_add2, - xnn_node_type_argmax_pooling_2d, - xnn_node_type_average_pooling_2d, - xnn_node_type_bankers_rounding, - xnn_node_type_batch_matrix_multiply, - xnn_node_type_ceiling, - xnn_node_type_clamp, - xnn_node_type_concatenate2, - xnn_node_type_concatenate3, - xnn_node_type_concatenate4, - xnn_node_type_concatenate5, - xnn_node_type_convert, - xnn_node_type_convolution_2d, - xnn_node_type_copy, - xnn_node_type_copysign, - xnn_node_type_deconvolution_2d, - xnn_node_type_depth_to_space_2d, - xnn_node_type_depthwise_convolution_2d, - xnn_node_type_divide, - xnn_node_type_elu, - xnn_node_type_exp, - xnn_node_type_even_split2, - xnn_node_type_even_split3, - xnn_node_type_even_split4, - xnn_node_type_floor, - xnn_node_type_fully_connected, - xnn_node_type_fully_connected_sparse, - xnn_node_type_gelu, - xnn_node_type_global_average_pooling_1d, - xnn_node_type_global_average_pooling_2d, - xnn_node_type_global_sum_pooling_1d, - xnn_node_type_global_sum_pooling_2d, - xnn_node_type_hardswish, - xnn_node_type_log, - xnn_node_type_leaky_relu, - xnn_node_type_max_pooling_2d, - xnn_node_type_maximum2, - xnn_node_type_minimum2, - xnn_node_type_multiply2, - xnn_node_type_negate, - xnn_node_type_prelu, - xnn_node_type_reciprocal_square_root, - xnn_node_type_rope, - xnn_node_type_scaled_dot_product_attention, - xnn_node_type_sigmoid, - xnn_node_type_softmax, - xnn_node_type_space_to_depth_2d, - xnn_node_type_square, - xnn_node_type_square_root, - xnn_node_type_squared_difference, - xnn_node_type_static_constant_pad, - xnn_node_type_static_expand_dims, - xnn_node_type_static_mean, - xnn_node_type_static_reshape, - xnn_node_type_static_resize_bilinear_2d, - xnn_node_type_static_slice, - xnn_node_type_static_transpose, - xnn_node_type_subtract, - xnn_node_type_tanh, - xnn_node_type_unpooling_2d, +#define XNN_ENUM_ITEM_0(enum_name, enum_string) enum_name = 0, +#define XNN_ENUM_ITEM(enum_name, enum_string) enum_name, + #include "xnnpack/node-type-defs.h" +#undef XNN_ENUM_ITEM_0 +#undef XNN_ENUM_ITEM }; #if XNN_LOG_LEVEL <= 0 diff --git a/src/xnnpack/operator-type-defs.h b/src/xnnpack/operator-type-defs.h new file mode 100644 index 00000000000..f56ebbb7a89 --- /dev/null +++ b/src/xnnpack/operator-type-defs.h @@ -0,0 +1,173 @@ +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#ifndef XNN_ENUM_ITEM_0 +#define XNN_ENUM_ITEM_0(enum_name, enum_string) XNN_ENUM_ITEM(enum_name, enum_string) +#define XNN_DEFINED_ENUM_ITEM_0 +#endif + +XNN_ENUM_ITEM_0(xnn_operator_type_invalid, "Invalid") +XNN_ENUM_ITEM(xnn_operator_type_abs_nc_f16, "Abs (NC, F16)") +XNN_ENUM_ITEM(xnn_operator_type_abs_nc_f32, "Abs (NC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_add, "Add (ND)") +XNN_ENUM_ITEM(xnn_operator_type_argmax_pooling_nhwc_f32, "ArgMax Pooling (NHWC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_average_pooling_nhwc_f16, "Average Pooling (NHWC, F16)") +XNN_ENUM_ITEM(xnn_operator_type_average_pooling_nhwc_f32, "Average Pooling (NHWC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_average_pooling_nhwc_qu8, "Average Pooling (NHWC, QU8)") +XNN_ENUM_ITEM(xnn_operator_type_bankers_rounding_nc_f16, "Bankers Rounding (NC, F16)") +XNN_ENUM_ITEM(xnn_operator_type_bankers_rounding_nc_f32, "Bankers Rounding (NC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_batch_matrix_multiply_nc_f16, "Batch Matrix Multiply (NC, F16)") +XNN_ENUM_ITEM(xnn_operator_type_batch_matrix_multiply_nc_f32, "Batch Matrix Multiply (NC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_batch_matrix_multiply_nc_qd8_f32_qc8w, "Batch Matrix Multiply (NC, QD8, F32, QC8W)") +XNN_ENUM_ITEM(xnn_operator_type_ceiling_nc_f16, "Ceiling (NC, F16)") +XNN_ENUM_ITEM(xnn_operator_type_ceiling_nc_f32, "Ceiling (NC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_channel_shuffle_nc_x8, "Channel Shuffle (NC, X8)") +XNN_ENUM_ITEM(xnn_operator_type_channel_shuffle_nc_x32, "Channel Shuffle (NC, X32)") +XNN_ENUM_ITEM(xnn_operator_type_clamp_nc_f16, "Clamp (NC, F16)") +XNN_ENUM_ITEM(xnn_operator_type_clamp_nc_f32, "Clamp (NC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_clamp_nc_s8, "Clamp (NC, S8)") +XNN_ENUM_ITEM(xnn_operator_type_clamp_nc_u8, "Clamp (NC, U8)") +XNN_ENUM_ITEM(xnn_operator_type_constant_pad_nd_x8, "Constant Pad (ND, X8)") +XNN_ENUM_ITEM(xnn_operator_type_constant_pad_nd_x16, "Constant Pad (ND, X16)") +XNN_ENUM_ITEM(xnn_operator_type_constant_pad_nd_x32, "Constant Pad (ND, X32)") +XNN_ENUM_ITEM(xnn_operator_type_convert_nc_f16_f32, "Convert (NC, F16, F32)") +XNN_ENUM_ITEM(xnn_operator_type_convert_nc_f16_qd8, "Convert (NC, F16, QD8)") +XNN_ENUM_ITEM(xnn_operator_type_convert_nc_f32_f16, "Convert (NC, F32, F16)") +XNN_ENUM_ITEM(xnn_operator_type_convert_nc_f32_qd8, "Convert (NC, F32, QD8)") +XNN_ENUM_ITEM(xnn_operator_type_convert_nc_f32_qp8, "Convert (NC, F32, QP8)") +XNN_ENUM_ITEM(xnn_operator_type_convert_nc_f32_qs8, "Convert (NC, F32, QS8)") +XNN_ENUM_ITEM(xnn_operator_type_convert_nc_f32_qu8, "Convert (NC, F32, QU8)") +XNN_ENUM_ITEM(xnn_operator_type_convert_nc_qs8, "Convert (NC, QS8)") +XNN_ENUM_ITEM(xnn_operator_type_convert_nc_qs8_f16, "Convert (NC, QS8, F16)") +XNN_ENUM_ITEM(xnn_operator_type_convert_nc_qs8_f32, "Convert (NC, QS8, F32)") +XNN_ENUM_ITEM(xnn_operator_type_convert_nc_qs16_qs8, "Convert (NC, QS16, QS8)") +XNN_ENUM_ITEM(xnn_operator_type_convert_nc_qu8, "Convert (NC, QU8)") +XNN_ENUM_ITEM(xnn_operator_type_convert_nc_qu8_f32, "Convert (NC, QU8, F32)") +XNN_ENUM_ITEM(xnn_operator_type_convolution_nchw_f16, "Convolution (NCHW, F16)") +XNN_ENUM_ITEM(xnn_operator_type_convolution_nchw_f32, "Convolution (NCHW, F32)") +XNN_ENUM_ITEM(xnn_operator_type_convolution_nhwc_f16, "Convolution (NHWC, F16)") +XNN_ENUM_ITEM(xnn_operator_type_convolution_nhwc_f32, "Convolution (NHWC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_convolution_nhwc_qd8_f16_qc8w, "Convolution (NHWC, QD8, F16, QC8W)") +XNN_ENUM_ITEM(xnn_operator_type_convolution_nhwc_qd8_f32_qc8w, "Convolution (NHWC, QD8, F32, QC8W)") +XNN_ENUM_ITEM(xnn_operator_type_convolution_nhwc_qc8, "Convolution (NHWC, QC8)") +XNN_ENUM_ITEM(xnn_operator_type_convolution_nhwc_qs8, "Convolution (NHWC, QS8)") +XNN_ENUM_ITEM(xnn_operator_type_convolution_nhwc_qu8, "Convolution (NHWC, QU8)") +XNN_ENUM_ITEM(xnn_operator_type_copy_nc_x8, "Copy (NC, X8)") +XNN_ENUM_ITEM(xnn_operator_type_copy_nc_x16, "Copy (NC, X16)") +XNN_ENUM_ITEM(xnn_operator_type_copy_nc_x32, "Copy (NC, X32)") +XNN_ENUM_ITEM(xnn_operator_type_copysign, "Copy Sign (ND)") +XNN_ENUM_ITEM(xnn_operator_type_deconvolution_nhwc_f16, "Deconvolution (NHWC, F16)") +XNN_ENUM_ITEM(xnn_operator_type_deconvolution_nhwc_f32, "Deconvolution (NHWC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_deconvolution_nhwc_qd8_f32_qc8w, "Deconvolution (NHWC, QD8, F32, QC8W)") +XNN_ENUM_ITEM(xnn_operator_type_deconvolution_nhwc_qs8, "Deconvolution (NHWC, QS8)") +XNN_ENUM_ITEM(xnn_operator_type_deconvolution_nhwc_qs8_qc8w, "Deconvolution (NC, QS8, QC8W)") +XNN_ENUM_ITEM(xnn_operator_type_deconvolution_nhwc_qu8, "Deconvolution (NHWC, QU8)") +XNN_ENUM_ITEM(xnn_operator_type_depth_to_space_nchw2nhwc_x16, "Depth To Space (NCHW2NHWC, X16)") +XNN_ENUM_ITEM(xnn_operator_type_depth_to_space_nchw2nhwc_x32, "Depth To Space (NCHW2NHWC, X32)") +XNN_ENUM_ITEM(xnn_operator_type_depth_to_space_nhwc_x8, "Depth To Space (NHWC, X8)") +XNN_ENUM_ITEM(xnn_operator_type_depth_to_space_nhwc_x16, "Depth To Space (NHWC, X16)") +XNN_ENUM_ITEM(xnn_operator_type_depth_to_space_nhwc_x32, "Depth To Space (NHWC, X32)") +XNN_ENUM_ITEM(xnn_operator_type_divide, "Divide (ND)") +XNN_ENUM_ITEM(xnn_operator_type_dynamic_fully_connected_nc_f16, "Dynamic Fully Connected (NC, F16)") +XNN_ENUM_ITEM(xnn_operator_type_dynamic_fully_connected_nc_f32, "Dynamic Fully Connected (NC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_elu_nc_f16, "ELU (NC, F16)") +XNN_ENUM_ITEM(xnn_operator_type_elu_nc_f32, "ELU (NC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_elu_nc_qs8, "ELU (NC, QS8)") +XNN_ENUM_ITEM(xnn_operator_type_exp_nc_f32, "Exp (NC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_floor_nc_f16, "Floor (NC, F16)") +XNN_ENUM_ITEM(xnn_operator_type_floor_nc_f32, "Floor (NC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_fully_connected_nc_f16, "Fully Connected (NC, F16)") +XNN_ENUM_ITEM(xnn_operator_type_fully_connected_nc_f32, "Fully Connected (NC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_fully_connected_nc_f32_qc4w, "Fully Connected (NC, F32, QC4W)") +XNN_ENUM_ITEM(xnn_operator_type_fully_connected_nc_f32_qc8w, "Fully Connected (NC, F32, QC8W)") +XNN_ENUM_ITEM(xnn_operator_type_fully_connected_nc_qd8_f16_qb4w, "Fully Connected (NC, QD8, F16, QB4W)") +XNN_ENUM_ITEM(xnn_operator_type_fully_connected_nc_qd8_f16_qc4w, "Fully Connected (NC, QD8, F16, QC4W)") +XNN_ENUM_ITEM(xnn_operator_type_fully_connected_nc_qd8_f16_qc8w, "Fully Connected (NC, QD8, F16, QC8W)") +XNN_ENUM_ITEM(xnn_operator_type_fully_connected_nc_qd8_f32_qb4w, "Fully Connected (NC, QD8, F32, QB4W)") +XNN_ENUM_ITEM(xnn_operator_type_fully_connected_nc_qd8_f32_qc4w, "Fully Connected (NC, QD8, F32, QC4W)") +XNN_ENUM_ITEM(xnn_operator_type_fully_connected_nc_qd8_f32_qc8w, "Fully Connected (NC, QD8, F32, QC8W)") +XNN_ENUM_ITEM(xnn_operator_type_fully_connected_nc_qp8_f32_qc4w, "Fully Connected (NC, QP8, F32, QC4W)") +XNN_ENUM_ITEM(xnn_operator_type_fully_connected_nc_qs8, "Fully Connected (NC, QS8)") +XNN_ENUM_ITEM(xnn_operator_type_fully_connected_nc_qs8_qc8w, "Fully Connected (NC, QS8, QC8W)") +XNN_ENUM_ITEM(xnn_operator_type_fully_connected_nc_qu8, "Fully Connected (NC, QU8)") +XNN_ENUM_ITEM(xnn_operator_type_gelu_nc_f32, "GELU (NC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_global_average_pooling_ncw_f16, "Global Average Pooling (NCW, F16)") +XNN_ENUM_ITEM(xnn_operator_type_global_average_pooling_ncw_f32, "Global Average Pooling (NCW, F32)") +XNN_ENUM_ITEM(xnn_operator_type_global_average_pooling_nwc_f16, "Global Average Pooling (NWC, F16)") +XNN_ENUM_ITEM(xnn_operator_type_global_average_pooling_nwc_f32, "Global Average Pooling (NWC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_global_average_pooling_nwc_qs8, "Global Average Pooling (NWC, QS8)") +XNN_ENUM_ITEM(xnn_operator_type_global_average_pooling_nwc_qu8, "Global Average Pooling (NWC, QU8)") +XNN_ENUM_ITEM(xnn_operator_type_global_sum_pooling_nwc_f16, "Global Sum Pooling (NWC, F16)") +XNN_ENUM_ITEM(xnn_operator_type_global_sum_pooling_nwc_f32, "Global Sum Pooling (NWC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_hardswish_nc_f16, "HardSwish (NC, F16)") +XNN_ENUM_ITEM(xnn_operator_type_hardswish_nc_f32, "HardSwish (NC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_leaky_relu_nc_f16, "Leaky ReLU (NC, F16)") +XNN_ENUM_ITEM(xnn_operator_type_leaky_relu_nc_f32, "Leaky ReLU (NC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_leaky_relu_nc_qs8, "Leaky ReLU (NC, QS8)") +XNN_ENUM_ITEM(xnn_operator_type_leaky_relu_nc_qu8, "Leaky ReLU (NC, QU8)") +XNN_ENUM_ITEM(xnn_operator_type_log_nc_f32, "Log (NC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_max_pooling_nhwc_f16, "Max Pooling (NHWC, F16)") +XNN_ENUM_ITEM(xnn_operator_type_max_pooling_nhwc_f32, "Max Pooling (NHWC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_max_pooling_nhwc_s8, "Max Pooling (NHWC, S8)") +XNN_ENUM_ITEM(xnn_operator_type_max_pooling_nhwc_u8, "Max Pooling (NHWC, U8)") +XNN_ENUM_ITEM(xnn_operator_type_maximum, "Maximum (ND)") +XNN_ENUM_ITEM(xnn_operator_type_mean_nd_f16, "Mean (ND, F16)") +XNN_ENUM_ITEM(xnn_operator_type_mean_nd_f32, "Mean (ND, F32)") +XNN_ENUM_ITEM(xnn_operator_type_mean_nd_qs8, "Mean (ND, QS8)") +XNN_ENUM_ITEM(xnn_operator_type_mean_nd_qu8, "Mean (ND, QU8)") +XNN_ENUM_ITEM(xnn_operator_type_minimum, "Minimum (ND)") +XNN_ENUM_ITEM(xnn_operator_type_multiply, "Multiply (ND)") +XNN_ENUM_ITEM(xnn_operator_type_negate_nc_f16, "Negate (NC, F16)") +XNN_ENUM_ITEM(xnn_operator_type_negate_nc_f32, "Negate (NC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_prelu_nc_f16, "PReLU (NC, F16)") +XNN_ENUM_ITEM(xnn_operator_type_prelu_nc_f32, "PReLU (NC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_reciprocal_square_root_nc_f16, "Reciprocal Square Root (NC, F16)") +XNN_ENUM_ITEM(xnn_operator_type_reciprocal_square_root_nc_f32, "Reciprocal Square Root (NC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_resize_bilinear_nchw_f16, "Resize Bilinear (NCHW, F16)") +XNN_ENUM_ITEM(xnn_operator_type_resize_bilinear_nchw_f32, "Resize Bilinear (NCHW, F32)") +XNN_ENUM_ITEM(xnn_operator_type_resize_bilinear_nhwc_f16, "Resize Bilinear (NHWC, F16)") +XNN_ENUM_ITEM(xnn_operator_type_resize_bilinear_nhwc_f32, "Resize Bilinear (NHWC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_resize_bilinear_nhwc_s8, "Resize Bilinear (NHWC, S8)") +XNN_ENUM_ITEM(xnn_operator_type_resize_bilinear_nhwc_u8, "Resize Bilinear (NHWC, U8)") +XNN_ENUM_ITEM(xnn_operator_type_rope_nthc_f16, "RoPE (NTHC, F16)") +XNN_ENUM_ITEM(xnn_operator_type_rope_nthc_f32, "RoPE (NTHC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_scaled_dot_product_attention_nhtc_f16, "Scaled Dot-Product Attention (NHTC, F16)") +XNN_ENUM_ITEM(xnn_operator_type_scaled_dot_product_attention_nhtc_f32, "Scaled Dot-Product Attention (NHTC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_sigmoid_nc_f16, "Sigmoid (NC, F16)") +XNN_ENUM_ITEM(xnn_operator_type_sigmoid_nc_f32, "Sigmoid (NC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_sigmoid_nc_qs8, "Sigmoid (NC, QS8)") +XNN_ENUM_ITEM(xnn_operator_type_sigmoid_nc_qu8, "Sigmoid (NC, QU8)") +XNN_ENUM_ITEM(xnn_operator_type_slice_nd_x8, "Slice (ND, X8)") +XNN_ENUM_ITEM(xnn_operator_type_slice_nd_x16, "Slice (ND, X16)") +XNN_ENUM_ITEM(xnn_operator_type_slice_nd_x32, "Slice (ND, X32)") +XNN_ENUM_ITEM(xnn_operator_type_softmax_nc_f16, "Softmax (NC, F16)") +XNN_ENUM_ITEM(xnn_operator_type_softmax_nc_f32, "Softmax (NC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_softmax_nc_qu8, "Softmax (NC, QU8)") +XNN_ENUM_ITEM(xnn_operator_type_space_to_depth_nhwc_x8, "Space To Depth (NHWC, X8)") +XNN_ENUM_ITEM(xnn_operator_type_space_to_depth_nhwc_x16, "Space To Depth (NHWC, X16)") +XNN_ENUM_ITEM(xnn_operator_type_space_to_depth_nhwc_x32, "Space To Depth (NHWC, X32)") +XNN_ENUM_ITEM(xnn_operator_type_square_nc_f16, "Square (NC, F16)") +XNN_ENUM_ITEM(xnn_operator_type_square_nc_f32, "Square (NC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_square_root_nc_f16, "Square Root (NC, F16)") +XNN_ENUM_ITEM(xnn_operator_type_square_root_nc_f32, "Square Root (NC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_squared_difference, "Squared Difference (NC)") +XNN_ENUM_ITEM(xnn_operator_type_subtract, "Subtract (ND)") +XNN_ENUM_ITEM(xnn_operator_type_tanh_nc_f16, "Tanh (NC, F16)") +XNN_ENUM_ITEM(xnn_operator_type_tanh_nc_f32, "Tanh (NC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_tanh_nc_qs8, "Tanh (NC, QS8)") +XNN_ENUM_ITEM(xnn_operator_type_tanh_nc_qu8, "Tanh (NC, QU8)") +XNN_ENUM_ITEM(xnn_operator_type_transpose_nd_x8, "Transpose (ND, X8)") +XNN_ENUM_ITEM(xnn_operator_type_transpose_nd_x16, "Transpose (ND, X16)") +XNN_ENUM_ITEM(xnn_operator_type_transpose_nd_x32, "Transpose (ND, X32)") +XNN_ENUM_ITEM(xnn_operator_type_transpose_nd_x64, "Transpose (ND, X64)") +XNN_ENUM_ITEM(xnn_operator_type_truncation_nc_f16, "Truncation (NC, F16)") +XNN_ENUM_ITEM(xnn_operator_type_truncation_nc_f32, "Truncation (NC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_unpooling_nhwc_x32, "Unpooling (NHWC, X32)") + + +#ifdef XNN_DEFINED_ENUM_ITEM_0 +#undef XNN_DEFINED_ENUM_ITEM_0 +#undef XNN_ENUM_ITEM_0 +#endif diff --git a/src/xnnpack/operator-type.h b/src/xnnpack/operator-type.h index 7b431bd0f28..021b6dfd8a6 100644 --- a/src/xnnpack/operator-type.h +++ b/src/xnnpack/operator-type.h @@ -2,10 +2,6 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: src/enums/operator-type.yaml -// Generator: tools/generate-enum.py #pragma once @@ -17,163 +13,11 @@ extern "C" { #endif enum xnn_operator_type { - xnn_operator_type_invalid = 0, - xnn_operator_type_abs_nc_f16, - xnn_operator_type_abs_nc_f32, - xnn_operator_type_add, - xnn_operator_type_argmax_pooling_nhwc_f32, - xnn_operator_type_average_pooling_nhwc_f16, - xnn_operator_type_average_pooling_nhwc_f32, - xnn_operator_type_average_pooling_nhwc_qu8, - xnn_operator_type_bankers_rounding_nc_f16, - xnn_operator_type_bankers_rounding_nc_f32, - xnn_operator_type_batch_matrix_multiply_nc_f16, - xnn_operator_type_batch_matrix_multiply_nc_f32, - xnn_operator_type_batch_matrix_multiply_nc_qd8_f32_qc8w, - xnn_operator_type_ceiling_nc_f16, - xnn_operator_type_ceiling_nc_f32, - xnn_operator_type_channel_shuffle_nc_x8, - xnn_operator_type_channel_shuffle_nc_x32, - xnn_operator_type_clamp_nc_f16, - xnn_operator_type_clamp_nc_f32, - xnn_operator_type_clamp_nc_s8, - xnn_operator_type_clamp_nc_u8, - xnn_operator_type_constant_pad_nd_x8, - xnn_operator_type_constant_pad_nd_x16, - xnn_operator_type_constant_pad_nd_x32, - xnn_operator_type_convert_nc_f16_f32, - xnn_operator_type_convert_nc_f16_qd8, - xnn_operator_type_convert_nc_f32_f16, - xnn_operator_type_convert_nc_f32_qd8, - xnn_operator_type_convert_nc_f32_qp8, - xnn_operator_type_convert_nc_f32_qs8, - xnn_operator_type_convert_nc_f32_qu8, - xnn_operator_type_convert_nc_qs8, - xnn_operator_type_convert_nc_qs8_f16, - xnn_operator_type_convert_nc_qs8_f32, - xnn_operator_type_convert_nc_qs16_qs8, - xnn_operator_type_convert_nc_qu8, - xnn_operator_type_convert_nc_qu8_f32, - xnn_operator_type_convolution_nchw_f16, - xnn_operator_type_convolution_nchw_f32, - xnn_operator_type_convolution_nhwc_f16, - xnn_operator_type_convolution_nhwc_f32, - xnn_operator_type_convolution_nhwc_qd8_f16_qc8w, - xnn_operator_type_convolution_nhwc_qd8_f32_qc8w, - xnn_operator_type_convolution_nhwc_qc8, - xnn_operator_type_convolution_nhwc_qs8, - xnn_operator_type_convolution_nhwc_qu8, - xnn_operator_type_copy_nc_x8, - xnn_operator_type_copy_nc_x16, - xnn_operator_type_copy_nc_x32, - xnn_operator_type_copysign, - xnn_operator_type_deconvolution_nhwc_f16, - xnn_operator_type_deconvolution_nhwc_f32, - xnn_operator_type_deconvolution_nhwc_qd8_f32_qc8w, - xnn_operator_type_deconvolution_nhwc_qs8, - xnn_operator_type_deconvolution_nhwc_qs8_qc8w, - xnn_operator_type_deconvolution_nhwc_qu8, - xnn_operator_type_depth_to_space_nchw2nhwc_x16, - xnn_operator_type_depth_to_space_nchw2nhwc_x32, - xnn_operator_type_depth_to_space_nhwc_x8, - xnn_operator_type_depth_to_space_nhwc_x16, - xnn_operator_type_depth_to_space_nhwc_x32, - xnn_operator_type_divide, - xnn_operator_type_dynamic_fully_connected_nc_f16, - xnn_operator_type_dynamic_fully_connected_nc_f32, - xnn_operator_type_elu_nc_f16, - xnn_operator_type_elu_nc_f32, - xnn_operator_type_elu_nc_qs8, - xnn_operator_type_exp_nc_f32, - xnn_operator_type_floor_nc_f16, - xnn_operator_type_floor_nc_f32, - xnn_operator_type_fully_connected_nc_f16, - xnn_operator_type_fully_connected_nc_f32, - xnn_operator_type_fully_connected_nc_f32_qc4w, - xnn_operator_type_fully_connected_nc_f32_qc8w, - xnn_operator_type_fully_connected_nc_qd8_f16_qb4w, - xnn_operator_type_fully_connected_nc_qd8_f16_qc4w, - xnn_operator_type_fully_connected_nc_qd8_f16_qc8w, - xnn_operator_type_fully_connected_nc_qd8_f32_qb4w, - xnn_operator_type_fully_connected_nc_qd8_f32_qc4w, - xnn_operator_type_fully_connected_nc_qd8_f32_qc8w, - xnn_operator_type_fully_connected_nc_qp8_f32_qc4w, - xnn_operator_type_fully_connected_nc_qs8, - xnn_operator_type_fully_connected_nc_qs8_qc8w, - xnn_operator_type_fully_connected_nc_qu8, - xnn_operator_type_gelu_nc_f32, - xnn_operator_type_global_average_pooling_ncw_f16, - xnn_operator_type_global_average_pooling_ncw_f32, - xnn_operator_type_global_average_pooling_nwc_f16, - xnn_operator_type_global_average_pooling_nwc_f32, - xnn_operator_type_global_average_pooling_nwc_qs8, - xnn_operator_type_global_average_pooling_nwc_qu8, - xnn_operator_type_global_sum_pooling_nwc_f16, - xnn_operator_type_global_sum_pooling_nwc_f32, - xnn_operator_type_hardswish_nc_f16, - xnn_operator_type_hardswish_nc_f32, - xnn_operator_type_leaky_relu_nc_f16, - xnn_operator_type_leaky_relu_nc_f32, - xnn_operator_type_leaky_relu_nc_qs8, - xnn_operator_type_leaky_relu_nc_qu8, - xnn_operator_type_log_nc_f32, - xnn_operator_type_max_pooling_nhwc_f16, - xnn_operator_type_max_pooling_nhwc_f32, - xnn_operator_type_max_pooling_nhwc_s8, - xnn_operator_type_max_pooling_nhwc_u8, - xnn_operator_type_maximum, - xnn_operator_type_mean_nd_f16, - xnn_operator_type_mean_nd_f32, - xnn_operator_type_mean_nd_qs8, - xnn_operator_type_mean_nd_qu8, - xnn_operator_type_minimum, - xnn_operator_type_multiply, - xnn_operator_type_negate_nc_f16, - xnn_operator_type_negate_nc_f32, - xnn_operator_type_prelu_nc_f16, - xnn_operator_type_prelu_nc_f32, - xnn_operator_type_reciprocal_square_root_nc_f16, - xnn_operator_type_reciprocal_square_root_nc_f32, - xnn_operator_type_resize_bilinear_nchw_f16, - xnn_operator_type_resize_bilinear_nchw_f32, - xnn_operator_type_resize_bilinear_nhwc_f16, - xnn_operator_type_resize_bilinear_nhwc_f32, - xnn_operator_type_resize_bilinear_nhwc_s8, - xnn_operator_type_resize_bilinear_nhwc_u8, - xnn_operator_type_rope_nthc_f16, - xnn_operator_type_rope_nthc_f32, - xnn_operator_type_scaled_dot_product_attention_nhtc_f16, - xnn_operator_type_scaled_dot_product_attention_nhtc_f32, - xnn_operator_type_sigmoid_nc_f16, - xnn_operator_type_sigmoid_nc_f32, - xnn_operator_type_sigmoid_nc_qs8, - xnn_operator_type_sigmoid_nc_qu8, - xnn_operator_type_slice_nd_x8, - xnn_operator_type_slice_nd_x16, - xnn_operator_type_slice_nd_x32, - xnn_operator_type_softmax_nc_f16, - xnn_operator_type_softmax_nc_f32, - xnn_operator_type_softmax_nc_qu8, - xnn_operator_type_space_to_depth_nhwc_x8, - xnn_operator_type_space_to_depth_nhwc_x16, - xnn_operator_type_space_to_depth_nhwc_x32, - xnn_operator_type_square_nc_f16, - xnn_operator_type_square_nc_f32, - xnn_operator_type_square_root_nc_f16, - xnn_operator_type_square_root_nc_f32, - xnn_operator_type_squared_difference, - xnn_operator_type_subtract, - xnn_operator_type_tanh_nc_f16, - xnn_operator_type_tanh_nc_f32, - xnn_operator_type_tanh_nc_qs8, - xnn_operator_type_tanh_nc_qu8, - xnn_operator_type_transpose_nd_x8, - xnn_operator_type_transpose_nd_x16, - xnn_operator_type_transpose_nd_x32, - xnn_operator_type_transpose_nd_x64, - xnn_operator_type_truncation_nc_f16, - xnn_operator_type_truncation_nc_f32, - xnn_operator_type_unpooling_nhwc_x32, +#define XNN_ENUM_ITEM_0(enum_name, enum_string) enum_name = 0, +#define XNN_ENUM_ITEM(enum_name, enum_string) enum_name, + #include "xnnpack/operator-type-defs.h" +#undef XNN_ENUM_ITEM_0 +#undef XNN_ENUM_ITEM }; XNN_INTERNAL const char* xnn_operator_type_to_string(enum xnn_operator_type operator_type); diff --git a/tools/generate-enum.py b/tools/generate-enum.py deleted file mode 100755 index 53bef16cb5e..00000000000 --- a/tools/generate-enum.py +++ /dev/null @@ -1,192 +0,0 @@ -#!/usr/bin/env python -# Copyright 2022 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -import argparse -import codecs -import math -import os -import re -import sys -import yaml - -sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) -import xngen -import xnncommon - -parser = argparse.ArgumentParser( - description='Generates enum header and convertion-to-string code.') -parser.add_argument( - '-s', - '--spec', - metavar='FILE', - required=True, - help='Specification (YAML) file') -parser.add_argument( - '--output_src', - metavar='FILE', - required=True, - help='Output C source file') -parser.add_argument( - '--output_hdr', - metavar='FILE', - required=True, - help='Output C/C++ header file') -parser.add_argument( - '-e', - '--enum', - metavar='NAME', - required=True, - help='Name of the enum variable') -parser.add_argument( - '-d', - '--debug', - action='store_true', - default=False, - help='Define enum-to-string fuction only when debug logging is enabled') -parser.set_defaults(defines=list()) - - -def generate_source(enum_name, spec_path, output_path, header_path, debug_only): - with codecs.open(spec_path, 'r', encoding='utf-8') as spec_file: - spec_yaml = yaml.safe_load(spec_file) - if not isinstance(spec_yaml, list): - raise ValueError('expected a list of enumeration values in the spec') - - output = f"""\ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: {spec_path} -// Generator: {sys.argv[0]} - -#include -#include - -#include "{xnncommon.xnnpack_src()}/{header_path}"\n\n""" - - max_offset = sum(len(entry['string']) + 1 for entry in spec_yaml[:-1]) - if max_offset < 256: - offset_type = 'uint8_t' - elif max_offset < 65536: - offset_type = 'uint16_t' - else: - offset_type = 'uint32_t' - - offset_declaration = f'static const {offset_type} offset[{len(spec_yaml)}] = {{\n '; - string_declaration = 'static const char data[] =\n' - pos = 0 - for i, spec_entry in enumerate(spec_yaml): - enum_item_name = spec_entry['name'] - assert enum_item_name.startswith(enum_name + "_") - enum_item_string = spec_entry['string'] - - if i + 1 != len(spec_yaml): - string_declaration += ' "' + enum_item_string + '\\0"\n' - offset_declaration += ' ' + str(pos) + ',' - else: - string_declaration += ' "' + enum_item_string + '";\n' - offset_declaration += ' ' + str(pos) + '\n};' - - # Wrap offset array on 120 columns - last_offset_line = offset_declaration[offset_declaration.rfind('\n')+1:] - if len(last_offset_line) > 120: - last_offset_start = offset_declaration.rfind(',', 0, -1) + 1 - offset_declaration = offset_declaration[:last_offset_start] + '\n ' + offset_declaration[last_offset_start:] - - pos += len(enum_item_string) + 1 - - if debug_only: - output += '#if XNN_LOG_LEVEL > 0\n' - output += offset_declaration - output += '\n\n' - output += string_declaration - - arg_name = enum_name[len("xnn_"):] - output += f""" -const char* {enum_name}_to_string(enum {enum_name} {arg_name}) {{ - assert({arg_name} >= {spec_yaml[0]['name']}); - assert({arg_name} <= {spec_yaml[-1]['name']}); - return &data[offset[{arg_name}]]; -}}\n""" - if debug_only: - output += '#endif // XNN_LOG_LEVEL > 0\n' - - xnncommon.overwrite_if_changed(output_path, output) - -def generate_header(enum_name, spec_path, output_path, debug_only): - with codecs.open(spec_path, 'r', encoding='utf-8') as spec_file: - spec_yaml = yaml.safe_load(spec_file) - if not isinstance(spec_yaml, list): - raise ValueError('expected a list of enumeration values in the spec') - - output = f"""\ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: {spec_path} -// Generator: {sys.argv[0]} - -#pragma once - -#include "xnnpack/common.h" - - -#ifdef __cplusplus -extern "C" {{ -#endif - -enum {enum_name} {{\n""" - - enum_item_name = spec_yaml[0]['name'] - assert enum_item_name.startswith(enum_name + "_") - output += ' ' + enum_item_name + ' = 0,\n' - for spec_entry in spec_yaml[1:]: - enum_item_name = spec_entry['name'] - assert enum_item_name.startswith(enum_name + "_") - output += ' ' + enum_item_name + ',\n' - - arg_name = enum_name[len("xnn_"):] - output += '};\n\n' - - if debug_only: - output += f"""\ -#if XNN_LOG_LEVEL <= 0 - XNN_INLINE static const char* {enum_name}_to_string(enum {enum_name} type) {{ - return ""; - }} -#else - XNN_INTERNAL const char* {enum_name}_to_string(enum {enum_name} type); -#endif -""" - else: - output += f"""\ -XNN_INTERNAL const char* {enum_name}_to_string(enum {enum_name} {arg_name}); -""" - output += """ -#ifdef __cplusplus -} // extern "C" -#endif -""" - - xnncommon.overwrite_if_changed(output_path, output) - -def main(args): - options = parser.parse_args(args) - generate_header(options.enum, options.spec, options.output_hdr, options.debug) - - assert options.enum.startswith('xnn_') - header_path = 'xnnpack/' + options.enum[len('xnn_'):].replace('_', '-') + '.h' - generate_source(options.enum, options.spec, options.output_src, header_path, - options.debug) - -if __name__ == '__main__': - main(sys.argv[1:]) From 1c6e016294e01624bd1200a1179b18f423310212 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Tue, 24 Sep 2024 12:43:53 -0700 Subject: [PATCH 46/50] F32-RADDSTOREEXPMINUSMAX RR2 for AVX512 - Direct port of original SSE RR2 PiperOrigin-RevId: 678357741 --- bench/f32-raddstoreexpminusmax.cc | 29 ++ cmake/gen/avx512f_microkernels.cmake | 4 + gen/avx512f_microkernels.bzl | 4 + scripts/generate-f32-raddstoreexpminusmax.sh | 5 + src/configs/raddstoreexpminusmax-config.c | 5 +- .../avx512f-rr2-p5.c.in | 241 ++++++++++++++++ ...-raddstoreexpminusmax-avx512f-rr2-p5-u16.c | 215 ++++++++++++++ ...storeexpminusmax-avx512f-rr2-p5-u32-acc2.c | 234 +++++++++++++++ ...storeexpminusmax-avx512f-rr2-p5-u64-acc2.c | 266 +++++++++++++++++ ...storeexpminusmax-avx512f-rr2-p5-u64-acc4.c | 270 ++++++++++++++++++ src/xnnpack/raddstoreexpminusmax.h | 5 + test/f32-raddstoreexpminusmax.cc | 148 ++++++++++ test/f32-raddstoreexpminusmax.yaml | 5 + 13 files changed, 1430 insertions(+), 1 deletion(-) create mode 100644 src/f32-raddstoreexpminusmax/avx512f-rr2-p5.c.in create mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr2-p5-u16.c create mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr2-p5-u32-acc2.c create mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr2-p5-u64-acc2.c create mode 100644 src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr2-p5-u64-acc4.c diff --git a/bench/f32-raddstoreexpminusmax.cc b/bench/f32-raddstoreexpminusmax.cc index 91739e67c94..84de3c6ff4e 100644 --- a/bench/f32-raddstoreexpminusmax.cc +++ b/bench/f32-raddstoreexpminusmax.cc @@ -248,6 +248,35 @@ static void f32_raddstoreexpminusmax( ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_rr2_p5_u16, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr2_p5_u16, + nullptr, + benchmark::utils::CheckAVX512F) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_rr2_p5_u32_acc2, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr2_p5_u32_acc2, + nullptr, + benchmark::utils::CheckAVX512F) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_rr2_p5_u64_acc2, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr2_p5_u64_acc2, + nullptr, + benchmark::utils::CheckAVX512F) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_rr2_p5_u64_acc4, + xnn_f32_rmax_ukernel__avx_u32_acc4, + xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr2_p5_u64_acc4, + nullptr, + benchmark::utils::CheckAVX512F) + ->Apply(benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr1_p5_u8, xnn_f32_rmax_ukernel__avx_u32_acc4, xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u8, diff --git a/cmake/gen/avx512f_microkernels.cmake b/cmake/gen/avx512f_microkernels.cmake index 1eafbfcff4b..c871a1bce06 100644 --- a/cmake/gen/avx512f_microkernels.cmake +++ b/cmake/gen/avx512f_microkernels.cmake @@ -20,6 +20,7 @@ SET(PROD_AVX512F_MICROKERNEL_SRCS src/f32-igemm/gen/f32-igemm-1x16-minmax-avx512f-broadcast.c src/f32-igemm/gen/f32-igemm-7x16-minmax-avx512f-broadcast.c src/f32-prelu/gen/f32-prelu-avx512f-2x16.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr2-p5-u64-acc2.c src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx512f-c64.c src/f32-rminmax/gen/f32-rmax-avx512f-u64-acc4.c src/f32-rminmax/gen/f32-rminmax-avx512f-u64-acc4.c @@ -132,6 +133,9 @@ SET(NON_PROD_AVX512F_MICROKERNEL_SRCS src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u32-acc2.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u64-acc2.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u64-acc4.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr2-p5-u16.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr2-p5-u32-acc2.c + src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr2-p5-u64-acc4.c src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx512f-c16.c src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx512f-c32.c src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx512f-c128.c diff --git a/gen/avx512f_microkernels.bzl b/gen/avx512f_microkernels.bzl index 13402d30b47..38bb3ba21c5 100644 --- a/gen/avx512f_microkernels.bzl +++ b/gen/avx512f_microkernels.bzl @@ -16,6 +16,7 @@ PROD_AVX512F_MICROKERNEL_SRCS = [ "src/f32-igemm/gen/f32-igemm-1x16-minmax-avx512f-broadcast.c", "src/f32-igemm/gen/f32-igemm-7x16-minmax-avx512f-broadcast.c", "src/f32-prelu/gen/f32-prelu-avx512f-2x16.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr2-p5-u64-acc2.c", "src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx512f-c64.c", "src/f32-rminmax/gen/f32-rmax-avx512f-u64-acc4.c", "src/f32-rminmax/gen/f32-rminmax-avx512f-u64-acc4.c", @@ -129,6 +130,9 @@ NON_PROD_AVX512F_MICROKERNEL_SRCS = [ "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u32-acc2.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u64-acc2.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u64-acc4.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr2-p5-u16.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr2-p5-u32-acc2.c", + "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr2-p5-u64-acc4.c", "src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx512f-c16.c", "src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx512f-c32.c", "src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx512f-c128.c", diff --git a/scripts/generate-f32-raddstoreexpminusmax.sh b/scripts/generate-f32-raddstoreexpminusmax.sh index f4d7bfe7cfb..55fb89c69e8 100755 --- a/scripts/generate-f32-raddstoreexpminusmax.sh +++ b/scripts/generate-f32-raddstoreexpminusmax.sh @@ -52,6 +52,11 @@ tools/xngen src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in -D BATCH_TIL tools/xngen src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in -D BATCH_TILE=64 -D ACCUMULATORS=2 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u64-acc2.c & tools/xngen src/f32-raddstoreexpminusmax/avx512f-rr1-p5-scalef.c.in -D BATCH_TILE=64 -D ACCUMULATORS=4 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-u64-acc4.c & +tools/xngen src/f32-raddstoreexpminusmax/avx512f-rr2-p5.c.in -D BATCH_TILE=16 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr2-p5-u16.c & +tools/xngen src/f32-raddstoreexpminusmax/avx512f-rr2-p5.c.in -D BATCH_TILE=32 -D ACCUMULATORS=2 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr2-p5-u32-acc2.c & +tools/xngen src/f32-raddstoreexpminusmax/avx512f-rr2-p5.c.in -D BATCH_TILE=64 -D ACCUMULATORS=2 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr2-p5-u64-acc2.c & +tools/xngen src/f32-raddstoreexpminusmax/avx512f-rr2-p5.c.in -D BATCH_TILE=64 -D ACCUMULATORS=4 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr2-p5-u64-acc4.c & + ################################## WAsm SIMD ################################## tools/xngen src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -D BATCH_TILE=4 -D ACCUMULATORS=1 -D FMA=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u4.c & tools/xngen src/f32-raddstoreexpminusmax/wasmsimd-rr2-p5.c.in -D BATCH_TILE=8 -D ACCUMULATORS=2 -D FMA=0 -o src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-u8-acc2.c & diff --git a/src/configs/raddstoreexpminusmax-config.c b/src/configs/raddstoreexpminusmax-config.c index 163bb97190d..4c2f82005f0 100644 --- a/src/configs/raddstoreexpminusmax-config.c +++ b/src/configs/raddstoreexpminusmax-config.c @@ -64,7 +64,10 @@ static void init_f32_raddstoreexpminusmax_config(void) { #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); - if (hardware_config->use_x86_avx2) { + if (hardware_config->use_x86_avx512f) { + f32_raddstoreexpminusmax_config.ukernel = (xnn_raddstoreexpminusmax_ukernel_fn) xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr2_p5_u64_acc2; + f32_raddstoreexpminusmax_config.element_tile = 64; + } else if (hardware_config->use_x86_avx2) { f32_raddstoreexpminusmax_config.ukernel = (xnn_raddstoreexpminusmax_ukernel_fn) xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr2_p5_u32_acc2; f32_raddstoreexpminusmax_config.element_tile = 32; } else { diff --git a/src/f32-raddstoreexpminusmax/avx512f-rr2-p5.c.in b/src/f32-raddstoreexpminusmax/avx512f-rr2-p5.c.in new file mode 100644 index 00000000000..819df2c7902 --- /dev/null +++ b/src/f32-raddstoreexpminusmax/avx512f-rr2-p5.c.in @@ -0,0 +1,241 @@ +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +$assert BATCH_TILE % 16 == 0 +$assert BATCH_TILE >= 16 +$SIMD_TILE = BATCH_TILE // 16 +#include + +#include + +#include "xnnpack/intrinsics-polyfill.h" +#include "xnnpack/raddstoreexpminusmax.h" + + +void xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr2_p5_u${BATCH_TILE}${"" if ACCUMULATORS == 1 else "_acc%d" % ACCUMULATORS}( + size_t batch, + const float* input, + const float* max, + float* output, + float* sum, + const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(max != NULL); + assert(output != NULL); + assert(sum != NULL); + + const __m512 vlog2e = _mm512_set1_ps(0x1.715476p+0f); + const __m512 vmagic_bias = _mm512_set1_ps(0x1.8000FEp23f); + const __m512 vminus_ln2_hi = _mm512_set1_ps(-0x1.62E400p-1f); + const __m512 vminus_ln2_lo = _mm512_set1_ps(-0x1.7F7D1Cp-20f); + const __m512 vc5 = _mm512_set1_ps(0x1.0F9F9Cp-7f); + const __m512 vc4 = _mm512_set1_ps(0x1.573A1Ap-5f); + const __m512 vc3 = _mm512_set1_ps(0x1.555A80p-3f); + const __m512 vc2 = _mm512_set1_ps(0x1.FFFDC6p-2f); + const __m512 vc1 = _mm512_set1_ps(0x1.FFFFF6p-1f); + const __m512 vdenorm_cutoff = _mm512_set1_ps(-0x1.5D589Ep6f); + + XNN_FORCE_REALIZATION(vlog2e); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vminus_ln2_hi); + XNN_FORCE_REALIZATION(vminus_ln2_lo); + XNN_FORCE_REALIZATION(vc5); + XNN_FORCE_REALIZATION(vc4); + XNN_FORCE_REALIZATION(vc3); + XNN_FORCE_REALIZATION(vc2); + XNN_FORCE_REALIZATION(vc1); + XNN_FORCE_REALIZATION(vdenorm_cutoff); + + const __m512 vi_max = _mm512_set1_ps(*max); + const __m512 vzero = _mm512_setzero_ps(); + + $for K in range(ACCUMULATORS): + __m512 vacc${K} = _mm512_setzero_ps(); + for (; batch >= ${BATCH_TILE} * sizeof(float); batch -= ${BATCH_TILE} * sizeof(float)) { + // Load ${BATCH_TILE} (${SIMD_TILE}x16) inputs at a time. + const __m512 vi0 = _mm512_loadu_ps(input); + $for N in range(1, SIMD_TILE): + const __m512 vi${N} = _mm512_loadu_ps(input + ${N * 16}); + input += ${BATCH_TILE}; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + $for N in range(SIMD_TILE): + const __m512 vx${N} = _mm512_sub_ps(vi${N}, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + $for N in range(SIMD_TILE): + __m512 vn${N} = _mm512_add_ps(_mm512_mul_ps(vx${N}, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + $for N in range(SIMD_TILE): + const __m512 vs${N} = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_castps_si512(vn${N}), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + $for N in range(SIMD_TILE): + vn${N} = _mm512_sub_ps(vn${N}, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + $for N in range(SIMD_TILE): + __m512 vt${N} = _mm512_add_ps(_mm512_mul_ps(vn${N}, vminus_ln2_hi), vx${N}); + + $for N in range(SIMD_TILE): + vt${N} = _mm512_add_ps(_mm512_mul_ps(vn${N}, vminus_ln2_lo), vt${N}); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + $for N in range(SIMD_TILE): + __m512 vp${N} = _mm512_add_ps(_mm512_mul_ps(vc5, vt${N}), vc4); + + $for N in range(SIMD_TILE): + vp${N} = _mm512_add_ps(_mm512_mul_ps(vp${N}, vt${N}), vc3); + + $for N in range(SIMD_TILE): + vp${N} = _mm512_add_ps(_mm512_mul_ps(vp${N}, vt${N}), vc2); + + $for N in range(SIMD_TILE): + vp${N} = _mm512_add_ps(_mm512_mul_ps(vp${N}, vt${N}), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + $for N in range(SIMD_TILE): + vt${N} = _mm512_mul_ps(vt${N}, vs${N}); + + $for N in range(SIMD_TILE): + __m512 vf${N} = _mm512_add_ps(_mm512_mul_ps(vt${N}, vp${N}), vs${N}); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + $for N in range(SIMD_TILE): + vf${N} = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(vx${N}, vdenorm_cutoff, _CMP_LT_OS), vf${N}, vzero); + + // Store ${BATCH_TILE} (${SIMD_TILE}x16) outputs at a time. + _mm512_storeu_ps(output, vf0); + $for N in range(1, SIMD_TILE): + _mm512_storeu_ps(output + ${N * 16}, vf${N}); + + output += ${BATCH_TILE}; + + // Accumulate computed exponents. + $for N in range(SIMD_TILE): + vacc${N % ACCUMULATORS} = _mm512_add_ps(vacc${N % ACCUMULATORS}, vf${N}); + } + $if ACCUMULATORS > 1: + // Add up all accumulators to vacc0 + $ACC_SLICE = 1 + $while ACC_SLICE < ACCUMULATORS: + $for A in range(0, ACCUMULATORS, ACC_SLICE * 2): + $if A + ACC_SLICE < ACCUMULATORS: + vacc${A} = _mm512_add_ps(vacc${A}, vacc${A + ACC_SLICE}); + $ACC_SLICE *= 2 + + __m512 vacc = vacc0; + for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { + // Load 16 inputs at a time. + const __m512 vi = _mm512_loadu_ps(input); + input += 16; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m512 vx = _mm512_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m512 vn = _mm512_add_ps(_mm512_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m512 vs = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_castps_si512(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm512_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m512 vt = _mm512_add_ps(_mm512_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm512_add_ps(_mm512_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m512 vp = _mm512_add_ps(_mm512_mul_ps(vc5, vt), vc4); + vp = _mm512_add_ps(_mm512_mul_ps(vp, vt), vc3); + vp = _mm512_add_ps(_mm512_mul_ps(vp, vt), vc2); + vp = _mm512_add_ps(_mm512_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm512_mul_ps(vt, vs); + __m512 vf = _mm512_add_ps(_mm512_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(vx, vdenorm_cutoff, _CMP_LT_OS), vf, vzero); + + // Store 16 outputs at a time. + _mm512_storeu_ps(output, vf); + output += 16; + + // Accumulate computed exponents. + vacc = _mm512_add_ps(vacc, vf); + } + if (batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 15 * sizeof(float)); + + // Prepare mask for valid 32-bit batch (depends on batch). + batch >>= XNN_LOG2_SIZEOF_FLOAT; + const __mmask16 vmask = _cvtu32_mask16((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); + + // Load 16 inputs at a time. + const __m512 vi = _mm512_maskz_loadu_ps(vmask, input); + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m512 vx = _mm512_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m512 vn = _mm512_add_ps(_mm512_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m512 vs = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_castps_si512(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm512_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m512 vt = _mm512_add_ps(_mm512_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm512_add_ps(_mm512_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m512 vp = _mm512_add_ps(_mm512_mul_ps(vc5, vt), vc4); + vp = _mm512_add_ps(_mm512_mul_ps(vp, vt), vc3); + vp = _mm512_add_ps(_mm512_mul_ps(vp, vt), vc2); + vp = _mm512_add_ps(_mm512_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm512_mul_ps(vt, vs); + __m512 vf = _mm512_add_ps(_mm512_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(vx, vdenorm_cutoff, _CMP_LT_OS), vf, vzero); + + _mm512_mask_storeu_ps(output, vmask, vf); + + vacc = _mm512_mask_add_ps(vacc, vmask, vacc, vf); + } + + *sum = _mm512_reduce_add_ps(vacc); +} + + diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr2-p5-u16.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr2-p5-u16.c new file mode 100644 index 00000000000..60c4266efd3 --- /dev/null +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr2-p5-u16.c @@ -0,0 +1,215 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-raddstoreexpminusmax/avx512f-rr2-p5.c.in +// Generator: tools/xngen +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/intrinsics-polyfill.h" +#include "xnnpack/raddstoreexpminusmax.h" + + +void xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr2_p5_u16( + size_t batch, + const float* input, + const float* max, + float* output, + float* sum, + const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(max != NULL); + assert(output != NULL); + assert(sum != NULL); + + const __m512 vlog2e = _mm512_set1_ps(0x1.715476p+0f); + const __m512 vmagic_bias = _mm512_set1_ps(0x1.8000FEp23f); + const __m512 vminus_ln2_hi = _mm512_set1_ps(-0x1.62E400p-1f); + const __m512 vminus_ln2_lo = _mm512_set1_ps(-0x1.7F7D1Cp-20f); + const __m512 vc5 = _mm512_set1_ps(0x1.0F9F9Cp-7f); + const __m512 vc4 = _mm512_set1_ps(0x1.573A1Ap-5f); + const __m512 vc3 = _mm512_set1_ps(0x1.555A80p-3f); + const __m512 vc2 = _mm512_set1_ps(0x1.FFFDC6p-2f); + const __m512 vc1 = _mm512_set1_ps(0x1.FFFFF6p-1f); + const __m512 vdenorm_cutoff = _mm512_set1_ps(-0x1.5D589Ep6f); + + XNN_FORCE_REALIZATION(vlog2e); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vminus_ln2_hi); + XNN_FORCE_REALIZATION(vminus_ln2_lo); + XNN_FORCE_REALIZATION(vc5); + XNN_FORCE_REALIZATION(vc4); + XNN_FORCE_REALIZATION(vc3); + XNN_FORCE_REALIZATION(vc2); + XNN_FORCE_REALIZATION(vc1); + XNN_FORCE_REALIZATION(vdenorm_cutoff); + + const __m512 vi_max = _mm512_set1_ps(*max); + const __m512 vzero = _mm512_setzero_ps(); + + __m512 vacc0 = _mm512_setzero_ps(); + for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { + // Load 16 (1x16) inputs at a time. + const __m512 vi0 = _mm512_loadu_ps(input); + input += 16; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m512 vx0 = _mm512_sub_ps(vi0, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m512 vn0 = _mm512_add_ps(_mm512_mul_ps(vx0, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m512 vs0 = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_castps_si512(vn0), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn0 = _mm512_sub_ps(vn0, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m512 vt0 = _mm512_add_ps(_mm512_mul_ps(vn0, vminus_ln2_hi), vx0); + + vt0 = _mm512_add_ps(_mm512_mul_ps(vn0, vminus_ln2_lo), vt0); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m512 vp0 = _mm512_add_ps(_mm512_mul_ps(vc5, vt0), vc4); + + vp0 = _mm512_add_ps(_mm512_mul_ps(vp0, vt0), vc3); + + vp0 = _mm512_add_ps(_mm512_mul_ps(vp0, vt0), vc2); + + vp0 = _mm512_add_ps(_mm512_mul_ps(vp0, vt0), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt0 = _mm512_mul_ps(vt0, vs0); + + __m512 vf0 = _mm512_add_ps(_mm512_mul_ps(vt0, vp0), vs0); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf0 = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0, vzero); + + // Store 16 (1x16) outputs at a time. + _mm512_storeu_ps(output, vf0); + + output += 16; + + // Accumulate computed exponents. + vacc0 = _mm512_add_ps(vacc0, vf0); + } + + __m512 vacc = vacc0; + for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { + // Load 16 inputs at a time. + const __m512 vi = _mm512_loadu_ps(input); + input += 16; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m512 vx = _mm512_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m512 vn = _mm512_add_ps(_mm512_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m512 vs = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_castps_si512(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm512_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m512 vt = _mm512_add_ps(_mm512_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm512_add_ps(_mm512_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m512 vp = _mm512_add_ps(_mm512_mul_ps(vc5, vt), vc4); + vp = _mm512_add_ps(_mm512_mul_ps(vp, vt), vc3); + vp = _mm512_add_ps(_mm512_mul_ps(vp, vt), vc2); + vp = _mm512_add_ps(_mm512_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm512_mul_ps(vt, vs); + __m512 vf = _mm512_add_ps(_mm512_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(vx, vdenorm_cutoff, _CMP_LT_OS), vf, vzero); + + // Store 16 outputs at a time. + _mm512_storeu_ps(output, vf); + output += 16; + + // Accumulate computed exponents. + vacc = _mm512_add_ps(vacc, vf); + } + if (batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 15 * sizeof(float)); + + // Prepare mask for valid 32-bit batch (depends on batch). + batch >>= XNN_LOG2_SIZEOF_FLOAT; + const __mmask16 vmask = _cvtu32_mask16((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); + + // Load 16 inputs at a time. + const __m512 vi = _mm512_maskz_loadu_ps(vmask, input); + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m512 vx = _mm512_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m512 vn = _mm512_add_ps(_mm512_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m512 vs = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_castps_si512(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm512_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m512 vt = _mm512_add_ps(_mm512_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm512_add_ps(_mm512_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m512 vp = _mm512_add_ps(_mm512_mul_ps(vc5, vt), vc4); + vp = _mm512_add_ps(_mm512_mul_ps(vp, vt), vc3); + vp = _mm512_add_ps(_mm512_mul_ps(vp, vt), vc2); + vp = _mm512_add_ps(_mm512_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm512_mul_ps(vt, vs); + __m512 vf = _mm512_add_ps(_mm512_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(vx, vdenorm_cutoff, _CMP_LT_OS), vf, vzero); + + _mm512_mask_storeu_ps(output, vmask, vf); + + vacc = _mm512_mask_add_ps(vacc, vmask, vacc, vf); + } + + *sum = _mm512_reduce_add_ps(vacc); +} + + diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr2-p5-u32-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr2-p5-u32-acc2.c new file mode 100644 index 00000000000..0c7723bb363 --- /dev/null +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr2-p5-u32-acc2.c @@ -0,0 +1,234 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-raddstoreexpminusmax/avx512f-rr2-p5.c.in +// Generator: tools/xngen +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/intrinsics-polyfill.h" +#include "xnnpack/raddstoreexpminusmax.h" + + +void xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr2_p5_u32_acc2( + size_t batch, + const float* input, + const float* max, + float* output, + float* sum, + const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(max != NULL); + assert(output != NULL); + assert(sum != NULL); + + const __m512 vlog2e = _mm512_set1_ps(0x1.715476p+0f); + const __m512 vmagic_bias = _mm512_set1_ps(0x1.8000FEp23f); + const __m512 vminus_ln2_hi = _mm512_set1_ps(-0x1.62E400p-1f); + const __m512 vminus_ln2_lo = _mm512_set1_ps(-0x1.7F7D1Cp-20f); + const __m512 vc5 = _mm512_set1_ps(0x1.0F9F9Cp-7f); + const __m512 vc4 = _mm512_set1_ps(0x1.573A1Ap-5f); + const __m512 vc3 = _mm512_set1_ps(0x1.555A80p-3f); + const __m512 vc2 = _mm512_set1_ps(0x1.FFFDC6p-2f); + const __m512 vc1 = _mm512_set1_ps(0x1.FFFFF6p-1f); + const __m512 vdenorm_cutoff = _mm512_set1_ps(-0x1.5D589Ep6f); + + XNN_FORCE_REALIZATION(vlog2e); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vminus_ln2_hi); + XNN_FORCE_REALIZATION(vminus_ln2_lo); + XNN_FORCE_REALIZATION(vc5); + XNN_FORCE_REALIZATION(vc4); + XNN_FORCE_REALIZATION(vc3); + XNN_FORCE_REALIZATION(vc2); + XNN_FORCE_REALIZATION(vc1); + XNN_FORCE_REALIZATION(vdenorm_cutoff); + + const __m512 vi_max = _mm512_set1_ps(*max); + const __m512 vzero = _mm512_setzero_ps(); + + __m512 vacc0 = _mm512_setzero_ps(); + __m512 vacc1 = _mm512_setzero_ps(); + for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { + // Load 32 (2x16) inputs at a time. + const __m512 vi0 = _mm512_loadu_ps(input); + const __m512 vi1 = _mm512_loadu_ps(input + 16); + input += 32; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m512 vx0 = _mm512_sub_ps(vi0, vi_max); + const __m512 vx1 = _mm512_sub_ps(vi1, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m512 vn0 = _mm512_add_ps(_mm512_mul_ps(vx0, vlog2e), vmagic_bias); + __m512 vn1 = _mm512_add_ps(_mm512_mul_ps(vx1, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m512 vs0 = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_castps_si512(vn0), 23)); + const __m512 vs1 = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_castps_si512(vn1), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn0 = _mm512_sub_ps(vn0, vmagic_bias); + vn1 = _mm512_sub_ps(vn1, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m512 vt0 = _mm512_add_ps(_mm512_mul_ps(vn0, vminus_ln2_hi), vx0); + __m512 vt1 = _mm512_add_ps(_mm512_mul_ps(vn1, vminus_ln2_hi), vx1); + + vt0 = _mm512_add_ps(_mm512_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm512_add_ps(_mm512_mul_ps(vn1, vminus_ln2_lo), vt1); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m512 vp0 = _mm512_add_ps(_mm512_mul_ps(vc5, vt0), vc4); + __m512 vp1 = _mm512_add_ps(_mm512_mul_ps(vc5, vt1), vc4); + + vp0 = _mm512_add_ps(_mm512_mul_ps(vp0, vt0), vc3); + vp1 = _mm512_add_ps(_mm512_mul_ps(vp1, vt1), vc3); + + vp0 = _mm512_add_ps(_mm512_mul_ps(vp0, vt0), vc2); + vp1 = _mm512_add_ps(_mm512_mul_ps(vp1, vt1), vc2); + + vp0 = _mm512_add_ps(_mm512_mul_ps(vp0, vt0), vc1); + vp1 = _mm512_add_ps(_mm512_mul_ps(vp1, vt1), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt0 = _mm512_mul_ps(vt0, vs0); + vt1 = _mm512_mul_ps(vt1, vs1); + + __m512 vf0 = _mm512_add_ps(_mm512_mul_ps(vt0, vp0), vs0); + __m512 vf1 = _mm512_add_ps(_mm512_mul_ps(vt1, vp1), vs1); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf0 = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0, vzero); + vf1 = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1, vzero); + + // Store 32 (2x16) outputs at a time. + _mm512_storeu_ps(output, vf0); + _mm512_storeu_ps(output + 16, vf1); + + output += 32; + + // Accumulate computed exponents. + vacc0 = _mm512_add_ps(vacc0, vf0); + vacc1 = _mm512_add_ps(vacc1, vf1); + } + // Add up all accumulators to vacc0 + vacc0 = _mm512_add_ps(vacc0, vacc1); + + __m512 vacc = vacc0; + for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { + // Load 16 inputs at a time. + const __m512 vi = _mm512_loadu_ps(input); + input += 16; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m512 vx = _mm512_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m512 vn = _mm512_add_ps(_mm512_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m512 vs = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_castps_si512(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm512_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m512 vt = _mm512_add_ps(_mm512_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm512_add_ps(_mm512_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m512 vp = _mm512_add_ps(_mm512_mul_ps(vc5, vt), vc4); + vp = _mm512_add_ps(_mm512_mul_ps(vp, vt), vc3); + vp = _mm512_add_ps(_mm512_mul_ps(vp, vt), vc2); + vp = _mm512_add_ps(_mm512_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm512_mul_ps(vt, vs); + __m512 vf = _mm512_add_ps(_mm512_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(vx, vdenorm_cutoff, _CMP_LT_OS), vf, vzero); + + // Store 16 outputs at a time. + _mm512_storeu_ps(output, vf); + output += 16; + + // Accumulate computed exponents. + vacc = _mm512_add_ps(vacc, vf); + } + if (batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 15 * sizeof(float)); + + // Prepare mask for valid 32-bit batch (depends on batch). + batch >>= XNN_LOG2_SIZEOF_FLOAT; + const __mmask16 vmask = _cvtu32_mask16((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); + + // Load 16 inputs at a time. + const __m512 vi = _mm512_maskz_loadu_ps(vmask, input); + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m512 vx = _mm512_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m512 vn = _mm512_add_ps(_mm512_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m512 vs = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_castps_si512(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm512_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m512 vt = _mm512_add_ps(_mm512_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm512_add_ps(_mm512_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m512 vp = _mm512_add_ps(_mm512_mul_ps(vc5, vt), vc4); + vp = _mm512_add_ps(_mm512_mul_ps(vp, vt), vc3); + vp = _mm512_add_ps(_mm512_mul_ps(vp, vt), vc2); + vp = _mm512_add_ps(_mm512_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm512_mul_ps(vt, vs); + __m512 vf = _mm512_add_ps(_mm512_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(vx, vdenorm_cutoff, _CMP_LT_OS), vf, vzero); + + _mm512_mask_storeu_ps(output, vmask, vf); + + vacc = _mm512_mask_add_ps(vacc, vmask, vacc, vf); + } + + *sum = _mm512_reduce_add_ps(vacc); +} + + diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr2-p5-u64-acc2.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr2-p5-u64-acc2.c new file mode 100644 index 00000000000..a22c57e3547 --- /dev/null +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr2-p5-u64-acc2.c @@ -0,0 +1,266 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-raddstoreexpminusmax/avx512f-rr2-p5.c.in +// Generator: tools/xngen +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/intrinsics-polyfill.h" +#include "xnnpack/raddstoreexpminusmax.h" + + +void xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr2_p5_u64_acc2( + size_t batch, + const float* input, + const float* max, + float* output, + float* sum, + const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(max != NULL); + assert(output != NULL); + assert(sum != NULL); + + const __m512 vlog2e = _mm512_set1_ps(0x1.715476p+0f); + const __m512 vmagic_bias = _mm512_set1_ps(0x1.8000FEp23f); + const __m512 vminus_ln2_hi = _mm512_set1_ps(-0x1.62E400p-1f); + const __m512 vminus_ln2_lo = _mm512_set1_ps(-0x1.7F7D1Cp-20f); + const __m512 vc5 = _mm512_set1_ps(0x1.0F9F9Cp-7f); + const __m512 vc4 = _mm512_set1_ps(0x1.573A1Ap-5f); + const __m512 vc3 = _mm512_set1_ps(0x1.555A80p-3f); + const __m512 vc2 = _mm512_set1_ps(0x1.FFFDC6p-2f); + const __m512 vc1 = _mm512_set1_ps(0x1.FFFFF6p-1f); + const __m512 vdenorm_cutoff = _mm512_set1_ps(-0x1.5D589Ep6f); + + XNN_FORCE_REALIZATION(vlog2e); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vminus_ln2_hi); + XNN_FORCE_REALIZATION(vminus_ln2_lo); + XNN_FORCE_REALIZATION(vc5); + XNN_FORCE_REALIZATION(vc4); + XNN_FORCE_REALIZATION(vc3); + XNN_FORCE_REALIZATION(vc2); + XNN_FORCE_REALIZATION(vc1); + XNN_FORCE_REALIZATION(vdenorm_cutoff); + + const __m512 vi_max = _mm512_set1_ps(*max); + const __m512 vzero = _mm512_setzero_ps(); + + __m512 vacc0 = _mm512_setzero_ps(); + __m512 vacc1 = _mm512_setzero_ps(); + for (; batch >= 64 * sizeof(float); batch -= 64 * sizeof(float)) { + // Load 64 (4x16) inputs at a time. + const __m512 vi0 = _mm512_loadu_ps(input); + const __m512 vi1 = _mm512_loadu_ps(input + 16); + const __m512 vi2 = _mm512_loadu_ps(input + 32); + const __m512 vi3 = _mm512_loadu_ps(input + 48); + input += 64; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m512 vx0 = _mm512_sub_ps(vi0, vi_max); + const __m512 vx1 = _mm512_sub_ps(vi1, vi_max); + const __m512 vx2 = _mm512_sub_ps(vi2, vi_max); + const __m512 vx3 = _mm512_sub_ps(vi3, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m512 vn0 = _mm512_add_ps(_mm512_mul_ps(vx0, vlog2e), vmagic_bias); + __m512 vn1 = _mm512_add_ps(_mm512_mul_ps(vx1, vlog2e), vmagic_bias); + __m512 vn2 = _mm512_add_ps(_mm512_mul_ps(vx2, vlog2e), vmagic_bias); + __m512 vn3 = _mm512_add_ps(_mm512_mul_ps(vx3, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m512 vs0 = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_castps_si512(vn0), 23)); + const __m512 vs1 = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_castps_si512(vn1), 23)); + const __m512 vs2 = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_castps_si512(vn2), 23)); + const __m512 vs3 = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_castps_si512(vn3), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn0 = _mm512_sub_ps(vn0, vmagic_bias); + vn1 = _mm512_sub_ps(vn1, vmagic_bias); + vn2 = _mm512_sub_ps(vn2, vmagic_bias); + vn3 = _mm512_sub_ps(vn3, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m512 vt0 = _mm512_add_ps(_mm512_mul_ps(vn0, vminus_ln2_hi), vx0); + __m512 vt1 = _mm512_add_ps(_mm512_mul_ps(vn1, vminus_ln2_hi), vx1); + __m512 vt2 = _mm512_add_ps(_mm512_mul_ps(vn2, vminus_ln2_hi), vx2); + __m512 vt3 = _mm512_add_ps(_mm512_mul_ps(vn3, vminus_ln2_hi), vx3); + + vt0 = _mm512_add_ps(_mm512_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm512_add_ps(_mm512_mul_ps(vn1, vminus_ln2_lo), vt1); + vt2 = _mm512_add_ps(_mm512_mul_ps(vn2, vminus_ln2_lo), vt2); + vt3 = _mm512_add_ps(_mm512_mul_ps(vn3, vminus_ln2_lo), vt3); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m512 vp0 = _mm512_add_ps(_mm512_mul_ps(vc5, vt0), vc4); + __m512 vp1 = _mm512_add_ps(_mm512_mul_ps(vc5, vt1), vc4); + __m512 vp2 = _mm512_add_ps(_mm512_mul_ps(vc5, vt2), vc4); + __m512 vp3 = _mm512_add_ps(_mm512_mul_ps(vc5, vt3), vc4); + + vp0 = _mm512_add_ps(_mm512_mul_ps(vp0, vt0), vc3); + vp1 = _mm512_add_ps(_mm512_mul_ps(vp1, vt1), vc3); + vp2 = _mm512_add_ps(_mm512_mul_ps(vp2, vt2), vc3); + vp3 = _mm512_add_ps(_mm512_mul_ps(vp3, vt3), vc3); + + vp0 = _mm512_add_ps(_mm512_mul_ps(vp0, vt0), vc2); + vp1 = _mm512_add_ps(_mm512_mul_ps(vp1, vt1), vc2); + vp2 = _mm512_add_ps(_mm512_mul_ps(vp2, vt2), vc2); + vp3 = _mm512_add_ps(_mm512_mul_ps(vp3, vt3), vc2); + + vp0 = _mm512_add_ps(_mm512_mul_ps(vp0, vt0), vc1); + vp1 = _mm512_add_ps(_mm512_mul_ps(vp1, vt1), vc1); + vp2 = _mm512_add_ps(_mm512_mul_ps(vp2, vt2), vc1); + vp3 = _mm512_add_ps(_mm512_mul_ps(vp3, vt3), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt0 = _mm512_mul_ps(vt0, vs0); + vt1 = _mm512_mul_ps(vt1, vs1); + vt2 = _mm512_mul_ps(vt2, vs2); + vt3 = _mm512_mul_ps(vt3, vs3); + + __m512 vf0 = _mm512_add_ps(_mm512_mul_ps(vt0, vp0), vs0); + __m512 vf1 = _mm512_add_ps(_mm512_mul_ps(vt1, vp1), vs1); + __m512 vf2 = _mm512_add_ps(_mm512_mul_ps(vt2, vp2), vs2); + __m512 vf3 = _mm512_add_ps(_mm512_mul_ps(vt3, vp3), vs3); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf0 = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0, vzero); + vf1 = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1, vzero); + vf2 = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2, vzero); + vf3 = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3, vzero); + + // Store 64 (4x16) outputs at a time. + _mm512_storeu_ps(output, vf0); + _mm512_storeu_ps(output + 16, vf1); + _mm512_storeu_ps(output + 32, vf2); + _mm512_storeu_ps(output + 48, vf3); + + output += 64; + + // Accumulate computed exponents. + vacc0 = _mm512_add_ps(vacc0, vf0); + vacc1 = _mm512_add_ps(vacc1, vf1); + vacc0 = _mm512_add_ps(vacc0, vf2); + vacc1 = _mm512_add_ps(vacc1, vf3); + } + // Add up all accumulators to vacc0 + vacc0 = _mm512_add_ps(vacc0, vacc1); + + __m512 vacc = vacc0; + for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { + // Load 16 inputs at a time. + const __m512 vi = _mm512_loadu_ps(input); + input += 16; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m512 vx = _mm512_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m512 vn = _mm512_add_ps(_mm512_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m512 vs = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_castps_si512(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm512_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m512 vt = _mm512_add_ps(_mm512_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm512_add_ps(_mm512_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m512 vp = _mm512_add_ps(_mm512_mul_ps(vc5, vt), vc4); + vp = _mm512_add_ps(_mm512_mul_ps(vp, vt), vc3); + vp = _mm512_add_ps(_mm512_mul_ps(vp, vt), vc2); + vp = _mm512_add_ps(_mm512_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm512_mul_ps(vt, vs); + __m512 vf = _mm512_add_ps(_mm512_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(vx, vdenorm_cutoff, _CMP_LT_OS), vf, vzero); + + // Store 16 outputs at a time. + _mm512_storeu_ps(output, vf); + output += 16; + + // Accumulate computed exponents. + vacc = _mm512_add_ps(vacc, vf); + } + if (batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 15 * sizeof(float)); + + // Prepare mask for valid 32-bit batch (depends on batch). + batch >>= XNN_LOG2_SIZEOF_FLOAT; + const __mmask16 vmask = _cvtu32_mask16((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); + + // Load 16 inputs at a time. + const __m512 vi = _mm512_maskz_loadu_ps(vmask, input); + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m512 vx = _mm512_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m512 vn = _mm512_add_ps(_mm512_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m512 vs = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_castps_si512(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm512_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m512 vt = _mm512_add_ps(_mm512_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm512_add_ps(_mm512_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m512 vp = _mm512_add_ps(_mm512_mul_ps(vc5, vt), vc4); + vp = _mm512_add_ps(_mm512_mul_ps(vp, vt), vc3); + vp = _mm512_add_ps(_mm512_mul_ps(vp, vt), vc2); + vp = _mm512_add_ps(_mm512_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm512_mul_ps(vt, vs); + __m512 vf = _mm512_add_ps(_mm512_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(vx, vdenorm_cutoff, _CMP_LT_OS), vf, vzero); + + _mm512_mask_storeu_ps(output, vmask, vf); + + vacc = _mm512_mask_add_ps(vacc, vmask, vacc, vf); + } + + *sum = _mm512_reduce_add_ps(vacc); +} + + diff --git a/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr2-p5-u64-acc4.c b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr2-p5-u64-acc4.c new file mode 100644 index 00000000000..16c494f57e7 --- /dev/null +++ b/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr2-p5-u64-acc4.c @@ -0,0 +1,270 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-raddstoreexpminusmax/avx512f-rr2-p5.c.in +// Generator: tools/xngen +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/intrinsics-polyfill.h" +#include "xnnpack/raddstoreexpminusmax.h" + + +void xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr2_p5_u64_acc4( + size_t batch, + const float* input, + const float* max, + float* output, + float* sum, + const struct xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(max != NULL); + assert(output != NULL); + assert(sum != NULL); + + const __m512 vlog2e = _mm512_set1_ps(0x1.715476p+0f); + const __m512 vmagic_bias = _mm512_set1_ps(0x1.8000FEp23f); + const __m512 vminus_ln2_hi = _mm512_set1_ps(-0x1.62E400p-1f); + const __m512 vminus_ln2_lo = _mm512_set1_ps(-0x1.7F7D1Cp-20f); + const __m512 vc5 = _mm512_set1_ps(0x1.0F9F9Cp-7f); + const __m512 vc4 = _mm512_set1_ps(0x1.573A1Ap-5f); + const __m512 vc3 = _mm512_set1_ps(0x1.555A80p-3f); + const __m512 vc2 = _mm512_set1_ps(0x1.FFFDC6p-2f); + const __m512 vc1 = _mm512_set1_ps(0x1.FFFFF6p-1f); + const __m512 vdenorm_cutoff = _mm512_set1_ps(-0x1.5D589Ep6f); + + XNN_FORCE_REALIZATION(vlog2e); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vminus_ln2_hi); + XNN_FORCE_REALIZATION(vminus_ln2_lo); + XNN_FORCE_REALIZATION(vc5); + XNN_FORCE_REALIZATION(vc4); + XNN_FORCE_REALIZATION(vc3); + XNN_FORCE_REALIZATION(vc2); + XNN_FORCE_REALIZATION(vc1); + XNN_FORCE_REALIZATION(vdenorm_cutoff); + + const __m512 vi_max = _mm512_set1_ps(*max); + const __m512 vzero = _mm512_setzero_ps(); + + __m512 vacc0 = _mm512_setzero_ps(); + __m512 vacc1 = _mm512_setzero_ps(); + __m512 vacc2 = _mm512_setzero_ps(); + __m512 vacc3 = _mm512_setzero_ps(); + for (; batch >= 64 * sizeof(float); batch -= 64 * sizeof(float)) { + // Load 64 (4x16) inputs at a time. + const __m512 vi0 = _mm512_loadu_ps(input); + const __m512 vi1 = _mm512_loadu_ps(input + 16); + const __m512 vi2 = _mm512_loadu_ps(input + 32); + const __m512 vi3 = _mm512_loadu_ps(input + 48); + input += 64; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m512 vx0 = _mm512_sub_ps(vi0, vi_max); + const __m512 vx1 = _mm512_sub_ps(vi1, vi_max); + const __m512 vx2 = _mm512_sub_ps(vi2, vi_max); + const __m512 vx3 = _mm512_sub_ps(vi3, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m512 vn0 = _mm512_add_ps(_mm512_mul_ps(vx0, vlog2e), vmagic_bias); + __m512 vn1 = _mm512_add_ps(_mm512_mul_ps(vx1, vlog2e), vmagic_bias); + __m512 vn2 = _mm512_add_ps(_mm512_mul_ps(vx2, vlog2e), vmagic_bias); + __m512 vn3 = _mm512_add_ps(_mm512_mul_ps(vx3, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m512 vs0 = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_castps_si512(vn0), 23)); + const __m512 vs1 = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_castps_si512(vn1), 23)); + const __m512 vs2 = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_castps_si512(vn2), 23)); + const __m512 vs3 = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_castps_si512(vn3), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn0 = _mm512_sub_ps(vn0, vmagic_bias); + vn1 = _mm512_sub_ps(vn1, vmagic_bias); + vn2 = _mm512_sub_ps(vn2, vmagic_bias); + vn3 = _mm512_sub_ps(vn3, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m512 vt0 = _mm512_add_ps(_mm512_mul_ps(vn0, vminus_ln2_hi), vx0); + __m512 vt1 = _mm512_add_ps(_mm512_mul_ps(vn1, vminus_ln2_hi), vx1); + __m512 vt2 = _mm512_add_ps(_mm512_mul_ps(vn2, vminus_ln2_hi), vx2); + __m512 vt3 = _mm512_add_ps(_mm512_mul_ps(vn3, vminus_ln2_hi), vx3); + + vt0 = _mm512_add_ps(_mm512_mul_ps(vn0, vminus_ln2_lo), vt0); + vt1 = _mm512_add_ps(_mm512_mul_ps(vn1, vminus_ln2_lo), vt1); + vt2 = _mm512_add_ps(_mm512_mul_ps(vn2, vminus_ln2_lo), vt2); + vt3 = _mm512_add_ps(_mm512_mul_ps(vn3, vminus_ln2_lo), vt3); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m512 vp0 = _mm512_add_ps(_mm512_mul_ps(vc5, vt0), vc4); + __m512 vp1 = _mm512_add_ps(_mm512_mul_ps(vc5, vt1), vc4); + __m512 vp2 = _mm512_add_ps(_mm512_mul_ps(vc5, vt2), vc4); + __m512 vp3 = _mm512_add_ps(_mm512_mul_ps(vc5, vt3), vc4); + + vp0 = _mm512_add_ps(_mm512_mul_ps(vp0, vt0), vc3); + vp1 = _mm512_add_ps(_mm512_mul_ps(vp1, vt1), vc3); + vp2 = _mm512_add_ps(_mm512_mul_ps(vp2, vt2), vc3); + vp3 = _mm512_add_ps(_mm512_mul_ps(vp3, vt3), vc3); + + vp0 = _mm512_add_ps(_mm512_mul_ps(vp0, vt0), vc2); + vp1 = _mm512_add_ps(_mm512_mul_ps(vp1, vt1), vc2); + vp2 = _mm512_add_ps(_mm512_mul_ps(vp2, vt2), vc2); + vp3 = _mm512_add_ps(_mm512_mul_ps(vp3, vt3), vc2); + + vp0 = _mm512_add_ps(_mm512_mul_ps(vp0, vt0), vc1); + vp1 = _mm512_add_ps(_mm512_mul_ps(vp1, vt1), vc1); + vp2 = _mm512_add_ps(_mm512_mul_ps(vp2, vt2), vc1); + vp3 = _mm512_add_ps(_mm512_mul_ps(vp3, vt3), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt0 = _mm512_mul_ps(vt0, vs0); + vt1 = _mm512_mul_ps(vt1, vs1); + vt2 = _mm512_mul_ps(vt2, vs2); + vt3 = _mm512_mul_ps(vt3, vs3); + + __m512 vf0 = _mm512_add_ps(_mm512_mul_ps(vt0, vp0), vs0); + __m512 vf1 = _mm512_add_ps(_mm512_mul_ps(vt1, vp1), vs1); + __m512 vf2 = _mm512_add_ps(_mm512_mul_ps(vt2, vp2), vs2); + __m512 vf3 = _mm512_add_ps(_mm512_mul_ps(vt3, vp3), vs3); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf0 = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0, vzero); + vf1 = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1, vzero); + vf2 = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2, vzero); + vf3 = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3, vzero); + + // Store 64 (4x16) outputs at a time. + _mm512_storeu_ps(output, vf0); + _mm512_storeu_ps(output + 16, vf1); + _mm512_storeu_ps(output + 32, vf2); + _mm512_storeu_ps(output + 48, vf3); + + output += 64; + + // Accumulate computed exponents. + vacc0 = _mm512_add_ps(vacc0, vf0); + vacc1 = _mm512_add_ps(vacc1, vf1); + vacc2 = _mm512_add_ps(vacc2, vf2); + vacc3 = _mm512_add_ps(vacc3, vf3); + } + // Add up all accumulators to vacc0 + vacc0 = _mm512_add_ps(vacc0, vacc1); + vacc2 = _mm512_add_ps(vacc2, vacc3); + vacc0 = _mm512_add_ps(vacc0, vacc2); + + __m512 vacc = vacc0; + for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { + // Load 16 inputs at a time. + const __m512 vi = _mm512_loadu_ps(input); + input += 16; + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m512 vx = _mm512_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m512 vn = _mm512_add_ps(_mm512_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m512 vs = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_castps_si512(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm512_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m512 vt = _mm512_add_ps(_mm512_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm512_add_ps(_mm512_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m512 vp = _mm512_add_ps(_mm512_mul_ps(vc5, vt), vc4); + vp = _mm512_add_ps(_mm512_mul_ps(vp, vt), vc3); + vp = _mm512_add_ps(_mm512_mul_ps(vp, vt), vc2); + vp = _mm512_add_ps(_mm512_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm512_mul_ps(vt, vs); + __m512 vf = _mm512_add_ps(_mm512_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(vx, vdenorm_cutoff, _CMP_LT_OS), vf, vzero); + + // Store 16 outputs at a time. + _mm512_storeu_ps(output, vf); + output += 16; + + // Accumulate computed exponents. + vacc = _mm512_add_ps(vacc, vf); + } + if (batch != 0) { + assert(batch >= 1 * sizeof(float)); + assert(batch <= 15 * sizeof(float)); + + // Prepare mask for valid 32-bit batch (depends on batch). + batch >>= XNN_LOG2_SIZEOF_FLOAT; + const __mmask16 vmask = _cvtu32_mask16((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); + + // Load 16 inputs at a time. + const __m512 vi = _mm512_maskz_loadu_ps(vmask, input); + + // Subtract maximum input x := i - i_max. This implies x <= 0. + const __m512 vx = _mm512_sub_ps(vi, vi_max); + + // Compute reduced argument batch := round(x / log(2)). + __m512 vn = _mm512_add_ps(_mm512_mul_ps(vx, vlog2e), vmagic_bias); + + // Create a floating-point number s (scale) such that s == 2**batch for inputs which don't cause underflow, i.e. + // -87.33642 <= x <= 0.0, and -126 <= batch <= 0 accordingly. + const __m512 vs = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_castps_si512(vn), 23)); + + // Subtract the large number back to get final batch := round(x / log(2)). + vn = _mm512_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - batch * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m512 vt = _mm512_add_ps(_mm512_mul_ps(vn, vminus_ln2_hi), vx); + vt = _mm512_add_ps(_mm512_mul_ps(vn, vminus_ln2_lo), vt); + + // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. + __m512 vp = _mm512_add_ps(_mm512_mul_ps(vc5, vt), vc4); + vp = _mm512_add_ps(_mm512_mul_ps(vp, vt), vc3); + vp = _mm512_add_ps(_mm512_mul_ps(vp, vt), vc2); + vp = _mm512_add_ps(_mm512_mul_ps(vp, vt), vc1); + + // Reconstruct the final f value: + // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) + // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) + // = s + (t * s) * p + vt = _mm512_mul_ps(vt, vs); + __m512 vf = _mm512_add_ps(_mm512_mul_ps(vt, vp), vs); + + // For inputs below zero cutoff, replace output with +0.0f. + // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. + vf = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(vx, vdenorm_cutoff, _CMP_LT_OS), vf, vzero); + + _mm512_mask_storeu_ps(output, vmask, vf); + + vacc = _mm512_mask_add_ps(vacc, vmask, vacc, vf); + } + + *sum = _mm512_reduce_add_ps(vacc); +} + + diff --git a/src/xnnpack/raddstoreexpminusmax.h b/src/xnnpack/raddstoreexpminusmax.h index 9b49f97fa1d..4a2465704fe 100644 --- a/src/xnnpack/raddstoreexpminusmax.h +++ b/src/xnnpack/raddstoreexpminusmax.h @@ -123,6 +123,11 @@ DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_u DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64_acc2) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64_acc4) +DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr2_p5_u16) +DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr2_p5_u32_acc2) +DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr2_p5_u64_acc2) +DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr2_p5_u64_acc4) + DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u4) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u8_acc2) DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u16_acc2) diff --git a/test/f32-raddstoreexpminusmax.cc b/test/f32-raddstoreexpminusmax.cc index 305fa348018..d6143f4b3c4 100644 --- a/test/f32-raddstoreexpminusmax.cc +++ b/test/f32-raddstoreexpminusmax.cc @@ -1286,6 +1286,154 @@ #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR2_P5_U16, elements_eq_16) { + TEST_REQUIRES_X86_AVX512F; + RAddStoreExpMinusMaxMicrokernelTester() + .elements(16) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr2_p5_u16, nullptr); + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR2_P5_U16, elements_div_16) { + TEST_REQUIRES_X86_AVX512F; + for (size_t elements = 32; elements < 160; elements += 16) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr2_p5_u16, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR2_P5_U16, elements_lt_16) { + TEST_REQUIRES_X86_AVX512F; + for (size_t elements = 1; elements < 16; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr2_p5_u16, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR2_P5_U16, elements_gt_16) { + TEST_REQUIRES_X86_AVX512F; + for (size_t elements = 17; elements < 32; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr2_p5_u16, nullptr); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR2_P5_U32_ACC2, elements_eq_32) { + TEST_REQUIRES_X86_AVX512F; + RAddStoreExpMinusMaxMicrokernelTester() + .elements(32) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr2_p5_u32_acc2, nullptr); + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR2_P5_U32_ACC2, elements_div_32) { + TEST_REQUIRES_X86_AVX512F; + for (size_t elements = 64; elements < 320; elements += 32) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr2_p5_u32_acc2, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR2_P5_U32_ACC2, elements_lt_32) { + TEST_REQUIRES_X86_AVX512F; + for (size_t elements = 1; elements < 32; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr2_p5_u32_acc2, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR2_P5_U32_ACC2, elements_gt_32) { + TEST_REQUIRES_X86_AVX512F; + for (size_t elements = 33; elements < 64; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr2_p5_u32_acc2, nullptr); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR2_P5_U64_ACC2, elements_eq_64) { + TEST_REQUIRES_X86_AVX512F; + RAddStoreExpMinusMaxMicrokernelTester() + .elements(64) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr2_p5_u64_acc2, nullptr); + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR2_P5_U64_ACC2, elements_div_64) { + TEST_REQUIRES_X86_AVX512F; + for (size_t elements = 128; elements < 640; elements += 64) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr2_p5_u64_acc2, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR2_P5_U64_ACC2, elements_lt_64) { + TEST_REQUIRES_X86_AVX512F; + for (size_t elements = 1; elements < 64; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr2_p5_u64_acc2, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR2_P5_U64_ACC2, elements_gt_64) { + TEST_REQUIRES_X86_AVX512F; + for (size_t elements = 65; elements < 128; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr2_p5_u64_acc2, nullptr); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR2_P5_U64_ACC4, elements_eq_64) { + TEST_REQUIRES_X86_AVX512F; + RAddStoreExpMinusMaxMicrokernelTester() + .elements(64) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr2_p5_u64_acc4, nullptr); + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR2_P5_U64_ACC4, elements_div_64) { + TEST_REQUIRES_X86_AVX512F; + for (size_t elements = 128; elements < 640; elements += 64) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr2_p5_u64_acc4, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR2_P5_U64_ACC4, elements_lt_64) { + TEST_REQUIRES_X86_AVX512F; + for (size_t elements = 1; elements < 64; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr2_p5_u64_acc4, nullptr); + } + } + + TEST(F32_RADDSTOREEXPMINUSMAX__AVX512F_RR2_P5_U64_ACC4, elements_gt_64) { + TEST_REQUIRES_X86_AVX512F; + for (size_t elements = 65; elements < 128; elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr2_p5_u64_acc4, nullptr); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_RR2_P5_U4, elements_eq_4) { RAddStoreExpMinusMaxMicrokernelTester() diff --git a/test/f32-raddstoreexpminusmax.yaml b/test/f32-raddstoreexpminusmax.yaml index 0bc916ce37d..4fc8a7bb258 100644 --- a/test/f32-raddstoreexpminusmax.yaml +++ b/test/f32-raddstoreexpminusmax.yaml @@ -48,6 +48,11 @@ - name: xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64_acc2 - name: xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64_acc4 +- name: xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr2_p5_u16 +- name: xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr2_p5_u32_acc2 +- name: xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr2_p5_u64_acc2 +- name: xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr2_p5_u64_acc4 + # WAsm SIMD - name: xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u4 - name: xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_u8_acc2 From 6fbd065ec59efd6b32eab5c4cbdddd56074d9f80 Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Tue, 24 Sep 2024 13:47:18 -0700 Subject: [PATCH 47/50] Remove unused code PiperOrigin-RevId: 678382363 --- include/xnnpack.h | 182 ---------------------------------------------- 1 file changed, 182 deletions(-) diff --git a/include/xnnpack.h b/include/xnnpack.h index efeba4dc533..579b6c04ef4 100644 --- a/include/xnnpack.h +++ b/include/xnnpack.h @@ -2253,24 +2253,6 @@ enum xnn_status xnn_run_operator( enum xnn_status xnn_delete_operator( xnn_operator_t op); -struct xnn_binary_operator_params { - union { - struct { - float param; - float param2; - } elu; - struct { - float param; - } leaky_relu; - }; - int32_t a_zero_point; - int32_t b_zero_point; - float a_scale; - float b_scale; - float output_scale; - int32_t output_zero_point; -}; - /// Operator API: /// - create operator will create and populate a xnn_operator_t /// - reshape operator will update fields in xnn_operator_t with shape/dimensions and parallelization information @@ -5098,170 +5080,6 @@ enum xnn_status xnn_setup_mean_nd_qu8( const void* input, void* output); -enum xnn_status xnn_setup_minimum_nd_f16( - xnn_operator_t minimum_op, - const void* input1, - const void* input2, - void* output); - -enum xnn_status xnn_create_minimum_nd_f32( - uint32_t flags, - xnn_operator_t* minimum_op_out); - -enum xnn_status xnn_reshape_minimum_nd_f32( - xnn_operator_t minimum_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_minimum_nd_f32( - xnn_operator_t minimum_op, - const float* input1, - const float* input2, - float* output); - -enum xnn_status xnn_run_minimum_nd_f32( - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - const float* input1, - const float* input2, - float* output, - uint32_t flags, - pthreadpool_t threadpool); - -enum xnn_status xnn_create_multiply_nd_f16( - uint32_t flags, - xnn_operator_t* multiply_op_out); - -enum xnn_status xnn_reshape_multiply_nd_f16( - xnn_operator_t multiply_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_multiply_nd_f16( - xnn_operator_t multiply_op, - const void* input1, - const void* input2, - void* output); - -enum xnn_status xnn_create_multiply_nd_f32( - uint32_t flags, - xnn_operator_t* multiply_op_out); - -enum xnn_status xnn_reshape_multiply_nd_f32( - xnn_operator_t multiply_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_multiply_nd_f32( - xnn_operator_t multiply_op, - const float* input1, - const float* input2, - float* output); - -enum xnn_status xnn_run_multiply_nd_f32( - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - const float* input1, - const float* input2, - float* output, - uint32_t flags, - pthreadpool_t threadpool); - -enum xnn_status xnn_create_multiply_nd_qs8( - int8_t input1_zero_point, - float input1_scale, - int8_t input2_zero_point, - float input2_scale, - int8_t output_zero_point, - float output_scale, - uint32_t flags, - xnn_operator_t* multiply_op_out); - -enum xnn_status xnn_reshape_multiply_nd_qs8( - xnn_operator_t multiply_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_multiply_nd_qs8( - xnn_operator_t multiply_op, - const int8_t* input1, - const int8_t* input2, - int8_t* output); - -enum xnn_status xnn_run_multiply_nd_qs8( - size_t num_input1_dims, - const size_t* input1_shape, - int8_t input1_zero_point, - float input1_scale, - size_t num_input2_dims, - const size_t* input2_shape, - int8_t input2_zero_point, - float input2_scale, - const int8_t* input1, - const int8_t* input2, - int8_t* output, - int8_t output_zero_point, - float output_scale, - uint32_t flags, - pthreadpool_t threadpool); - -enum xnn_status xnn_create_multiply_nd_qu8( - uint8_t input1_zero_point, - float input1_scale, - uint8_t input2_zero_point, - float input2_scale, - uint8_t output_zero_point, - float output_scale, - uint32_t flags, - xnn_operator_t* multiply_op_out); - -enum xnn_status xnn_reshape_multiply_nd_qu8( - xnn_operator_t multiply_op, - size_t num_input1_dims, - const size_t* input1_shape, - size_t num_input2_dims, - const size_t* input2_shape, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_multiply_nd_qu8( - xnn_operator_t multiply_op, - const uint8_t* input1, - const uint8_t* input2, - uint8_t* output); - -enum xnn_status xnn_run_multiply_nd_qu8( - size_t num_input1_dims, - const size_t* input1_shape, - uint8_t input1_zero_point, - float input1_scale, - size_t num_input2_dims, - const size_t* input2_shape, - uint8_t input2_zero_point, - float input2_scale, - const uint8_t* input1, - const uint8_t* input2, - uint8_t* output, - uint8_t output_zero_point, - float output_scale, - uint32_t flags, - pthreadpool_t threadpool); - enum xnn_status xnn_create_negate_nc_f16( uint32_t flags, xnn_operator_t* negate_op_out); From 2469ec7221e4c4be7dc93df3073c53443bfcb23e Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Tue, 24 Sep 2024 16:01:36 -0700 Subject: [PATCH 48/50] Randomize binary_elementwise_nd_test This test currently exhaustively tests broadcasting of all possible ranks. I think this is better suited to a fuzz-style test instead. This reduces the runtime of this test dramatically, despite eliminating sharding. PiperOrigin-RevId: 678429954 --- CMakeLists.txt | 1 - test/BUILD.bazel | 1 - test/binary-elementwise-nd.cc | 165 +++++++++++++++------------------- 3 files changed, 70 insertions(+), 97 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2a14b87210f..1ef17276d08 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1323,7 +1323,6 @@ IF(XNNPACK_BUILD_TESTS) GTest::gtest GTest::gtest_main XNNPACK) - ADD_SHARDED_TEST(binary-elementwise-nd-test 10) # ---[ Build subgraph optimizations unit tests SET(LIBRARY_SUBGRAPH_OPTIMIZATION_TESTS diff --git a/test/BUILD.bazel b/test/BUILD.bazel index 67b74bd5882..b36d2b0c0f8 100644 --- a/test/BUILD.bazel +++ b/test/BUILD.bazel @@ -1360,7 +1360,6 @@ xnnpack_unit_test( name = "binary_elementwise_nd_test", timeout = "long", srcs = ["binary-elementwise-nd.cc"], - shard_count = 10, deps = OPERATOR_TEST_DEPS, ) diff --git a/test/binary-elementwise-nd.cc b/test/binary-elementwise-nd.cc index ecadd4f522f..23e74b24988 100644 --- a/test/binary-elementwise-nd.cc +++ b/test/binary-elementwise-nd.cc @@ -25,17 +25,6 @@ #include "xnnpack/math.h" #include "replicable_random_device.h" -constexpr size_t kDim1 = 2; -constexpr size_t kDim2 = 3; -constexpr size_t kDim3 = 4; -constexpr size_t kDim4 = 5; -constexpr size_t kDim5 = 6; -constexpr size_t kDim6 = 7; -const size_t kDims[] = {kDim1, kDim2, kDim3, kDim4, kDim5, kDim6}; - -const size_t kBroadcastRanks[] = {0, 1, 2, 3, 4, 5, 6}; -const size_t kTestRank = 4; - enum class RunMode { kCreateReshapeRun, kEager, @@ -403,167 +392,153 @@ class BinaryElementwiseOperatorTester { size_t iterations_{3}; }; -// Make a shape of `rank` dimensions, broadcasting in each dimension according -// `broadcast_mask`. -inline std::vector MakeShapeOfRank(size_t rank, uint32_t broadcast_mask, - const size_t* dims) { - std::vector shape; +template +std::vector RandomShape(Rng& rng) { + const size_t rank = rng() % XNN_MAX_TENSOR_DIMS; + std::vector dims(rank); for (size_t i = 0; i < rank; i++) { - const bool broadcast = (broadcast_mask & (uint32_t(1) << i)) != 0; - shape.push_back(broadcast ? 1 : dims[i]); + dims[i] = rng() % 10 + 1; + } + return dims; +} + +template +std::vector RandomBroadcast(Rng& rng, std::vector dims) { + // Randomly assign some dimensions to 1. + for (size_t i = 0; i < dims.size(); i++) { + if (rng() % 8 == 0) { + dims[i] = 1; + } + } + // Possibly remove leading 1s. + if (rng() % 2 == 0) { + while (!dims.empty() && dims.front() == 1) { + dims.erase(dims.begin()); + } } - std::reverse(shape.begin(), shape.end()); - return shape; + return dims; } template -void RunBinaryOpTester(size_t rank_a, size_t rank_b, const size_t* dims, - RunMode run_mode, +void RunBinaryOpTester(RunMode run_mode, BinaryElementwiseOperatorTester& tester) { - for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << rank_a); bm1++) { - for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << rank_b); bm2++) { - tester.input1_shape(MakeShapeOfRank(rank_a, bm1, dims)) - .input2_shape(MakeShapeOfRank(rank_b, bm2, dims)); - tester.Test(run_mode); - } + xnnpack::ReplicableRandomDevice rng; + for (int iterations = 0; iterations < 100; iterations++) { + std::vector output_shape = RandomShape(rng); + tester.input1_shape(RandomBroadcast(rng, output_shape)) + .input2_shape(RandomBroadcast(rng, output_shape)); + tester.Test(run_mode); } } template -void BroadcastNDTestImpl(const Params& params) { +void BinaryNDTestImpl(const Params& params) { RunMode mode = std::get<0>(params); xnn_binary_operator op = std::get<1>(params); - const size_t rank_a = std::get<2>(params); - const size_t rank_b = std::get<3>(params); BinaryElementwiseOperatorTester tester; tester.operation_type(op); - RunBinaryOpTester(rank_a, rank_b, kDims, mode, tester); + RunBinaryOpTester(mode, tester); } template -class BroadcastNDTest +class BinaryNDTest : public testing::TestWithParam< - std::tuple> {}; + std::tuple> {}; -using BroadcastNDTestQS8 = BroadcastNDTest; -using BroadcastNDTestQU8 = BroadcastNDTest; +using BinaryNDTestQS8 = BinaryNDTest; +using BinaryNDTestQU8 = BinaryNDTest; #ifndef XNN_EXCLUDE_F16_TESTS -using BroadcastNDTestF16 = BroadcastNDTest; +using BinaryNDTestF16 = BinaryNDTest; #endif // XNN_EXCLUDE_F16_TESTS -using BroadcastNDTestF32 = BroadcastNDTest; -using BroadcastNDTestS32 = BroadcastNDTest; +using BinaryNDTestF32 = BinaryNDTest; +using BinaryNDTestS32 = BinaryNDTest; -TEST_P(BroadcastNDTestQS8, op) { BroadcastNDTestImpl(GetParam()); } -TEST_P(BroadcastNDTestQU8, op) { BroadcastNDTestImpl(GetParam()); } +TEST_P(BinaryNDTestQS8, op) { BinaryNDTestImpl(GetParam()); } +TEST_P(BinaryNDTestQU8, op) { BinaryNDTestImpl(GetParam()); } #ifndef XNN_EXCLUDE_F16_TESTS -TEST_P(BroadcastNDTestF16, op) { BroadcastNDTestImpl(GetParam()); } +TEST_P(BinaryNDTestF16, op) { BinaryNDTestImpl(GetParam()); } #endif // XNN_EXCLUDE_F16_TESTS -TEST_P(BroadcastNDTestF32, op) { BroadcastNDTestImpl(GetParam()); } -TEST_P(BroadcastNDTestS32, op) { BroadcastNDTestImpl(GetParam()); } - -std::string ToString( - const std::tuple& param) { - return BinaryElementwiseOperatorTester::ToString(std::get<1>(param)) + "_" + - std::to_string(std::get<2>(param)) + "d_x_" + - std::to_string(std::get<3>(param)) + "d"; -} +TEST_P(BinaryNDTestF32, op) { BinaryNDTestImpl(GetParam()); } +TEST_P(BinaryNDTestS32, op) { BinaryNDTestImpl(GetParam()); } std::string ToString(const std::tuple& param) { return BinaryElementwiseOperatorTester::ToString(std::get<1>(param)); } INSTANTIATE_TEST_SUITE_P( - CreateReshapeRun, BroadcastNDTestQS8, + CreateReshapeRun, BinaryNDTestQS8, testing::Combine(testing::Values(RunMode::kCreateReshapeRun), testing::Values(xnn_binary_add, xnn_binary_subtract, - xnn_binary_multiply), - testing::ValuesIn(kBroadcastRanks), - testing::ValuesIn(kBroadcastRanks)), + xnn_binary_multiply)), [](const auto& info) { return ToString(info.param); }); -INSTANTIATE_TEST_SUITE_P(Eager, BroadcastNDTestQS8, +INSTANTIATE_TEST_SUITE_P(Eager, BinaryNDTestQS8, testing::Combine(testing::Values(RunMode::kEager), testing::Values(xnn_binary_add, xnn_binary_subtract, - xnn_binary_multiply), - testing::ValuesIn(kBroadcastRanks), - testing::ValuesIn(kBroadcastRanks)), + xnn_binary_multiply)), [](const auto& info) { return ToString(info.param); }); INSTANTIATE_TEST_SUITE_P( - CreateReshapeRun, BroadcastNDTestQU8, + CreateReshapeRun, BinaryNDTestQU8, testing::Combine(testing::Values(RunMode::kCreateReshapeRun), testing::Values(xnn_binary_add, xnn_binary_subtract, - xnn_binary_multiply), - testing::ValuesIn(kBroadcastRanks), - testing::ValuesIn(kBroadcastRanks)), + xnn_binary_multiply)), [](const auto& info) { return ToString(info.param); }); -INSTANTIATE_TEST_SUITE_P(Eager, BroadcastNDTestQU8, +INSTANTIATE_TEST_SUITE_P(Eager, BinaryNDTestQU8, testing::Combine(testing::Values(RunMode::kEager), testing::Values(xnn_binary_add, xnn_binary_subtract, - xnn_binary_multiply), - testing::ValuesIn(kBroadcastRanks), - testing::ValuesIn(kBroadcastRanks)), + xnn_binary_multiply)), [](const auto& info) { return ToString(info.param); }); #ifndef XNN_EXCLUDE_F16_TESTS INSTANTIATE_TEST_SUITE_P( - CreateReshapeRun, BroadcastNDTestF16, + CreateReshapeRun, BinaryNDTestF16, testing::Combine( testing::Values(RunMode::kCreateReshapeRun), testing::Values(xnn_binary_add, xnn_binary_divide, xnn_binary_maximum, xnn_binary_minimum, xnn_binary_multiply, - xnn_binary_squared_difference, xnn_binary_subtract), - testing::ValuesIn(kBroadcastRanks), testing::ValuesIn(kBroadcastRanks)), + xnn_binary_squared_difference, xnn_binary_subtract)), [](const auto& info) { return ToString(info.param); }); INSTANTIATE_TEST_SUITE_P( - Eager, BroadcastNDTestF16, + Eager, BinaryNDTestF16, testing::Combine( testing::Values(RunMode::kEager), testing::Values(xnn_binary_add, xnn_binary_divide, xnn_binary_maximum, xnn_binary_minimum, xnn_binary_multiply, - xnn_binary_squared_difference, xnn_binary_subtract), - testing::ValuesIn(kBroadcastRanks), testing::ValuesIn(kBroadcastRanks)), + xnn_binary_squared_difference, xnn_binary_subtract)), [](const auto& info) { return ToString(info.param); }); #endif INSTANTIATE_TEST_SUITE_P( - CreateReshapeRun, BroadcastNDTestF32, + CreateReshapeRun, BinaryNDTestF32, testing::Combine(testing::Values(RunMode::kCreateReshapeRun), testing::Values(xnn_binary_add, xnn_binary_copysign, xnn_binary_divide, xnn_binary_maximum, xnn_binary_minimum, xnn_binary_multiply, xnn_binary_subtract, - xnn_binary_squared_difference), - testing::ValuesIn(kBroadcastRanks), - testing::ValuesIn(kBroadcastRanks)), + xnn_binary_squared_difference)), [](const auto& info) { return ToString(info.param); }); INSTANTIATE_TEST_SUITE_P( - Eager, BroadcastNDTestF32, + Eager, BinaryNDTestF32, testing::Combine(testing::Values(RunMode::kEager), testing::Values(xnn_binary_add, xnn_binary_divide, xnn_binary_maximum, xnn_binary_minimum, xnn_binary_multiply, xnn_binary_subtract, - xnn_binary_squared_difference), - testing::ValuesIn(kBroadcastRanks), - testing::ValuesIn(kBroadcastRanks)), + xnn_binary_squared_difference)), [](const auto& info) { return ToString(info.param); }); INSTANTIATE_TEST_SUITE_P( - CreateReshapeRun, BroadcastNDTestS32, + CreateReshapeRun, BinaryNDTestS32, testing::Combine(testing::Values(RunMode::kCreateReshapeRun), - testing::Values(xnn_binary_multiply), - testing::ValuesIn(kBroadcastRanks), - testing::ValuesIn(kBroadcastRanks)), + testing::Values(xnn_binary_multiply)), [](const auto& info) { return ToString(info.param); }); -INSTANTIATE_TEST_SUITE_P(Eager, BroadcastNDTestS32, +INSTANTIATE_TEST_SUITE_P(Eager, BinaryNDTestS32, testing::Combine(testing::Values(RunMode::kEager), - testing::Values(xnn_binary_multiply), - testing::ValuesIn(kBroadcastRanks), - testing::ValuesIn(kBroadcastRanks)), + testing::Values(xnn_binary_multiply)), [](const auto& info) { return ToString(info.param); }); template void QuantizedTest_Input1Scale(Params params) { for (float input1_scale = 0.1f; input1_scale <= 10.0f; input1_scale *= 3.14f) { - RunBinaryOpTester(kTestRank, kTestRank, kDims, std::get<0>(params), + RunBinaryOpTester(std::get<0>(params), BinaryElementwiseOperatorTester() .operation_type(std::get<1>(params)) .input1_scale(input1_scale)); @@ -575,7 +550,7 @@ void QuantizedTest_Input1ZeroPoint(Params params) { for (int32_t input1_zero_point = std::numeric_limits::min(); input1_zero_point <= std::numeric_limits::max(); input1_zero_point += 51) { - RunBinaryOpTester(kTestRank, kTestRank, kDims, std::get<0>(params), + RunBinaryOpTester(std::get<0>(params), BinaryElementwiseOperatorTester() .operation_type(std::get<1>(params)) .input1_zero_point(input1_zero_point)); @@ -586,7 +561,7 @@ template void QuantizedTest_Input2Scale(Params params) { for (float input2_scale = 0.1f; input2_scale <= 10.0f; input2_scale *= 3.14f) { - RunBinaryOpTester(kTestRank, kTestRank, kDims, std::get<0>(params), + RunBinaryOpTester(std::get<0>(params), BinaryElementwiseOperatorTester() .operation_type(std::get<1>(params)) .input2_scale(input2_scale)); @@ -598,7 +573,7 @@ void QuantizedTest_Input2ZeroPoint(Params params) { for (int32_t input2_zero_point = std::numeric_limits::min(); input2_zero_point <= std::numeric_limits::max(); input2_zero_point += 51) { - RunBinaryOpTester(kTestRank, kTestRank, kDims, std::get<0>(params), + RunBinaryOpTester(std::get<0>(params), BinaryElementwiseOperatorTester() .operation_type(std::get<1>(params)) .input2_zero_point(input2_zero_point)); @@ -609,7 +584,7 @@ template void QuantizedTest_OutputScale(Params params) { for (float output_scale = 0.1f; output_scale <= 10.0f; output_scale *= 3.14f) { - RunBinaryOpTester(kTestRank, kTestRank, kDims, std::get<0>(params), + RunBinaryOpTester(std::get<0>(params), BinaryElementwiseOperatorTester() .operation_type(std::get<1>(params)) .output_scale(output_scale)); @@ -621,7 +596,7 @@ void QuantizedTest_OutputZeroPoint(Params params) { for (int32_t output_zero_point = std::numeric_limits::min(); output_zero_point <= std::numeric_limits::max(); output_zero_point += 51) { - RunBinaryOpTester(kTestRank, kTestRank, kDims, std::get<0>(params), + RunBinaryOpTester(std::get<0>(params), BinaryElementwiseOperatorTester() .operation_type(std::get<1>(params)) .output_zero_point(output_zero_point)); From 322ba759d23d4df8ce5a9ec82c658e7cedf8d461 Mon Sep 17 00:00:00 2001 From: XNNPACK Team Date: Tue, 24 Sep 2024 17:17:50 -0700 Subject: [PATCH 49/50] Add Int8/QAT MobileNet V2 model to bench/models as a subgraph benchmark. (Original source: https://github.com/tensorflow/models/tree/master/official/projects/qat/vision) PiperOrigin-RevId: 678453196 --- bench/models/fp32-mobilenet-v1.cc | 5 +- bench/models/fp32-mobilenet-v2.cc | 9 +- bench/models/fp32-mobilenet-v3-large.cc | 9 +- bench/models/fp32-mobilenet-v3-small.cc | 9 +- bench/models/models.h | 1 - bench/models/qs8-mobilenet-v2.cc | 2310 +++++++++++++++-------- 6 files changed, 1547 insertions(+), 796 deletions(-) diff --git a/bench/models/fp32-mobilenet-v1.cc b/bench/models/fp32-mobilenet-v1.cc index 937500f4cac..1a4cbf42a71 100644 --- a/bench/models/fp32-mobilenet-v1.cc +++ b/bench/models/fp32-mobilenet-v1.cc @@ -32,6 +32,9 @@ xnn_subgraph_t FP32MobileNetV1() { return nullptr; } + std::random_device random_device; + auto rng = std::mt19937(random_device()); + uint32_t v0 = XNN_INVALID_VALUE_ID; std::array v0_dims = {{1, 224, 224, 3}}; status = xnn_define_tensor_value( @@ -1120,8 +1123,6 @@ xnn_subgraph_t FP32MobileNetV1() { return nullptr; } - std::random_device random_device; - auto rng = std::mt19937(random_device()); auto f32rng = std::bind(std::uniform_real_distribution(-1.0f, +1.0f), std::ref(rng)); std::generate(w30_data.begin(), w30_data.end(), std::ref(f32rng)); std::generate(w31_data.begin(), w31_data.end(), std::ref(f32rng)); diff --git a/bench/models/fp32-mobilenet-v2.cc b/bench/models/fp32-mobilenet-v2.cc index 3bb746d94b0..44f23290e55 100644 --- a/bench/models/fp32-mobilenet-v2.cc +++ b/bench/models/fp32-mobilenet-v2.cc @@ -32,13 +32,16 @@ xnn_subgraph_t FP32MobileNetV2() { return nullptr; } + std::random_device random_device; + auto rng = std::mt19937(random_device()); + uint32_t v0 = XNN_INVALID_VALUE_ID; std::array v0_dims = {{1, 224, 224, 3}}; status = xnn_define_tensor_value( subgraph, xnn_datatype_fp32, v0_dims.size(), v0_dims.data(), /*data=*/nullptr, - 1, XNN_VALUE_FLAG_EXTERNAL_INPUT, &v0); + 0, XNN_VALUE_FLAG_EXTERNAL_INPUT, &v0); if (status != xnn_status_success) { std::cerr << "failed to create tensor v0" << std::endl; return nullptr; @@ -830,7 +833,7 @@ xnn_subgraph_t FP32MobileNetV2() { subgraph, xnn_datatype_fp32, v66_dims.size(), v66_dims.data(), /*data=*/nullptr, - 0, XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &v66); + 1, XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &v66); if (status != xnn_status_success) { std::cerr << "failed to create tensor v66" << std::endl; return nullptr; @@ -2214,8 +2217,6 @@ xnn_subgraph_t FP32MobileNetV2() { return nullptr; } - std::random_device random_device; - auto rng = std::mt19937(random_device()); auto f32rng = std::bind(std::uniform_real_distribution(-1.0f, +1.0f), std::ref(rng)); std::generate(w67_data.begin(), w67_data.end(), std::ref(f32rng)); std::generate(w68_data.begin(), w68_data.end(), std::ref(f32rng)); diff --git a/bench/models/fp32-mobilenet-v3-large.cc b/bench/models/fp32-mobilenet-v3-large.cc index 44f672da228..f5e014323d2 100644 --- a/bench/models/fp32-mobilenet-v3-large.cc +++ b/bench/models/fp32-mobilenet-v3-large.cc @@ -32,13 +32,16 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } + std::random_device random_device; + auto rng = std::mt19937(random_device()); + uint32_t v0 = XNN_INVALID_VALUE_ID; std::array v0_dims = {{1, 224, 224, 3}}; status = xnn_define_tensor_value( subgraph, xnn_datatype_fp32, v0_dims.size(), v0_dims.data(), /*data=*/nullptr, - 1, XNN_VALUE_FLAG_EXTERNAL_INPUT, &v0); + 0, XNN_VALUE_FLAG_EXTERNAL_INPUT, &v0); if (status != xnn_status_success) { std::cerr << "failed to create tensor v0" << std::endl; return nullptr; @@ -1514,7 +1517,7 @@ xnn_subgraph_t FP32MobileNetV3Large() { subgraph, xnn_datatype_fp32, v123_dims.size(), v123_dims.data(), /*data=*/nullptr, - 0, XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &v123); + 1, XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &v123); if (status != xnn_status_success) { std::cerr << "failed to create tensor v123" << std::endl; return nullptr; @@ -3288,8 +3291,6 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - std::random_device random_device; - auto rng = std::mt19937(random_device()); auto f32rng = std::bind(std::uniform_real_distribution(-1.0f, +1.0f), std::ref(rng)); std::generate(w124_data.begin(), w124_data.end(), std::ref(f32rng)); std::generate(w125_data.begin(), w125_data.end(), std::ref(f32rng)); diff --git a/bench/models/fp32-mobilenet-v3-small.cc b/bench/models/fp32-mobilenet-v3-small.cc index a22280c3053..804e36fd64a 100644 --- a/bench/models/fp32-mobilenet-v3-small.cc +++ b/bench/models/fp32-mobilenet-v3-small.cc @@ -32,13 +32,16 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } + std::random_device random_device; + auto rng = std::mt19937(random_device()); + uint32_t v0 = XNN_INVALID_VALUE_ID; std::array v0_dims = {{1, 224, 224, 3}}; status = xnn_define_tensor_value( subgraph, xnn_datatype_fp32, v0_dims.size(), v0_dims.data(), /*data=*/nullptr, - 1, XNN_VALUE_FLAG_EXTERNAL_INPUT, &v0); + 0, XNN_VALUE_FLAG_EXTERNAL_INPUT, &v0); if (status != xnn_status_success) { std::cerr << "failed to create tensor v0" << std::endl; return nullptr; @@ -1358,7 +1361,7 @@ xnn_subgraph_t FP32MobileNetV3Small() { subgraph, xnn_datatype_fp32, v110_dims.size(), v110_dims.data(), /*data=*/nullptr, - 0, XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &v110); + 1, XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &v110); if (status != xnn_status_success) { std::cerr << "failed to create tensor v110" << std::endl; return nullptr; @@ -2885,8 +2888,6 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - std::random_device random_device; - auto rng = std::mt19937(random_device()); auto f32rng = std::bind(std::uniform_real_distribution(-1.0f, +1.0f), std::ref(rng)); std::generate(w111_data.begin(), w111_data.end(), std::ref(f32rng)); std::generate(w112_data.begin(), w112_data.end(), std::ref(f32rng)); diff --git a/bench/models/models.h b/bench/models/models.h index 0e356d3f0fb..605b669dbac 100644 --- a/bench/models/models.h +++ b/bench/models/models.h @@ -13,7 +13,6 @@ xnn_subgraph_t FP32MobileNetV1(); xnn_subgraph_t FP32MobileNetV2(); xnn_subgraph_t FP32MobileNetV3Large(); xnn_subgraph_t FP32MobileNetV3Small(); - xnn_subgraph_t QS8MobileNetV2(); } // namespace models diff --git a/bench/models/qs8-mobilenet-v2.cc b/bench/models/qs8-mobilenet-v2.cc index 34f03c8bd59..d07d1309b96 100644 --- a/bench/models/qs8-mobilenet-v2.cc +++ b/bench/models/qs8-mobilenet-v2.cc @@ -32,22 +32,27 @@ xnn_subgraph_t QS8MobileNetV2() { return nullptr; } + std::random_device random_device; + auto rng = std::mt19937(random_device()); + uint32_t v0 = XNN_INVALID_VALUE_ID; std::array v0_dims = {{1, 224, 224, 3}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, v0_dims.size(), v0_dims.data(), /*data=*/nullptr, - 1, XNN_VALUE_FLAG_EXTERNAL_INPUT, &v0); + 0, XNN_VALUE_FLAG_EXTERNAL_INPUT, &v0); if (status != xnn_status_success) { std::cerr << "failed to create tensor v0" << std::endl; return nullptr; } uint32_t v1 = XNN_INVALID_VALUE_ID; - std::array v1_dims = {{1, 112, 112, 32}}; + std::array v1_dims = {{1, 224, 224, 3}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-14, + /*scale=*/0.01865844801068306f, v1_dims.size(), v1_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v1); @@ -59,7 +64,9 @@ xnn_subgraph_t QS8MobileNetV2() { uint32_t v2 = XNN_INVALID_VALUE_ID; std::array v2_dims = {{1, 112, 112, 32}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-128, + /*scale=*/0.0235294122248888f, v2_dims.size(), v2_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v2); @@ -69,9 +76,11 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v3 = XNN_INVALID_VALUE_ID; - std::array v3_dims = {{1, 112, 112, 16}}; + std::array v3_dims = {{1, 112, 112, 32}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-128, + /*scale=*/0.0235294122248888f, v3_dims.size(), v3_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v3); @@ -81,9 +90,11 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v4 = XNN_INVALID_VALUE_ID; - std::array v4_dims = {{1, 112, 112, 96}}; + std::array v4_dims = {{1, 112, 112, 16}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-6, + /*scale=*/0.3295263648033142f, v4_dims.size(), v4_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v4); @@ -93,9 +104,11 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v5 = XNN_INVALID_VALUE_ID; - std::array v5_dims = {{1, 56, 56, 96}}; + std::array v5_dims = {{1, 112, 112, 96}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-128, + /*scale=*/0.0235294122248888f, v5_dims.size(), v5_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v5); @@ -105,9 +118,11 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v6 = XNN_INVALID_VALUE_ID; - std::array v6_dims = {{1, 56, 56, 24}}; + std::array v6_dims = {{1, 56, 56, 96}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-128, + /*scale=*/0.0235294122248888f, v6_dims.size(), v6_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v6); @@ -117,9 +132,11 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v7 = XNN_INVALID_VALUE_ID; - std::array v7_dims = {{1, 56, 56, 144}}; + std::array v7_dims = {{1, 56, 56, 24}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/1, + /*scale=*/0.26354169845581055f, v7_dims.size(), v7_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v7); @@ -131,7 +148,9 @@ xnn_subgraph_t QS8MobileNetV2() { uint32_t v8 = XNN_INVALID_VALUE_ID; std::array v8_dims = {{1, 56, 56, 144}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-128, + /*scale=*/0.0235294122248888f, v8_dims.size(), v8_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v8); @@ -141,9 +160,11 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v9 = XNN_INVALID_VALUE_ID; - std::array v9_dims = {{1, 56, 56, 24}}; + std::array v9_dims = {{1, 56, 56, 144}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-128, + /*scale=*/0.0235294122248888f, v9_dims.size(), v9_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v9); @@ -155,7 +176,9 @@ xnn_subgraph_t QS8MobileNetV2() { uint32_t v10 = XNN_INVALID_VALUE_ID; std::array v10_dims = {{1, 56, 56, 24}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-1, + /*scale=*/0.361392080783844f, v10_dims.size(), v10_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v10); @@ -165,9 +188,11 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v11 = XNN_INVALID_VALUE_ID; - std::array v11_dims = {{1, 56, 56, 144}}; + std::array v11_dims = {{1, 56, 56, 24}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/5, + /*scale=*/0.39553302526474f, v11_dims.size(), v11_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v11); @@ -177,9 +202,11 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v12 = XNN_INVALID_VALUE_ID; - std::array v12_dims = {{1, 28, 28, 144}}; + std::array v12_dims = {{1, 56, 56, 144}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-128, + /*scale=*/0.0235294122248888f, v12_dims.size(), v12_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v12); @@ -189,9 +216,11 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v13 = XNN_INVALID_VALUE_ID; - std::array v13_dims = {{1, 28, 28, 32}}; + std::array v13_dims = {{1, 28, 28, 144}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-128, + /*scale=*/0.0235294122248888f, v13_dims.size(), v13_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v13); @@ -201,9 +230,11 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v14 = XNN_INVALID_VALUE_ID; - std::array v14_dims = {{1, 28, 28, 192}}; + std::array v14_dims = {{1, 28, 28, 32}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/1, + /*scale=*/0.24222400784492493f, v14_dims.size(), v14_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v14); @@ -215,7 +246,9 @@ xnn_subgraph_t QS8MobileNetV2() { uint32_t v15 = XNN_INVALID_VALUE_ID; std::array v15_dims = {{1, 28, 28, 192}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-128, + /*scale=*/0.0235294122248888f, v15_dims.size(), v15_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v15); @@ -225,9 +258,11 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v16 = XNN_INVALID_VALUE_ID; - std::array v16_dims = {{1, 28, 28, 32}}; + std::array v16_dims = {{1, 28, 28, 192}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-128, + /*scale=*/0.0235294122248888f, v16_dims.size(), v16_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v16); @@ -239,7 +274,9 @@ xnn_subgraph_t QS8MobileNetV2() { uint32_t v17 = XNN_INVALID_VALUE_ID; std::array v17_dims = {{1, 28, 28, 32}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-22, + /*scale=*/0.2548377215862274f, v17_dims.size(), v17_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v17); @@ -249,9 +286,11 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v18 = XNN_INVALID_VALUE_ID; - std::array v18_dims = {{1, 28, 28, 192}}; + std::array v18_dims = {{1, 28, 28, 32}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-2, + /*scale=*/0.32090887427330017f, v18_dims.size(), v18_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v18); @@ -263,7 +302,9 @@ xnn_subgraph_t QS8MobileNetV2() { uint32_t v19 = XNN_INVALID_VALUE_ID; std::array v19_dims = {{1, 28, 28, 192}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-128, + /*scale=*/0.0235294122248888f, v19_dims.size(), v19_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v19); @@ -273,9 +314,11 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v20 = XNN_INVALID_VALUE_ID; - std::array v20_dims = {{1, 28, 28, 32}}; + std::array v20_dims = {{1, 28, 28, 192}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-128, + /*scale=*/0.0235294122248888f, v20_dims.size(), v20_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v20); @@ -287,7 +330,9 @@ xnn_subgraph_t QS8MobileNetV2() { uint32_t v21 = XNN_INVALID_VALUE_ID; std::array v21_dims = {{1, 28, 28, 32}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/0, + /*scale=*/0.22142209112644196f, v21_dims.size(), v21_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v21); @@ -297,9 +342,11 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v22 = XNN_INVALID_VALUE_ID; - std::array v22_dims = {{1, 28, 28, 192}}; + std::array v22_dims = {{1, 28, 28, 32}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-1, + /*scale=*/0.3504160940647125f, v22_dims.size(), v22_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v22); @@ -309,9 +356,11 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v23 = XNN_INVALID_VALUE_ID; - std::array v23_dims = {{1, 14, 14, 192}}; + std::array v23_dims = {{1, 28, 28, 192}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-128, + /*scale=*/0.0235294122248888f, v23_dims.size(), v23_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v23); @@ -321,9 +370,11 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v24 = XNN_INVALID_VALUE_ID; - std::array v24_dims = {{1, 14, 14, 64}}; + std::array v24_dims = {{1, 14, 14, 192}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-128, + /*scale=*/0.0235294122248888f, v24_dims.size(), v24_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v24); @@ -333,9 +384,11 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v25 = XNN_INVALID_VALUE_ID; - std::array v25_dims = {{1, 14, 14, 384}}; + std::array v25_dims = {{1, 14, 14, 64}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-5, + /*scale=*/0.1933557689189911f, v25_dims.size(), v25_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v25); @@ -347,7 +400,9 @@ xnn_subgraph_t QS8MobileNetV2() { uint32_t v26 = XNN_INVALID_VALUE_ID; std::array v26_dims = {{1, 14, 14, 384}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-128, + /*scale=*/0.0235294122248888f, v26_dims.size(), v26_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v26); @@ -357,9 +412,11 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v27 = XNN_INVALID_VALUE_ID; - std::array v27_dims = {{1, 14, 14, 64}}; + std::array v27_dims = {{1, 14, 14, 384}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-128, + /*scale=*/0.0235294122248888f, v27_dims.size(), v27_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v27); @@ -371,7 +428,9 @@ xnn_subgraph_t QS8MobileNetV2() { uint32_t v28 = XNN_INVALID_VALUE_ID; std::array v28_dims = {{1, 14, 14, 64}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-14, + /*scale=*/0.17820045351982117f, v28_dims.size(), v28_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v28); @@ -381,9 +440,11 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v29 = XNN_INVALID_VALUE_ID; - std::array v29_dims = {{1, 14, 14, 384}}; + std::array v29_dims = {{1, 14, 14, 64}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-3, + /*scale=*/0.2269556224346161f, v29_dims.size(), v29_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v29); @@ -395,7 +456,9 @@ xnn_subgraph_t QS8MobileNetV2() { uint32_t v30 = XNN_INVALID_VALUE_ID; std::array v30_dims = {{1, 14, 14, 384}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-128, + /*scale=*/0.0235294122248888f, v30_dims.size(), v30_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v30); @@ -405,9 +468,11 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v31 = XNN_INVALID_VALUE_ID; - std::array v31_dims = {{1, 14, 14, 64}}; + std::array v31_dims = {{1, 14, 14, 384}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-128, + /*scale=*/0.0235294122248888f, v31_dims.size(), v31_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v31); @@ -419,7 +484,9 @@ xnn_subgraph_t QS8MobileNetV2() { uint32_t v32 = XNN_INVALID_VALUE_ID; std::array v32_dims = {{1, 14, 14, 64}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/4, + /*scale=*/0.13266333937644958f, v32_dims.size(), v32_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v32); @@ -429,9 +496,11 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v33 = XNN_INVALID_VALUE_ID; - std::array v33_dims = {{1, 14, 14, 384}}; + std::array v33_dims = {{1, 14, 14, 64}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-4, + /*scale=*/0.24267108738422394f, v33_dims.size(), v33_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v33); @@ -443,7 +512,9 @@ xnn_subgraph_t QS8MobileNetV2() { uint32_t v34 = XNN_INVALID_VALUE_ID; std::array v34_dims = {{1, 14, 14, 384}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-128, + /*scale=*/0.0235294122248888f, v34_dims.size(), v34_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v34); @@ -453,9 +524,11 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v35 = XNN_INVALID_VALUE_ID; - std::array v35_dims = {{1, 14, 14, 64}}; + std::array v35_dims = {{1, 14, 14, 384}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-128, + /*scale=*/0.0235294122248888f, v35_dims.size(), v35_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v35); @@ -467,7 +540,9 @@ xnn_subgraph_t QS8MobileNetV2() { uint32_t v36 = XNN_INVALID_VALUE_ID; std::array v36_dims = {{1, 14, 14, 64}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-5, + /*scale=*/0.143711119890213f, v36_dims.size(), v36_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v36); @@ -477,9 +552,11 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v37 = XNN_INVALID_VALUE_ID; - std::array v37_dims = {{1, 14, 14, 384}}; + std::array v37_dims = {{1, 14, 14, 64}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-3, + /*scale=*/0.24989819526672363f, v37_dims.size(), v37_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v37); @@ -491,7 +568,9 @@ xnn_subgraph_t QS8MobileNetV2() { uint32_t v38 = XNN_INVALID_VALUE_ID; std::array v38_dims = {{1, 14, 14, 384}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-128, + /*scale=*/0.0235294122248888f, v38_dims.size(), v38_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v38); @@ -501,9 +580,11 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v39 = XNN_INVALID_VALUE_ID; - std::array v39_dims = {{1, 14, 14, 96}}; + std::array v39_dims = {{1, 14, 14, 384}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-128, + /*scale=*/0.0235294122248888f, v39_dims.size(), v39_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v39); @@ -513,9 +594,11 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v40 = XNN_INVALID_VALUE_ID; - std::array v40_dims = {{1, 14, 14, 576}}; + std::array v40_dims = {{1, 14, 14, 96}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-3, + /*scale=*/0.199631929397583f, v40_dims.size(), v40_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v40); @@ -527,7 +610,9 @@ xnn_subgraph_t QS8MobileNetV2() { uint32_t v41 = XNN_INVALID_VALUE_ID; std::array v41_dims = {{1, 14, 14, 576}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-128, + /*scale=*/0.0235294122248888f, v41_dims.size(), v41_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v41); @@ -537,9 +622,11 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v42 = XNN_INVALID_VALUE_ID; - std::array v42_dims = {{1, 14, 14, 96}}; + std::array v42_dims = {{1, 14, 14, 576}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-128, + /*scale=*/0.0235294122248888f, v42_dims.size(), v42_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v42); @@ -551,7 +638,9 @@ xnn_subgraph_t QS8MobileNetV2() { uint32_t v43 = XNN_INVALID_VALUE_ID; std::array v43_dims = {{1, 14, 14, 96}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-5, + /*scale=*/0.1339312046766281f, v43_dims.size(), v43_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v43); @@ -561,9 +650,11 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v44 = XNN_INVALID_VALUE_ID; - std::array v44_dims = {{1, 14, 14, 576}}; + std::array v44_dims = {{1, 14, 14, 96}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-3, + /*scale=*/0.2115112543106079f, v44_dims.size(), v44_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v44); @@ -575,7 +666,9 @@ xnn_subgraph_t QS8MobileNetV2() { uint32_t v45 = XNN_INVALID_VALUE_ID; std::array v45_dims = {{1, 14, 14, 576}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-128, + /*scale=*/0.0235294122248888f, v45_dims.size(), v45_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v45); @@ -585,9 +678,11 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v46 = XNN_INVALID_VALUE_ID; - std::array v46_dims = {{1, 14, 14, 96}}; + std::array v46_dims = {{1, 14, 14, 576}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-128, + /*scale=*/0.0235294122248888f, v46_dims.size(), v46_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v46); @@ -599,7 +694,9 @@ xnn_subgraph_t QS8MobileNetV2() { uint32_t v47 = XNN_INVALID_VALUE_ID; std::array v47_dims = {{1, 14, 14, 96}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/13, + /*scale=*/0.20608632266521454f, v47_dims.size(), v47_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v47); @@ -609,9 +706,11 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v48 = XNN_INVALID_VALUE_ID; - std::array v48_dims = {{1, 14, 14, 576}}; + std::array v48_dims = {{1, 14, 14, 96}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/4, + /*scale=*/0.2694852650165558f, v48_dims.size(), v48_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v48); @@ -621,9 +720,11 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v49 = XNN_INVALID_VALUE_ID; - std::array v49_dims = {{1, 7, 7, 576}}; + std::array v49_dims = {{1, 14, 14, 576}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-128, + /*scale=*/0.0235294122248888f, v49_dims.size(), v49_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v49); @@ -633,9 +734,11 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v50 = XNN_INVALID_VALUE_ID; - std::array v50_dims = {{1, 7, 7, 160}}; + std::array v50_dims = {{1, 7, 7, 576}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-128, + /*scale=*/0.0235294122248888f, v50_dims.size(), v50_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v50); @@ -645,9 +748,11 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v51 = XNN_INVALID_VALUE_ID; - std::array v51_dims = {{1, 7, 7, 960}}; + std::array v51_dims = {{1, 7, 7, 160}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-3, + /*scale=*/0.15931324660778046f, v51_dims.size(), v51_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v51); @@ -659,7 +764,9 @@ xnn_subgraph_t QS8MobileNetV2() { uint32_t v52 = XNN_INVALID_VALUE_ID; std::array v52_dims = {{1, 7, 7, 960}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-128, + /*scale=*/0.0235294122248888f, v52_dims.size(), v52_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v52); @@ -669,9 +776,11 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v53 = XNN_INVALID_VALUE_ID; - std::array v53_dims = {{1, 7, 7, 160}}; + std::array v53_dims = {{1, 7, 7, 960}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-128, + /*scale=*/0.0235294122248888f, v53_dims.size(), v53_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v53); @@ -683,7 +792,9 @@ xnn_subgraph_t QS8MobileNetV2() { uint32_t v54 = XNN_INVALID_VALUE_ID; std::array v54_dims = {{1, 7, 7, 160}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/1, + /*scale=*/0.10275092720985413f, v54_dims.size(), v54_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v54); @@ -693,9 +804,11 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v55 = XNN_INVALID_VALUE_ID; - std::array v55_dims = {{1, 7, 7, 960}}; + std::array v55_dims = {{1, 7, 7, 160}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-6, + /*scale=*/0.1888202577829361f, v55_dims.size(), v55_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v55); @@ -707,7 +820,9 @@ xnn_subgraph_t QS8MobileNetV2() { uint32_t v56 = XNN_INVALID_VALUE_ID; std::array v56_dims = {{1, 7, 7, 960}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-128, + /*scale=*/0.0235294122248888f, v56_dims.size(), v56_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v56); @@ -717,9 +832,11 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v57 = XNN_INVALID_VALUE_ID; - std::array v57_dims = {{1, 7, 7, 160}}; + std::array v57_dims = {{1, 7, 7, 960}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-128, + /*scale=*/0.0235294122248888f, v57_dims.size(), v57_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v57); @@ -731,7 +848,9 @@ xnn_subgraph_t QS8MobileNetV2() { uint32_t v58 = XNN_INVALID_VALUE_ID; std::array v58_dims = {{1, 7, 7, 160}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/7, + /*scale=*/0.22013002634048462f, v58_dims.size(), v58_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v58); @@ -741,9 +860,11 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v59 = XNN_INVALID_VALUE_ID; - std::array v59_dims = {{1, 7, 7, 960}}; + std::array v59_dims = {{1, 7, 7, 160}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-5, + /*scale=*/0.3162474036216736f, v59_dims.size(), v59_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v59); @@ -755,7 +876,9 @@ xnn_subgraph_t QS8MobileNetV2() { uint32_t v60 = XNN_INVALID_VALUE_ID; std::array v60_dims = {{1, 7, 7, 960}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-128, + /*scale=*/0.0235294122248888f, v60_dims.size(), v60_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v60); @@ -765,9 +888,11 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v61 = XNN_INVALID_VALUE_ID; - std::array v61_dims = {{1, 7, 7, 320}}; + std::array v61_dims = {{1, 7, 7, 960}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-128, + /*scale=*/0.0235294122248888f, v61_dims.size(), v61_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v61); @@ -777,9 +902,11 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v62 = XNN_INVALID_VALUE_ID; - std::array v62_dims = {{1, 7, 7, 1280}}; + std::array v62_dims = {{1, 7, 7, 320}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-1, + /*scale=*/0.1037522703409195f, v62_dims.size(), v62_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v62); @@ -789,25 +916,25 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v63 = XNN_INVALID_VALUE_ID; - std::array v63_dims = {{1, 1, 1, 1280}}; + std::array v63_dims = {{1, 7, 7, 1280}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-128, + /*scale=*/0.0235294122248888f, v63_dims.size(), v63_dims.data(), /*data=*/nullptr, -#if 0 XNN_INVALID_VALUE_ID, /*flags=*/0, &v63); -#else - 0, XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &v63); -#endif if (status != xnn_status_success) { std::cerr << "failed to create tensor v63" << std::endl; return nullptr; } -#if 0 + uint32_t v64 = XNN_INVALID_VALUE_ID; - std::array v64_dims = {{1, 1, 1, 1008}}; + std::array v64_dims = {{1, 1280}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-128, + /*scale=*/0.018486851826310158f, v64_dims.size(), v64_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v64); @@ -817,9 +944,11 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v65 = XNN_INVALID_VALUE_ID; - std::array v65_dims = {{1, 1008}}; + std::array v65_dims = {{1, 1001}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/-53, + /*scale=*/0.07755904644727707f, v65_dims.size(), v65_dims.data(), /*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &v65); @@ -829,24 +958,29 @@ xnn_subgraph_t QS8MobileNetV2() { } uint32_t v66 = XNN_INVALID_VALUE_ID; - std::array v66_dims = {{1, 1008}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + std::array v66_dims = {{1, 1001}}; + status = xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, v66_dims.size(), v66_dims.data(), /*data=*/nullptr, - 0, XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &v66); + 1, XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &v66); if (status != xnn_status_success) { std::cerr << "failed to create tensor v66" << std::endl; return nullptr; } -#endif - + alignas(16) static std::array w67_data; uint32_t w67 = XNN_INVALID_VALUE_ID; std::array w67_dims = {{32, 3, 3, 3}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w67_dims.size(), w67_dims.data(), + std::array w67_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w67_scale.begin(), w67_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w67_scale.data(), + w67_dims.size(), 0, w67_dims.data(), /*data=*/w67_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w67); if (status != xnn_status_success) { @@ -857,9 +991,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w68_data; uint32_t w68 = XNN_INVALID_VALUE_ID; std::array w68_dims = {{32}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w68_dims.size(), w68_dims.data(), + std::array w68_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w68_scale.begin(), w68_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w68_scale.data(), + w68_dims.size(), 0, w68_dims.data(), /*data=*/w68_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w68); if (status != xnn_status_success) { @@ -870,9 +1010,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w69_data; uint32_t w69 = XNN_INVALID_VALUE_ID; std::array w69_dims = {{1, 3, 3, 32}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w69_dims.size(), w69_dims.data(), + std::array w69_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w69_scale.begin(), w69_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w69_scale.data(), + w69_dims.size(), 3, w69_dims.data(), /*data=*/w69_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w69); if (status != xnn_status_success) { @@ -883,9 +1029,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w70_data; uint32_t w70 = XNN_INVALID_VALUE_ID; std::array w70_dims = {{32}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w70_dims.size(), w70_dims.data(), + std::array w70_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w70_scale.begin(), w70_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w70_scale.data(), + w70_dims.size(), 0, w70_dims.data(), /*data=*/w70_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w70); if (status != xnn_status_success) { @@ -896,9 +1048,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w71_data; uint32_t w71 = XNN_INVALID_VALUE_ID; std::array w71_dims = {{16, 1, 1, 32}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w71_dims.size(), w71_dims.data(), + std::array w71_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w71_scale.begin(), w71_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w71_scale.data(), + w71_dims.size(), 0, w71_dims.data(), /*data=*/w71_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w71); if (status != xnn_status_success) { @@ -909,9 +1067,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w72_data; uint32_t w72 = XNN_INVALID_VALUE_ID; std::array w72_dims = {{16}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w72_dims.size(), w72_dims.data(), + std::array w72_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w72_scale.begin(), w72_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w72_scale.data(), + w72_dims.size(), 0, w72_dims.data(), /*data=*/w72_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w72); if (status != xnn_status_success) { @@ -922,9 +1086,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w73_data; uint32_t w73 = XNN_INVALID_VALUE_ID; std::array w73_dims = {{96, 1, 1, 16}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w73_dims.size(), w73_dims.data(), + std::array w73_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w73_scale.begin(), w73_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w73_scale.data(), + w73_dims.size(), 0, w73_dims.data(), /*data=*/w73_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w73); if (status != xnn_status_success) { @@ -935,9 +1105,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w74_data; uint32_t w74 = XNN_INVALID_VALUE_ID; std::array w74_dims = {{96}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w74_dims.size(), w74_dims.data(), + std::array w74_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w74_scale.begin(), w74_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w74_scale.data(), + w74_dims.size(), 0, w74_dims.data(), /*data=*/w74_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w74); if (status != xnn_status_success) { @@ -948,9 +1124,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w75_data; uint32_t w75 = XNN_INVALID_VALUE_ID; std::array w75_dims = {{1, 3, 3, 96}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w75_dims.size(), w75_dims.data(), + std::array w75_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w75_scale.begin(), w75_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w75_scale.data(), + w75_dims.size(), 3, w75_dims.data(), /*data=*/w75_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w75); if (status != xnn_status_success) { @@ -961,9 +1143,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w76_data; uint32_t w76 = XNN_INVALID_VALUE_ID; std::array w76_dims = {{96}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w76_dims.size(), w76_dims.data(), + std::array w76_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w76_scale.begin(), w76_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w76_scale.data(), + w76_dims.size(), 0, w76_dims.data(), /*data=*/w76_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w76); if (status != xnn_status_success) { @@ -974,9 +1162,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w77_data; uint32_t w77 = XNN_INVALID_VALUE_ID; std::array w77_dims = {{24, 1, 1, 96}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w77_dims.size(), w77_dims.data(), + std::array w77_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w77_scale.begin(), w77_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w77_scale.data(), + w77_dims.size(), 0, w77_dims.data(), /*data=*/w77_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w77); if (status != xnn_status_success) { @@ -987,9 +1181,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w78_data; uint32_t w78 = XNN_INVALID_VALUE_ID; std::array w78_dims = {{24}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w78_dims.size(), w78_dims.data(), + std::array w78_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w78_scale.begin(), w78_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w78_scale.data(), + w78_dims.size(), 0, w78_dims.data(), /*data=*/w78_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w78); if (status != xnn_status_success) { @@ -1000,9 +1200,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w79_data; uint32_t w79 = XNN_INVALID_VALUE_ID; std::array w79_dims = {{144, 1, 1, 24}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w79_dims.size(), w79_dims.data(), + std::array w79_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w79_scale.begin(), w79_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w79_scale.data(), + w79_dims.size(), 0, w79_dims.data(), /*data=*/w79_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w79); if (status != xnn_status_success) { @@ -1013,9 +1219,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w80_data; uint32_t w80 = XNN_INVALID_VALUE_ID; std::array w80_dims = {{144}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w80_dims.size(), w80_dims.data(), + std::array w80_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w80_scale.begin(), w80_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w80_scale.data(), + w80_dims.size(), 0, w80_dims.data(), /*data=*/w80_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w80); if (status != xnn_status_success) { @@ -1026,9 +1238,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w81_data; uint32_t w81 = XNN_INVALID_VALUE_ID; std::array w81_dims = {{1, 3, 3, 144}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w81_dims.size(), w81_dims.data(), + std::array w81_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w81_scale.begin(), w81_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w81_scale.data(), + w81_dims.size(), 3, w81_dims.data(), /*data=*/w81_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w81); if (status != xnn_status_success) { @@ -1039,9 +1257,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w82_data; uint32_t w82 = XNN_INVALID_VALUE_ID; std::array w82_dims = {{144}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w82_dims.size(), w82_dims.data(), + std::array w82_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w82_scale.begin(), w82_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w82_scale.data(), + w82_dims.size(), 0, w82_dims.data(), /*data=*/w82_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w82); if (status != xnn_status_success) { @@ -1052,9 +1276,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w83_data; uint32_t w83 = XNN_INVALID_VALUE_ID; std::array w83_dims = {{24, 1, 1, 144}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w83_dims.size(), w83_dims.data(), + std::array w83_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w83_scale.begin(), w83_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w83_scale.data(), + w83_dims.size(), 0, w83_dims.data(), /*data=*/w83_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w83); if (status != xnn_status_success) { @@ -1065,9 +1295,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w84_data; uint32_t w84 = XNN_INVALID_VALUE_ID; std::array w84_dims = {{24}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w84_dims.size(), w84_dims.data(), + std::array w84_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w84_scale.begin(), w84_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w84_scale.data(), + w84_dims.size(), 0, w84_dims.data(), /*data=*/w84_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w84); if (status != xnn_status_success) { @@ -1078,9 +1314,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w85_data; uint32_t w85 = XNN_INVALID_VALUE_ID; std::array w85_dims = {{144, 1, 1, 24}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w85_dims.size(), w85_dims.data(), + std::array w85_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w85_scale.begin(), w85_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w85_scale.data(), + w85_dims.size(), 0, w85_dims.data(), /*data=*/w85_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w85); if (status != xnn_status_success) { @@ -1091,9 +1333,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w86_data; uint32_t w86 = XNN_INVALID_VALUE_ID; std::array w86_dims = {{144}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w86_dims.size(), w86_dims.data(), + std::array w86_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w86_scale.begin(), w86_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w86_scale.data(), + w86_dims.size(), 0, w86_dims.data(), /*data=*/w86_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w86); if (status != xnn_status_success) { @@ -1104,9 +1352,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w87_data; uint32_t w87 = XNN_INVALID_VALUE_ID; std::array w87_dims = {{1, 3, 3, 144}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w87_dims.size(), w87_dims.data(), + std::array w87_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w87_scale.begin(), w87_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w87_scale.data(), + w87_dims.size(), 3, w87_dims.data(), /*data=*/w87_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w87); if (status != xnn_status_success) { @@ -1117,9 +1371,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w88_data; uint32_t w88 = XNN_INVALID_VALUE_ID; std::array w88_dims = {{144}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w88_dims.size(), w88_dims.data(), + std::array w88_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w88_scale.begin(), w88_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w88_scale.data(), + w88_dims.size(), 0, w88_dims.data(), /*data=*/w88_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w88); if (status != xnn_status_success) { @@ -1130,9 +1390,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w89_data; uint32_t w89 = XNN_INVALID_VALUE_ID; std::array w89_dims = {{32, 1, 1, 144}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w89_dims.size(), w89_dims.data(), + std::array w89_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w89_scale.begin(), w89_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w89_scale.data(), + w89_dims.size(), 0, w89_dims.data(), /*data=*/w89_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w89); if (status != xnn_status_success) { @@ -1143,9 +1409,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w90_data; uint32_t w90 = XNN_INVALID_VALUE_ID; std::array w90_dims = {{32}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w90_dims.size(), w90_dims.data(), + std::array w90_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w90_scale.begin(), w90_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w90_scale.data(), + w90_dims.size(), 0, w90_dims.data(), /*data=*/w90_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w90); if (status != xnn_status_success) { @@ -1156,9 +1428,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w91_data; uint32_t w91 = XNN_INVALID_VALUE_ID; std::array w91_dims = {{192, 1, 1, 32}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w91_dims.size(), w91_dims.data(), + std::array w91_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w91_scale.begin(), w91_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w91_scale.data(), + w91_dims.size(), 0, w91_dims.data(), /*data=*/w91_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w91); if (status != xnn_status_success) { @@ -1169,9 +1447,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w92_data; uint32_t w92 = XNN_INVALID_VALUE_ID; std::array w92_dims = {{192}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w92_dims.size(), w92_dims.data(), + std::array w92_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w92_scale.begin(), w92_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w92_scale.data(), + w92_dims.size(), 0, w92_dims.data(), /*data=*/w92_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w92); if (status != xnn_status_success) { @@ -1182,9 +1466,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w93_data; uint32_t w93 = XNN_INVALID_VALUE_ID; std::array w93_dims = {{1, 3, 3, 192}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w93_dims.size(), w93_dims.data(), + std::array w93_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w93_scale.begin(), w93_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w93_scale.data(), + w93_dims.size(), 3, w93_dims.data(), /*data=*/w93_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w93); if (status != xnn_status_success) { @@ -1195,9 +1485,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w94_data; uint32_t w94 = XNN_INVALID_VALUE_ID; std::array w94_dims = {{192}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w94_dims.size(), w94_dims.data(), + std::array w94_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w94_scale.begin(), w94_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w94_scale.data(), + w94_dims.size(), 0, w94_dims.data(), /*data=*/w94_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w94); if (status != xnn_status_success) { @@ -1208,9 +1504,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w95_data; uint32_t w95 = XNN_INVALID_VALUE_ID; std::array w95_dims = {{32, 1, 1, 192}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w95_dims.size(), w95_dims.data(), + std::array w95_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w95_scale.begin(), w95_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w95_scale.data(), + w95_dims.size(), 0, w95_dims.data(), /*data=*/w95_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w95); if (status != xnn_status_success) { @@ -1221,9 +1523,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w96_data; uint32_t w96 = XNN_INVALID_VALUE_ID; std::array w96_dims = {{32}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w96_dims.size(), w96_dims.data(), + std::array w96_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w96_scale.begin(), w96_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w96_scale.data(), + w96_dims.size(), 0, w96_dims.data(), /*data=*/w96_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w96); if (status != xnn_status_success) { @@ -1234,9 +1542,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w97_data; uint32_t w97 = XNN_INVALID_VALUE_ID; std::array w97_dims = {{192, 1, 1, 32}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w97_dims.size(), w97_dims.data(), + std::array w97_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w97_scale.begin(), w97_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w97_scale.data(), + w97_dims.size(), 0, w97_dims.data(), /*data=*/w97_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w97); if (status != xnn_status_success) { @@ -1247,9 +1561,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w98_data; uint32_t w98 = XNN_INVALID_VALUE_ID; std::array w98_dims = {{192}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w98_dims.size(), w98_dims.data(), + std::array w98_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w98_scale.begin(), w98_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w98_scale.data(), + w98_dims.size(), 0, w98_dims.data(), /*data=*/w98_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w98); if (status != xnn_status_success) { @@ -1260,9 +1580,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w99_data; uint32_t w99 = XNN_INVALID_VALUE_ID; std::array w99_dims = {{1, 3, 3, 192}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w99_dims.size(), w99_dims.data(), + std::array w99_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w99_scale.begin(), w99_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w99_scale.data(), + w99_dims.size(), 3, w99_dims.data(), /*data=*/w99_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w99); if (status != xnn_status_success) { @@ -1273,9 +1599,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w100_data; uint32_t w100 = XNN_INVALID_VALUE_ID; std::array w100_dims = {{192}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w100_dims.size(), w100_dims.data(), + std::array w100_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w100_scale.begin(), w100_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w100_scale.data(), + w100_dims.size(), 0, w100_dims.data(), /*data=*/w100_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w100); if (status != xnn_status_success) { @@ -1286,9 +1618,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w101_data; uint32_t w101 = XNN_INVALID_VALUE_ID; std::array w101_dims = {{32, 1, 1, 192}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w101_dims.size(), w101_dims.data(), + std::array w101_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w101_scale.begin(), w101_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w101_scale.data(), + w101_dims.size(), 0, w101_dims.data(), /*data=*/w101_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w101); if (status != xnn_status_success) { @@ -1299,9 +1637,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w102_data; uint32_t w102 = XNN_INVALID_VALUE_ID; std::array w102_dims = {{32}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w102_dims.size(), w102_dims.data(), + std::array w102_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w102_scale.begin(), w102_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w102_scale.data(), + w102_dims.size(), 0, w102_dims.data(), /*data=*/w102_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w102); if (status != xnn_status_success) { @@ -1312,9 +1656,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w103_data; uint32_t w103 = XNN_INVALID_VALUE_ID; std::array w103_dims = {{192, 1, 1, 32}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w103_dims.size(), w103_dims.data(), + std::array w103_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w103_scale.begin(), w103_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w103_scale.data(), + w103_dims.size(), 0, w103_dims.data(), /*data=*/w103_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w103); if (status != xnn_status_success) { @@ -1325,9 +1675,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w104_data; uint32_t w104 = XNN_INVALID_VALUE_ID; std::array w104_dims = {{192}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w104_dims.size(), w104_dims.data(), + std::array w104_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w104_scale.begin(), w104_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w104_scale.data(), + w104_dims.size(), 0, w104_dims.data(), /*data=*/w104_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w104); if (status != xnn_status_success) { @@ -1338,9 +1694,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w105_data; uint32_t w105 = XNN_INVALID_VALUE_ID; std::array w105_dims = {{1, 3, 3, 192}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w105_dims.size(), w105_dims.data(), + std::array w105_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w105_scale.begin(), w105_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w105_scale.data(), + w105_dims.size(), 3, w105_dims.data(), /*data=*/w105_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w105); if (status != xnn_status_success) { @@ -1351,9 +1713,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w106_data; uint32_t w106 = XNN_INVALID_VALUE_ID; std::array w106_dims = {{192}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w106_dims.size(), w106_dims.data(), + std::array w106_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w106_scale.begin(), w106_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w106_scale.data(), + w106_dims.size(), 0, w106_dims.data(), /*data=*/w106_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w106); if (status != xnn_status_success) { @@ -1364,9 +1732,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w107_data; uint32_t w107 = XNN_INVALID_VALUE_ID; std::array w107_dims = {{64, 1, 1, 192}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w107_dims.size(), w107_dims.data(), + std::array w107_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w107_scale.begin(), w107_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w107_scale.data(), + w107_dims.size(), 0, w107_dims.data(), /*data=*/w107_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w107); if (status != xnn_status_success) { @@ -1377,9 +1751,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w108_data; uint32_t w108 = XNN_INVALID_VALUE_ID; std::array w108_dims = {{64}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w108_dims.size(), w108_dims.data(), + std::array w108_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w108_scale.begin(), w108_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w108_scale.data(), + w108_dims.size(), 0, w108_dims.data(), /*data=*/w108_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w108); if (status != xnn_status_success) { @@ -1390,9 +1770,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w109_data; uint32_t w109 = XNN_INVALID_VALUE_ID; std::array w109_dims = {{384, 1, 1, 64}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w109_dims.size(), w109_dims.data(), + std::array w109_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w109_scale.begin(), w109_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w109_scale.data(), + w109_dims.size(), 0, w109_dims.data(), /*data=*/w109_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w109); if (status != xnn_status_success) { @@ -1403,9 +1789,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w110_data; uint32_t w110 = XNN_INVALID_VALUE_ID; std::array w110_dims = {{384}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w110_dims.size(), w110_dims.data(), + std::array w110_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w110_scale.begin(), w110_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w110_scale.data(), + w110_dims.size(), 0, w110_dims.data(), /*data=*/w110_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w110); if (status != xnn_status_success) { @@ -1416,9 +1808,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w111_data; uint32_t w111 = XNN_INVALID_VALUE_ID; std::array w111_dims = {{1, 3, 3, 384}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w111_dims.size(), w111_dims.data(), + std::array w111_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w111_scale.begin(), w111_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w111_scale.data(), + w111_dims.size(), 3, w111_dims.data(), /*data=*/w111_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w111); if (status != xnn_status_success) { @@ -1429,9 +1827,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w112_data; uint32_t w112 = XNN_INVALID_VALUE_ID; std::array w112_dims = {{384}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w112_dims.size(), w112_dims.data(), + std::array w112_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w112_scale.begin(), w112_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w112_scale.data(), + w112_dims.size(), 0, w112_dims.data(), /*data=*/w112_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w112); if (status != xnn_status_success) { @@ -1442,9 +1846,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w113_data; uint32_t w113 = XNN_INVALID_VALUE_ID; std::array w113_dims = {{64, 1, 1, 384}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w113_dims.size(), w113_dims.data(), + std::array w113_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w113_scale.begin(), w113_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w113_scale.data(), + w113_dims.size(), 0, w113_dims.data(), /*data=*/w113_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w113); if (status != xnn_status_success) { @@ -1455,9 +1865,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w114_data; uint32_t w114 = XNN_INVALID_VALUE_ID; std::array w114_dims = {{64}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w114_dims.size(), w114_dims.data(), + std::array w114_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w114_scale.begin(), w114_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w114_scale.data(), + w114_dims.size(), 0, w114_dims.data(), /*data=*/w114_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w114); if (status != xnn_status_success) { @@ -1468,9 +1884,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w115_data; uint32_t w115 = XNN_INVALID_VALUE_ID; std::array w115_dims = {{384, 1, 1, 64}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w115_dims.size(), w115_dims.data(), + std::array w115_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w115_scale.begin(), w115_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w115_scale.data(), + w115_dims.size(), 0, w115_dims.data(), /*data=*/w115_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w115); if (status != xnn_status_success) { @@ -1481,9 +1903,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w116_data; uint32_t w116 = XNN_INVALID_VALUE_ID; std::array w116_dims = {{384}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w116_dims.size(), w116_dims.data(), + std::array w116_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w116_scale.begin(), w116_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w116_scale.data(), + w116_dims.size(), 0, w116_dims.data(), /*data=*/w116_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w116); if (status != xnn_status_success) { @@ -1494,9 +1922,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w117_data; uint32_t w117 = XNN_INVALID_VALUE_ID; std::array w117_dims = {{1, 3, 3, 384}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w117_dims.size(), w117_dims.data(), + std::array w117_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w117_scale.begin(), w117_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w117_scale.data(), + w117_dims.size(), 3, w117_dims.data(), /*data=*/w117_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w117); if (status != xnn_status_success) { @@ -1507,9 +1941,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w118_data; uint32_t w118 = XNN_INVALID_VALUE_ID; std::array w118_dims = {{384}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w118_dims.size(), w118_dims.data(), + std::array w118_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w118_scale.begin(), w118_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w118_scale.data(), + w118_dims.size(), 0, w118_dims.data(), /*data=*/w118_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w118); if (status != xnn_status_success) { @@ -1520,9 +1960,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w119_data; uint32_t w119 = XNN_INVALID_VALUE_ID; std::array w119_dims = {{64, 1, 1, 384}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w119_dims.size(), w119_dims.data(), + std::array w119_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w119_scale.begin(), w119_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w119_scale.data(), + w119_dims.size(), 0, w119_dims.data(), /*data=*/w119_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w119); if (status != xnn_status_success) { @@ -1533,9 +1979,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w120_data; uint32_t w120 = XNN_INVALID_VALUE_ID; std::array w120_dims = {{64}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w120_dims.size(), w120_dims.data(), + std::array w120_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w120_scale.begin(), w120_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w120_scale.data(), + w120_dims.size(), 0, w120_dims.data(), /*data=*/w120_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w120); if (status != xnn_status_success) { @@ -1546,9 +1998,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w121_data; uint32_t w121 = XNN_INVALID_VALUE_ID; std::array w121_dims = {{384, 1, 1, 64}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w121_dims.size(), w121_dims.data(), + std::array w121_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w121_scale.begin(), w121_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w121_scale.data(), + w121_dims.size(), 0, w121_dims.data(), /*data=*/w121_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w121); if (status != xnn_status_success) { @@ -1559,9 +2017,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w122_data; uint32_t w122 = XNN_INVALID_VALUE_ID; std::array w122_dims = {{384}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w122_dims.size(), w122_dims.data(), + std::array w122_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w122_scale.begin(), w122_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w122_scale.data(), + w122_dims.size(), 0, w122_dims.data(), /*data=*/w122_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w122); if (status != xnn_status_success) { @@ -1572,9 +2036,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w123_data; uint32_t w123 = XNN_INVALID_VALUE_ID; std::array w123_dims = {{1, 3, 3, 384}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w123_dims.size(), w123_dims.data(), + std::array w123_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w123_scale.begin(), w123_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w123_scale.data(), + w123_dims.size(), 3, w123_dims.data(), /*data=*/w123_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w123); if (status != xnn_status_success) { @@ -1585,9 +2055,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w124_data; uint32_t w124 = XNN_INVALID_VALUE_ID; std::array w124_dims = {{384}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w124_dims.size(), w124_dims.data(), + std::array w124_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w124_scale.begin(), w124_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w124_scale.data(), + w124_dims.size(), 0, w124_dims.data(), /*data=*/w124_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w124); if (status != xnn_status_success) { @@ -1598,9 +2074,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w125_data; uint32_t w125 = XNN_INVALID_VALUE_ID; std::array w125_dims = {{64, 1, 1, 384}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w125_dims.size(), w125_dims.data(), + std::array w125_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w125_scale.begin(), w125_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w125_scale.data(), + w125_dims.size(), 0, w125_dims.data(), /*data=*/w125_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w125); if (status != xnn_status_success) { @@ -1611,9 +2093,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w126_data; uint32_t w126 = XNN_INVALID_VALUE_ID; std::array w126_dims = {{64}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w126_dims.size(), w126_dims.data(), + std::array w126_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w126_scale.begin(), w126_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w126_scale.data(), + w126_dims.size(), 0, w126_dims.data(), /*data=*/w126_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w126); if (status != xnn_status_success) { @@ -1624,9 +2112,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w127_data; uint32_t w127 = XNN_INVALID_VALUE_ID; std::array w127_dims = {{384, 1, 1, 64}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w127_dims.size(), w127_dims.data(), + std::array w127_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w127_scale.begin(), w127_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w127_scale.data(), + w127_dims.size(), 0, w127_dims.data(), /*data=*/w127_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w127); if (status != xnn_status_success) { @@ -1637,9 +2131,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w128_data; uint32_t w128 = XNN_INVALID_VALUE_ID; std::array w128_dims = {{384}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w128_dims.size(), w128_dims.data(), + std::array w128_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w128_scale.begin(), w128_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w128_scale.data(), + w128_dims.size(), 0, w128_dims.data(), /*data=*/w128_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w128); if (status != xnn_status_success) { @@ -1650,9 +2150,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w129_data; uint32_t w129 = XNN_INVALID_VALUE_ID; std::array w129_dims = {{1, 3, 3, 384}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w129_dims.size(), w129_dims.data(), + std::array w129_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w129_scale.begin(), w129_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w129_scale.data(), + w129_dims.size(), 3, w129_dims.data(), /*data=*/w129_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w129); if (status != xnn_status_success) { @@ -1663,9 +2169,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w130_data; uint32_t w130 = XNN_INVALID_VALUE_ID; std::array w130_dims = {{384}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w130_dims.size(), w130_dims.data(), + std::array w130_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w130_scale.begin(), w130_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w130_scale.data(), + w130_dims.size(), 0, w130_dims.data(), /*data=*/w130_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w130); if (status != xnn_status_success) { @@ -1676,9 +2188,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w131_data; uint32_t w131 = XNN_INVALID_VALUE_ID; std::array w131_dims = {{96, 1, 1, 384}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w131_dims.size(), w131_dims.data(), + std::array w131_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w131_scale.begin(), w131_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w131_scale.data(), + w131_dims.size(), 0, w131_dims.data(), /*data=*/w131_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w131); if (status != xnn_status_success) { @@ -1689,9 +2207,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w132_data; uint32_t w132 = XNN_INVALID_VALUE_ID; std::array w132_dims = {{96}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w132_dims.size(), w132_dims.data(), + std::array w132_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w132_scale.begin(), w132_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w132_scale.data(), + w132_dims.size(), 0, w132_dims.data(), /*data=*/w132_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w132); if (status != xnn_status_success) { @@ -1702,9 +2226,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w133_data; uint32_t w133 = XNN_INVALID_VALUE_ID; std::array w133_dims = {{576, 1, 1, 96}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w133_dims.size(), w133_dims.data(), + std::array w133_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w133_scale.begin(), w133_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w133_scale.data(), + w133_dims.size(), 0, w133_dims.data(), /*data=*/w133_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w133); if (status != xnn_status_success) { @@ -1715,9 +2245,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w134_data; uint32_t w134 = XNN_INVALID_VALUE_ID; std::array w134_dims = {{576}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w134_dims.size(), w134_dims.data(), + std::array w134_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w134_scale.begin(), w134_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w134_scale.data(), + w134_dims.size(), 0, w134_dims.data(), /*data=*/w134_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w134); if (status != xnn_status_success) { @@ -1728,9 +2264,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w135_data; uint32_t w135 = XNN_INVALID_VALUE_ID; std::array w135_dims = {{1, 3, 3, 576}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w135_dims.size(), w135_dims.data(), + std::array w135_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w135_scale.begin(), w135_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w135_scale.data(), + w135_dims.size(), 3, w135_dims.data(), /*data=*/w135_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w135); if (status != xnn_status_success) { @@ -1741,9 +2283,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w136_data; uint32_t w136 = XNN_INVALID_VALUE_ID; std::array w136_dims = {{576}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w136_dims.size(), w136_dims.data(), + std::array w136_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w136_scale.begin(), w136_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w136_scale.data(), + w136_dims.size(), 0, w136_dims.data(), /*data=*/w136_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w136); if (status != xnn_status_success) { @@ -1754,9 +2302,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w137_data; uint32_t w137 = XNN_INVALID_VALUE_ID; std::array w137_dims = {{96, 1, 1, 576}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w137_dims.size(), w137_dims.data(), + std::array w137_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w137_scale.begin(), w137_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w137_scale.data(), + w137_dims.size(), 0, w137_dims.data(), /*data=*/w137_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w137); if (status != xnn_status_success) { @@ -1767,9 +2321,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w138_data; uint32_t w138 = XNN_INVALID_VALUE_ID; std::array w138_dims = {{96}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w138_dims.size(), w138_dims.data(), + std::array w138_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w138_scale.begin(), w138_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w138_scale.data(), + w138_dims.size(), 0, w138_dims.data(), /*data=*/w138_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w138); if (status != xnn_status_success) { @@ -1780,9 +2340,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w139_data; uint32_t w139 = XNN_INVALID_VALUE_ID; std::array w139_dims = {{576, 1, 1, 96}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w139_dims.size(), w139_dims.data(), + std::array w139_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w139_scale.begin(), w139_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w139_scale.data(), + w139_dims.size(), 0, w139_dims.data(), /*data=*/w139_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w139); if (status != xnn_status_success) { @@ -1793,9 +2359,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w140_data; uint32_t w140 = XNN_INVALID_VALUE_ID; std::array w140_dims = {{576}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w140_dims.size(), w140_dims.data(), + std::array w140_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w140_scale.begin(), w140_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w140_scale.data(), + w140_dims.size(), 0, w140_dims.data(), /*data=*/w140_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w140); if (status != xnn_status_success) { @@ -1806,9 +2378,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w141_data; uint32_t w141 = XNN_INVALID_VALUE_ID; std::array w141_dims = {{1, 3, 3, 576}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w141_dims.size(), w141_dims.data(), + std::array w141_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w141_scale.begin(), w141_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w141_scale.data(), + w141_dims.size(), 3, w141_dims.data(), /*data=*/w141_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w141); if (status != xnn_status_success) { @@ -1819,9 +2397,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w142_data; uint32_t w142 = XNN_INVALID_VALUE_ID; std::array w142_dims = {{576}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w142_dims.size(), w142_dims.data(), + std::array w142_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w142_scale.begin(), w142_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w142_scale.data(), + w142_dims.size(), 0, w142_dims.data(), /*data=*/w142_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w142); if (status != xnn_status_success) { @@ -1832,9 +2416,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w143_data; uint32_t w143 = XNN_INVALID_VALUE_ID; std::array w143_dims = {{96, 1, 1, 576}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w143_dims.size(), w143_dims.data(), + std::array w143_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w143_scale.begin(), w143_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w143_scale.data(), + w143_dims.size(), 0, w143_dims.data(), /*data=*/w143_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w143); if (status != xnn_status_success) { @@ -1845,9 +2435,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w144_data; uint32_t w144 = XNN_INVALID_VALUE_ID; std::array w144_dims = {{96}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w144_dims.size(), w144_dims.data(), + std::array w144_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w144_scale.begin(), w144_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w144_scale.data(), + w144_dims.size(), 0, w144_dims.data(), /*data=*/w144_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w144); if (status != xnn_status_success) { @@ -1858,9 +2454,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w145_data; uint32_t w145 = XNN_INVALID_VALUE_ID; std::array w145_dims = {{576, 1, 1, 96}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w145_dims.size(), w145_dims.data(), + std::array w145_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w145_scale.begin(), w145_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w145_scale.data(), + w145_dims.size(), 0, w145_dims.data(), /*data=*/w145_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w145); if (status != xnn_status_success) { @@ -1871,9 +2473,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w146_data; uint32_t w146 = XNN_INVALID_VALUE_ID; std::array w146_dims = {{576}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w146_dims.size(), w146_dims.data(), + std::array w146_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w146_scale.begin(), w146_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w146_scale.data(), + w146_dims.size(), 0, w146_dims.data(), /*data=*/w146_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w146); if (status != xnn_status_success) { @@ -1884,9 +2492,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w147_data; uint32_t w147 = XNN_INVALID_VALUE_ID; std::array w147_dims = {{1, 3, 3, 576}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w147_dims.size(), w147_dims.data(), + std::array w147_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w147_scale.begin(), w147_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w147_scale.data(), + w147_dims.size(), 3, w147_dims.data(), /*data=*/w147_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w147); if (status != xnn_status_success) { @@ -1897,9 +2511,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w148_data; uint32_t w148 = XNN_INVALID_VALUE_ID; std::array w148_dims = {{576}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w148_dims.size(), w148_dims.data(), + std::array w148_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w148_scale.begin(), w148_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w148_scale.data(), + w148_dims.size(), 0, w148_dims.data(), /*data=*/w148_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w148); if (status != xnn_status_success) { @@ -1910,9 +2530,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w149_data; uint32_t w149 = XNN_INVALID_VALUE_ID; std::array w149_dims = {{160, 1, 1, 576}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w149_dims.size(), w149_dims.data(), + std::array w149_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w149_scale.begin(), w149_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w149_scale.data(), + w149_dims.size(), 0, w149_dims.data(), /*data=*/w149_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w149); if (status != xnn_status_success) { @@ -1923,9 +2549,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w150_data; uint32_t w150 = XNN_INVALID_VALUE_ID; std::array w150_dims = {{160}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w150_dims.size(), w150_dims.data(), + std::array w150_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w150_scale.begin(), w150_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w150_scale.data(), + w150_dims.size(), 0, w150_dims.data(), /*data=*/w150_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w150); if (status != xnn_status_success) { @@ -1936,9 +2568,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w151_data; uint32_t w151 = XNN_INVALID_VALUE_ID; std::array w151_dims = {{960, 1, 1, 160}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w151_dims.size(), w151_dims.data(), + std::array w151_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w151_scale.begin(), w151_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w151_scale.data(), + w151_dims.size(), 0, w151_dims.data(), /*data=*/w151_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w151); if (status != xnn_status_success) { @@ -1949,9 +2587,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w152_data; uint32_t w152 = XNN_INVALID_VALUE_ID; std::array w152_dims = {{960}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w152_dims.size(), w152_dims.data(), + std::array w152_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w152_scale.begin(), w152_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w152_scale.data(), + w152_dims.size(), 0, w152_dims.data(), /*data=*/w152_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w152); if (status != xnn_status_success) { @@ -1962,9 +2606,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w153_data; uint32_t w153 = XNN_INVALID_VALUE_ID; std::array w153_dims = {{1, 3, 3, 960}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w153_dims.size(), w153_dims.data(), + std::array w153_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w153_scale.begin(), w153_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w153_scale.data(), + w153_dims.size(), 3, w153_dims.data(), /*data=*/w153_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w153); if (status != xnn_status_success) { @@ -1975,9 +2625,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w154_data; uint32_t w154 = XNN_INVALID_VALUE_ID; std::array w154_dims = {{960}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w154_dims.size(), w154_dims.data(), + std::array w154_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w154_scale.begin(), w154_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w154_scale.data(), + w154_dims.size(), 0, w154_dims.data(), /*data=*/w154_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w154); if (status != xnn_status_success) { @@ -1988,9 +2644,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w155_data; uint32_t w155 = XNN_INVALID_VALUE_ID; std::array w155_dims = {{160, 1, 1, 960}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w155_dims.size(), w155_dims.data(), + std::array w155_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w155_scale.begin(), w155_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w155_scale.data(), + w155_dims.size(), 0, w155_dims.data(), /*data=*/w155_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w155); if (status != xnn_status_success) { @@ -2001,9 +2663,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w156_data; uint32_t w156 = XNN_INVALID_VALUE_ID; std::array w156_dims = {{160}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w156_dims.size(), w156_dims.data(), + std::array w156_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w156_scale.begin(), w156_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w156_scale.data(), + w156_dims.size(), 0, w156_dims.data(), /*data=*/w156_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w156); if (status != xnn_status_success) { @@ -2014,9 +2682,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w157_data; uint32_t w157 = XNN_INVALID_VALUE_ID; std::array w157_dims = {{960, 1, 1, 160}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w157_dims.size(), w157_dims.data(), + std::array w157_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w157_scale.begin(), w157_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w157_scale.data(), + w157_dims.size(), 0, w157_dims.data(), /*data=*/w157_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w157); if (status != xnn_status_success) { @@ -2027,9 +2701,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w158_data; uint32_t w158 = XNN_INVALID_VALUE_ID; std::array w158_dims = {{960}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w158_dims.size(), w158_dims.data(), + std::array w158_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w158_scale.begin(), w158_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w158_scale.data(), + w158_dims.size(), 0, w158_dims.data(), /*data=*/w158_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w158); if (status != xnn_status_success) { @@ -2040,9 +2720,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w159_data; uint32_t w159 = XNN_INVALID_VALUE_ID; std::array w159_dims = {{1, 3, 3, 960}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w159_dims.size(), w159_dims.data(), + std::array w159_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w159_scale.begin(), w159_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w159_scale.data(), + w159_dims.size(), 3, w159_dims.data(), /*data=*/w159_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w159); if (status != xnn_status_success) { @@ -2053,9 +2739,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w160_data; uint32_t w160 = XNN_INVALID_VALUE_ID; std::array w160_dims = {{960}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w160_dims.size(), w160_dims.data(), + std::array w160_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w160_scale.begin(), w160_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w160_scale.data(), + w160_dims.size(), 0, w160_dims.data(), /*data=*/w160_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w160); if (status != xnn_status_success) { @@ -2066,9 +2758,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w161_data; uint32_t w161 = XNN_INVALID_VALUE_ID; std::array w161_dims = {{160, 1, 1, 960}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w161_dims.size(), w161_dims.data(), + std::array w161_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w161_scale.begin(), w161_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w161_scale.data(), + w161_dims.size(), 0, w161_dims.data(), /*data=*/w161_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w161); if (status != xnn_status_success) { @@ -2079,9 +2777,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w162_data; uint32_t w162 = XNN_INVALID_VALUE_ID; std::array w162_dims = {{160}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w162_dims.size(), w162_dims.data(), + std::array w162_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w162_scale.begin(), w162_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w162_scale.data(), + w162_dims.size(), 0, w162_dims.data(), /*data=*/w162_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w162); if (status != xnn_status_success) { @@ -2092,9 +2796,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w163_data; uint32_t w163 = XNN_INVALID_VALUE_ID; std::array w163_dims = {{960, 1, 1, 160}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w163_dims.size(), w163_dims.data(), + std::array w163_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w163_scale.begin(), w163_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w163_scale.data(), + w163_dims.size(), 0, w163_dims.data(), /*data=*/w163_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w163); if (status != xnn_status_success) { @@ -2105,9 +2815,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w164_data; uint32_t w164 = XNN_INVALID_VALUE_ID; std::array w164_dims = {{960}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w164_dims.size(), w164_dims.data(), + std::array w164_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w164_scale.begin(), w164_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w164_scale.data(), + w164_dims.size(), 0, w164_dims.data(), /*data=*/w164_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w164); if (status != xnn_status_success) { @@ -2118,9 +2834,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w165_data; uint32_t w165 = XNN_INVALID_VALUE_ID; std::array w165_dims = {{1, 3, 3, 960}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w165_dims.size(), w165_dims.data(), + std::array w165_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w165_scale.begin(), w165_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w165_scale.data(), + w165_dims.size(), 3, w165_dims.data(), /*data=*/w165_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w165); if (status != xnn_status_success) { @@ -2131,9 +2853,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w166_data; uint32_t w166 = XNN_INVALID_VALUE_ID; std::array w166_dims = {{960}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w166_dims.size(), w166_dims.data(), + std::array w166_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w166_scale.begin(), w166_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w166_scale.data(), + w166_dims.size(), 0, w166_dims.data(), /*data=*/w166_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w166); if (status != xnn_status_success) { @@ -2144,9 +2872,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w167_data; uint32_t w167 = XNN_INVALID_VALUE_ID; std::array w167_dims = {{320, 1, 1, 960}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w167_dims.size(), w167_dims.data(), + std::array w167_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w167_scale.begin(), w167_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w167_scale.data(), + w167_dims.size(), 0, w167_dims.data(), /*data=*/w167_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w167); if (status != xnn_status_success) { @@ -2157,9 +2891,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w168_data; uint32_t w168 = XNN_INVALID_VALUE_ID; std::array w168_dims = {{320}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w168_dims.size(), w168_dims.data(), + std::array w168_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w168_scale.begin(), w168_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w168_scale.data(), + w168_dims.size(), 0, w168_dims.data(), /*data=*/w168_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w168); if (status != xnn_status_success) { @@ -2170,9 +2910,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w169_data; uint32_t w169 = XNN_INVALID_VALUE_ID; std::array w169_dims = {{1280, 1, 1, 320}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, - w169_dims.size(), w169_dims.data(), + std::array w169_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w169_scale.begin(), w169_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint8, + /*scale=*/w169_scale.data(), + w169_dims.size(), 0, w169_dims.data(), /*data=*/w169_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w169); if (status != xnn_status_success) { @@ -2183,9 +2929,15 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w170_data; uint32_t w170 = XNN_INVALID_VALUE_ID; std::array w170_dims = {{1280}}; - status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, - w170_dims.size(), w170_dims.data(), + std::array w170_scale; + { + auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); + std::generate(w170_scale.begin(), w170_scale.end(), std::ref(scalerng)); + } + status = xnn_define_channelwise_quantized_tensor_value( + subgraph, xnn_datatype_qcint32, + /*scale=*/w170_scale.data(), + w170_dims.size(), 0, w170_dims.data(), /*data=*/w170_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w170); if (status != xnn_status_success) { @@ -2195,9 +2947,11 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w171_data; uint32_t w171 = XNN_INVALID_VALUE_ID; - std::array w171_dims = {{1008, 1, 1, 1280}}; + std::array w171_dims = {{1001, 1280}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint8, + /*zero_point=*/0, + /*scale=*/0.004167426843196154f, w171_dims.size(), w171_dims.data(), /*data=*/w171_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w171); @@ -2206,11 +2960,13 @@ xnn_subgraph_t QS8MobileNetV2() { return nullptr; } - alignas(16) static std::array w172_data; + alignas(16) static std::array w172_data; uint32_t w172 = XNN_INVALID_VALUE_ID; - std::array w172_dims = {{1008}}; + std::array w172_dims = {{1001}}; status = xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint32, /*zero_point=*/0, /*scale=*/1.0f, + subgraph, xnn_datatype_qint32, + /*zero_point=*/0, + /*scale=*/7.704259769525379e-05f, w172_dims.size(), w172_dims.data(), /*data=*/w172_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w172); @@ -2219,115 +2975,126 @@ xnn_subgraph_t QS8MobileNetV2() { return nullptr; } - std::random_device random_device; - auto rng = std::mt19937(random_device()); - auto f32rng = std::bind(std::uniform_real_distribution(-1.0f, +1.0f), std::ref(rng)); - std::generate(w67_data.begin(), w67_data.end(), std::ref(f32rng)); - std::generate(w68_data.begin(), w68_data.end(), std::ref(f32rng)); - std::generate(w69_data.begin(), w69_data.end(), std::ref(f32rng)); - std::generate(w70_data.begin(), w70_data.end(), std::ref(f32rng)); - std::generate(w71_data.begin(), w71_data.end(), std::ref(f32rng)); - std::generate(w72_data.begin(), w72_data.end(), std::ref(f32rng)); - std::generate(w73_data.begin(), w73_data.end(), std::ref(f32rng)); - std::generate(w74_data.begin(), w74_data.end(), std::ref(f32rng)); - std::generate(w75_data.begin(), w75_data.end(), std::ref(f32rng)); - std::generate(w76_data.begin(), w76_data.end(), std::ref(f32rng)); - std::generate(w77_data.begin(), w77_data.end(), std::ref(f32rng)); - std::generate(w78_data.begin(), w78_data.end(), std::ref(f32rng)); - std::generate(w79_data.begin(), w79_data.end(), std::ref(f32rng)); - std::generate(w80_data.begin(), w80_data.end(), std::ref(f32rng)); - std::generate(w81_data.begin(), w81_data.end(), std::ref(f32rng)); - std::generate(w82_data.begin(), w82_data.end(), std::ref(f32rng)); - std::generate(w83_data.begin(), w83_data.end(), std::ref(f32rng)); - std::generate(w84_data.begin(), w84_data.end(), std::ref(f32rng)); - std::generate(w85_data.begin(), w85_data.end(), std::ref(f32rng)); - std::generate(w86_data.begin(), w86_data.end(), std::ref(f32rng)); - std::generate(w87_data.begin(), w87_data.end(), std::ref(f32rng)); - std::generate(w88_data.begin(), w88_data.end(), std::ref(f32rng)); - std::generate(w89_data.begin(), w89_data.end(), std::ref(f32rng)); - std::generate(w90_data.begin(), w90_data.end(), std::ref(f32rng)); - std::generate(w91_data.begin(), w91_data.end(), std::ref(f32rng)); - std::generate(w92_data.begin(), w92_data.end(), std::ref(f32rng)); - std::generate(w93_data.begin(), w93_data.end(), std::ref(f32rng)); - std::generate(w94_data.begin(), w94_data.end(), std::ref(f32rng)); - std::generate(w95_data.begin(), w95_data.end(), std::ref(f32rng)); - std::generate(w96_data.begin(), w96_data.end(), std::ref(f32rng)); - std::generate(w97_data.begin(), w97_data.end(), std::ref(f32rng)); - std::generate(w98_data.begin(), w98_data.end(), std::ref(f32rng)); - std::generate(w99_data.begin(), w99_data.end(), std::ref(f32rng)); - std::generate(w100_data.begin(), w100_data.end(), std::ref(f32rng)); - std::generate(w101_data.begin(), w101_data.end(), std::ref(f32rng)); - std::generate(w102_data.begin(), w102_data.end(), std::ref(f32rng)); - std::generate(w103_data.begin(), w103_data.end(), std::ref(f32rng)); - std::generate(w104_data.begin(), w104_data.end(), std::ref(f32rng)); - std::generate(w105_data.begin(), w105_data.end(), std::ref(f32rng)); - std::generate(w106_data.begin(), w106_data.end(), std::ref(f32rng)); - std::generate(w107_data.begin(), w107_data.end(), std::ref(f32rng)); - std::generate(w108_data.begin(), w108_data.end(), std::ref(f32rng)); - std::generate(w109_data.begin(), w109_data.end(), std::ref(f32rng)); - std::generate(w110_data.begin(), w110_data.end(), std::ref(f32rng)); - std::generate(w111_data.begin(), w111_data.end(), std::ref(f32rng)); - std::generate(w112_data.begin(), w112_data.end(), std::ref(f32rng)); - std::generate(w113_data.begin(), w113_data.end(), std::ref(f32rng)); - std::generate(w114_data.begin(), w114_data.end(), std::ref(f32rng)); - std::generate(w115_data.begin(), w115_data.end(), std::ref(f32rng)); - std::generate(w116_data.begin(), w116_data.end(), std::ref(f32rng)); - std::generate(w117_data.begin(), w117_data.end(), std::ref(f32rng)); - std::generate(w118_data.begin(), w118_data.end(), std::ref(f32rng)); - std::generate(w119_data.begin(), w119_data.end(), std::ref(f32rng)); - std::generate(w120_data.begin(), w120_data.end(), std::ref(f32rng)); - std::generate(w121_data.begin(), w121_data.end(), std::ref(f32rng)); - std::generate(w122_data.begin(), w122_data.end(), std::ref(f32rng)); - std::generate(w123_data.begin(), w123_data.end(), std::ref(f32rng)); - std::generate(w124_data.begin(), w124_data.end(), std::ref(f32rng)); - std::generate(w125_data.begin(), w125_data.end(), std::ref(f32rng)); - std::generate(w126_data.begin(), w126_data.end(), std::ref(f32rng)); - std::generate(w127_data.begin(), w127_data.end(), std::ref(f32rng)); - std::generate(w128_data.begin(), w128_data.end(), std::ref(f32rng)); - std::generate(w129_data.begin(), w129_data.end(), std::ref(f32rng)); - std::generate(w130_data.begin(), w130_data.end(), std::ref(f32rng)); - std::generate(w131_data.begin(), w131_data.end(), std::ref(f32rng)); - std::generate(w132_data.begin(), w132_data.end(), std::ref(f32rng)); - std::generate(w133_data.begin(), w133_data.end(), std::ref(f32rng)); - std::generate(w134_data.begin(), w134_data.end(), std::ref(f32rng)); - std::generate(w135_data.begin(), w135_data.end(), std::ref(f32rng)); - std::generate(w136_data.begin(), w136_data.end(), std::ref(f32rng)); - std::generate(w137_data.begin(), w137_data.end(), std::ref(f32rng)); - std::generate(w138_data.begin(), w138_data.end(), std::ref(f32rng)); - std::generate(w139_data.begin(), w139_data.end(), std::ref(f32rng)); - std::generate(w140_data.begin(), w140_data.end(), std::ref(f32rng)); - std::generate(w141_data.begin(), w141_data.end(), std::ref(f32rng)); - std::generate(w142_data.begin(), w142_data.end(), std::ref(f32rng)); - std::generate(w143_data.begin(), w143_data.end(), std::ref(f32rng)); - std::generate(w144_data.begin(), w144_data.end(), std::ref(f32rng)); - std::generate(w145_data.begin(), w145_data.end(), std::ref(f32rng)); - std::generate(w146_data.begin(), w146_data.end(), std::ref(f32rng)); - std::generate(w147_data.begin(), w147_data.end(), std::ref(f32rng)); - std::generate(w148_data.begin(), w148_data.end(), std::ref(f32rng)); - std::generate(w149_data.begin(), w149_data.end(), std::ref(f32rng)); - std::generate(w150_data.begin(), w150_data.end(), std::ref(f32rng)); - std::generate(w151_data.begin(), w151_data.end(), std::ref(f32rng)); - std::generate(w152_data.begin(), w152_data.end(), std::ref(f32rng)); - std::generate(w153_data.begin(), w153_data.end(), std::ref(f32rng)); - std::generate(w154_data.begin(), w154_data.end(), std::ref(f32rng)); - std::generate(w155_data.begin(), w155_data.end(), std::ref(f32rng)); - std::generate(w156_data.begin(), w156_data.end(), std::ref(f32rng)); - std::generate(w157_data.begin(), w157_data.end(), std::ref(f32rng)); - std::generate(w158_data.begin(), w158_data.end(), std::ref(f32rng)); - std::generate(w159_data.begin(), w159_data.end(), std::ref(f32rng)); - std::generate(w160_data.begin(), w160_data.end(), std::ref(f32rng)); - std::generate(w161_data.begin(), w161_data.end(), std::ref(f32rng)); - std::generate(w162_data.begin(), w162_data.end(), std::ref(f32rng)); - std::generate(w163_data.begin(), w163_data.end(), std::ref(f32rng)); - std::generate(w164_data.begin(), w164_data.end(), std::ref(f32rng)); - std::generate(w165_data.begin(), w165_data.end(), std::ref(f32rng)); - std::generate(w166_data.begin(), w166_data.end(), std::ref(f32rng)); - std::generate(w167_data.begin(), w167_data.end(), std::ref(f32rng)); - std::generate(w168_data.begin(), w168_data.end(), std::ref(f32rng)); - std::generate(w169_data.begin(), w169_data.end(), std::ref(f32rng)); - std::generate(w170_data.begin(), w170_data.end(), std::ref(f32rng)); - std::generate(w171_data.begin(), w171_data.end(), std::ref(f32rng)); - std::generate(w172_data.begin(), w172_data.end(), std::ref(f32rng)); + auto qs8rng = std::bind(std::uniform_int_distribution(std::numeric_limits::min(), std::numeric_limits::max()), std::ref(rng)); + auto qc8rng = std::bind(std::uniform_int_distribution(std::numeric_limits::min(), std::numeric_limits::max()), std::ref(rng)); + auto qs32rng = std::bind(std::uniform_int_distribution(-10000, 10000), std::ref(rng)); + auto qc32rng = std::bind(std::uniform_int_distribution(-10000, 10000), std::ref(rng)); + std::generate(w67_data.begin(), w67_data.end(), std::ref(qc8rng)); + std::generate(w68_data.begin(), w68_data.end(), std::ref(qc32rng)); + std::generate(w69_data.begin(), w69_data.end(), std::ref(qc8rng)); + std::generate(w70_data.begin(), w70_data.end(), std::ref(qc32rng)); + std::generate(w71_data.begin(), w71_data.end(), std::ref(qc8rng)); + std::generate(w72_data.begin(), w72_data.end(), std::ref(qc32rng)); + std::generate(w73_data.begin(), w73_data.end(), std::ref(qc8rng)); + std::generate(w74_data.begin(), w74_data.end(), std::ref(qc32rng)); + std::generate(w75_data.begin(), w75_data.end(), std::ref(qc8rng)); + std::generate(w76_data.begin(), w76_data.end(), std::ref(qc32rng)); + std::generate(w77_data.begin(), w77_data.end(), std::ref(qc8rng)); + std::generate(w78_data.begin(), w78_data.end(), std::ref(qc32rng)); + std::generate(w79_data.begin(), w79_data.end(), std::ref(qc8rng)); + std::generate(w80_data.begin(), w80_data.end(), std::ref(qc32rng)); + std::generate(w81_data.begin(), w81_data.end(), std::ref(qc8rng)); + std::generate(w82_data.begin(), w82_data.end(), std::ref(qc32rng)); + std::generate(w83_data.begin(), w83_data.end(), std::ref(qc8rng)); + std::generate(w84_data.begin(), w84_data.end(), std::ref(qc32rng)); + std::generate(w85_data.begin(), w85_data.end(), std::ref(qc8rng)); + std::generate(w86_data.begin(), w86_data.end(), std::ref(qc32rng)); + std::generate(w87_data.begin(), w87_data.end(), std::ref(qc8rng)); + std::generate(w88_data.begin(), w88_data.end(), std::ref(qc32rng)); + std::generate(w89_data.begin(), w89_data.end(), std::ref(qc8rng)); + std::generate(w90_data.begin(), w90_data.end(), std::ref(qc32rng)); + std::generate(w91_data.begin(), w91_data.end(), std::ref(qc8rng)); + std::generate(w92_data.begin(), w92_data.end(), std::ref(qc32rng)); + std::generate(w93_data.begin(), w93_data.end(), std::ref(qc8rng)); + std::generate(w94_data.begin(), w94_data.end(), std::ref(qc32rng)); + std::generate(w95_data.begin(), w95_data.end(), std::ref(qc8rng)); + std::generate(w96_data.begin(), w96_data.end(), std::ref(qc32rng)); + std::generate(w97_data.begin(), w97_data.end(), std::ref(qc8rng)); + std::generate(w98_data.begin(), w98_data.end(), std::ref(qc32rng)); + std::generate(w99_data.begin(), w99_data.end(), std::ref(qc8rng)); + std::generate(w100_data.begin(), w100_data.end(), std::ref(qc32rng)); + std::generate(w101_data.begin(), w101_data.end(), std::ref(qc8rng)); + std::generate(w102_data.begin(), w102_data.end(), std::ref(qc32rng)); + std::generate(w103_data.begin(), w103_data.end(), std::ref(qc8rng)); + std::generate(w104_data.begin(), w104_data.end(), std::ref(qc32rng)); + std::generate(w105_data.begin(), w105_data.end(), std::ref(qc8rng)); + std::generate(w106_data.begin(), w106_data.end(), std::ref(qc32rng)); + std::generate(w107_data.begin(), w107_data.end(), std::ref(qc8rng)); + std::generate(w108_data.begin(), w108_data.end(), std::ref(qc32rng)); + std::generate(w109_data.begin(), w109_data.end(), std::ref(qc8rng)); + std::generate(w110_data.begin(), w110_data.end(), std::ref(qc32rng)); + std::generate(w111_data.begin(), w111_data.end(), std::ref(qc8rng)); + std::generate(w112_data.begin(), w112_data.end(), std::ref(qc32rng)); + std::generate(w113_data.begin(), w113_data.end(), std::ref(qc8rng)); + std::generate(w114_data.begin(), w114_data.end(), std::ref(qc32rng)); + std::generate(w115_data.begin(), w115_data.end(), std::ref(qc8rng)); + std::generate(w116_data.begin(), w116_data.end(), std::ref(qc32rng)); + std::generate(w117_data.begin(), w117_data.end(), std::ref(qc8rng)); + std::generate(w118_data.begin(), w118_data.end(), std::ref(qc32rng)); + std::generate(w119_data.begin(), w119_data.end(), std::ref(qc8rng)); + std::generate(w120_data.begin(), w120_data.end(), std::ref(qc32rng)); + std::generate(w121_data.begin(), w121_data.end(), std::ref(qc8rng)); + std::generate(w122_data.begin(), w122_data.end(), std::ref(qc32rng)); + std::generate(w123_data.begin(), w123_data.end(), std::ref(qc8rng)); + std::generate(w124_data.begin(), w124_data.end(), std::ref(qc32rng)); + std::generate(w125_data.begin(), w125_data.end(), std::ref(qc8rng)); + std::generate(w126_data.begin(), w126_data.end(), std::ref(qc32rng)); + std::generate(w127_data.begin(), w127_data.end(), std::ref(qc8rng)); + std::generate(w128_data.begin(), w128_data.end(), std::ref(qc32rng)); + std::generate(w129_data.begin(), w129_data.end(), std::ref(qc8rng)); + std::generate(w130_data.begin(), w130_data.end(), std::ref(qc32rng)); + std::generate(w131_data.begin(), w131_data.end(), std::ref(qc8rng)); + std::generate(w132_data.begin(), w132_data.end(), std::ref(qc32rng)); + std::generate(w133_data.begin(), w133_data.end(), std::ref(qc8rng)); + std::generate(w134_data.begin(), w134_data.end(), std::ref(qc32rng)); + std::generate(w135_data.begin(), w135_data.end(), std::ref(qc8rng)); + std::generate(w136_data.begin(), w136_data.end(), std::ref(qc32rng)); + std::generate(w137_data.begin(), w137_data.end(), std::ref(qc8rng)); + std::generate(w138_data.begin(), w138_data.end(), std::ref(qc32rng)); + std::generate(w139_data.begin(), w139_data.end(), std::ref(qc8rng)); + std::generate(w140_data.begin(), w140_data.end(), std::ref(qc32rng)); + std::generate(w141_data.begin(), w141_data.end(), std::ref(qc8rng)); + std::generate(w142_data.begin(), w142_data.end(), std::ref(qc32rng)); + std::generate(w143_data.begin(), w143_data.end(), std::ref(qc8rng)); + std::generate(w144_data.begin(), w144_data.end(), std::ref(qc32rng)); + std::generate(w145_data.begin(), w145_data.end(), std::ref(qc8rng)); + std::generate(w146_data.begin(), w146_data.end(), std::ref(qc32rng)); + std::generate(w147_data.begin(), w147_data.end(), std::ref(qc8rng)); + std::generate(w148_data.begin(), w148_data.end(), std::ref(qc32rng)); + std::generate(w149_data.begin(), w149_data.end(), std::ref(qc8rng)); + std::generate(w150_data.begin(), w150_data.end(), std::ref(qc32rng)); + std::generate(w151_data.begin(), w151_data.end(), std::ref(qc8rng)); + std::generate(w152_data.begin(), w152_data.end(), std::ref(qc32rng)); + std::generate(w153_data.begin(), w153_data.end(), std::ref(qc8rng)); + std::generate(w154_data.begin(), w154_data.end(), std::ref(qc32rng)); + std::generate(w155_data.begin(), w155_data.end(), std::ref(qc8rng)); + std::generate(w156_data.begin(), w156_data.end(), std::ref(qc32rng)); + std::generate(w157_data.begin(), w157_data.end(), std::ref(qc8rng)); + std::generate(w158_data.begin(), w158_data.end(), std::ref(qc32rng)); + std::generate(w159_data.begin(), w159_data.end(), std::ref(qc8rng)); + std::generate(w160_data.begin(), w160_data.end(), std::ref(qc32rng)); + std::generate(w161_data.begin(), w161_data.end(), std::ref(qc8rng)); + std::generate(w162_data.begin(), w162_data.end(), std::ref(qc32rng)); + std::generate(w163_data.begin(), w163_data.end(), std::ref(qc8rng)); + std::generate(w164_data.begin(), w164_data.end(), std::ref(qc32rng)); + std::generate(w165_data.begin(), w165_data.end(), std::ref(qc8rng)); + std::generate(w166_data.begin(), w166_data.end(), std::ref(qc32rng)); + std::generate(w167_data.begin(), w167_data.end(), std::ref(qc8rng)); + std::generate(w168_data.begin(), w168_data.end(), std::ref(qc32rng)); + std::generate(w169_data.begin(), w169_data.end(), std::ref(qc8rng)); + std::generate(w170_data.begin(), w170_data.end(), std::ref(qc32rng)); + std::generate(w171_data.begin(), w171_data.end(), std::ref(qs8rng)); + std::generate(w172_data.begin(), w172_data.end(), std::ref(qs32rng)); + + status = xnn_define_convert( + subgraph, + v0, + v1, + 0); + if (status != xnn_status_success) { + std::cerr << "failed to create node #0" << std::endl; + return nullptr; + } status = xnn_define_convolution_2d( subgraph, @@ -2339,13 +3106,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*group_input_channels=*/3, /*group_output_channels=*/32, /*output_min=*/0.0f, /*output_max=*/6.0f, - v0, + v1, w67, w68, - v1, + v2, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #0" << std::endl; + std::cerr << "failed to create node #1" << std::endl; return nullptr; } @@ -2358,13 +3125,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*depth_multiplier=*/1, /*input_channels=*/32, /*output_min=*/0.0f, /*output_max=*/6.0f, - v1, + v2, w69, w70, - v2, + v3, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #1" << std::endl; + std::cerr << "failed to create node #2" << std::endl; return nullptr; } @@ -2378,13 +3145,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*group_input_channels=*/32, /*group_output_channels=*/16, /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - v2, + v3, w71, w72, - v3, + v4, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #2" << std::endl; + std::cerr << "failed to create node #3" << std::endl; return nullptr; } @@ -2398,13 +3165,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*group_input_channels=*/16, /*group_output_channels=*/96, /*output_min=*/0.0f, /*output_max=*/6.0f, - v3, + v4, w73, w74, - v4, + v5, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #3" << std::endl; + std::cerr << "failed to create node #4" << std::endl; return nullptr; } @@ -2417,13 +3184,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*depth_multiplier=*/1, /*input_channels=*/96, /*output_min=*/0.0f, /*output_max=*/6.0f, - v4, + v5, w75, w76, - v5, + v6, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #4" << std::endl; + std::cerr << "failed to create node #5" << std::endl; return nullptr; } @@ -2437,13 +3204,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*group_input_channels=*/96, /*group_output_channels=*/24, /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - v5, + v6, w77, w78, - v6, + v7, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #5" << std::endl; + std::cerr << "failed to create node #6" << std::endl; return nullptr; } @@ -2457,13 +3224,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*group_input_channels=*/24, /*group_output_channels=*/144, /*output_min=*/0.0f, /*output_max=*/6.0f, - v6, + v7, w79, w80, - v7, + v8, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #6" << std::endl; + std::cerr << "failed to create node #7" << std::endl; return nullptr; } @@ -2476,13 +3243,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*depth_multiplier=*/1, /*input_channels=*/144, /*output_min=*/0.0f, /*output_max=*/6.0f, - v7, + v8, w81, w82, - v8, + v9, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #7" << std::endl; + std::cerr << "failed to create node #8" << std::endl; return nullptr; } @@ -2496,26 +3263,26 @@ xnn_subgraph_t QS8MobileNetV2() { /*group_input_channels=*/144, /*group_output_channels=*/24, /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - v8, + v9, w83, w84, - v9, + v10, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #8" << std::endl; + std::cerr << "failed to create node #9" << std::endl; return nullptr; } status = xnn_define_add2( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), - v9, - v6, + /*output_min=*/std::numeric_limits::min(), + /*output_max=*/std::numeric_limits::max(), v10, + v7, + v11, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #9" << std::endl; + std::cerr << "failed to create node #10" << std::endl; return nullptr; } @@ -2529,13 +3296,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*group_input_channels=*/24, /*group_output_channels=*/144, /*output_min=*/0.0f, /*output_max=*/6.0f, - v10, + v11, w85, w86, - v11, + v12, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #10" << std::endl; + std::cerr << "failed to create node #11" << std::endl; return nullptr; } @@ -2548,13 +3315,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*depth_multiplier=*/1, /*input_channels=*/144, /*output_min=*/0.0f, /*output_max=*/6.0f, - v11, + v12, w87, w88, - v12, + v13, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #11" << std::endl; + std::cerr << "failed to create node #12" << std::endl; return nullptr; } @@ -2568,13 +3335,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*group_input_channels=*/144, /*group_output_channels=*/32, /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - v12, + v13, w89, w90, - v13, + v14, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #12" << std::endl; + std::cerr << "failed to create node #13" << std::endl; return nullptr; } @@ -2588,13 +3355,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*group_input_channels=*/32, /*group_output_channels=*/192, /*output_min=*/0.0f, /*output_max=*/6.0f, - v13, + v14, w91, w92, - v14, + v15, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #13" << std::endl; + std::cerr << "failed to create node #14" << std::endl; return nullptr; } @@ -2607,13 +3374,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*depth_multiplier=*/1, /*input_channels=*/192, /*output_min=*/0.0f, /*output_max=*/6.0f, - v14, + v15, w93, w94, - v15, + v16, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #14" << std::endl; + std::cerr << "failed to create node #15" << std::endl; return nullptr; } @@ -2627,26 +3394,26 @@ xnn_subgraph_t QS8MobileNetV2() { /*group_input_channels=*/192, /*group_output_channels=*/32, /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - v15, + v16, w95, w96, - v16, + v17, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #15" << std::endl; + std::cerr << "failed to create node #16" << std::endl; return nullptr; } status = xnn_define_add2( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), - v16, - v13, + /*output_min=*/std::numeric_limits::min(), + /*output_max=*/std::numeric_limits::max(), v17, + v14, + v18, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #16" << std::endl; + std::cerr << "failed to create node #17" << std::endl; return nullptr; } @@ -2660,13 +3427,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*group_input_channels=*/32, /*group_output_channels=*/192, /*output_min=*/0.0f, /*output_max=*/6.0f, - v17, + v18, w97, w98, - v18, + v19, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #17" << std::endl; + std::cerr << "failed to create node #18" << std::endl; return nullptr; } @@ -2679,13 +3446,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*depth_multiplier=*/1, /*input_channels=*/192, /*output_min=*/0.0f, /*output_max=*/6.0f, - v18, + v19, w99, w100, - v19, + v20, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #18" << std::endl; + std::cerr << "failed to create node #19" << std::endl; return nullptr; } @@ -2699,26 +3466,26 @@ xnn_subgraph_t QS8MobileNetV2() { /*group_input_channels=*/192, /*group_output_channels=*/32, /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - v19, + v20, w101, w102, - v20, + v21, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #19" << std::endl; + std::cerr << "failed to create node #20" << std::endl; return nullptr; } status = xnn_define_add2( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), - v20, - v17, + /*output_min=*/std::numeric_limits::min(), + /*output_max=*/std::numeric_limits::max(), v21, + v18, + v22, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #20" << std::endl; + std::cerr << "failed to create node #21" << std::endl; return nullptr; } @@ -2732,13 +3499,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*group_input_channels=*/32, /*group_output_channels=*/192, /*output_min=*/0.0f, /*output_max=*/6.0f, - v21, + v22, w103, w104, - v22, + v23, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #21" << std::endl; + std::cerr << "failed to create node #22" << std::endl; return nullptr; } @@ -2751,13 +3518,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*depth_multiplier=*/1, /*input_channels=*/192, /*output_min=*/0.0f, /*output_max=*/6.0f, - v22, + v23, w105, w106, - v23, + v24, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #22" << std::endl; + std::cerr << "failed to create node #23" << std::endl; return nullptr; } @@ -2771,13 +3538,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*group_input_channels=*/192, /*group_output_channels=*/64, /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - v23, + v24, w107, w108, - v24, + v25, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #23" << std::endl; + std::cerr << "failed to create node #24" << std::endl; return nullptr; } @@ -2791,13 +3558,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*group_input_channels=*/64, /*group_output_channels=*/384, /*output_min=*/0.0f, /*output_max=*/6.0f, - v24, + v25, w109, w110, - v25, + v26, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #24" << std::endl; + std::cerr << "failed to create node #25" << std::endl; return nullptr; } @@ -2810,13 +3577,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*depth_multiplier=*/1, /*input_channels=*/384, /*output_min=*/0.0f, /*output_max=*/6.0f, - v25, + v26, w111, w112, - v26, + v27, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #25" << std::endl; + std::cerr << "failed to create node #26" << std::endl; return nullptr; } @@ -2830,26 +3597,26 @@ xnn_subgraph_t QS8MobileNetV2() { /*group_input_channels=*/384, /*group_output_channels=*/64, /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - v26, + v27, w113, w114, - v27, + v28, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #26" << std::endl; + std::cerr << "failed to create node #27" << std::endl; return nullptr; } status = xnn_define_add2( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), - v27, - v24, + /*output_min=*/std::numeric_limits::min(), + /*output_max=*/std::numeric_limits::max(), v28, + v25, + v29, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #27" << std::endl; + std::cerr << "failed to create node #28" << std::endl; return nullptr; } @@ -2863,13 +3630,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*group_input_channels=*/64, /*group_output_channels=*/384, /*output_min=*/0.0f, /*output_max=*/6.0f, - v28, + v29, w115, w116, - v29, + v30, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #28" << std::endl; + std::cerr << "failed to create node #29" << std::endl; return nullptr; } @@ -2882,13 +3649,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*depth_multiplier=*/1, /*input_channels=*/384, /*output_min=*/0.0f, /*output_max=*/6.0f, - v29, + v30, w117, w118, - v30, + v31, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #29" << std::endl; + std::cerr << "failed to create node #30" << std::endl; return nullptr; } @@ -2902,26 +3669,26 @@ xnn_subgraph_t QS8MobileNetV2() { /*group_input_channels=*/384, /*group_output_channels=*/64, /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - v30, + v31, w119, w120, - v31, + v32, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #30" << std::endl; + std::cerr << "failed to create node #31" << std::endl; return nullptr; } status = xnn_define_add2( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), - v31, - v28, + /*output_min=*/std::numeric_limits::min(), + /*output_max=*/std::numeric_limits::max(), v32, + v29, + v33, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #31" << std::endl; + std::cerr << "failed to create node #32" << std::endl; return nullptr; } @@ -2935,13 +3702,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*group_input_channels=*/64, /*group_output_channels=*/384, /*output_min=*/0.0f, /*output_max=*/6.0f, - v32, + v33, w121, w122, - v33, + v34, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #32" << std::endl; + std::cerr << "failed to create node #33" << std::endl; return nullptr; } @@ -2954,13 +3721,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*depth_multiplier=*/1, /*input_channels=*/384, /*output_min=*/0.0f, /*output_max=*/6.0f, - v33, + v34, w123, w124, - v34, + v35, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #33" << std::endl; + std::cerr << "failed to create node #34" << std::endl; return nullptr; } @@ -2974,26 +3741,26 @@ xnn_subgraph_t QS8MobileNetV2() { /*group_input_channels=*/384, /*group_output_channels=*/64, /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - v34, + v35, w125, w126, - v35, + v36, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #34" << std::endl; + std::cerr << "failed to create node #35" << std::endl; return nullptr; } status = xnn_define_add2( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), - v35, - v32, + /*output_min=*/std::numeric_limits::min(), + /*output_max=*/std::numeric_limits::max(), v36, + v33, + v37, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #35" << std::endl; + std::cerr << "failed to create node #36" << std::endl; return nullptr; } @@ -3007,13 +3774,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*group_input_channels=*/64, /*group_output_channels=*/384, /*output_min=*/0.0f, /*output_max=*/6.0f, - v36, + v37, w127, w128, - v37, + v38, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #36" << std::endl; + std::cerr << "failed to create node #37" << std::endl; return nullptr; } @@ -3026,13 +3793,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*depth_multiplier=*/1, /*input_channels=*/384, /*output_min=*/0.0f, /*output_max=*/6.0f, - v37, + v38, w129, w130, - v38, + v39, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #37" << std::endl; + std::cerr << "failed to create node #38" << std::endl; return nullptr; } @@ -3046,13 +3813,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*group_input_channels=*/384, /*group_output_channels=*/96, /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - v38, + v39, w131, w132, - v39, + v40, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #38" << std::endl; + std::cerr << "failed to create node #39" << std::endl; return nullptr; } @@ -3066,13 +3833,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*group_input_channels=*/96, /*group_output_channels=*/576, /*output_min=*/0.0f, /*output_max=*/6.0f, - v39, + v40, w133, w134, - v40, + v41, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #39" << std::endl; + std::cerr << "failed to create node #40" << std::endl; return nullptr; } @@ -3085,13 +3852,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*depth_multiplier=*/1, /*input_channels=*/576, /*output_min=*/0.0f, /*output_max=*/6.0f, - v40, + v41, w135, w136, - v41, + v42, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #40" << std::endl; + std::cerr << "failed to create node #41" << std::endl; return nullptr; } @@ -3105,26 +3872,26 @@ xnn_subgraph_t QS8MobileNetV2() { /*group_input_channels=*/576, /*group_output_channels=*/96, /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - v41, + v42, w137, w138, - v42, + v43, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #41" << std::endl; + std::cerr << "failed to create node #42" << std::endl; return nullptr; } status = xnn_define_add2( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), - v42, - v39, + /*output_min=*/std::numeric_limits::min(), + /*output_max=*/std::numeric_limits::max(), v43, + v40, + v44, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #42" << std::endl; + std::cerr << "failed to create node #43" << std::endl; return nullptr; } @@ -3138,13 +3905,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*group_input_channels=*/96, /*group_output_channels=*/576, /*output_min=*/0.0f, /*output_max=*/6.0f, - v43, + v44, w139, w140, - v44, + v45, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #43" << std::endl; + std::cerr << "failed to create node #44" << std::endl; return nullptr; } @@ -3157,13 +3924,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*depth_multiplier=*/1, /*input_channels=*/576, /*output_min=*/0.0f, /*output_max=*/6.0f, - v44, + v45, w141, w142, - v45, + v46, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #44" << std::endl; + std::cerr << "failed to create node #45" << std::endl; return nullptr; } @@ -3177,26 +3944,26 @@ xnn_subgraph_t QS8MobileNetV2() { /*group_input_channels=*/576, /*group_output_channels=*/96, /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - v45, + v46, w143, w144, - v46, + v47, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #45" << std::endl; + std::cerr << "failed to create node #46" << std::endl; return nullptr; } status = xnn_define_add2( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), - v46, - v43, + /*output_min=*/std::numeric_limits::min(), + /*output_max=*/std::numeric_limits::max(), v47, + v44, + v48, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #46" << std::endl; + std::cerr << "failed to create node #47" << std::endl; return nullptr; } @@ -3210,13 +3977,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*group_input_channels=*/96, /*group_output_channels=*/576, /*output_min=*/0.0f, /*output_max=*/6.0f, - v47, + v48, w145, w146, - v48, + v49, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #47" << std::endl; + std::cerr << "failed to create node #48" << std::endl; return nullptr; } @@ -3229,13 +3996,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*depth_multiplier=*/1, /*input_channels=*/576, /*output_min=*/0.0f, /*output_max=*/6.0f, - v48, + v49, w147, w148, - v49, + v50, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #48" << std::endl; + std::cerr << "failed to create node #49" << std::endl; return nullptr; } @@ -3249,13 +4016,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*group_input_channels=*/576, /*group_output_channels=*/160, /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - v49, + v50, w149, w150, - v50, + v51, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #49" << std::endl; + std::cerr << "failed to create node #50" << std::endl; return nullptr; } @@ -3269,13 +4036,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*group_input_channels=*/160, /*group_output_channels=*/960, /*output_min=*/0.0f, /*output_max=*/6.0f, - v50, + v51, w151, w152, - v51, + v52, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #50" << std::endl; + std::cerr << "failed to create node #51" << std::endl; return nullptr; } @@ -3288,13 +4055,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*depth_multiplier=*/1, /*input_channels=*/960, /*output_min=*/0.0f, /*output_max=*/6.0f, - v51, + v52, w153, w154, - v52, + v53, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #51" << std::endl; + std::cerr << "failed to create node #52" << std::endl; return nullptr; } @@ -3308,26 +4075,26 @@ xnn_subgraph_t QS8MobileNetV2() { /*group_input_channels=*/960, /*group_output_channels=*/160, /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - v52, + v53, w155, w156, - v53, + v54, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #52" << std::endl; + std::cerr << "failed to create node #53" << std::endl; return nullptr; } status = xnn_define_add2( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), - v53, - v50, + /*output_min=*/std::numeric_limits::min(), + /*output_max=*/std::numeric_limits::max(), v54, + v51, + v55, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #53" << std::endl; + std::cerr << "failed to create node #54" << std::endl; return nullptr; } @@ -3341,13 +4108,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*group_input_channels=*/160, /*group_output_channels=*/960, /*output_min=*/0.0f, /*output_max=*/6.0f, - v54, + v55, w157, w158, - v55, + v56, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #54" << std::endl; + std::cerr << "failed to create node #55" << std::endl; return nullptr; } @@ -3360,13 +4127,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*depth_multiplier=*/1, /*input_channels=*/960, /*output_min=*/0.0f, /*output_max=*/6.0f, - v55, + v56, w159, w160, - v56, + v57, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #55" << std::endl; + std::cerr << "failed to create node #56" << std::endl; return nullptr; } @@ -3380,26 +4147,26 @@ xnn_subgraph_t QS8MobileNetV2() { /*group_input_channels=*/960, /*group_output_channels=*/160, /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - v56, + v57, w161, w162, - v57, + v58, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #56" << std::endl; + std::cerr << "failed to create node #57" << std::endl; return nullptr; } status = xnn_define_add2( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), - v57, - v54, + /*output_min=*/std::numeric_limits::min(), + /*output_max=*/std::numeric_limits::max(), v58, + v55, + v59, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #57" << std::endl; + std::cerr << "failed to create node #58" << std::endl; return nullptr; } @@ -3413,13 +4180,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*group_input_channels=*/160, /*group_output_channels=*/960, /*output_min=*/0.0f, /*output_max=*/6.0f, - v58, + v59, w163, w164, - v59, + v60, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #58" << std::endl; + std::cerr << "failed to create node #59" << std::endl; return nullptr; } @@ -3432,13 +4199,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*depth_multiplier=*/1, /*input_channels=*/960, /*output_min=*/0.0f, /*output_max=*/6.0f, - v59, + v60, w165, w166, - v60, + v61, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #59" << std::endl; + std::cerr << "failed to create node #60" << std::endl; return nullptr; } @@ -3452,13 +4219,13 @@ xnn_subgraph_t QS8MobileNetV2() { /*group_input_channels=*/960, /*group_output_channels=*/320, /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - v60, + v61, w167, w168, - v61, + v62, /*flags=*/0); if (status != xnn_status_success) { - std::cerr << "failed to create node #60" << std::endl; + std::cerr << "failed to create node #61" << std::endl; return nullptr; } @@ -3472,20 +4239,9 @@ xnn_subgraph_t QS8MobileNetV2() { /*group_input_channels=*/320, /*group_output_channels=*/1280, /*output_min=*/0.0f, /*output_max=*/6.0f, - v61, + v62, w169, w170, - v62, - /*flags=*/0); - if (status != xnn_status_success) { - std::cerr << "failed to create node #61" << std::endl; - return nullptr; - } - - status = xnn_define_global_average_pooling_2d( - subgraph, - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), - v62, v63, /*flags=*/0); if (status != xnn_status_success) { @@ -3493,21 +4249,11 @@ xnn_subgraph_t QS8MobileNetV2() { return nullptr; } -#if 0 - // These last few ops cause a weird crash. - status = xnn_define_convolution_2d( + status = xnn_define_global_average_pooling_2d( subgraph, - /*padding_top=*/0, /*padding_right=*/0, /*padding_bottom=*/0, /*padding_left=*/0, - /*kernel_height=*/1, /*kernel_width=*/1, - /*subsampling_height=*/1, /*subsampling_width=*/1, - /*dilation_height=*/1, /*dilation_width=*/1, - /*groups=*/1, - /*group_input_channels=*/1280, - /*group_output_channels=*/1008, - /*output_min=*/-std::numeric_limits::infinity(), /*output_max=*/std::numeric_limits::infinity(), + /*output_min=*/std::numeric_limits::min(), + /*output_max=*/std::numeric_limits::max(), v63, - w171, - w172, v64, /*flags=*/0); if (status != xnn_status_success) { @@ -3515,27 +4261,29 @@ xnn_subgraph_t QS8MobileNetV2() { return nullptr; } - status = xnn_define_copy( + status = xnn_define_fully_connected( subgraph, + /*output_min=*/std::numeric_limits::min(), /*output_max=*/std::numeric_limits::max(), v64, + w171, + w172, v65, - 0); + /*flags=*/0); if (status != xnn_status_success) { std::cerr << "failed to create node #64" << std::endl; return nullptr; } - // Supposed to be softmax, qint8 not supported - status = xnn_define_copy( + status = xnn_define_convert( subgraph, v65, v66, - /*flags=*/0); + 0); if (status != xnn_status_success) { std::cerr << "failed to create node #65" << std::endl; return nullptr; } -#endif + return subgraph; } From 90aef2e3b656f0eb266a858f2a7e3dd6921878ee Mon Sep 17 00:00:00 2001 From: XNNPACK Team Date: Tue, 24 Sep 2024 19:36:13 -0700 Subject: [PATCH 50/50] Don't use API functions that are XNN_DEPRECATED: - It's bad programming practice - It emits fountains of warnings for CMake builds PiperOrigin-RevId: 678487141 --- bench/models/fp32-mobilenet-v1.cc | 4 +- bench/models/fp32-mobilenet-v2.cc | 70 +++++---- bench/models/fp32-mobilenet-v3-large.cc | 182 ++++++++++++++---------- bench/models/fp32-mobilenet-v3-small.cc | 168 ++++++++++++---------- bench/models/qs8-mobilenet-v2.cc | 74 +++++----- test/reshape-helpers.cc | 6 +- test/subgraph-tester.h | 20 +-- test/workspace.cc | 7 +- 8 files changed, 303 insertions(+), 228 deletions(-) diff --git a/bench/models/fp32-mobilenet-v1.cc b/bench/models/fp32-mobilenet-v1.cc index 1a4cbf42a71..49c3d59a73a 100644 --- a/bench/models/fp32-mobilenet-v1.cc +++ b/bench/models/fp32-mobilenet-v1.cc @@ -41,7 +41,7 @@ xnn_subgraph_t FP32MobileNetV1() { subgraph, xnn_datatype_fp32, v0_dims.size(), v0_dims.data(), /*data=*/nullptr, - 1, XNN_VALUE_FLAG_EXTERNAL_INPUT, &v0); + 0, XNN_VALUE_FLAG_EXTERNAL_INPUT, &v0); if (status != xnn_status_success) { std::cerr << "failed to create tensor v0" << std::endl; return nullptr; @@ -389,7 +389,7 @@ xnn_subgraph_t FP32MobileNetV1() { subgraph, xnn_datatype_fp32, v29_dims.size(), v29_dims.data(), /*data=*/nullptr, - 0, XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &v29); + 1, XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &v29); if (status != xnn_status_success) { std::cerr << "failed to create tensor v29" << std::endl; return nullptr; diff --git a/bench/models/fp32-mobilenet-v2.cc b/bench/models/fp32-mobilenet-v2.cc index 44f23290e55..fdf58f842c0 100644 --- a/bench/models/fp32-mobilenet-v2.cc +++ b/bench/models/fp32-mobilenet-v2.cc @@ -2502,10 +2502,11 @@ xnn_subgraph_t FP32MobileNetV2() { return nullptr; } - status = xnn_define_add2( + xnn_binary_params v10_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_add, + &v10_params, v9, v6, v10, @@ -2633,10 +2634,11 @@ xnn_subgraph_t FP32MobileNetV2() { return nullptr; } - status = xnn_define_add2( + xnn_binary_params v17_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_add, + &v17_params, v16, v13, v17, @@ -2705,10 +2707,11 @@ xnn_subgraph_t FP32MobileNetV2() { return nullptr; } - status = xnn_define_add2( + xnn_binary_params v21_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_add, + &v21_params, v20, v17, v21, @@ -2836,10 +2839,11 @@ xnn_subgraph_t FP32MobileNetV2() { return nullptr; } - status = xnn_define_add2( + xnn_binary_params v28_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_add, + &v28_params, v27, v24, v28, @@ -2908,10 +2912,11 @@ xnn_subgraph_t FP32MobileNetV2() { return nullptr; } - status = xnn_define_add2( + xnn_binary_params v32_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_add, + &v32_params, v31, v28, v32, @@ -2980,10 +2985,11 @@ xnn_subgraph_t FP32MobileNetV2() { return nullptr; } - status = xnn_define_add2( + xnn_binary_params v36_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_add, + &v36_params, v35, v32, v36, @@ -3111,10 +3117,11 @@ xnn_subgraph_t FP32MobileNetV2() { return nullptr; } - status = xnn_define_add2( + xnn_binary_params v43_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_add, + &v43_params, v42, v39, v43, @@ -3183,10 +3190,11 @@ xnn_subgraph_t FP32MobileNetV2() { return nullptr; } - status = xnn_define_add2( + xnn_binary_params v47_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_add, + &v47_params, v46, v43, v47, @@ -3314,10 +3322,11 @@ xnn_subgraph_t FP32MobileNetV2() { return nullptr; } - status = xnn_define_add2( + xnn_binary_params v54_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_add, + &v54_params, v53, v50, v54, @@ -3386,10 +3395,11 @@ xnn_subgraph_t FP32MobileNetV2() { return nullptr; } - status = xnn_define_add2( + xnn_binary_params v58_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_add, + &v58_params, v57, v54, v58, diff --git a/bench/models/fp32-mobilenet-v3-large.cc b/bench/models/fp32-mobilenet-v3-large.cc index f5e014323d2..1dff3f8a40d 100644 --- a/bench/models/fp32-mobilenet-v3-large.cc +++ b/bench/models/fp32-mobilenet-v3-large.cc @@ -3498,10 +3498,11 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_add2( + xnn_binary_params v5_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_add, + &v5_params, v4, v2, v5, @@ -3629,10 +3630,11 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_add2( + xnn_binary_params v12_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_add, + &v12_params, v11, v8, v12, @@ -3735,10 +3737,11 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_multiply2( + xnn_binary_params v18_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_multiply, + &v18_params, v17, w150, v18, @@ -3748,10 +3751,11 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_multiply2( + xnn_binary_params v19_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_multiply, + &v19_params, v14, v18, v19, @@ -3874,10 +3878,11 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_multiply2( + xnn_binary_params v26_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_multiply, + &v26_params, v25, w161, v26, @@ -3887,10 +3892,11 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_multiply2( + xnn_binary_params v27_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_multiply, + &v27_params, v22, v26, v27, @@ -3920,10 +3926,11 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_add2( + xnn_binary_params v29_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_add, + &v29_params, v28, v20, v29, @@ -4026,10 +4033,11 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_multiply2( + xnn_binary_params v35_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_multiply, + &v35_params, v34, w172, v35, @@ -4039,10 +4047,11 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_multiply2( + xnn_binary_params v36_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_multiply, + &v36_params, v31, v35, v36, @@ -4072,10 +4081,11 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_add2( + xnn_binary_params v38_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_add, + &v38_params, v37, v29, v38, @@ -4243,10 +4253,11 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_add2( + xnn_binary_params v49_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_add, + &v49_params, v48, v43, v49, @@ -4335,10 +4346,11 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_add2( + xnn_binary_params v55_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_add, + &v55_params, v54, v49, v55, @@ -4427,10 +4439,11 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_add2( + xnn_binary_params v61_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_add, + &v61_params, v60, v55, v61, @@ -4553,10 +4566,11 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_multiply2( + xnn_binary_params v69_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_multiply, + &v69_params, v68, w207, v69, @@ -4566,10 +4580,11 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_multiply2( + xnn_binary_params v70_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_multiply, + &v70_params, v65, v69, v70, @@ -4712,10 +4727,11 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_multiply2( + xnn_binary_params v79_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_multiply, + &v79_params, v78, w218, v79, @@ -4725,10 +4741,11 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_multiply2( + xnn_binary_params v80_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_multiply, + &v80_params, v75, v79, v80, @@ -4758,10 +4775,11 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_add2( + xnn_binary_params v82_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_add, + &v82_params, v81, v71, v82, @@ -4884,10 +4902,11 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_multiply2( + xnn_binary_params v90_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_multiply, + &v90_params, v89, w229, v90, @@ -4897,10 +4916,11 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_multiply2( + xnn_binary_params v91_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_multiply, + &v91_params, v86, v90, v91, @@ -5043,10 +5063,11 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_multiply2( + xnn_binary_params v100_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_multiply, + &v100_params, v99, w240, v100, @@ -5056,10 +5077,11 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_multiply2( + xnn_binary_params v101_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_multiply, + &v101_params, v96, v100, v101, @@ -5089,10 +5111,11 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_add2( + xnn_binary_params v103_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_add, + &v103_params, v102, v92, v103, @@ -5215,10 +5238,11 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_multiply2( + xnn_binary_params v111_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_multiply, + &v111_params, v110, w251, v111, @@ -5228,10 +5252,11 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_multiply2( + xnn_binary_params v112_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_multiply, + &v112_params, v107, v111, v112, @@ -5261,10 +5286,11 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_add2( + xnn_binary_params v114_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_add, + &v114_params, v113, v103, v114, diff --git a/bench/models/fp32-mobilenet-v3-small.cc b/bench/models/fp32-mobilenet-v3-small.cc index 804e36fd64a..e5d2d9a11cc 100644 --- a/bench/models/fp32-mobilenet-v3-small.cc +++ b/bench/models/fp32-mobilenet-v3-small.cc @@ -3110,10 +3110,11 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_multiply2( + xnn_binary_params v7_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_multiply, + &v7_params, v6, w119, v7, @@ -3123,10 +3124,11 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_multiply2( + xnn_binary_params v8_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_multiply, + &v8_params, v3, v7, v8, @@ -3274,10 +3276,11 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_add2( + xnn_binary_params v16_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_add, + &v16_params, v15, v12, v16, @@ -3400,10 +3403,11 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_multiply2( + xnn_binary_params v24_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_multiply, + &v24_params, v23, w142, v24, @@ -3413,10 +3417,11 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_multiply2( + xnn_binary_params v25_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_multiply, + &v25_params, v20, v24, v25, @@ -3559,10 +3564,11 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_multiply2( + xnn_binary_params v34_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_multiply, + &v34_params, v33, w153, v34, @@ -3572,10 +3578,11 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_multiply2( + xnn_binary_params v35_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_multiply, + &v35_params, v30, v34, v35, @@ -3605,10 +3612,11 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_add2( + xnn_binary_params v37_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_add, + &v37_params, v36, v26, v37, @@ -3731,10 +3739,11 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_multiply2( + xnn_binary_params v45_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_multiply, + &v45_params, v44, w164, v45, @@ -3744,10 +3753,11 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_multiply2( + xnn_binary_params v46_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_multiply, + &v46_params, v41, v45, v46, @@ -3777,10 +3787,11 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_add2( + xnn_binary_params v48_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_add, + &v48_params, v47, v37, v48, @@ -3903,10 +3914,11 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_multiply2( + xnn_binary_params v56_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_multiply, + &v56_params, v55, w175, v56, @@ -3916,10 +3928,11 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_multiply2( + xnn_binary_params v57_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_multiply, + &v57_params, v52, v56, v57, @@ -4062,10 +4075,11 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_multiply2( + xnn_binary_params v66_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_multiply, + &v66_params, v65, w186, v66, @@ -4075,10 +4089,11 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_multiply2( + xnn_binary_params v67_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_multiply, + &v67_params, v62, v66, v67, @@ -4108,10 +4123,11 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_add2( + xnn_binary_params v69_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_add, + &v69_params, v68, v58, v69, @@ -4234,10 +4250,11 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_multiply2( + xnn_binary_params v77_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_multiply, + &v77_params, v76, w197, v77, @@ -4247,10 +4264,11 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_multiply2( + xnn_binary_params v78_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_multiply, + &v78_params, v73, v77, v78, @@ -4393,10 +4411,11 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_multiply2( + xnn_binary_params v87_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_multiply, + &v87_params, v86, w208, v87, @@ -4406,10 +4425,11 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_multiply2( + xnn_binary_params v88_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_multiply, + &v88_params, v83, v87, v88, @@ -4439,10 +4459,11 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_add2( + xnn_binary_params v90_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_add, + &v90_params, v89, v79, v90, @@ -4565,10 +4586,11 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_multiply2( + xnn_binary_params v98_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_multiply, + &v98_params, v97, w219, v98, @@ -4578,10 +4600,11 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_multiply2( + xnn_binary_params v99_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_multiply, + &v99_params, v94, v98, v99, @@ -4611,10 +4634,11 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_add2( + xnn_binary_params v101_params = { -std::numeric_limits::infinity(), std::numeric_limits::infinity() }; + status = xnn_define_binary( subgraph, - /*output_min=*/-std::numeric_limits::infinity(), - /*output_max=*/std::numeric_limits::infinity(), + xnn_binary_add, + &v101_params, v100, v90, v101, diff --git a/bench/models/qs8-mobilenet-v2.cc b/bench/models/qs8-mobilenet-v2.cc index d07d1309b96..ad7cff160fb 100644 --- a/bench/models/qs8-mobilenet-v2.cc +++ b/bench/models/qs8-mobilenet-v2.cc @@ -41,7 +41,7 @@ xnn_subgraph_t QS8MobileNetV2() { subgraph, xnn_datatype_fp32, v0_dims.size(), v0_dims.data(), /*data=*/nullptr, - 0, XNN_VALUE_FLAG_EXTERNAL_INPUT, &v0); + 1, XNN_VALUE_FLAG_EXTERNAL_INPUT, &v0); if (status != xnn_status_success) { std::cerr << "failed to create tensor v0" << std::endl; return nullptr; @@ -963,7 +963,7 @@ xnn_subgraph_t QS8MobileNetV2() { subgraph, xnn_datatype_fp32, v66_dims.size(), v66_dims.data(), /*data=*/nullptr, - 1, XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &v66); + 0, XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &v66); if (status != xnn_status_success) { std::cerr << "failed to create tensor v66" << std::endl; return nullptr; @@ -3273,10 +3273,11 @@ xnn_subgraph_t QS8MobileNetV2() { return nullptr; } - status = xnn_define_add2( + xnn_binary_params v11_params = { std::numeric_limits::min(), std::numeric_limits::max() }; + status = xnn_define_binary( subgraph, - /*output_min=*/std::numeric_limits::min(), - /*output_max=*/std::numeric_limits::max(), + xnn_binary_add, + &v11_params, v10, v7, v11, @@ -3404,10 +3405,11 @@ xnn_subgraph_t QS8MobileNetV2() { return nullptr; } - status = xnn_define_add2( + xnn_binary_params v18_params = { std::numeric_limits::min(), std::numeric_limits::max() }; + status = xnn_define_binary( subgraph, - /*output_min=*/std::numeric_limits::min(), - /*output_max=*/std::numeric_limits::max(), + xnn_binary_add, + &v18_params, v17, v14, v18, @@ -3476,10 +3478,11 @@ xnn_subgraph_t QS8MobileNetV2() { return nullptr; } - status = xnn_define_add2( + xnn_binary_params v22_params = { std::numeric_limits::min(), std::numeric_limits::max() }; + status = xnn_define_binary( subgraph, - /*output_min=*/std::numeric_limits::min(), - /*output_max=*/std::numeric_limits::max(), + xnn_binary_add, + &v22_params, v21, v18, v22, @@ -3607,10 +3610,11 @@ xnn_subgraph_t QS8MobileNetV2() { return nullptr; } - status = xnn_define_add2( + xnn_binary_params v29_params = { std::numeric_limits::min(), std::numeric_limits::max() }; + status = xnn_define_binary( subgraph, - /*output_min=*/std::numeric_limits::min(), - /*output_max=*/std::numeric_limits::max(), + xnn_binary_add, + &v29_params, v28, v25, v29, @@ -3679,10 +3683,11 @@ xnn_subgraph_t QS8MobileNetV2() { return nullptr; } - status = xnn_define_add2( + xnn_binary_params v33_params = { std::numeric_limits::min(), std::numeric_limits::max() }; + status = xnn_define_binary( subgraph, - /*output_min=*/std::numeric_limits::min(), - /*output_max=*/std::numeric_limits::max(), + xnn_binary_add, + &v33_params, v32, v29, v33, @@ -3751,10 +3756,11 @@ xnn_subgraph_t QS8MobileNetV2() { return nullptr; } - status = xnn_define_add2( + xnn_binary_params v37_params = { std::numeric_limits::min(), std::numeric_limits::max() }; + status = xnn_define_binary( subgraph, - /*output_min=*/std::numeric_limits::min(), - /*output_max=*/std::numeric_limits::max(), + xnn_binary_add, + &v37_params, v36, v33, v37, @@ -3882,10 +3888,11 @@ xnn_subgraph_t QS8MobileNetV2() { return nullptr; } - status = xnn_define_add2( + xnn_binary_params v44_params = { std::numeric_limits::min(), std::numeric_limits::max() }; + status = xnn_define_binary( subgraph, - /*output_min=*/std::numeric_limits::min(), - /*output_max=*/std::numeric_limits::max(), + xnn_binary_add, + &v44_params, v43, v40, v44, @@ -3954,10 +3961,11 @@ xnn_subgraph_t QS8MobileNetV2() { return nullptr; } - status = xnn_define_add2( + xnn_binary_params v48_params = { std::numeric_limits::min(), std::numeric_limits::max() }; + status = xnn_define_binary( subgraph, - /*output_min=*/std::numeric_limits::min(), - /*output_max=*/std::numeric_limits::max(), + xnn_binary_add, + &v48_params, v47, v44, v48, @@ -4085,10 +4093,11 @@ xnn_subgraph_t QS8MobileNetV2() { return nullptr; } - status = xnn_define_add2( + xnn_binary_params v55_params = { std::numeric_limits::min(), std::numeric_limits::max() }; + status = xnn_define_binary( subgraph, - /*output_min=*/std::numeric_limits::min(), - /*output_max=*/std::numeric_limits::max(), + xnn_binary_add, + &v55_params, v54, v51, v55, @@ -4157,10 +4166,11 @@ xnn_subgraph_t QS8MobileNetV2() { return nullptr; } - status = xnn_define_add2( + xnn_binary_params v59_params = { std::numeric_limits::min(), std::numeric_limits::max() }; + status = xnn_define_binary( subgraph, - /*output_min=*/std::numeric_limits::min(), - /*output_max=*/std::numeric_limits::max(), + xnn_binary_add, + &v59_params, v58, v55, v59, diff --git a/test/reshape-helpers.cc b/test/reshape-helpers.cc index c824a026598..8805b21fc29 100644 --- a/test/reshape-helpers.cc +++ b/test/reshape-helpers.cc @@ -115,9 +115,9 @@ xnn_runtime_t SetupBinary(const std::vector &input0_dims, return nullptr; } - const float output_min = -std::numeric_limits::infinity(); - const float output_max = std::numeric_limits::infinity(); - if (xnn_define_add2(subgraph, output_min, output_max, input0_id, input1_id, + struct xnn_binary_params params = {-std::numeric_limits::infinity(), + std::numeric_limits::infinity()}; + if (xnn_define_binary(subgraph, xnn_binary_add, ¶ms, input0_id, input1_id, output_id, /*flags=*/0) != xnn_status_success) { return nullptr; } diff --git a/test/subgraph-tester.h b/test/subgraph-tester.h index c57c96caf81..24f64368e34 100644 --- a/test/subgraph-tester.h +++ b/test/subgraph-tester.h @@ -327,9 +327,10 @@ class SubgraphTester { } SubgraphTester& AddAddition(uint32_t input_id1, uint32_t input_id2, uint32_t output_id) { + struct xnn_binary_params params = {-std::numeric_limits::infinity(), + std::numeric_limits::infinity()}; const xnn_status status = - xnn_define_add2(subgraph_.get(), -std::numeric_limits::infinity(), - std::numeric_limits::infinity(), input_id1, + xnn_define_binary(subgraph_.get(), xnn_binary_add, ¶ms, input_id1, input_id2, output_id, 0 /* flags */); EXPECT_EQ(status, xnn_status_success); @@ -403,9 +404,10 @@ class SubgraphTester { } SubgraphTester& AddDivide(uint32_t input_id1, uint32_t input_id2, uint32_t output_id) { + struct xnn_binary_params params = {-std::numeric_limits::infinity(), + std::numeric_limits::infinity()}; const xnn_status status = - xnn_define_divide(subgraph_.get(), -std::numeric_limits::infinity(), - std::numeric_limits::infinity(), input_id1, + xnn_define_binary(subgraph_.get(), xnn_binary_divide, ¶ms, input_id1, input_id2, output_id, 0 /* flags */); EXPECT_EQ(status, xnn_status_success); @@ -482,9 +484,10 @@ class SubgraphTester { } SubgraphTester& AddMultiply(uint32_t input_id1, uint32_t input_id2, uint32_t output_id) { + struct xnn_binary_params params = {-std::numeric_limits::infinity(), + std::numeric_limits::infinity()}; const xnn_status status = - xnn_define_multiply2(subgraph_.get(), -std::numeric_limits::infinity(), - std::numeric_limits::infinity(), input_id1, + xnn_define_binary(subgraph_.get(), xnn_binary_multiply, ¶ms, input_id1, input_id2, output_id, 0 /* flags */); EXPECT_EQ(status, xnn_status_success); @@ -499,9 +502,10 @@ class SubgraphTester { } SubgraphTester& AddSubtract(uint32_t input_id1, uint32_t input_id2, uint32_t output_id) { + struct xnn_binary_params params = {-std::numeric_limits::infinity(), + std::numeric_limits::infinity()}; const xnn_status status = - xnn_define_subtract(subgraph_.get(), -std::numeric_limits::infinity(), - std::numeric_limits::infinity(), input_id1, + xnn_define_binary(subgraph_.get(), xnn_binary_subtract, ¶ms, input_id1, input_id2, output_id, 0 /* flags */); EXPECT_EQ(status, xnn_status_success); diff --git a/test/workspace.cc b/test/workspace.cc index aa05c3a783c..71e92f2ded5 100644 --- a/test/workspace.cc +++ b/test/workspace.cc @@ -89,10 +89,11 @@ void DefineGraphWithStaticData(xnn_subgraph_t* subgraph, std::array d XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id); ASSERT_NE(output_id, XNN_INVALID_VALUE_ID); + struct xnn_binary_params params = {-std::numeric_limits::infinity(), + std::numeric_limits::infinity()}; ASSERT_EQ(xnn_status_success, - xnn_define_add2(*subgraph, -std::numeric_limits::infinity(), - std::numeric_limits::infinity(), input_id, - static_value_id, output_id, /*flags=*/0)); + xnn_define_binary(*subgraph, xnn_binary_add, ¶ms, input_id, + static_value_id, output_id, /*flags=*/0)); } void DefineGraphWithPersistentTensors(xnn_subgraph_t* subgraph, std::array dims)